diffusers 0.31.0__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +66 -5
- diffusers/callbacks.py +56 -3
- diffusers/configuration_utils.py +1 -1
- diffusers/dependency_versions_table.py +1 -1
- diffusers/image_processor.py +25 -17
- diffusers/loaders/__init__.py +22 -3
- diffusers/loaders/ip_adapter.py +538 -15
- diffusers/loaders/lora_base.py +124 -118
- diffusers/loaders/lora_conversion_utils.py +318 -3
- diffusers/loaders/lora_pipeline.py +1688 -368
- diffusers/loaders/peft.py +379 -0
- diffusers/loaders/single_file_model.py +71 -4
- diffusers/loaders/single_file_utils.py +519 -9
- diffusers/loaders/textual_inversion.py +3 -3
- diffusers/loaders/transformer_flux.py +181 -0
- diffusers/loaders/transformer_sd3.py +89 -0
- diffusers/loaders/unet.py +17 -4
- diffusers/models/__init__.py +47 -14
- diffusers/models/activations.py +22 -9
- diffusers/models/attention.py +13 -4
- diffusers/models/attention_flax.py +1 -1
- diffusers/models/attention_processor.py +2059 -281
- diffusers/models/autoencoders/__init__.py +5 -0
- diffusers/models/autoencoders/autoencoder_dc.py +620 -0
- diffusers/models/autoencoders/autoencoder_kl.py +2 -1
- diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
- diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
- diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
- diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
- diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
- diffusers/models/autoencoders/vae.py +18 -5
- diffusers/models/controlnet.py +47 -802
- diffusers/models/controlnet_flux.py +29 -495
- diffusers/models/controlnet_sd3.py +25 -379
- diffusers/models/controlnet_sparsectrl.py +46 -718
- diffusers/models/controlnets/__init__.py +23 -0
- diffusers/models/controlnets/controlnet.py +872 -0
- diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
- diffusers/models/controlnets/controlnet_flux.py +536 -0
- diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
- diffusers/models/controlnets/controlnet_sd3.py +489 -0
- diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
- diffusers/models/controlnets/controlnet_union.py +832 -0
- diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
- diffusers/models/controlnets/multicontrolnet.py +183 -0
- diffusers/models/embeddings.py +838 -43
- diffusers/models/model_loading_utils.py +88 -6
- diffusers/models/modeling_flax_utils.py +1 -1
- diffusers/models/modeling_utils.py +72 -26
- diffusers/models/normalization.py +78 -13
- diffusers/models/transformers/__init__.py +5 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
- diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
- diffusers/models/transformers/dit_transformer_2d.py +1 -1
- diffusers/models/transformers/latte_transformer_3d.py +4 -4
- diffusers/models/transformers/pixart_transformer_2d.py +1 -1
- diffusers/models/transformers/sana_transformer.py +488 -0
- diffusers/models/transformers/stable_audio_transformer.py +1 -1
- diffusers/models/transformers/transformer_2d.py +1 -1
- diffusers/models/transformers/transformer_allegro.py +422 -0
- diffusers/models/transformers/transformer_cogview3plus.py +1 -1
- diffusers/models/transformers/transformer_flux.py +30 -9
- diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
- diffusers/models/transformers/transformer_ltx.py +469 -0
- diffusers/models/transformers/transformer_mochi.py +499 -0
- diffusers/models/transformers/transformer_sd3.py +105 -17
- diffusers/models/transformers/transformer_temporal.py +1 -1
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d.py +8 -1
- diffusers/models/unets/unet_2d_blocks.py +88 -21
- diffusers/models/unets/unet_2d_condition.py +1 -1
- diffusers/models/unets/unet_3d_blocks.py +9 -7
- diffusers/models/unets/unet_motion_model.py +5 -5
- diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
- diffusers/models/unets/unet_stable_cascade.py +2 -2
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +8 -0
- diffusers/pipelines/__init__.py +34 -0
- diffusers/pipelines/allegro/__init__.py +48 -0
- diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
- diffusers/pipelines/allegro/pipeline_output.py +23 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
- diffusers/pipelines/auto_pipeline.py +53 -6
- diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
- diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
- diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
- diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
- diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
- diffusers/pipelines/controlnet/__init__.py +86 -80
- diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
- diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
- diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
- diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
- diffusers/pipelines/flux/__init__.py +13 -1
- diffusers/pipelines/flux/modeling_flux.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +204 -29
- diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
- diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
- diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
- diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
- diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
- diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
- diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
- diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
- diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
- diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
- diffusers/pipelines/flux/pipeline_output.py +16 -0
- diffusers/pipelines/hunyuan_video/__init__.py +48 -0
- diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
- diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
- diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
- diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
- diffusers/pipelines/kolors/text_encoder.py +2 -2
- diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
- diffusers/pipelines/ltx/__init__.py +50 -0
- diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
- diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
- diffusers/pipelines/ltx/pipeline_output.py +20 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
- diffusers/pipelines/mochi/__init__.py +48 -0
- diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
- diffusers/pipelines/mochi/pipeline_output.py +20 -0
- diffusers/pipelines/pag/__init__.py +7 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
- diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
- diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
- diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
- diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
- diffusers/pipelines/pipeline_flax_utils.py +1 -1
- diffusers/pipelines/pipeline_loading_utils.py +25 -4
- diffusers/pipelines/pipeline_utils.py +35 -6
- diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
- diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
- diffusers/pipelines/sana/__init__.py +47 -0
- diffusers/pipelines/sana/pipeline_output.py +21 -0
- diffusers/pipelines/sana/pipeline_sana.py +884 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
- diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/quantizers/auto.py +14 -1
- diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
- diffusers/quantizers/gguf/__init__.py +1 -0
- diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
- diffusers/quantizers/gguf/utils.py +456 -0
- diffusers/quantizers/quantization_config.py +280 -2
- diffusers/quantizers/torchao/__init__.py +15 -0
- diffusers/quantizers/torchao/torchao_quantizer.py +292 -0
- diffusers/schedulers/scheduling_ddpm.py +2 -6
- diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
- diffusers/schedulers/scheduling_deis_multistep.py +28 -9
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
- diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
- diffusers/schedulers/scheduling_euler_discrete.py +4 -4
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
- diffusers/schedulers/scheduling_heun_discrete.py +4 -4
- diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
- diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
- diffusers/schedulers/scheduling_lcm.py +2 -6
- diffusers/schedulers/scheduling_lms_discrete.py +4 -4
- diffusers/schedulers/scheduling_repaint.py +1 -1
- diffusers/schedulers/scheduling_sasolver.py +28 -9
- diffusers/schedulers/scheduling_tcd.py +2 -6
- diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
- diffusers/training_utils.py +16 -2
- diffusers/utils/__init__.py +5 -0
- diffusers/utils/constants.py +1 -0
- diffusers/utils/dummy_pt_objects.py +180 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
- diffusers/utils/dynamic_modules_utils.py +3 -3
- diffusers/utils/hub_utils.py +31 -39
- diffusers/utils/import_utils.py +67 -0
- diffusers/utils/peft_utils.py +3 -0
- diffusers/utils/testing_utils.py +56 -1
- diffusers/utils/torch_utils.py +3 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/METADATA +6 -6
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/RECORD +214 -162
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/WHEEL +1 -1
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/LICENSE +0 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.31.0.dist-info → diffusers-0.32.1.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,7 @@ from ...models import AutoencoderOobleck, StableAudioDiTModel
|
|
26
26
|
from ...models.embeddings import get_1d_rotary_pos_embed
|
27
27
|
from ...schedulers import EDMDPMSolverMultistepScheduler
|
28
28
|
from ...utils import (
|
29
|
+
is_torch_xla_available,
|
29
30
|
logging,
|
30
31
|
replace_example_docstring,
|
31
32
|
)
|
@@ -34,6 +35,13 @@ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
|
|
34
35
|
from .modeling_stable_audio import StableAudioProjectionModel
|
35
36
|
|
36
37
|
|
38
|
+
if is_torch_xla_available():
|
39
|
+
import torch_xla.core.xla_model as xm
|
40
|
+
|
41
|
+
XLA_AVAILABLE = True
|
42
|
+
else:
|
43
|
+
XLA_AVAILABLE = False
|
44
|
+
|
37
45
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
38
46
|
|
39
47
|
EXAMPLE_DOC_STRING = """
|
@@ -438,7 +446,7 @@ class StableAudioPipeline(DiffusionPipeline):
|
|
438
446
|
f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but has `{initial_audio_waveforms.ndim}` dimensions"
|
439
447
|
)
|
440
448
|
|
441
|
-
audio_vae_length = self.transformer.config.sample_size * self.vae.hop_length
|
449
|
+
audio_vae_length = int(self.transformer.config.sample_size) * self.vae.hop_length
|
442
450
|
audio_shape = (batch_size // num_waveforms_per_prompt, audio_channels, audio_vae_length)
|
443
451
|
|
444
452
|
# check num_channels
|
@@ -726,6 +734,9 @@ class StableAudioPipeline(DiffusionPipeline):
|
|
726
734
|
step_idx = i // getattr(self.scheduler, "order", 1)
|
727
735
|
callback(step_idx, t, latents)
|
728
736
|
|
737
|
+
if XLA_AVAILABLE:
|
738
|
+
xm.mark_step()
|
739
|
+
|
729
740
|
# 9. Post-processing
|
730
741
|
if not output_type == "latent":
|
731
742
|
audio = self.vae.decode(latents).sample
|
@@ -255,7 +255,12 @@ class StableDiffusionPipeline(
|
|
255
255
|
is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
|
256
256
|
version.parse(unet.config._diffusers_version).base_version
|
257
257
|
) < version.parse("0.9.0.dev0")
|
258
|
-
|
258
|
+
self._is_unet_config_sample_size_int = isinstance(unet.config.sample_size, int)
|
259
|
+
is_unet_sample_size_less_64 = (
|
260
|
+
hasattr(unet.config, "sample_size")
|
261
|
+
and self._is_unet_config_sample_size_int
|
262
|
+
and unet.config.sample_size < 64
|
263
|
+
)
|
259
264
|
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
|
260
265
|
deprecation_message = (
|
261
266
|
"The configuration file of the unet has set the default `sample_size` to smaller than"
|
@@ -902,8 +907,18 @@ class StableDiffusionPipeline(
|
|
902
907
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
903
908
|
|
904
909
|
# 0. Default height and width to unet
|
905
|
-
|
906
|
-
|
910
|
+
if not height or not width:
|
911
|
+
height = (
|
912
|
+
self.unet.config.sample_size
|
913
|
+
if self._is_unet_config_sample_size_int
|
914
|
+
else self.unet.config.sample_size[0]
|
915
|
+
)
|
916
|
+
width = (
|
917
|
+
self.unet.config.sample_size
|
918
|
+
if self._is_unet_config_sample_size_int
|
919
|
+
else self.unet.config.sample_size[1]
|
920
|
+
)
|
921
|
+
height, width = height * self.vae_scale_factor, width * self.vae_scale_factor
|
907
922
|
# to deal with lora scaling and other possible forward hooks
|
908
923
|
|
909
924
|
# 1. Check inputs. Raise error if not correct
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2024 Stability AI and The
|
1
|
+
# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -17,14 +17,16 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
17
17
|
|
18
18
|
import torch
|
19
19
|
from transformers import (
|
20
|
+
BaseImageProcessor,
|
20
21
|
CLIPTextModelWithProjection,
|
21
22
|
CLIPTokenizer,
|
23
|
+
PreTrainedModel,
|
22
24
|
T5EncoderModel,
|
23
25
|
T5TokenizerFast,
|
24
26
|
)
|
25
27
|
|
26
|
-
from ...image_processor import VaeImageProcessor
|
27
|
-
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
28
|
+
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
29
|
+
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
|
28
30
|
from ...models.autoencoders import AutoencoderKL
|
29
31
|
from ...models.transformers import SD3Transformer2DModel
|
30
32
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
@@ -68,6 +70,20 @@ EXAMPLE_DOC_STRING = """
|
|
68
70
|
"""
|
69
71
|
|
70
72
|
|
73
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
74
|
+
def calculate_shift(
|
75
|
+
image_seq_len,
|
76
|
+
base_seq_len: int = 256,
|
77
|
+
max_seq_len: int = 4096,
|
78
|
+
base_shift: float = 0.5,
|
79
|
+
max_shift: float = 1.16,
|
80
|
+
):
|
81
|
+
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
82
|
+
b = base_shift - m * base_seq_len
|
83
|
+
mu = image_seq_len * m + b
|
84
|
+
return mu
|
85
|
+
|
86
|
+
|
71
87
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
72
88
|
def retrieve_timesteps(
|
73
89
|
scheduler,
|
@@ -128,7 +144,7 @@ def retrieve_timesteps(
|
|
128
144
|
return timesteps, num_inference_steps
|
129
145
|
|
130
146
|
|
131
|
-
class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
|
147
|
+
class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
|
132
148
|
r"""
|
133
149
|
Args:
|
134
150
|
transformer ([`SD3Transformer2DModel`]):
|
@@ -160,10 +176,14 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
160
176
|
tokenizer_3 (`T5TokenizerFast`):
|
161
177
|
Tokenizer of class
|
162
178
|
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
179
|
+
image_encoder (`PreTrainedModel`, *optional*):
|
180
|
+
Pre-trained Vision Model for IP Adapter.
|
181
|
+
feature_extractor (`BaseImageProcessor`, *optional*):
|
182
|
+
Image processor for IP Adapter.
|
163
183
|
"""
|
164
184
|
|
165
|
-
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
|
166
|
-
_optional_components = []
|
185
|
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
|
186
|
+
_optional_components = ["image_encoder", "feature_extractor"]
|
167
187
|
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
168
188
|
|
169
189
|
def __init__(
|
@@ -177,6 +197,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
177
197
|
tokenizer_2: CLIPTokenizer,
|
178
198
|
text_encoder_3: T5EncoderModel,
|
179
199
|
tokenizer_3: T5TokenizerFast,
|
200
|
+
image_encoder: PreTrainedModel = None,
|
201
|
+
feature_extractor: BaseImageProcessor = None,
|
180
202
|
):
|
181
203
|
super().__init__()
|
182
204
|
|
@@ -190,6 +212,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
190
212
|
tokenizer_3=tokenizer_3,
|
191
213
|
transformer=transformer,
|
192
214
|
scheduler=scheduler,
|
215
|
+
image_encoder=image_encoder,
|
216
|
+
feature_extractor=feature_extractor,
|
193
217
|
)
|
194
218
|
self.vae_scale_factor = (
|
195
219
|
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
|
@@ -642,6 +666,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
642
666
|
def guidance_scale(self):
|
643
667
|
return self._guidance_scale
|
644
668
|
|
669
|
+
@property
|
670
|
+
def skip_guidance_layers(self):
|
671
|
+
return self._skip_guidance_layers
|
672
|
+
|
645
673
|
@property
|
646
674
|
def clip_skip(self):
|
647
675
|
return self._clip_skip
|
@@ -665,6 +693,83 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
665
693
|
def interrupt(self):
|
666
694
|
return self._interrupt
|
667
695
|
|
696
|
+
# Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_image
|
697
|
+
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
|
698
|
+
"""Encodes the given image into a feature representation using a pre-trained image encoder.
|
699
|
+
|
700
|
+
Args:
|
701
|
+
image (`PipelineImageInput`):
|
702
|
+
Input image to be encoded.
|
703
|
+
device: (`torch.device`):
|
704
|
+
Torch device.
|
705
|
+
|
706
|
+
Returns:
|
707
|
+
`torch.Tensor`: The encoded image feature representation.
|
708
|
+
"""
|
709
|
+
if not isinstance(image, torch.Tensor):
|
710
|
+
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
711
|
+
|
712
|
+
image = image.to(device=device, dtype=self.dtype)
|
713
|
+
|
714
|
+
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
715
|
+
|
716
|
+
# Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.prepare_ip_adapter_image_embeds
|
717
|
+
def prepare_ip_adapter_image_embeds(
|
718
|
+
self,
|
719
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
720
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
721
|
+
device: Optional[torch.device] = None,
|
722
|
+
num_images_per_prompt: int = 1,
|
723
|
+
do_classifier_free_guidance: bool = True,
|
724
|
+
) -> torch.Tensor:
|
725
|
+
"""Prepares image embeddings for use in the IP-Adapter.
|
726
|
+
|
727
|
+
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
|
728
|
+
|
729
|
+
Args:
|
730
|
+
ip_adapter_image (`PipelineImageInput`, *optional*):
|
731
|
+
The input image to extract features from for IP-Adapter.
|
732
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
733
|
+
Precomputed image embeddings.
|
734
|
+
device: (`torch.device`, *optional*):
|
735
|
+
Torch device.
|
736
|
+
num_images_per_prompt (`int`, defaults to 1):
|
737
|
+
Number of images that should be generated per prompt.
|
738
|
+
do_classifier_free_guidance (`bool`, defaults to True):
|
739
|
+
Whether to use classifier free guidance or not.
|
740
|
+
"""
|
741
|
+
device = device or self._execution_device
|
742
|
+
|
743
|
+
if ip_adapter_image_embeds is not None:
|
744
|
+
if do_classifier_free_guidance:
|
745
|
+
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
|
746
|
+
else:
|
747
|
+
single_image_embeds = ip_adapter_image_embeds
|
748
|
+
elif ip_adapter_image is not None:
|
749
|
+
single_image_embeds = self.encode_image(ip_adapter_image, device)
|
750
|
+
if do_classifier_free_guidance:
|
751
|
+
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
|
752
|
+
else:
|
753
|
+
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
|
754
|
+
|
755
|
+
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
756
|
+
|
757
|
+
if do_classifier_free_guidance:
|
758
|
+
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
759
|
+
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
|
760
|
+
|
761
|
+
return image_embeds.to(device=device)
|
762
|
+
|
763
|
+
def enable_sequential_cpu_offload(self, *args, **kwargs):
|
764
|
+
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
|
765
|
+
logger.warning(
|
766
|
+
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
|
767
|
+
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
|
768
|
+
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
|
769
|
+
)
|
770
|
+
|
771
|
+
super().enable_sequential_cpu_offload(*args, **kwargs)
|
772
|
+
|
668
773
|
@torch.no_grad()
|
669
774
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
670
775
|
def __call__(
|
@@ -675,7 +780,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
675
780
|
height: Optional[int] = None,
|
676
781
|
width: Optional[int] = None,
|
677
782
|
num_inference_steps: int = 28,
|
678
|
-
|
783
|
+
sigmas: Optional[List[float]] = None,
|
679
784
|
guidance_scale: float = 7.0,
|
680
785
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
681
786
|
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
@@ -687,6 +792,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
687
792
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
688
793
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
689
794
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
795
|
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
796
|
+
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
690
797
|
output_type: Optional[str] = "pil",
|
691
798
|
return_dict: bool = True,
|
692
799
|
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
@@ -694,6 +801,11 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
694
801
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
695
802
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
696
803
|
max_sequence_length: int = 256,
|
804
|
+
skip_guidance_layers: List[int] = None,
|
805
|
+
skip_layer_guidance_scale: float = 2.8,
|
806
|
+
skip_layer_guidance_stop: float = 0.2,
|
807
|
+
skip_layer_guidance_start: float = 0.01,
|
808
|
+
mu: Optional[float] = None,
|
697
809
|
):
|
698
810
|
r"""
|
699
811
|
Function invoked when calling the pipeline for generation.
|
@@ -715,10 +827,10 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
715
827
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
716
828
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
717
829
|
expense of slower inference.
|
718
|
-
|
719
|
-
Custom
|
720
|
-
|
721
|
-
|
830
|
+
sigmas (`List[float]`, *optional*):
|
831
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
832
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
833
|
+
will be used.
|
722
834
|
guidance_scale (`float`, *optional*, defaults to 7.0):
|
723
835
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
724
836
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -758,12 +870,17 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
758
870
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
759
871
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
760
872
|
input argument.
|
873
|
+
ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
874
|
+
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
875
|
+
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
876
|
+
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
877
|
+
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
761
878
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
762
879
|
The output format of the generate image. Choose between
|
763
880
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
764
881
|
return_dict (`bool`, *optional*, defaults to `True`):
|
765
|
-
Whether or not to return a [`~pipelines.
|
766
|
-
|
882
|
+
Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
|
883
|
+
a plain tuple.
|
767
884
|
joint_attention_kwargs (`dict`, *optional*):
|
768
885
|
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
769
886
|
`self.processor` in
|
@@ -778,6 +895,23 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
778
895
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
779
896
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
780
897
|
max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
|
898
|
+
skip_guidance_layers (`List[int]`, *optional*):
|
899
|
+
A list of integers that specify layers to skip during guidance. If not provided, all layers will be
|
900
|
+
used for guidance. If provided, the guidance will only be applied to the layers specified in the list.
|
901
|
+
Recommended value by StabiltyAI for Stable Diffusion 3.5 Medium is [7, 8, 9].
|
902
|
+
skip_layer_guidance_scale (`int`, *optional*): The scale of the guidance for the layers specified in
|
903
|
+
`skip_guidance_layers`. The guidance will be applied to the layers specified in `skip_guidance_layers`
|
904
|
+
with a scale of `skip_layer_guidance_scale`. The guidance will be applied to the rest of the layers
|
905
|
+
with a scale of `1`.
|
906
|
+
skip_layer_guidance_stop (`int`, *optional*): The step at which the guidance for the layers specified in
|
907
|
+
`skip_guidance_layers` will stop. The guidance will be applied to the layers specified in
|
908
|
+
`skip_guidance_layers` until the fraction specified in `skip_layer_guidance_stop`. Recommended value by
|
909
|
+
StabiltyAI for Stable Diffusion 3.5 Medium is 0.2.
|
910
|
+
skip_layer_guidance_start (`int`, *optional*): The step at which the guidance for the layers specified in
|
911
|
+
`skip_guidance_layers` will start. The guidance will be applied to the layers specified in
|
912
|
+
`skip_guidance_layers` from the fraction specified in `skip_layer_guidance_start`. Recommended value by
|
913
|
+
StabiltyAI for Stable Diffusion 3.5 Medium is 0.01.
|
914
|
+
mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
|
781
915
|
|
782
916
|
Examples:
|
783
917
|
|
@@ -809,6 +943,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
809
943
|
)
|
810
944
|
|
811
945
|
self._guidance_scale = guidance_scale
|
946
|
+
self._skip_layer_guidance_scale = skip_layer_guidance_scale
|
812
947
|
self._clip_skip = clip_skip
|
813
948
|
self._joint_attention_kwargs = joint_attention_kwargs
|
814
949
|
self._interrupt = False
|
@@ -851,15 +986,13 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
851
986
|
)
|
852
987
|
|
853
988
|
if self.do_classifier_free_guidance:
|
989
|
+
if skip_guidance_layers is not None:
|
990
|
+
original_prompt_embeds = prompt_embeds
|
991
|
+
original_pooled_prompt_embeds = pooled_prompt_embeds
|
854
992
|
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
855
993
|
pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
|
856
994
|
|
857
|
-
# 4. Prepare
|
858
|
-
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
859
|
-
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
860
|
-
self._num_timesteps = len(timesteps)
|
861
|
-
|
862
|
-
# 5. Prepare latent variables
|
995
|
+
# 4. Prepare latent variables
|
863
996
|
num_channels_latents = self.transformer.config.in_channels
|
864
997
|
latents = self.prepare_latents(
|
865
998
|
batch_size * num_images_per_prompt,
|
@@ -872,7 +1005,49 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
872
1005
|
latents,
|
873
1006
|
)
|
874
1007
|
|
875
|
-
#
|
1008
|
+
# 5. Prepare timesteps
|
1009
|
+
scheduler_kwargs = {}
|
1010
|
+
if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
|
1011
|
+
_, _, height, width = latents.shape
|
1012
|
+
image_seq_len = (height // self.transformer.config.patch_size) * (
|
1013
|
+
width // self.transformer.config.patch_size
|
1014
|
+
)
|
1015
|
+
mu = calculate_shift(
|
1016
|
+
image_seq_len,
|
1017
|
+
self.scheduler.config.base_image_seq_len,
|
1018
|
+
self.scheduler.config.max_image_seq_len,
|
1019
|
+
self.scheduler.config.base_shift,
|
1020
|
+
self.scheduler.config.max_shift,
|
1021
|
+
)
|
1022
|
+
scheduler_kwargs["mu"] = mu
|
1023
|
+
elif mu is not None:
|
1024
|
+
scheduler_kwargs["mu"] = mu
|
1025
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
1026
|
+
self.scheduler,
|
1027
|
+
num_inference_steps,
|
1028
|
+
device,
|
1029
|
+
sigmas=sigmas,
|
1030
|
+
**scheduler_kwargs,
|
1031
|
+
)
|
1032
|
+
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
1033
|
+
self._num_timesteps = len(timesteps)
|
1034
|
+
|
1035
|
+
# 6. Prepare image embeddings
|
1036
|
+
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
|
1037
|
+
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
1038
|
+
ip_adapter_image,
|
1039
|
+
ip_adapter_image_embeds,
|
1040
|
+
device,
|
1041
|
+
batch_size * num_images_per_prompt,
|
1042
|
+
self.do_classifier_free_guidance,
|
1043
|
+
)
|
1044
|
+
|
1045
|
+
if self.joint_attention_kwargs is None:
|
1046
|
+
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
|
1047
|
+
else:
|
1048
|
+
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
|
1049
|
+
|
1050
|
+
# 7. Denoising loop
|
876
1051
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
877
1052
|
for i, t in enumerate(timesteps):
|
878
1053
|
if self.interrupt:
|
@@ -896,6 +1071,27 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
896
1071
|
if self.do_classifier_free_guidance:
|
897
1072
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
898
1073
|
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
1074
|
+
should_skip_layers = (
|
1075
|
+
True
|
1076
|
+
if i > num_inference_steps * skip_layer_guidance_start
|
1077
|
+
and i < num_inference_steps * skip_layer_guidance_stop
|
1078
|
+
else False
|
1079
|
+
)
|
1080
|
+
if skip_guidance_layers is not None and should_skip_layers:
|
1081
|
+
timestep = t.expand(latents.shape[0])
|
1082
|
+
latent_model_input = latents
|
1083
|
+
noise_pred_skip_layers = self.transformer(
|
1084
|
+
hidden_states=latent_model_input,
|
1085
|
+
timestep=timestep,
|
1086
|
+
encoder_hidden_states=original_prompt_embeds,
|
1087
|
+
pooled_projections=original_pooled_prompt_embeds,
|
1088
|
+
joint_attention_kwargs=self.joint_attention_kwargs,
|
1089
|
+
return_dict=False,
|
1090
|
+
skip_layers=skip_guidance_layers,
|
1091
|
+
)[0]
|
1092
|
+
noise_pred = (
|
1093
|
+
noise_pred + (noise_pred_text - noise_pred_skip_layers) * self._skip_layer_guidance_scale
|
1094
|
+
)
|
899
1095
|
|
900
1096
|
# compute the previous noisy sample x_t -> x_t-1
|
901
1097
|
latents_dtype = latents.dtype
|
@@ -75,6 +75,20 @@ EXAMPLE_DOC_STRING = """
|
|
75
75
|
"""
|
76
76
|
|
77
77
|
|
78
|
+
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
79
|
+
def calculate_shift(
|
80
|
+
image_seq_len,
|
81
|
+
base_seq_len: int = 256,
|
82
|
+
max_seq_len: int = 4096,
|
83
|
+
base_shift: float = 0.5,
|
84
|
+
max_shift: float = 1.16,
|
85
|
+
):
|
86
|
+
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
|
87
|
+
b = base_shift - m * base_seq_len
|
88
|
+
mu = image_seq_len * m + b
|
89
|
+
return mu
|
90
|
+
|
91
|
+
|
78
92
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
|
79
93
|
def retrieve_latents(
|
80
94
|
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
|
@@ -218,6 +232,9 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
218
232
|
)
|
219
233
|
self.tokenizer_max_length = self.tokenizer.model_max_length
|
220
234
|
self.default_sample_size = self.transformer.config.sample_size
|
235
|
+
self.patch_size = (
|
236
|
+
self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
|
237
|
+
)
|
221
238
|
|
222
239
|
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
|
223
240
|
def _get_t5_prompt_embeds(
|
@@ -531,6 +548,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
531
548
|
prompt,
|
532
549
|
prompt_2,
|
533
550
|
prompt_3,
|
551
|
+
height,
|
552
|
+
width,
|
534
553
|
strength,
|
535
554
|
negative_prompt=None,
|
536
555
|
negative_prompt_2=None,
|
@@ -542,6 +561,15 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
542
561
|
callback_on_step_end_tensor_inputs=None,
|
543
562
|
max_sequence_length=None,
|
544
563
|
):
|
564
|
+
if (
|
565
|
+
height % (self.vae_scale_factor * self.patch_size) != 0
|
566
|
+
or width % (self.vae_scale_factor * self.patch_size) != 0
|
567
|
+
):
|
568
|
+
raise ValueError(
|
569
|
+
f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
|
570
|
+
f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
|
571
|
+
)
|
572
|
+
|
545
573
|
if strength < 0 or strength > 1:
|
546
574
|
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
547
575
|
|
@@ -710,10 +738,12 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
710
738
|
prompt: Union[str, List[str]] = None,
|
711
739
|
prompt_2: Optional[Union[str, List[str]]] = None,
|
712
740
|
prompt_3: Optional[Union[str, List[str]]] = None,
|
741
|
+
height: Optional[int] = None,
|
742
|
+
width: Optional[int] = None,
|
713
743
|
image: PipelineImageInput = None,
|
714
744
|
strength: float = 0.6,
|
715
745
|
num_inference_steps: int = 50,
|
716
|
-
|
746
|
+
sigmas: Optional[List[float]] = None,
|
717
747
|
guidance_scale: float = 7.0,
|
718
748
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
719
749
|
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
@@ -732,6 +762,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
732
762
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
733
763
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
734
764
|
max_sequence_length: int = 256,
|
765
|
+
mu: Optional[float] = None,
|
735
766
|
):
|
736
767
|
r"""
|
737
768
|
Function invoked when calling the pipeline for generation.
|
@@ -753,10 +784,10 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
753
784
|
num_inference_steps (`int`, *optional*, defaults to 50):
|
754
785
|
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
755
786
|
expense of slower inference.
|
756
|
-
|
757
|
-
Custom
|
758
|
-
|
759
|
-
|
787
|
+
sigmas (`List[float]`, *optional*):
|
788
|
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
789
|
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
790
|
+
will be used.
|
760
791
|
guidance_scale (`float`, *optional*, defaults to 7.0):
|
761
792
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
762
793
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -800,8 +831,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
800
831
|
The output format of the generate image. Choose between
|
801
832
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
802
833
|
return_dict (`bool`, *optional*, defaults to `True`):
|
803
|
-
Whether or not to return a [`~pipelines.
|
804
|
-
|
834
|
+
Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
|
835
|
+
a plain tuple.
|
805
836
|
joint_attention_kwargs (`dict`, *optional*):
|
806
837
|
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
807
838
|
`self.processor` in
|
@@ -816,6 +847,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
816
847
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
817
848
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
818
849
|
max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
|
850
|
+
mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
|
819
851
|
|
820
852
|
Examples:
|
821
853
|
|
@@ -824,12 +856,16 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
824
856
|
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
|
825
857
|
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
826
858
|
"""
|
859
|
+
height = height or self.default_sample_size * self.vae_scale_factor
|
860
|
+
width = width or self.default_sample_size * self.vae_scale_factor
|
827
861
|
|
828
862
|
# 1. Check inputs. Raise error if not correct
|
829
863
|
self.check_inputs(
|
830
864
|
prompt,
|
831
865
|
prompt_2,
|
832
866
|
prompt_3,
|
867
|
+
height,
|
868
|
+
width,
|
833
869
|
strength,
|
834
870
|
negative_prompt=negative_prompt,
|
835
871
|
negative_prompt_2=negative_prompt_2,
|
@@ -890,10 +926,27 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|
890
926
|
pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
|
891
927
|
|
892
928
|
# 3. Preprocess image
|
893
|
-
image = self.image_processor.preprocess(image)
|
929
|
+
image = self.image_processor.preprocess(image, height=height, width=width)
|
894
930
|
|
895
931
|
# 4. Prepare timesteps
|
896
|
-
|
932
|
+
scheduler_kwargs = {}
|
933
|
+
if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
|
934
|
+
image_seq_len = (int(height) // self.vae_scale_factor // self.transformer.config.patch_size) * (
|
935
|
+
int(width) // self.vae_scale_factor // self.transformer.config.patch_size
|
936
|
+
)
|
937
|
+
mu = calculate_shift(
|
938
|
+
image_seq_len,
|
939
|
+
self.scheduler.config.base_image_seq_len,
|
940
|
+
self.scheduler.config.max_image_seq_len,
|
941
|
+
self.scheduler.config.base_shift,
|
942
|
+
self.scheduler.config.max_shift,
|
943
|
+
)
|
944
|
+
scheduler_kwargs["mu"] = mu
|
945
|
+
elif mu is not None:
|
946
|
+
scheduler_kwargs["mu"] = mu
|
947
|
+
timesteps, num_inference_steps = retrieve_timesteps(
|
948
|
+
self.scheduler, num_inference_steps, device, sigmas=sigmas, **scheduler_kwargs
|
949
|
+
)
|
897
950
|
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
898
951
|
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
899
952
|
|