diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +94 -3
- diffusers/commands/env.py +1 -5
- diffusers/configuration_utils.py +4 -9
- diffusers/dependency_versions_table.py +2 -2
- diffusers/image_processor.py +1 -2
- diffusers/loaders/__init__.py +17 -2
- diffusers/loaders/ip_adapter.py +10 -7
- diffusers/loaders/lora_base.py +752 -0
- diffusers/loaders/lora_pipeline.py +2252 -0
- diffusers/loaders/peft.py +213 -5
- diffusers/loaders/single_file.py +3 -14
- diffusers/loaders/single_file_model.py +31 -10
- diffusers/loaders/single_file_utils.py +293 -8
- diffusers/loaders/textual_inversion.py +1 -6
- diffusers/loaders/unet.py +23 -208
- diffusers/models/__init__.py +20 -0
- diffusers/models/activations.py +22 -0
- diffusers/models/attention.py +386 -7
- diffusers/models/attention_processor.py +1937 -629
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_kl.py +14 -3
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1271 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
- diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vq_model.py +4 -4
- diffusers/models/controlnet.py +2 -3
- diffusers/models/controlnet_hunyuan.py +401 -0
- diffusers/models/controlnet_sd3.py +11 -11
- diffusers/models/controlnet_sparsectrl.py +789 -0
- diffusers/models/controlnet_xs.py +40 -10
- diffusers/models/downsampling.py +68 -0
- diffusers/models/embeddings.py +403 -36
- diffusers/models/model_loading_utils.py +1 -3
- diffusers/models/modeling_flax_utils.py +1 -6
- diffusers/models/modeling_utils.py +4 -16
- diffusers/models/normalization.py +203 -12
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +543 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +485 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
- diffusers/models/transformers/latte_transformer_3d.py +327 -0
- diffusers/models/transformers/lumina_nextdit2d.py +340 -0
- diffusers/models/transformers/pixart_transformer_2d.py +102 -1
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/stable_audio_transformer.py +458 -0
- diffusers/models/transformers/transformer_flux.py +455 -0
- diffusers/models/transformers/transformer_sd3.py +18 -4
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_condition.py +8 -1
- diffusers/models/unets/unet_3d_blocks.py +51 -920
- diffusers/models/unets/unet_3d_condition.py +4 -1
- diffusers/models/unets/unet_i2vgen_xl.py +4 -1
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +1330 -84
- diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
- diffusers/models/unets/unet_stable_cascade.py +1 -3
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +64 -0
- diffusers/models/vq_model.py +8 -4
- diffusers/optimization.py +1 -1
- diffusers/pipelines/__init__.py +100 -3
- diffusers/pipelines/animatediff/__init__.py +4 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
- diffusers/pipelines/aura_flow/__init__.py +48 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
- diffusers/pipelines/auto_pipeline.py +97 -19
- diffusers/pipelines/cogvideo/__init__.py +48 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +746 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
- diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +749 -0
- diffusers/pipelines/flux/pipeline_output.py +21 -0
- diffusers/pipelines/free_init_utils.py +2 -0
- diffusers/pipelines/free_noise_utils.py +236 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
- diffusers/pipelines/kolors/__init__.py +54 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
- diffusers/pipelines/kolors/pipeline_output.py +21 -0
- diffusers/pipelines/kolors/text_encoder.py +889 -0
- diffusers/pipelines/kolors/tokenizer.py +334 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
- diffusers/pipelines/latte/__init__.py +48 -0
- diffusers/pipelines/latte/pipeline_latte.py +881 -0
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
- diffusers/pipelines/lumina/__init__.py +48 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
- diffusers/pipelines/pag/__init__.py +67 -0
- diffusers/pipelines/pag/pag_utils.py +237 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
- diffusers/pipelines/pia/pipeline_pia.py +30 -37
- diffusers/pipelines/pipeline_flax_utils.py +4 -9
- diffusers/pipelines/pipeline_loading_utils.py +0 -3
- diffusers/pipelines/pipeline_utils.py +2 -14
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
- diffusers/pipelines/stable_audio/__init__.py +50 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
- diffusers/schedulers/__init__.py +8 -0
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
- diffusers/schedulers/scheduling_ddim.py +1 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
- diffusers/schedulers/scheduling_ddpm.py +1 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
- diffusers/schedulers/scheduling_deis_multistep.py +2 -2
- diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
- diffusers/schedulers/scheduling_ipndm.py +1 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
- diffusers/schedulers/scheduling_utils.py +1 -3
- diffusers/schedulers/scheduling_utils_flax.py +1 -3
- diffusers/training_utils.py +99 -14
- diffusers/utils/__init__.py +2 -2
- diffusers/utils/dummy_pt_objects.py +210 -0
- diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
- diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
- diffusers/utils/dynamic_modules_utils.py +1 -11
- diffusers/utils/export_utils.py +50 -6
- diffusers/utils/hub_utils.py +45 -42
- diffusers/utils/import_utils.py +37 -15
- diffusers/utils/loading_utils.py +80 -3
- diffusers/utils/testing_utils.py +11 -8
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/METADATA +73 -83
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/RECORD +217 -164
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/WHEEL +1 -1
- diffusers/loaders/autoencoder.py +0 -146
- diffusers/loaders/controlnet.py +0 -136
- diffusers/loaders/lora.py +0 -1728
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/LICENSE +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/top_level.txt +0 -0
@@ -22,12 +22,10 @@ import torch
|
|
22
22
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
23
23
|
|
24
24
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
25
|
-
from ...loaders import FromSingleFileMixin,
|
25
|
+
from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
26
26
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
27
27
|
from ...models.attention_processor import (
|
28
28
|
AttnProcessor2_0,
|
29
|
-
LoRAAttnProcessor2_0,
|
30
|
-
LoRAXFormersAttnProcessor,
|
31
29
|
XFormersAttnProcessor,
|
32
30
|
)
|
33
31
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -68,7 +66,11 @@ def preprocess(image):
|
|
68
66
|
|
69
67
|
|
70
68
|
class StableDiffusionUpscalePipeline(
|
71
|
-
DiffusionPipeline,
|
69
|
+
DiffusionPipeline,
|
70
|
+
StableDiffusionMixin,
|
71
|
+
TextualInversionLoaderMixin,
|
72
|
+
StableDiffusionLoraLoaderMixin,
|
73
|
+
FromSingleFileMixin,
|
72
74
|
):
|
73
75
|
r"""
|
74
76
|
Pipeline for text-guided image super-resolution using Stable Diffusion 2.
|
@@ -78,8 +80,8 @@ class StableDiffusionUpscalePipeline(
|
|
78
80
|
|
79
81
|
The pipeline also inherits the following loading methods:
|
80
82
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
81
|
-
- [`~loaders.
|
82
|
-
- [`~loaders.
|
83
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
84
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
83
85
|
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
84
86
|
|
85
87
|
Args:
|
@@ -245,7 +247,7 @@ class StableDiffusionUpscalePipeline(
|
|
245
247
|
"""
|
246
248
|
# set lora scale so that monkey patched LoRA
|
247
249
|
# function of text encoder can correctly access it
|
248
|
-
if lora_scale is not None and isinstance(self,
|
250
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
249
251
|
self._lora_scale = lora_scale
|
250
252
|
|
251
253
|
# dynamically adjust the LoRA scale
|
@@ -378,7 +380,7 @@ class StableDiffusionUpscalePipeline(
|
|
378
380
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
379
381
|
|
380
382
|
if self.text_encoder is not None:
|
381
|
-
if isinstance(self,
|
383
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
382
384
|
# Retrieve the original scale by scaling back the LoRA layers
|
383
385
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
384
386
|
|
@@ -520,8 +522,6 @@ class StableDiffusionUpscalePipeline(
|
|
520
522
|
(
|
521
523
|
AttnProcessor2_0,
|
522
524
|
XFormersAttnProcessor,
|
523
|
-
LoRAXFormersAttnProcessor,
|
524
|
-
LoRAAttnProcessor2_0,
|
525
525
|
),
|
526
526
|
)
|
527
527
|
# if xformers or torch_2_0 is used attention block does not need
|
@@ -616,7 +616,7 @@ class StableDiffusionUpscalePipeline(
|
|
616
616
|
>>> # load model and scheduler
|
617
617
|
>>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
|
618
618
|
>>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
|
619
|
-
... model_id,
|
619
|
+
... model_id, variant="fp16", torch_dtype=torch.float16
|
620
620
|
... )
|
621
621
|
>>> pipeline = pipeline.to("cuda")
|
622
622
|
|
@@ -20,7 +20,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokeniz
|
|
20
20
|
from transformers.models.clip.modeling_clip import CLIPTextModelOutput
|
21
21
|
|
22
22
|
from ...image_processor import VaeImageProcessor
|
23
|
-
from ...loaders import
|
23
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
24
24
|
from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
|
25
25
|
from ...models.embeddings import get_timestep_embedding
|
26
26
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -58,7 +58,9 @@ EXAMPLE_DOC_STRING = """
|
|
58
58
|
"""
|
59
59
|
|
60
60
|
|
61
|
-
class StableUnCLIPPipeline(
|
61
|
+
class StableUnCLIPPipeline(
|
62
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
63
|
+
):
|
62
64
|
"""
|
63
65
|
Pipeline for text-to-image generation using stable unCLIP.
|
64
66
|
|
@@ -67,8 +69,8 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
67
69
|
|
68
70
|
The pipeline also inherits the following loading methods:
|
69
71
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
70
|
-
- [`~loaders.
|
71
|
-
- [`~loaders.
|
72
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
73
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
72
74
|
|
73
75
|
Args:
|
74
76
|
prior_tokenizer ([`CLIPTokenizer`]):
|
@@ -326,7 +328,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
326
328
|
"""
|
327
329
|
# set lora scale so that monkey patched LoRA
|
328
330
|
# function of text encoder can correctly access it
|
329
|
-
if lora_scale is not None and isinstance(self,
|
331
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
330
332
|
self._lora_scale = lora_scale
|
331
333
|
|
332
334
|
# dynamically adjust the LoRA scale
|
@@ -459,7 +461,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
|
|
459
461
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
460
462
|
|
461
463
|
if self.text_encoder is not None:
|
462
|
-
if isinstance(self,
|
464
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
463
465
|
# Retrieve the original scale by scaling back the LoRA layers
|
464
466
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
465
467
|
|
@@ -20,7 +20,7 @@ import torch
|
|
20
20
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
21
21
|
|
22
22
|
from ...image_processor import VaeImageProcessor
|
23
|
-
from ...loaders import
|
23
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
24
24
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
25
25
|
from ...models.embeddings import get_timestep_embedding
|
26
26
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -70,7 +70,7 @@ EXAMPLE_DOC_STRING = """
|
|
70
70
|
|
71
71
|
|
72
72
|
class StableUnCLIPImg2ImgPipeline(
|
73
|
-
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin,
|
73
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
74
74
|
):
|
75
75
|
"""
|
76
76
|
Pipeline for text-guided image-to-image generation using stable unCLIP.
|
@@ -80,8 +80,8 @@ class StableUnCLIPImg2ImgPipeline(
|
|
80
80
|
|
81
81
|
The pipeline also inherits the following loading methods:
|
82
82
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
83
|
-
- [`~loaders.
|
84
|
-
- [`~loaders.
|
83
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
84
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
85
85
|
|
86
86
|
Args:
|
87
87
|
feature_extractor ([`CLIPImageProcessor`]):
|
@@ -290,7 +290,7 @@ class StableUnCLIPImg2ImgPipeline(
|
|
290
290
|
"""
|
291
291
|
# set lora scale so that monkey patched LoRA
|
292
292
|
# function of text encoder can correctly access it
|
293
|
-
if lora_scale is not None and isinstance(self,
|
293
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
294
294
|
self._lora_scale = lora_scale
|
295
295
|
|
296
296
|
# dynamically adjust the LoRA scale
|
@@ -423,7 +423,7 @@ class StableUnCLIPImg2ImgPipeline(
|
|
423
423
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
424
424
|
|
425
425
|
if self.text_encoder is not None:
|
426
|
-
if isinstance(self,
|
426
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
427
427
|
# Retrieve the original scale by scaling back the LoRA layers
|
428
428
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
429
429
|
|
@@ -25,6 +25,7 @@ except OptionalDependencyNotAvailable:
|
|
25
25
|
else:
|
26
26
|
_import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
|
27
27
|
_import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
|
28
|
+
_import_structure["pipeline_stable_diffusion_3_inpaint"] = ["StableDiffusion3InpaintPipeline"]
|
28
29
|
|
29
30
|
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
30
31
|
try:
|
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
|
35
36
|
else:
|
36
37
|
from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
|
37
38
|
from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
|
39
|
+
from .pipeline_stable_diffusion_3_inpaint import StableDiffusion3InpaintPipeline
|
38
40
|
|
39
41
|
else:
|
40
42
|
import sys
|
@@ -29,9 +29,12 @@ from ...models.autoencoders import AutoencoderKL
|
|
29
29
|
from ...models.transformers import SD3Transformer2DModel
|
30
30
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
31
31
|
from ...utils import (
|
32
|
+
USE_PEFT_BACKEND,
|
32
33
|
is_torch_xla_available,
|
33
34
|
logging,
|
34
35
|
replace_example_docstring,
|
36
|
+
scale_lora_layers,
|
37
|
+
unscale_lora_layers,
|
35
38
|
)
|
36
39
|
from ...utils.torch_utils import randn_tensor
|
37
40
|
from ..pipeline_utils import DiffusionPipeline
|
@@ -329,6 +332,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
329
332
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
330
333
|
clip_skip: Optional[int] = None,
|
331
334
|
max_sequence_length: int = 256,
|
335
|
+
lora_scale: Optional[float] = None,
|
332
336
|
):
|
333
337
|
r"""
|
334
338
|
|
@@ -374,9 +378,22 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
374
378
|
clip_skip (`int`, *optional*):
|
375
379
|
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
376
380
|
the output of the pre-final layer will be used for computing the prompt embeddings.
|
381
|
+
lora_scale (`float`, *optional*):
|
382
|
+
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
|
377
383
|
"""
|
378
384
|
device = device or self._execution_device
|
379
385
|
|
386
|
+
# set lora scale so that monkey patched LoRA
|
387
|
+
# function of text encoder can correctly access it
|
388
|
+
if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
|
389
|
+
self._lora_scale = lora_scale
|
390
|
+
|
391
|
+
# dynamically adjust the LoRA scale
|
392
|
+
if self.text_encoder is not None and USE_PEFT_BACKEND:
|
393
|
+
scale_lora_layers(self.text_encoder, lora_scale)
|
394
|
+
if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
|
395
|
+
scale_lora_layers(self.text_encoder_2, lora_scale)
|
396
|
+
|
380
397
|
prompt = [prompt] if isinstance(prompt, str) else prompt
|
381
398
|
if prompt is not None:
|
382
399
|
batch_size = len(prompt)
|
@@ -479,6 +496,16 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
479
496
|
[negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
|
480
497
|
)
|
481
498
|
|
499
|
+
if self.text_encoder is not None:
|
500
|
+
if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
|
501
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
502
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
503
|
+
|
504
|
+
if self.text_encoder_2 is not None:
|
505
|
+
if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
|
506
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
507
|
+
unscale_lora_layers(self.text_encoder_2, lora_scale)
|
508
|
+
|
482
509
|
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
|
483
510
|
|
484
511
|
def check_inputs(
|
@@ -683,7 +710,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
683
710
|
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
684
711
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
685
712
|
passed will be used. Must be in descending order.
|
686
|
-
guidance_scale (`float`, *optional*, defaults to
|
713
|
+
guidance_scale (`float`, *optional*, defaults to 7.0):
|
687
714
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
688
715
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
689
716
|
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
@@ -746,8 +773,8 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
746
773
|
Examples:
|
747
774
|
|
748
775
|
Returns:
|
749
|
-
[`~pipelines.
|
750
|
-
[`~pipelines.
|
776
|
+
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
|
777
|
+
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
|
751
778
|
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
752
779
|
"""
|
753
780
|
|
@@ -787,6 +814,9 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
787
814
|
|
788
815
|
device = self._execution_device
|
789
816
|
|
817
|
+
lora_scale = (
|
818
|
+
self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
|
819
|
+
)
|
790
820
|
(
|
791
821
|
prompt_embeds,
|
792
822
|
negative_prompt_embeds,
|
@@ -808,6 +838,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
|
|
808
838
|
clip_skip=self.clip_skip,
|
809
839
|
num_images_per_prompt=num_images_per_prompt,
|
810
840
|
max_sequence_length=max_sequence_length,
|
841
|
+
lora_scale=lora_scale,
|
811
842
|
)
|
812
843
|
|
813
844
|
if self.do_classifier_free_guidance:
|
@@ -25,13 +25,17 @@ from transformers import (
|
|
25
25
|
)
|
26
26
|
|
27
27
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
28
|
+
from ...loaders import SD3LoraLoaderMixin
|
28
29
|
from ...models.autoencoders import AutoencoderKL
|
29
30
|
from ...models.transformers import SD3Transformer2DModel
|
30
31
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
31
32
|
from ...utils import (
|
33
|
+
USE_PEFT_BACKEND,
|
32
34
|
is_torch_xla_available,
|
33
35
|
logging,
|
34
36
|
replace_example_docstring,
|
37
|
+
scale_lora_layers,
|
38
|
+
unscale_lora_layers,
|
35
39
|
)
|
36
40
|
from ...utils.torch_utils import randn_tensor
|
37
41
|
from ..pipeline_utils import DiffusionPipeline
|
@@ -62,7 +66,7 @@ EXAMPLE_DOC_STRING = """
|
|
62
66
|
>>> pipe = pipe.to(device)
|
63
67
|
|
64
68
|
>>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
|
65
|
-
>>> init_image = load_image(url).resize((
|
69
|
+
>>> init_image = load_image(url).resize((1024, 1024))
|
66
70
|
|
67
71
|
>>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
|
68
72
|
|
@@ -346,6 +350,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
346
350
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
347
351
|
clip_skip: Optional[int] = None,
|
348
352
|
max_sequence_length: int = 256,
|
353
|
+
lora_scale: Optional[float] = None,
|
349
354
|
):
|
350
355
|
r"""
|
351
356
|
|
@@ -391,9 +396,22 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
391
396
|
clip_skip (`int`, *optional*):
|
392
397
|
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
393
398
|
the output of the pre-final layer will be used for computing the prompt embeddings.
|
399
|
+
lora_scale (`float`, *optional*):
|
400
|
+
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
|
394
401
|
"""
|
395
402
|
device = device or self._execution_device
|
396
403
|
|
404
|
+
# set lora scale so that monkey patched LoRA
|
405
|
+
# function of text encoder can correctly access it
|
406
|
+
if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
|
407
|
+
self._lora_scale = lora_scale
|
408
|
+
|
409
|
+
# dynamically adjust the LoRA scale
|
410
|
+
if self.text_encoder is not None and USE_PEFT_BACKEND:
|
411
|
+
scale_lora_layers(self.text_encoder, lora_scale)
|
412
|
+
if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
|
413
|
+
scale_lora_layers(self.text_encoder_2, lora_scale)
|
414
|
+
|
397
415
|
prompt = [prompt] if isinstance(prompt, str) else prompt
|
398
416
|
if prompt is not None:
|
399
417
|
batch_size = len(prompt)
|
@@ -496,6 +514,16 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
496
514
|
[negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
|
497
515
|
)
|
498
516
|
|
517
|
+
if self.text_encoder is not None:
|
518
|
+
if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
|
519
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
520
|
+
unscale_lora_layers(self.text_encoder, lora_scale)
|
521
|
+
|
522
|
+
if self.text_encoder_2 is not None:
|
523
|
+
if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
|
524
|
+
# Retrieve the original scale by scaling back the LoRA layers
|
525
|
+
unscale_lora_layers(self.text_encoder_2, lora_scale)
|
526
|
+
|
499
527
|
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
|
500
528
|
|
501
529
|
def check_inputs(
|
@@ -605,8 +633,6 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
605
633
|
)
|
606
634
|
|
607
635
|
image = image.to(device=device, dtype=dtype)
|
608
|
-
if image.shape[1] == self.vae.config.latent_channels:
|
609
|
-
init_latents = image
|
610
636
|
|
611
637
|
batch_size = batch_size * num_images_per_prompt
|
612
638
|
if image.shape[1] == self.vae.config.latent_channels:
|
@@ -726,7 +752,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
726
752
|
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
727
753
|
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
728
754
|
passed will be used. Must be in descending order.
|
729
|
-
guidance_scale (`float`, *optional*, defaults to
|
755
|
+
guidance_scale (`float`, *optional*, defaults to 7.0):
|
730
756
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
731
757
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
732
758
|
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
@@ -785,8 +811,8 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
785
811
|
Examples:
|
786
812
|
|
787
813
|
Returns:
|
788
|
-
[`~pipelines.
|
789
|
-
[`~pipelines.
|
814
|
+
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
|
815
|
+
[`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
|
790
816
|
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
791
817
|
"""
|
792
818
|
|
@@ -854,7 +880,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
|
|
854
880
|
# 4. Prepare timesteps
|
855
881
|
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
|
856
882
|
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
857
|
-
latent_timestep = timesteps[:1].repeat(batch_size *
|
883
|
+
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
858
884
|
|
859
885
|
# 5. Prepare latent variables
|
860
886
|
if latents is None:
|