diffusers 0.29.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +94 -3
- diffusers/commands/env.py +1 -5
- diffusers/configuration_utils.py +4 -9
- diffusers/dependency_versions_table.py +2 -2
- diffusers/image_processor.py +1 -2
- diffusers/loaders/__init__.py +17 -2
- diffusers/loaders/ip_adapter.py +10 -7
- diffusers/loaders/lora_base.py +752 -0
- diffusers/loaders/lora_pipeline.py +2222 -0
- diffusers/loaders/peft.py +213 -5
- diffusers/loaders/single_file.py +1 -12
- diffusers/loaders/single_file_model.py +31 -10
- diffusers/loaders/single_file_utils.py +262 -2
- diffusers/loaders/textual_inversion.py +1 -6
- diffusers/loaders/unet.py +23 -208
- diffusers/models/__init__.py +20 -0
- diffusers/models/activations.py +22 -0
- diffusers/models/attention.py +386 -7
- diffusers/models/attention_processor.py +1795 -629
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_kl.py +14 -3
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1035 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
- diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vq_model.py +4 -4
- diffusers/models/controlnet.py +2 -3
- diffusers/models/controlnet_hunyuan.py +401 -0
- diffusers/models/controlnet_sd3.py +11 -11
- diffusers/models/controlnet_sparsectrl.py +789 -0
- diffusers/models/controlnet_xs.py +40 -10
- diffusers/models/downsampling.py +68 -0
- diffusers/models/embeddings.py +319 -36
- diffusers/models/model_loading_utils.py +1 -3
- diffusers/models/modeling_flax_utils.py +1 -6
- diffusers/models/modeling_utils.py +4 -16
- diffusers/models/normalization.py +203 -12
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +527 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +345 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
- diffusers/models/transformers/latte_transformer_3d.py +327 -0
- diffusers/models/transformers/lumina_nextdit2d.py +340 -0
- diffusers/models/transformers/pixart_transformer_2d.py +102 -1
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/stable_audio_transformer.py +458 -0
- diffusers/models/transformers/transformer_flux.py +455 -0
- diffusers/models/transformers/transformer_sd3.py +18 -4
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_condition.py +8 -1
- diffusers/models/unets/unet_3d_blocks.py +51 -920
- diffusers/models/unets/unet_3d_condition.py +4 -1
- diffusers/models/unets/unet_i2vgen_xl.py +4 -1
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +1330 -84
- diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
- diffusers/models/unets/unet_stable_cascade.py +1 -3
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +64 -0
- diffusers/models/vq_model.py +8 -4
- diffusers/optimization.py +1 -1
- diffusers/pipelines/__init__.py +100 -3
- diffusers/pipelines/animatediff/__init__.py +4 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
- diffusers/pipelines/aura_flow/__init__.py +48 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
- diffusers/pipelines/auto_pipeline.py +97 -19
- diffusers/pipelines/cogvideo/__init__.py +48 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +687 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
- diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +749 -0
- diffusers/pipelines/flux/pipeline_output.py +21 -0
- diffusers/pipelines/free_init_utils.py +2 -0
- diffusers/pipelines/free_noise_utils.py +236 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
- diffusers/pipelines/kolors/__init__.py +54 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
- diffusers/pipelines/kolors/pipeline_output.py +21 -0
- diffusers/pipelines/kolors/text_encoder.py +889 -0
- diffusers/pipelines/kolors/tokenizer.py +334 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
- diffusers/pipelines/latte/__init__.py +48 -0
- diffusers/pipelines/latte/pipeline_latte.py +881 -0
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
- diffusers/pipelines/lumina/__init__.py +48 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
- diffusers/pipelines/pag/__init__.py +67 -0
- diffusers/pipelines/pag/pag_utils.py +237 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
- diffusers/pipelines/pia/pipeline_pia.py +30 -37
- diffusers/pipelines/pipeline_flax_utils.py +4 -9
- diffusers/pipelines/pipeline_loading_utils.py +0 -3
- diffusers/pipelines/pipeline_utils.py +2 -14
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
- diffusers/pipelines/stable_audio/__init__.py +50 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
- diffusers/schedulers/__init__.py +8 -0
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
- diffusers/schedulers/scheduling_ddim.py +1 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
- diffusers/schedulers/scheduling_ddpm.py +1 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
- diffusers/schedulers/scheduling_deis_multistep.py +2 -2
- diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
- diffusers/schedulers/scheduling_ipndm.py +1 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
- diffusers/schedulers/scheduling_utils.py +1 -3
- diffusers/schedulers/scheduling_utils_flax.py +1 -3
- diffusers/training_utils.py +99 -14
- diffusers/utils/__init__.py +2 -2
- diffusers/utils/dummy_pt_objects.py +210 -0
- diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
- diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
- diffusers/utils/dynamic_modules_utils.py +1 -11
- diffusers/utils/export_utils.py +1 -4
- diffusers/utils/hub_utils.py +45 -42
- diffusers/utils/import_utils.py +19 -16
- diffusers/utils/loading_utils.py +76 -3
- diffusers/utils/testing_utils.py +11 -8
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/METADATA +73 -83
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/RECORD +217 -164
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/WHEEL +1 -1
- diffusers/loaders/autoencoder.py +0 -146
- diffusers/loaders/controlnet.py +0 -136
- diffusers/loaders/lora.py +0 -1728
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/LICENSE +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/entry_points.txt +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.0.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,7 @@ from torch.nn import functional as F
|
|
22
22
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
23
23
|
|
24
24
|
from ...image_processor import VaeImageProcessor
|
25
|
-
from ...loaders import
|
25
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
26
26
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
27
27
|
from ...models.attention_processor import Attention
|
28
28
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -323,7 +323,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
|
|
323
323
|
"""
|
324
324
|
# set lora scale so that monkey patched LoRA
|
325
325
|
# function of text encoder can correctly access it
|
326
|
-
if lora_scale is not None and isinstance(self,
|
326
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
327
327
|
self._lora_scale = lora_scale
|
328
328
|
|
329
329
|
# dynamically adjust the LoRA scale
|
@@ -456,7 +456,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionM
|
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
457
457
|
|
458
458
|
if self.text_encoder is not None:
|
459
|
-
if isinstance(self,
|
459
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
460
460
|
# Retrieve the original scale by scaling back the LoRA layers
|
461
461
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
462
462
|
|
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
|
24
24
|
|
25
25
|
from ...configuration_utils import FrozenDict
|
26
26
|
from ...image_processor import VaeImageProcessor
|
27
|
-
from ...loaders import
|
27
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
28
28
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
29
29
|
from ...models.lora import adjust_lora_scale_text_encoder
|
30
30
|
from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
|
@@ -234,7 +234,7 @@ def preprocess_mask(mask, batch_size: int = 1):
|
|
234
234
|
|
235
235
|
|
236
236
|
class StableDiffusionDiffEditPipeline(
|
237
|
-
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin,
|
237
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
238
238
|
):
|
239
239
|
r"""
|
240
240
|
<Tip warning={true}>
|
@@ -250,8 +250,8 @@ class StableDiffusionDiffEditPipeline(
|
|
250
250
|
|
251
251
|
The pipeline also inherits the following loading and saving methods:
|
252
252
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
253
|
-
- [`~loaders.
|
254
|
-
- [`~loaders.
|
253
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
254
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
255
255
|
|
256
256
|
Args:
|
257
257
|
vae ([`AutoencoderKL`]):
|
@@ -448,7 +448,7 @@ class StableDiffusionDiffEditPipeline(
|
|
448
448
|
"""
|
449
449
|
# set lora scale so that monkey patched LoRA
|
450
450
|
# function of text encoder can correctly access it
|
451
|
-
if lora_scale is not None and isinstance(self,
|
451
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
452
452
|
self._lora_scale = lora_scale
|
453
453
|
|
454
454
|
# dynamically adjust the LoRA scale
|
@@ -581,7 +581,7 @@ class StableDiffusionDiffEditPipeline(
|
|
581
581
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
582
582
|
|
583
583
|
if self.text_encoder is not None:
|
584
|
-
if isinstance(self,
|
584
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
585
585
|
# Retrieve the original scale by scaling back the LoRA layers
|
586
586
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
587
587
|
|
@@ -18,10 +18,10 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
18
18
|
|
19
19
|
import PIL.Image
|
20
20
|
import torch
|
21
|
-
from transformers import
|
21
|
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
22
22
|
|
23
23
|
from ...image_processor import VaeImageProcessor
|
24
|
-
from ...loaders import
|
24
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
25
25
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
26
26
|
from ...models.attention import GatedSelfAttentionDense
|
27
27
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -138,7 +138,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
138
138
|
unet: UNet2DConditionModel,
|
139
139
|
scheduler: KarrasDiffusionSchedulers,
|
140
140
|
safety_checker: StableDiffusionSafetyChecker,
|
141
|
-
feature_extractor:
|
141
|
+
feature_extractor: CLIPImageProcessor,
|
142
142
|
requires_safety_checker: bool = True,
|
143
143
|
):
|
144
144
|
super().__init__()
|
@@ -249,7 +249,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
249
249
|
"""
|
250
250
|
# set lora scale so that monkey patched LoRA
|
251
251
|
# function of text encoder can correctly access it
|
252
|
-
if lora_scale is not None and isinstance(self,
|
252
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
253
253
|
self._lora_scale = lora_scale
|
254
254
|
|
255
255
|
# dynamically adjust the LoRA scale
|
@@ -382,7 +382,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
|
|
382
382
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
383
383
|
|
384
384
|
if self.text_encoder is not None:
|
385
|
-
if isinstance(self,
|
385
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
386
386
|
# Retrieve the original scale by scaling back the LoRA layers
|
387
387
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
388
388
|
|
@@ -19,7 +19,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
|
|
19
19
|
import PIL.Image
|
20
20
|
import torch
|
21
21
|
from transformers import (
|
22
|
-
|
22
|
+
CLIPImageProcessor,
|
23
23
|
CLIPProcessor,
|
24
24
|
CLIPTextModel,
|
25
25
|
CLIPTokenizer,
|
@@ -27,7 +27,7 @@ from transformers import (
|
|
27
27
|
)
|
28
28
|
|
29
29
|
from ...image_processor import VaeImageProcessor
|
30
|
-
from ...loaders import
|
30
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
31
31
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
32
32
|
from ...models.attention import GatedSelfAttentionDense
|
33
33
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -193,7 +193,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
193
193
|
unet: UNet2DConditionModel,
|
194
194
|
scheduler: KarrasDiffusionSchedulers,
|
195
195
|
safety_checker: StableDiffusionSafetyChecker,
|
196
|
-
feature_extractor:
|
196
|
+
feature_extractor: CLIPImageProcessor,
|
197
197
|
requires_safety_checker: bool = True,
|
198
198
|
):
|
199
199
|
super().__init__()
|
@@ -274,7 +274,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
274
274
|
"""
|
275
275
|
# set lora scale so that monkey patched LoRA
|
276
276
|
# function of text encoder can correctly access it
|
277
|
-
if lora_scale is not None and isinstance(self,
|
277
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
278
278
|
self._lora_scale = lora_scale
|
279
279
|
|
280
280
|
# dynamically adjust the LoRA scale
|
@@ -407,7 +407,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
|
|
407
407
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
408
408
|
|
409
409
|
if self.text_encoder is not None:
|
410
|
-
if isinstance(self,
|
410
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
411
411
|
# Retrieve the original scale by scaling back the LoRA layers
|
412
412
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
413
413
|
|
@@ -21,7 +21,7 @@ from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
|
|
21
21
|
from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
|
22
22
|
|
23
23
|
from ...image_processor import VaeImageProcessor
|
24
|
-
from ...loaders import
|
24
|
+
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
25
25
|
from ...models.lora import adjust_lora_scale_text_encoder
|
26
26
|
from ...schedulers import LMSDiscreteScheduler
|
27
27
|
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
|
@@ -48,7 +48,7 @@ class ModelWrapper:
|
|
48
48
|
|
49
49
|
|
50
50
|
class StableDiffusionKDiffusionPipeline(
|
51
|
-
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin,
|
51
|
+
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
|
52
52
|
):
|
53
53
|
r"""
|
54
54
|
Pipeline for text-to-image generation using Stable Diffusion.
|
@@ -58,8 +58,8 @@ class StableDiffusionKDiffusionPipeline(
|
|
58
58
|
|
59
59
|
The pipeline also inherits the following loading methods:
|
60
60
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
61
|
-
- [`~loaders.
|
62
|
-
- [`~loaders.
|
61
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
62
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
63
63
|
|
64
64
|
<Tip warning={true}>
|
65
65
|
|
@@ -223,7 +223,7 @@ class StableDiffusionKDiffusionPipeline(
|
|
223
223
|
"""
|
224
224
|
# set lora scale so that monkey patched LoRA
|
225
225
|
# function of text encoder can correctly access it
|
226
|
-
if lora_scale is not None and isinstance(self,
|
226
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
227
227
|
self._lora_scale = lora_scale
|
228
228
|
|
229
229
|
# dynamically adjust the LoRA scale
|
@@ -356,7 +356,7 @@ class StableDiffusionKDiffusionPipeline(
|
|
356
356
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
357
357
|
|
358
358
|
if self.text_encoder is not None:
|
359
|
-
if isinstance(self,
|
359
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
360
360
|
# Retrieve the original scale by scaling back the LoRA layers
|
361
361
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
362
362
|
|
diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
CHANGED
@@ -36,8 +36,6 @@ from ...models import AutoencoderKL, UNet2DConditionModel
|
|
36
36
|
from ...models.attention_processor import (
|
37
37
|
AttnProcessor2_0,
|
38
38
|
FusedAttnProcessor2_0,
|
39
|
-
LoRAAttnProcessor2_0,
|
40
|
-
LoRAXFormersAttnProcessor,
|
41
39
|
XFormersAttnProcessor,
|
42
40
|
)
|
43
41
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -543,8 +541,6 @@ class StableDiffusionXLKDiffusionPipeline(
|
|
543
541
|
(
|
544
542
|
AttnProcessor2_0,
|
545
543
|
XFormersAttnProcessor,
|
546
|
-
LoRAXFormersAttnProcessor,
|
547
|
-
LoRAAttnProcessor2_0,
|
548
544
|
FusedAttnProcessor2_0,
|
549
545
|
),
|
550
546
|
)
|
@@ -22,7 +22,7 @@ import torch
|
|
22
22
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
23
23
|
|
24
24
|
from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
|
25
|
-
from ...loaders import FromSingleFileMixin, IPAdapterMixin,
|
25
|
+
from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
26
26
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
27
27
|
from ...models.lora import adjust_lora_scale_text_encoder
|
28
28
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -161,7 +161,7 @@ class StableDiffusionLDM3DPipeline(
|
|
161
161
|
StableDiffusionMixin,
|
162
162
|
TextualInversionLoaderMixin,
|
163
163
|
IPAdapterMixin,
|
164
|
-
|
164
|
+
StableDiffusionLoraLoaderMixin,
|
165
165
|
FromSingleFileMixin,
|
166
166
|
):
|
167
167
|
r"""
|
@@ -172,8 +172,8 @@ class StableDiffusionLDM3DPipeline(
|
|
172
172
|
|
173
173
|
The pipeline also inherits the following loading methods:
|
174
174
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
175
|
-
- [`~loaders.
|
176
|
-
- [`~loaders.
|
175
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
176
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
177
177
|
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
178
178
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
179
179
|
|
@@ -323,7 +323,7 @@ class StableDiffusionLDM3DPipeline(
|
|
323
323
|
"""
|
324
324
|
# set lora scale so that monkey patched LoRA
|
325
325
|
# function of text encoder can correctly access it
|
326
|
-
if lora_scale is not None and isinstance(self,
|
326
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
327
327
|
self._lora_scale = lora_scale
|
328
328
|
|
329
329
|
# dynamically adjust the LoRA scale
|
@@ -456,7 +456,7 @@ class StableDiffusionLDM3DPipeline(
|
|
456
456
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
457
457
|
|
458
458
|
if self.text_encoder is not None:
|
459
|
-
if isinstance(self,
|
459
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
460
460
|
# Retrieve the original scale by scaling back the LoRA layers
|
461
461
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
462
462
|
|
@@ -491,6 +491,9 @@ class StableDiffusionLDM3DPipeline(
|
|
491
491
|
def prepare_ip_adapter_image_embeds(
|
492
492
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
493
493
|
):
|
494
|
+
image_embeds = []
|
495
|
+
if do_classifier_free_guidance:
|
496
|
+
negative_image_embeds = []
|
494
497
|
if ip_adapter_image_embeds is None:
|
495
498
|
if not isinstance(ip_adapter_image, list):
|
496
499
|
ip_adapter_image = [ip_adapter_image]
|
@@ -500,7 +503,6 @@ class StableDiffusionLDM3DPipeline(
|
|
500
503
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
501
504
|
)
|
502
505
|
|
503
|
-
image_embeds = []
|
504
506
|
for single_ip_adapter_image, image_proj_layer in zip(
|
505
507
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
506
508
|
):
|
@@ -508,36 +510,28 @@ class StableDiffusionLDM3DPipeline(
|
|
508
510
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
509
511
|
single_ip_adapter_image, device, 1, output_hidden_state
|
510
512
|
)
|
511
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
512
|
-
single_negative_image_embeds = torch.stack(
|
513
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
514
|
-
)
|
515
513
|
|
514
|
+
image_embeds.append(single_image_embeds[None, :])
|
516
515
|
if do_classifier_free_guidance:
|
517
|
-
|
518
|
-
single_image_embeds = single_image_embeds.to(device)
|
519
|
-
|
520
|
-
image_embeds.append(single_image_embeds)
|
516
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
521
517
|
else:
|
522
|
-
repeat_dims = [1]
|
523
|
-
image_embeds = []
|
524
518
|
for single_image_embeds in ip_adapter_image_embeds:
|
525
519
|
if do_classifier_free_guidance:
|
526
520
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
527
|
-
|
528
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
529
|
-
)
|
530
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
531
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
532
|
-
)
|
533
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
534
|
-
else:
|
535
|
-
single_image_embeds = single_image_embeds.repeat(
|
536
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
537
|
-
)
|
521
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
538
522
|
image_embeds.append(single_image_embeds)
|
539
523
|
|
540
|
-
|
524
|
+
ip_adapter_image_embeds = []
|
525
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
526
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
527
|
+
if do_classifier_free_guidance:
|
528
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
529
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
530
|
+
|
531
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
532
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
533
|
+
|
534
|
+
return ip_adapter_image_embeds
|
541
535
|
|
542
536
|
def run_safety_checker(self, image, device, dtype):
|
543
537
|
if self.safety_checker is None:
|
@@ -19,7 +19,7 @@ import torch
|
|
19
19
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
20
20
|
|
21
21
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
22
|
-
from ...loaders import IPAdapterMixin,
|
22
|
+
from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
23
23
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
24
24
|
from ...models.lora import adjust_lora_scale_text_encoder
|
25
25
|
from ...schedulers import DDIMScheduler
|
@@ -135,7 +135,11 @@ def retrieve_timesteps(
|
|
135
135
|
|
136
136
|
|
137
137
|
class StableDiffusionPanoramaPipeline(
|
138
|
-
DiffusionPipeline,
|
138
|
+
DiffusionPipeline,
|
139
|
+
StableDiffusionMixin,
|
140
|
+
TextualInversionLoaderMixin,
|
141
|
+
StableDiffusionLoraLoaderMixin,
|
142
|
+
IPAdapterMixin,
|
139
143
|
):
|
140
144
|
r"""
|
141
145
|
Pipeline for text-to-image generation using MultiDiffusion.
|
@@ -145,8 +149,8 @@ class StableDiffusionPanoramaPipeline(
|
|
145
149
|
|
146
150
|
The pipeline also inherits the following loading methods:
|
147
151
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
148
|
-
- [`~loaders.
|
149
|
-
- [`~loaders.
|
152
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
153
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
150
154
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
151
155
|
|
152
156
|
Args:
|
@@ -295,7 +299,7 @@ class StableDiffusionPanoramaPipeline(
|
|
295
299
|
"""
|
296
300
|
# set lora scale so that monkey patched LoRA
|
297
301
|
# function of text encoder can correctly access it
|
298
|
-
if lora_scale is not None and isinstance(self,
|
302
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
299
303
|
self._lora_scale = lora_scale
|
300
304
|
|
301
305
|
# dynamically adjust the LoRA scale
|
@@ -428,7 +432,7 @@ class StableDiffusionPanoramaPipeline(
|
|
428
432
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
429
433
|
|
430
434
|
if self.text_encoder is not None:
|
431
|
-
if isinstance(self,
|
435
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
432
436
|
# Retrieve the original scale by scaling back the LoRA layers
|
433
437
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
434
438
|
|
@@ -463,6 +467,9 @@ class StableDiffusionPanoramaPipeline(
|
|
463
467
|
def prepare_ip_adapter_image_embeds(
|
464
468
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
465
469
|
):
|
470
|
+
image_embeds = []
|
471
|
+
if do_classifier_free_guidance:
|
472
|
+
negative_image_embeds = []
|
466
473
|
if ip_adapter_image_embeds is None:
|
467
474
|
if not isinstance(ip_adapter_image, list):
|
468
475
|
ip_adapter_image = [ip_adapter_image]
|
@@ -472,7 +479,6 @@ class StableDiffusionPanoramaPipeline(
|
|
472
479
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
473
480
|
)
|
474
481
|
|
475
|
-
image_embeds = []
|
476
482
|
for single_ip_adapter_image, image_proj_layer in zip(
|
477
483
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
478
484
|
):
|
@@ -480,36 +486,28 @@ class StableDiffusionPanoramaPipeline(
|
|
480
486
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
481
487
|
single_ip_adapter_image, device, 1, output_hidden_state
|
482
488
|
)
|
483
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
484
|
-
single_negative_image_embeds = torch.stack(
|
485
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
486
|
-
)
|
487
489
|
|
490
|
+
image_embeds.append(single_image_embeds[None, :])
|
488
491
|
if do_classifier_free_guidance:
|
489
|
-
|
490
|
-
single_image_embeds = single_image_embeds.to(device)
|
491
|
-
|
492
|
-
image_embeds.append(single_image_embeds)
|
492
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
493
493
|
else:
|
494
|
-
repeat_dims = [1]
|
495
|
-
image_embeds = []
|
496
494
|
for single_image_embeds in ip_adapter_image_embeds:
|
497
495
|
if do_classifier_free_guidance:
|
498
496
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
499
|
-
|
500
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
501
|
-
)
|
502
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
503
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
504
|
-
)
|
505
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
506
|
-
else:
|
507
|
-
single_image_embeds = single_image_embeds.repeat(
|
508
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
509
|
-
)
|
497
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
510
498
|
image_embeds.append(single_image_embeds)
|
511
499
|
|
512
|
-
|
500
|
+
ip_adapter_image_embeds = []
|
501
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
502
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
503
|
+
if do_classifier_free_guidance:
|
504
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
505
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
506
|
+
|
507
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
508
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
509
|
+
|
510
|
+
return ip_adapter_image_embeds
|
513
511
|
|
514
512
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
|
515
513
|
def run_safety_checker(self, image, device, dtype):
|
@@ -20,7 +20,7 @@ import torch.nn.functional as F
|
|
20
20
|
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
|
21
21
|
|
22
22
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
23
|
-
from ...loaders import IPAdapterMixin,
|
23
|
+
from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
24
24
|
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
25
25
|
from ...models.lora import adjust_lora_scale_text_encoder
|
26
26
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -238,7 +238,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
238
238
|
"""
|
239
239
|
# set lora scale so that monkey patched LoRA
|
240
240
|
# function of text encoder can correctly access it
|
241
|
-
if lora_scale is not None and isinstance(self,
|
241
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
242
242
|
self._lora_scale = lora_scale
|
243
243
|
|
244
244
|
# dynamically adjust the LoRA scale
|
@@ -371,7 +371,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
|
|
371
371
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
372
372
|
|
373
373
|
if self.text_encoder is not None:
|
374
|
-
if isinstance(self,
|
374
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
375
375
|
# Retrieve the original scale by scaling back the LoRA layers
|
376
376
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
377
377
|
|
@@ -36,8 +36,6 @@ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
|
|
36
36
|
from ...models.attention_processor import (
|
37
37
|
AttnProcessor2_0,
|
38
38
|
FusedAttnProcessor2_0,
|
39
|
-
LoRAAttnProcessor2_0,
|
40
|
-
LoRAXFormersAttnProcessor,
|
41
39
|
XFormersAttnProcessor,
|
42
40
|
)
|
43
41
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -537,6 +535,9 @@ class StableDiffusionXLPipeline(
|
|
537
535
|
def prepare_ip_adapter_image_embeds(
|
538
536
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
539
537
|
):
|
538
|
+
image_embeds = []
|
539
|
+
if do_classifier_free_guidance:
|
540
|
+
negative_image_embeds = []
|
540
541
|
if ip_adapter_image_embeds is None:
|
541
542
|
if not isinstance(ip_adapter_image, list):
|
542
543
|
ip_adapter_image = [ip_adapter_image]
|
@@ -546,7 +547,6 @@ class StableDiffusionXLPipeline(
|
|
546
547
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
547
548
|
)
|
548
549
|
|
549
|
-
image_embeds = []
|
550
550
|
for single_ip_adapter_image, image_proj_layer in zip(
|
551
551
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
552
552
|
):
|
@@ -554,36 +554,28 @@ class StableDiffusionXLPipeline(
|
|
554
554
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
555
555
|
single_ip_adapter_image, device, 1, output_hidden_state
|
556
556
|
)
|
557
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
558
|
-
single_negative_image_embeds = torch.stack(
|
559
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
560
|
-
)
|
561
557
|
|
558
|
+
image_embeds.append(single_image_embeds[None, :])
|
562
559
|
if do_classifier_free_guidance:
|
563
|
-
|
564
|
-
single_image_embeds = single_image_embeds.to(device)
|
565
|
-
|
566
|
-
image_embeds.append(single_image_embeds)
|
560
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
567
561
|
else:
|
568
|
-
repeat_dims = [1]
|
569
|
-
image_embeds = []
|
570
562
|
for single_image_embeds in ip_adapter_image_embeds:
|
571
563
|
if do_classifier_free_guidance:
|
572
564
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
573
|
-
|
574
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
575
|
-
)
|
576
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
577
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
578
|
-
)
|
579
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
580
|
-
else:
|
581
|
-
single_image_embeds = single_image_embeds.repeat(
|
582
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
583
|
-
)
|
565
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
584
566
|
image_embeds.append(single_image_embeds)
|
585
567
|
|
586
|
-
|
568
|
+
ip_adapter_image_embeds = []
|
569
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
570
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
571
|
+
if do_classifier_free_guidance:
|
572
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
573
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
574
|
+
|
575
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
576
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
577
|
+
|
578
|
+
return ip_adapter_image_embeds
|
587
579
|
|
588
580
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
589
581
|
def prepare_extra_step_kwargs(self, generator, eta):
|
@@ -748,8 +740,6 @@ class StableDiffusionXLPipeline(
|
|
748
740
|
(
|
749
741
|
AttnProcessor2_0,
|
750
742
|
XFormersAttnProcessor,
|
751
|
-
LoRAXFormersAttnProcessor,
|
752
|
-
LoRAAttnProcessor2_0,
|
753
743
|
FusedAttnProcessor2_0,
|
754
744
|
),
|
755
745
|
)
|