diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffusers/__init__.py +94 -3
- diffusers/commands/env.py +1 -5
- diffusers/configuration_utils.py +4 -9
- diffusers/dependency_versions_table.py +2 -2
- diffusers/image_processor.py +1 -2
- diffusers/loaders/__init__.py +17 -2
- diffusers/loaders/ip_adapter.py +10 -7
- diffusers/loaders/lora_base.py +752 -0
- diffusers/loaders/lora_pipeline.py +2252 -0
- diffusers/loaders/peft.py +213 -5
- diffusers/loaders/single_file.py +3 -14
- diffusers/loaders/single_file_model.py +31 -10
- diffusers/loaders/single_file_utils.py +293 -8
- diffusers/loaders/textual_inversion.py +1 -6
- diffusers/loaders/unet.py +23 -208
- diffusers/models/__init__.py +20 -0
- diffusers/models/activations.py +22 -0
- diffusers/models/attention.py +386 -7
- diffusers/models/attention_processor.py +1937 -629
- diffusers/models/autoencoders/__init__.py +2 -0
- diffusers/models/autoencoders/autoencoder_kl.py +14 -3
- diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +1271 -0
- diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
- diffusers/models/autoencoders/autoencoder_oobleck.py +464 -0
- diffusers/models/autoencoders/autoencoder_tiny.py +1 -0
- diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
- diffusers/models/autoencoders/vq_model.py +4 -4
- diffusers/models/controlnet.py +2 -3
- diffusers/models/controlnet_hunyuan.py +401 -0
- diffusers/models/controlnet_sd3.py +11 -11
- diffusers/models/controlnet_sparsectrl.py +789 -0
- diffusers/models/controlnet_xs.py +40 -10
- diffusers/models/downsampling.py +68 -0
- diffusers/models/embeddings.py +403 -36
- diffusers/models/model_loading_utils.py +1 -3
- diffusers/models/modeling_flax_utils.py +1 -6
- diffusers/models/modeling_utils.py +4 -16
- diffusers/models/normalization.py +203 -12
- diffusers/models/transformers/__init__.py +6 -0
- diffusers/models/transformers/auraflow_transformer_2d.py +543 -0
- diffusers/models/transformers/cogvideox_transformer_3d.py +485 -0
- diffusers/models/transformers/hunyuan_transformer_2d.py +19 -15
- diffusers/models/transformers/latte_transformer_3d.py +327 -0
- diffusers/models/transformers/lumina_nextdit2d.py +340 -0
- diffusers/models/transformers/pixart_transformer_2d.py +102 -1
- diffusers/models/transformers/prior_transformer.py +1 -1
- diffusers/models/transformers/stable_audio_transformer.py +458 -0
- diffusers/models/transformers/transformer_flux.py +455 -0
- diffusers/models/transformers/transformer_sd3.py +18 -4
- diffusers/models/unets/unet_1d_blocks.py +1 -1
- diffusers/models/unets/unet_2d_condition.py +8 -1
- diffusers/models/unets/unet_3d_blocks.py +51 -920
- diffusers/models/unets/unet_3d_condition.py +4 -1
- diffusers/models/unets/unet_i2vgen_xl.py +4 -1
- diffusers/models/unets/unet_kandinsky3.py +1 -1
- diffusers/models/unets/unet_motion_model.py +1330 -84
- diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
- diffusers/models/unets/unet_stable_cascade.py +1 -3
- diffusers/models/unets/uvit_2d.py +1 -1
- diffusers/models/upsampling.py +64 -0
- diffusers/models/vq_model.py +8 -4
- diffusers/optimization.py +1 -1
- diffusers/pipelines/__init__.py +100 -3
- diffusers/pipelines/animatediff/__init__.py +4 -0
- diffusers/pipelines/animatediff/pipeline_animatediff.py +50 -40
- diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +1076 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +17 -27
- diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1008 -0
- diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +51 -38
- diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
- diffusers/pipelines/audioldm2/pipeline_audioldm2.py +1 -0
- diffusers/pipelines/aura_flow/__init__.py +48 -0
- diffusers/pipelines/aura_flow/pipeline_aura_flow.py +591 -0
- diffusers/pipelines/auto_pipeline.py +97 -19
- diffusers/pipelines/cogvideo/__init__.py +48 -0
- diffusers/pipelines/cogvideo/pipeline_cogvideox.py +746 -0
- diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
- diffusers/pipelines/controlnet/pipeline_controlnet.py +24 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +31 -30
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +24 -153
- diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +19 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +18 -28
- diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +29 -32
- diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
- diffusers/pipelines/controlnet_hunyuandit/__init__.py +48 -0
- diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +1042 -0
- diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +35 -0
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +10 -6
- diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +0 -4
- diffusers/pipelines/deepfloyd_if/pipeline_if.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +2 -2
- diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +2 -2
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +11 -6
- diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +11 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +10 -10
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +10 -6
- diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +3 -3
- diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +1 -1
- diffusers/pipelines/flux/__init__.py +47 -0
- diffusers/pipelines/flux/pipeline_flux.py +749 -0
- diffusers/pipelines/flux/pipeline_output.py +21 -0
- diffusers/pipelines/free_init_utils.py +2 -0
- diffusers/pipelines/free_noise_utils.py +236 -0
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +2 -2
- diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +2 -2
- diffusers/pipelines/kolors/__init__.py +54 -0
- diffusers/pipelines/kolors/pipeline_kolors.py +1070 -0
- diffusers/pipelines/kolors/pipeline_kolors_img2img.py +1247 -0
- diffusers/pipelines/kolors/pipeline_output.py +21 -0
- diffusers/pipelines/kolors/text_encoder.py +889 -0
- diffusers/pipelines/kolors/tokenizer.py +334 -0
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +30 -29
- diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +23 -29
- diffusers/pipelines/latte/__init__.py +48 -0
- diffusers/pipelines/latte/pipeline_latte.py +881 -0
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +4 -4
- diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +0 -4
- diffusers/pipelines/lumina/__init__.py +48 -0
- diffusers/pipelines/lumina/pipeline_lumina.py +897 -0
- diffusers/pipelines/pag/__init__.py +67 -0
- diffusers/pipelines/pag/pag_utils.py +237 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1329 -0
- diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1612 -0
- diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +953 -0
- diffusers/pipelines/pag/pipeline_pag_kolors.py +1136 -0
- diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +872 -0
- diffusers/pipelines/pag/pipeline_pag_sd.py +1050 -0
- diffusers/pipelines/pag/pipeline_pag_sd_3.py +985 -0
- diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +862 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl.py +1333 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +1529 -0
- diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +1753 -0
- diffusers/pipelines/pia/pipeline_pia.py +30 -37
- diffusers/pipelines/pipeline_flax_utils.py +4 -9
- diffusers/pipelines/pipeline_loading_utils.py +0 -3
- diffusers/pipelines/pipeline_utils.py +2 -14
- diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +0 -1
- diffusers/pipelines/stable_audio/__init__.py +50 -0
- diffusers/pipelines/stable_audio/modeling_stable_audio.py +158 -0
- diffusers/pipelines/stable_audio/pipeline_stable_audio.py +745 -0
- diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +2 -0
- diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +23 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +15 -8
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +30 -29
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +23 -152
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +8 -4
- diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +11 -11
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +8 -6
- diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +6 -6
- diffusers/pipelines/stable_diffusion_3/__init__.py +2 -0
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +34 -3
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +33 -7
- diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +1201 -0
- diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +3 -3
- diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +6 -6
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -5
- diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +5 -5
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +6 -6
- diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +0 -4
- diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +23 -29
- diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +27 -29
- diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +3 -3
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +17 -27
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +26 -29
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +17 -145
- diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +0 -4
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +6 -6
- diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +18 -28
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +8 -6
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +6 -4
- diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +0 -4
- diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
- diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
- diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +5 -4
- diffusers/schedulers/__init__.py +8 -0
- diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py +572 -0
- diffusers/schedulers/scheduling_ddim.py +1 -1
- diffusers/schedulers/scheduling_ddim_cogvideox.py +449 -0
- diffusers/schedulers/scheduling_ddpm.py +1 -1
- diffusers/schedulers/scheduling_ddpm_parallel.py +1 -1
- diffusers/schedulers/scheduling_deis_multistep.py +2 -2
- diffusers/schedulers/scheduling_dpm_cogvideox.py +489 -0
- diffusers/schedulers/scheduling_dpmsolver_multistep.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +1 -1
- diffusers/schedulers/scheduling_dpmsolver_singlestep.py +64 -19
- diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +2 -2
- diffusers/schedulers/scheduling_flow_match_euler_discrete.py +63 -39
- diffusers/schedulers/scheduling_flow_match_heun_discrete.py +321 -0
- diffusers/schedulers/scheduling_ipndm.py +1 -1
- diffusers/schedulers/scheduling_unipc_multistep.py +1 -1
- diffusers/schedulers/scheduling_utils.py +1 -3
- diffusers/schedulers/scheduling_utils_flax.py +1 -3
- diffusers/training_utils.py +99 -14
- diffusers/utils/__init__.py +2 -2
- diffusers/utils/dummy_pt_objects.py +210 -0
- diffusers/utils/dummy_torch_and_torchsde_objects.py +15 -0
- diffusers/utils/dummy_torch_and_transformers_and_sentencepiece_objects.py +47 -0
- diffusers/utils/dummy_torch_and_transformers_objects.py +315 -0
- diffusers/utils/dynamic_modules_utils.py +1 -11
- diffusers/utils/export_utils.py +50 -6
- diffusers/utils/hub_utils.py +45 -42
- diffusers/utils/import_utils.py +37 -15
- diffusers/utils/loading_utils.py +80 -3
- diffusers/utils/testing_utils.py +11 -8
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/METADATA +73 -83
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/RECORD +217 -164
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/WHEEL +1 -1
- diffusers/loaders/autoencoder.py +0 -146
- diffusers/loaders/controlnet.py +0 -136
- diffusers/loaders/lora.py +0 -1728
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/LICENSE +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/entry_points.txt +0 -0
- {diffusers-0.29.2.dist-info → diffusers-0.30.1.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,7 @@ EXAMPLE_DOC_STRING = """
|
|
52
52
|
>>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png")
|
53
53
|
|
54
54
|
>>> # Multistep sampling, class-conditional image generation
|
55
|
-
>>> # Timesteps can be explicitly specified; the particular timesteps below are from the original
|
55
|
+
>>> # Timesteps can be explicitly specified; the particular timesteps below are from the original GitHub repo:
|
56
56
|
>>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77
|
57
57
|
>>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0]
|
58
58
|
>>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png")
|
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
|
|
24
24
|
|
25
25
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
26
26
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
27
|
-
from ...loaders import FromSingleFileMixin, IPAdapterMixin,
|
27
|
+
from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
28
28
|
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
29
29
|
from ...models.lora import adjust_lora_scale_text_encoder
|
30
30
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -156,7 +156,7 @@ class StableDiffusionControlNetPipeline(
|
|
156
156
|
DiffusionPipeline,
|
157
157
|
StableDiffusionMixin,
|
158
158
|
TextualInversionLoaderMixin,
|
159
|
-
|
159
|
+
StableDiffusionLoraLoaderMixin,
|
160
160
|
IPAdapterMixin,
|
161
161
|
FromSingleFileMixin,
|
162
162
|
):
|
@@ -168,8 +168,8 @@ class StableDiffusionControlNetPipeline(
|
|
168
168
|
|
169
169
|
The pipeline also inherits the following loading methods:
|
170
170
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
171
|
-
- [`~loaders.
|
172
|
-
- [`~loaders.
|
171
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
172
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
173
173
|
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
174
174
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
175
175
|
|
@@ -331,7 +331,7 @@ class StableDiffusionControlNetPipeline(
|
|
331
331
|
"""
|
332
332
|
# set lora scale so that monkey patched LoRA
|
333
333
|
# function of text encoder can correctly access it
|
334
|
-
if lora_scale is not None and isinstance(self,
|
334
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
335
335
|
self._lora_scale = lora_scale
|
336
336
|
|
337
337
|
# dynamically adjust the LoRA scale
|
@@ -464,7 +464,7 @@ class StableDiffusionControlNetPipeline(
|
|
464
464
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
465
465
|
|
466
466
|
if self.text_encoder is not None:
|
467
|
-
if isinstance(self,
|
467
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
468
468
|
# Retrieve the original scale by scaling back the LoRA layers
|
469
469
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
470
470
|
|
@@ -499,6 +499,9 @@ class StableDiffusionControlNetPipeline(
|
|
499
499
|
def prepare_ip_adapter_image_embeds(
|
500
500
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
501
501
|
):
|
502
|
+
image_embeds = []
|
503
|
+
if do_classifier_free_guidance:
|
504
|
+
negative_image_embeds = []
|
502
505
|
if ip_adapter_image_embeds is None:
|
503
506
|
if not isinstance(ip_adapter_image, list):
|
504
507
|
ip_adapter_image = [ip_adapter_image]
|
@@ -508,7 +511,6 @@ class StableDiffusionControlNetPipeline(
|
|
508
511
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
509
512
|
)
|
510
513
|
|
511
|
-
image_embeds = []
|
512
514
|
for single_ip_adapter_image, image_proj_layer in zip(
|
513
515
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
514
516
|
):
|
@@ -516,36 +518,28 @@ class StableDiffusionControlNetPipeline(
|
|
516
518
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
517
519
|
single_ip_adapter_image, device, 1, output_hidden_state
|
518
520
|
)
|
519
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
520
|
-
single_negative_image_embeds = torch.stack(
|
521
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
522
|
-
)
|
523
521
|
|
522
|
+
image_embeds.append(single_image_embeds[None, :])
|
524
523
|
if do_classifier_free_guidance:
|
525
|
-
|
526
|
-
single_image_embeds = single_image_embeds.to(device)
|
527
|
-
|
528
|
-
image_embeds.append(single_image_embeds)
|
524
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
529
525
|
else:
|
530
|
-
repeat_dims = [1]
|
531
|
-
image_embeds = []
|
532
526
|
for single_image_embeds in ip_adapter_image_embeds:
|
533
527
|
if do_classifier_free_guidance:
|
534
528
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
535
|
-
|
536
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
537
|
-
)
|
538
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
539
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
540
|
-
)
|
541
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
542
|
-
else:
|
543
|
-
single_image_embeds = single_image_embeds.repeat(
|
544
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
545
|
-
)
|
529
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
546
530
|
image_embeds.append(single_image_embeds)
|
547
531
|
|
548
|
-
|
532
|
+
ip_adapter_image_embeds = []
|
533
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
534
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
535
|
+
if do_classifier_free_guidance:
|
536
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
537
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
538
|
+
|
539
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
540
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
541
|
+
|
542
|
+
return ip_adapter_image_embeds
|
549
543
|
|
550
544
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
|
551
545
|
def run_safety_checker(self, image, device, dtype):
|
@@ -1278,7 +1272,7 @@ class StableDiffusionControlNetPipeline(
|
|
1278
1272
|
)
|
1279
1273
|
|
1280
1274
|
if guess_mode and self.do_classifier_free_guidance:
|
1281
|
-
#
|
1275
|
+
# Inferred ControlNet only for the conditional batch.
|
1282
1276
|
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
1283
1277
|
# add 0 to the unconditional batch to keep it unchanged.
|
1284
1278
|
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
@@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
|
|
23
23
|
|
24
24
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25
25
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
26
|
-
from ...loaders import FromSingleFileMixin, IPAdapterMixin,
|
26
|
+
from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
27
27
|
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
28
28
|
from ...models.lora import adjust_lora_scale_text_encoder
|
29
29
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -134,7 +134,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
134
134
|
DiffusionPipeline,
|
135
135
|
StableDiffusionMixin,
|
136
136
|
TextualInversionLoaderMixin,
|
137
|
-
|
137
|
+
StableDiffusionLoraLoaderMixin,
|
138
138
|
IPAdapterMixin,
|
139
139
|
FromSingleFileMixin,
|
140
140
|
):
|
@@ -146,8 +146,8 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
146
146
|
|
147
147
|
The pipeline also inherits the following loading methods:
|
148
148
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
149
|
-
- [`~loaders.
|
150
|
-
- [`~loaders.
|
149
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
150
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
151
151
|
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
152
152
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
153
153
|
|
@@ -309,7 +309,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
309
309
|
"""
|
310
310
|
# set lora scale so that monkey patched LoRA
|
311
311
|
# function of text encoder can correctly access it
|
312
|
-
if lora_scale is not None and isinstance(self,
|
312
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
313
313
|
self._lora_scale = lora_scale
|
314
314
|
|
315
315
|
# dynamically adjust the LoRA scale
|
@@ -442,7 +442,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
442
442
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
443
443
|
|
444
444
|
if self.text_encoder is not None:
|
445
|
-
if isinstance(self,
|
445
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
446
446
|
# Retrieve the original scale by scaling back the LoRA layers
|
447
447
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
448
448
|
|
@@ -477,6 +477,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
477
477
|
def prepare_ip_adapter_image_embeds(
|
478
478
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
479
479
|
):
|
480
|
+
image_embeds = []
|
481
|
+
if do_classifier_free_guidance:
|
482
|
+
negative_image_embeds = []
|
480
483
|
if ip_adapter_image_embeds is None:
|
481
484
|
if not isinstance(ip_adapter_image, list):
|
482
485
|
ip_adapter_image = [ip_adapter_image]
|
@@ -486,7 +489,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
486
489
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
487
490
|
)
|
488
491
|
|
489
|
-
image_embeds = []
|
490
492
|
for single_ip_adapter_image, image_proj_layer in zip(
|
491
493
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
492
494
|
):
|
@@ -494,36 +496,28 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
494
496
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
495
497
|
single_ip_adapter_image, device, 1, output_hidden_state
|
496
498
|
)
|
497
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
498
|
-
single_negative_image_embeds = torch.stack(
|
499
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
500
|
-
)
|
501
499
|
|
500
|
+
image_embeds.append(single_image_embeds[None, :])
|
502
501
|
if do_classifier_free_guidance:
|
503
|
-
|
504
|
-
single_image_embeds = single_image_embeds.to(device)
|
505
|
-
|
506
|
-
image_embeds.append(single_image_embeds)
|
502
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
507
503
|
else:
|
508
|
-
repeat_dims = [1]
|
509
|
-
image_embeds = []
|
510
504
|
for single_image_embeds in ip_adapter_image_embeds:
|
511
505
|
if do_classifier_free_guidance:
|
512
506
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
513
|
-
|
514
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
515
|
-
)
|
516
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
517
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
518
|
-
)
|
519
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
520
|
-
else:
|
521
|
-
single_image_embeds = single_image_embeds.repeat(
|
522
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
523
|
-
)
|
507
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
524
508
|
image_embeds.append(single_image_embeds)
|
525
509
|
|
526
|
-
|
510
|
+
ip_adapter_image_embeds = []
|
511
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
512
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
513
|
+
if do_classifier_free_guidance:
|
514
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
515
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
516
|
+
|
517
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
518
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
519
|
+
|
520
|
+
return ip_adapter_image_embeds
|
527
521
|
|
528
522
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
|
529
523
|
def run_safety_checker(self, image, device, dtype):
|
@@ -830,6 +824,13 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
830
824
|
)
|
831
825
|
|
832
826
|
elif isinstance(generator, list):
|
827
|
+
if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
|
828
|
+
image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
|
829
|
+
elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
|
830
|
+
raise ValueError(
|
831
|
+
f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
|
832
|
+
)
|
833
|
+
|
833
834
|
init_latents = [
|
834
835
|
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
|
835
836
|
for i in range(batch_size)
|
@@ -1243,7 +1244,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
|
|
1243
1244
|
)
|
1244
1245
|
|
1245
1246
|
if guess_mode and self.do_classifier_free_guidance:
|
1246
|
-
#
|
1247
|
+
# Inferred ControlNet only for the conditional batch.
|
1247
1248
|
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
1248
1249
|
# add 0 to the unconditional batch to keep it unchanged.
|
1249
1250
|
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
@@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
|
|
25
25
|
|
26
26
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
27
27
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
28
|
-
from ...loaders import FromSingleFileMixin, IPAdapterMixin,
|
28
|
+
from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
|
29
29
|
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
30
30
|
from ...models.lora import adjust_lora_scale_text_encoder
|
31
31
|
from ...schedulers import KarrasDiffusionSchedulers
|
@@ -118,134 +118,11 @@ def retrieve_latents(
|
|
118
118
|
raise AttributeError("Could not access latents of provided encoder_output")
|
119
119
|
|
120
120
|
|
121
|
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
|
122
|
-
def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
|
123
|
-
"""
|
124
|
-
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
|
125
|
-
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
|
126
|
-
``image`` and ``1`` for the ``mask``.
|
127
|
-
|
128
|
-
The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
|
129
|
-
binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
|
130
|
-
|
131
|
-
Args:
|
132
|
-
image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
|
133
|
-
It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
|
134
|
-
``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
|
135
|
-
mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
|
136
|
-
It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
|
137
|
-
``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
|
138
|
-
|
139
|
-
|
140
|
-
Raises:
|
141
|
-
ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
|
142
|
-
should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
|
143
|
-
TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
|
144
|
-
(ot the other way around).
|
145
|
-
|
146
|
-
Returns:
|
147
|
-
tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
|
148
|
-
dimensions: ``batch x channels x height x width``.
|
149
|
-
"""
|
150
|
-
deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
|
151
|
-
deprecate(
|
152
|
-
"prepare_mask_and_masked_image",
|
153
|
-
"0.30.0",
|
154
|
-
deprecation_message,
|
155
|
-
)
|
156
|
-
if image is None:
|
157
|
-
raise ValueError("`image` input cannot be undefined.")
|
158
|
-
|
159
|
-
if mask is None:
|
160
|
-
raise ValueError("`mask_image` input cannot be undefined.")
|
161
|
-
|
162
|
-
if isinstance(image, torch.Tensor):
|
163
|
-
if not isinstance(mask, torch.Tensor):
|
164
|
-
raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
|
165
|
-
|
166
|
-
# Batch single image
|
167
|
-
if image.ndim == 3:
|
168
|
-
assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
|
169
|
-
image = image.unsqueeze(0)
|
170
|
-
|
171
|
-
# Batch and add channel dim for single mask
|
172
|
-
if mask.ndim == 2:
|
173
|
-
mask = mask.unsqueeze(0).unsqueeze(0)
|
174
|
-
|
175
|
-
# Batch single mask or add channel dim
|
176
|
-
if mask.ndim == 3:
|
177
|
-
# Single batched mask, no channel dim or single mask not batched but channel dim
|
178
|
-
if mask.shape[0] == 1:
|
179
|
-
mask = mask.unsqueeze(0)
|
180
|
-
|
181
|
-
# Batched masks no channel dim
|
182
|
-
else:
|
183
|
-
mask = mask.unsqueeze(1)
|
184
|
-
|
185
|
-
assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
|
186
|
-
assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
|
187
|
-
assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
|
188
|
-
|
189
|
-
# Check image is in [-1, 1]
|
190
|
-
if image.min() < -1 or image.max() > 1:
|
191
|
-
raise ValueError("Image should be in [-1, 1] range")
|
192
|
-
|
193
|
-
# Check mask is in [0, 1]
|
194
|
-
if mask.min() < 0 or mask.max() > 1:
|
195
|
-
raise ValueError("Mask should be in [0, 1] range")
|
196
|
-
|
197
|
-
# Binarize mask
|
198
|
-
mask[mask < 0.5] = 0
|
199
|
-
mask[mask >= 0.5] = 1
|
200
|
-
|
201
|
-
# Image as float32
|
202
|
-
image = image.to(dtype=torch.float32)
|
203
|
-
elif isinstance(mask, torch.Tensor):
|
204
|
-
raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
|
205
|
-
else:
|
206
|
-
# preprocess image
|
207
|
-
if isinstance(image, (PIL.Image.Image, np.ndarray)):
|
208
|
-
image = [image]
|
209
|
-
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
|
210
|
-
# resize all images w.r.t passed height an width
|
211
|
-
image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
|
212
|
-
image = [np.array(i.convert("RGB"))[None, :] for i in image]
|
213
|
-
image = np.concatenate(image, axis=0)
|
214
|
-
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
|
215
|
-
image = np.concatenate([i[None, :] for i in image], axis=0)
|
216
|
-
|
217
|
-
image = image.transpose(0, 3, 1, 2)
|
218
|
-
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
219
|
-
|
220
|
-
# preprocess mask
|
221
|
-
if isinstance(mask, (PIL.Image.Image, np.ndarray)):
|
222
|
-
mask = [mask]
|
223
|
-
|
224
|
-
if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
|
225
|
-
mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
|
226
|
-
mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
|
227
|
-
mask = mask.astype(np.float32) / 255.0
|
228
|
-
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
|
229
|
-
mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
|
230
|
-
|
231
|
-
mask[mask < 0.5] = 0
|
232
|
-
mask[mask >= 0.5] = 1
|
233
|
-
mask = torch.from_numpy(mask)
|
234
|
-
|
235
|
-
masked_image = image * (mask < 0.5)
|
236
|
-
|
237
|
-
# n.b. ensure backwards compatibility as old function does not return image
|
238
|
-
if return_image:
|
239
|
-
return mask, masked_image, image
|
240
|
-
|
241
|
-
return mask, masked_image
|
242
|
-
|
243
|
-
|
244
121
|
class StableDiffusionControlNetInpaintPipeline(
|
245
122
|
DiffusionPipeline,
|
246
123
|
StableDiffusionMixin,
|
247
124
|
TextualInversionLoaderMixin,
|
248
|
-
|
125
|
+
StableDiffusionLoraLoaderMixin,
|
249
126
|
IPAdapterMixin,
|
250
127
|
FromSingleFileMixin,
|
251
128
|
):
|
@@ -257,8 +134,8 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
257
134
|
|
258
135
|
The pipeline also inherits the following loading methods:
|
259
136
|
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
|
260
|
-
- [`~loaders.
|
261
|
-
- [`~loaders.
|
137
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
|
138
|
+
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
|
262
139
|
- [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
|
263
140
|
- [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
|
264
141
|
|
@@ -434,7 +311,7 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
434
311
|
"""
|
435
312
|
# set lora scale so that monkey patched LoRA
|
436
313
|
# function of text encoder can correctly access it
|
437
|
-
if lora_scale is not None and isinstance(self,
|
314
|
+
if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
|
438
315
|
self._lora_scale = lora_scale
|
439
316
|
|
440
317
|
# dynamically adjust the LoRA scale
|
@@ -567,7 +444,7 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
567
444
|
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
568
445
|
|
569
446
|
if self.text_encoder is not None:
|
570
|
-
if isinstance(self,
|
447
|
+
if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
|
571
448
|
# Retrieve the original scale by scaling back the LoRA layers
|
572
449
|
unscale_lora_layers(self.text_encoder, lora_scale)
|
573
450
|
|
@@ -602,6 +479,9 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
602
479
|
def prepare_ip_adapter_image_embeds(
|
603
480
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
604
481
|
):
|
482
|
+
image_embeds = []
|
483
|
+
if do_classifier_free_guidance:
|
484
|
+
negative_image_embeds = []
|
605
485
|
if ip_adapter_image_embeds is None:
|
606
486
|
if not isinstance(ip_adapter_image, list):
|
607
487
|
ip_adapter_image = [ip_adapter_image]
|
@@ -611,7 +491,6 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
611
491
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
612
492
|
)
|
613
493
|
|
614
|
-
image_embeds = []
|
615
494
|
for single_ip_adapter_image, image_proj_layer in zip(
|
616
495
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
617
496
|
):
|
@@ -619,36 +498,28 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
619
498
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
620
499
|
single_ip_adapter_image, device, 1, output_hidden_state
|
621
500
|
)
|
622
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
623
|
-
single_negative_image_embeds = torch.stack(
|
624
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
625
|
-
)
|
626
501
|
|
502
|
+
image_embeds.append(single_image_embeds[None, :])
|
627
503
|
if do_classifier_free_guidance:
|
628
|
-
|
629
|
-
single_image_embeds = single_image_embeds.to(device)
|
630
|
-
|
631
|
-
image_embeds.append(single_image_embeds)
|
504
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
632
505
|
else:
|
633
|
-
repeat_dims = [1]
|
634
|
-
image_embeds = []
|
635
506
|
for single_image_embeds in ip_adapter_image_embeds:
|
636
507
|
if do_classifier_free_guidance:
|
637
508
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
638
|
-
|
639
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
640
|
-
)
|
641
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
642
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
643
|
-
)
|
644
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
645
|
-
else:
|
646
|
-
single_image_embeds = single_image_embeds.repeat(
|
647
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
648
|
-
)
|
509
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
649
510
|
image_embeds.append(single_image_embeds)
|
650
511
|
|
651
|
-
|
512
|
+
ip_adapter_image_embeds = []
|
513
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
514
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
515
|
+
if do_classifier_free_guidance:
|
516
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
517
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
518
|
+
|
519
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
520
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
521
|
+
|
522
|
+
return ip_adapter_image_embeds
|
652
523
|
|
653
524
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
|
654
525
|
def run_safety_checker(self, image, device, dtype):
|
@@ -1537,7 +1408,7 @@ class StableDiffusionControlNetInpaintPipeline(
|
|
1537
1408
|
)
|
1538
1409
|
|
1539
1410
|
if guess_mode and self.do_classifier_free_guidance:
|
1540
|
-
#
|
1411
|
+
# Inferred ControlNet only for the conditional batch.
|
1541
1412
|
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
1542
1413
|
# add 0 to the unconditional batch to keep it unchanged.
|
1543
1414
|
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
@@ -38,8 +38,6 @@ from ...loaders import (
|
|
38
38
|
from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
|
39
39
|
from ...models.attention_processor import (
|
40
40
|
AttnProcessor2_0,
|
41
|
-
LoRAAttnProcessor2_0,
|
42
|
-
LoRAXFormersAttnProcessor,
|
43
41
|
XFormersAttnProcessor,
|
44
42
|
)
|
45
43
|
from ...models.lora import adjust_lora_scale_text_encoder
|
@@ -86,6 +84,7 @@ EXAMPLE_DOC_STRING = """
|
|
86
84
|
>>> # !pip install transformers accelerate
|
87
85
|
>>> from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
|
88
86
|
>>> from diffusers.utils import load_image
|
87
|
+
>>> from PIL import Image
|
89
88
|
>>> import numpy as np
|
90
89
|
>>> import torch
|
91
90
|
|
@@ -534,6 +533,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
|
534
533
|
def prepare_ip_adapter_image_embeds(
|
535
534
|
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
|
536
535
|
):
|
536
|
+
image_embeds = []
|
537
|
+
if do_classifier_free_guidance:
|
538
|
+
negative_image_embeds = []
|
537
539
|
if ip_adapter_image_embeds is None:
|
538
540
|
if not isinstance(ip_adapter_image, list):
|
539
541
|
ip_adapter_image = [ip_adapter_image]
|
@@ -543,7 +545,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
|
543
545
|
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
544
546
|
)
|
545
547
|
|
546
|
-
image_embeds = []
|
547
548
|
for single_ip_adapter_image, image_proj_layer in zip(
|
548
549
|
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
549
550
|
):
|
@@ -551,36 +552,28 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
|
551
552
|
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
552
553
|
single_ip_adapter_image, device, 1, output_hidden_state
|
553
554
|
)
|
554
|
-
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
555
|
-
single_negative_image_embeds = torch.stack(
|
556
|
-
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
557
|
-
)
|
558
555
|
|
556
|
+
image_embeds.append(single_image_embeds[None, :])
|
559
557
|
if do_classifier_free_guidance:
|
560
|
-
|
561
|
-
single_image_embeds = single_image_embeds.to(device)
|
562
|
-
|
563
|
-
image_embeds.append(single_image_embeds)
|
558
|
+
negative_image_embeds.append(single_negative_image_embeds[None, :])
|
564
559
|
else:
|
565
|
-
repeat_dims = [1]
|
566
|
-
image_embeds = []
|
567
560
|
for single_image_embeds in ip_adapter_image_embeds:
|
568
561
|
if do_classifier_free_guidance:
|
569
562
|
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
|
570
|
-
|
571
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
572
|
-
)
|
573
|
-
single_negative_image_embeds = single_negative_image_embeds.repeat(
|
574
|
-
num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
|
575
|
-
)
|
576
|
-
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
577
|
-
else:
|
578
|
-
single_image_embeds = single_image_embeds.repeat(
|
579
|
-
num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
|
580
|
-
)
|
563
|
+
negative_image_embeds.append(single_negative_image_embeds)
|
581
564
|
image_embeds.append(single_image_embeds)
|
582
565
|
|
583
|
-
|
566
|
+
ip_adapter_image_embeds = []
|
567
|
+
for i, single_image_embeds in enumerate(image_embeds):
|
568
|
+
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
569
|
+
if do_classifier_free_guidance:
|
570
|
+
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
|
571
|
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
|
572
|
+
|
573
|
+
single_image_embeds = single_image_embeds.to(device=device)
|
574
|
+
ip_adapter_image_embeds.append(single_image_embeds)
|
575
|
+
|
576
|
+
return ip_adapter_image_embeds
|
584
577
|
|
585
578
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
586
579
|
def prepare_extra_step_kwargs(self, generator, eta):
|
@@ -1117,8 +1110,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
|
1117
1110
|
(
|
1118
1111
|
AttnProcessor2_0,
|
1119
1112
|
XFormersAttnProcessor,
|
1120
|
-
LoRAXFormersAttnProcessor,
|
1121
|
-
LoRAAttnProcessor2_0,
|
1122
1113
|
),
|
1123
1114
|
)
|
1124
1115
|
# if xformers or torch_2_0 is used attention block does not need
|
@@ -1748,7 +1739,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
|
|
1748
1739
|
)
|
1749
1740
|
|
1750
1741
|
if guess_mode and self.do_classifier_free_guidance:
|
1751
|
-
#
|
1742
|
+
# Inferred ControlNet only for the conditional batch.
|
1752
1743
|
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
1753
1744
|
# add 0 to the unconditional batch to keep it unchanged.
|
1754
1745
|
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|