diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/configuration_utils.py +34 -29
  3. diffusers/dependency_versions_table.py +4 -0
  4. diffusers/image_processor.py +125 -12
  5. diffusers/loaders.py +169 -203
  6. diffusers/models/attention.py +24 -1
  7. diffusers/models/attention_flax.py +10 -5
  8. diffusers/models/attention_processor.py +3 -0
  9. diffusers/models/autoencoder_kl.py +114 -33
  10. diffusers/models/controlnet.py +131 -14
  11. diffusers/models/controlnet_flax.py +37 -26
  12. diffusers/models/cross_attention.py +17 -17
  13. diffusers/models/embeddings.py +67 -0
  14. diffusers/models/modeling_flax_utils.py +64 -56
  15. diffusers/models/modeling_utils.py +193 -104
  16. diffusers/models/prior_transformer.py +207 -37
  17. diffusers/models/resnet.py +26 -26
  18. diffusers/models/transformer_2d.py +36 -41
  19. diffusers/models/transformer_temporal.py +24 -21
  20. diffusers/models/unet_1d.py +31 -25
  21. diffusers/models/unet_2d.py +43 -30
  22. diffusers/models/unet_2d_blocks.py +210 -89
  23. diffusers/models/unet_2d_blocks_flax.py +12 -12
  24. diffusers/models/unet_2d_condition.py +172 -64
  25. diffusers/models/unet_2d_condition_flax.py +38 -24
  26. diffusers/models/unet_3d_blocks.py +34 -31
  27. diffusers/models/unet_3d_condition.py +101 -34
  28. diffusers/models/vae.py +5 -5
  29. diffusers/models/vae_flax.py +37 -34
  30. diffusers/models/vq_model.py +23 -14
  31. diffusers/pipelines/__init__.py +24 -1
  32. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
  33. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
  34. diffusers/pipelines/consistency_models/__init__.py +1 -0
  35. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
  36. diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
  37. diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
  38. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
  39. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
  40. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  41. diffusers/pipelines/kandinsky/__init__.py +1 -1
  42. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
  43. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
  44. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
  45. diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
  46. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
  47. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
  48. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
  49. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
  50. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
  51. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
  52. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
  53. diffusers/pipelines/pipeline_flax_utils.py +2 -2
  54. diffusers/pipelines/pipeline_utils.py +124 -146
  55. diffusers/pipelines/shap_e/__init__.py +27 -0
  56. diffusers/pipelines/shap_e/camera.py +147 -0
  57. diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
  58. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
  59. diffusers/pipelines/shap_e/renderer.py +709 -0
  60. diffusers/pipelines/stable_diffusion/__init__.py +2 -0
  61. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
  65. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
  69. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
  70. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
  71. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
  72. diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
  73. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
  74. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
  75. diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
  76. diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
  77. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
  78. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
  79. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
  80. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  81. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
  82. diffusers/schedulers/__init__.py +3 -0
  83. diffusers/schedulers/scheduling_consistency_models.py +380 -0
  84. diffusers/schedulers/scheduling_ddim.py +28 -6
  85. diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
  86. diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
  87. diffusers/schedulers/scheduling_ddpm.py +53 -7
  88. diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
  89. diffusers/schedulers/scheduling_deis_multistep.py +66 -11
  90. diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
  91. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
  92. diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
  93. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
  94. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
  95. diffusers/schedulers/scheduling_euler_discrete.py +58 -8
  96. diffusers/schedulers/scheduling_heun_discrete.py +89 -14
  97. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
  98. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
  99. diffusers/schedulers/scheduling_lms_discrete.py +57 -8
  100. diffusers/schedulers/scheduling_pndm.py +46 -10
  101. diffusers/schedulers/scheduling_repaint.py +19 -4
  102. diffusers/schedulers/scheduling_sde_ve.py +5 -1
  103. diffusers/schedulers/scheduling_unclip.py +43 -4
  104. diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
  105. diffusers/training_utils.py +1 -1
  106. diffusers/utils/__init__.py +2 -1
  107. diffusers/utils/dummy_pt_objects.py +60 -0
  108. diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
  109. diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
  110. diffusers/utils/hub_utils.py +1 -1
  111. diffusers/utils/import_utils.py +20 -3
  112. diffusers/utils/logging.py +15 -18
  113. diffusers/utils/outputs.py +3 -3
  114. diffusers/utils/testing_utils.py +15 -0
  115. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
  116. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
  117. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
  118. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
  119. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
  120. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,896 @@
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
22
+
23
+ from ...image_processor import VaeImageProcessor
24
+ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...models import AutoencoderKL, UNet2DConditionModel
26
+ from ...models.attention_processor import (
27
+ AttnProcessor2_0,
28
+ LoRAAttnProcessor2_0,
29
+ LoRAXFormersAttnProcessor,
30
+ XFormersAttnProcessor,
31
+ )
32
+ from ...schedulers import KarrasDiffusionSchedulers
33
+ from ...utils import (
34
+ is_accelerate_available,
35
+ is_accelerate_version,
36
+ logging,
37
+ randn_tensor,
38
+ replace_example_docstring,
39
+ )
40
+ from ..pipeline_utils import DiffusionPipeline
41
+ from . import StableDiffusionXLPipelineOutput
42
+ from .watermark import StableDiffusionXLWatermarker
43
+
44
+
45
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
46
+
47
+ EXAMPLE_DOC_STRING = """
48
+ Examples:
49
+ ```py
50
+ >>> import torch
51
+ >>> from diffusers import StableDiffusionXLImg2ImgPipeline
52
+ >>> from diffusers.utils import load_image
53
+
54
+ >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
55
+ ... "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16
56
+ ... )
57
+ >>> pipe = pipe.to("cuda")
58
+ >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
59
+
60
+ >>> init_image = load_image(url).convert("RGB")
61
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
62
+ >>> image = pipe(prompt, image=init_image).images[0]
63
+ ```
64
+ """
65
+
66
+
67
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
68
+ """
69
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
70
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
71
+ """
72
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
73
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
74
+ # rescale the results from guidance (fixes overexposure)
75
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
76
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
77
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
78
+ return noise_cfg
79
+
80
+
81
+ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
82
+ r"""
83
+ Pipeline for text-to-image generation using Stable Diffusion.
84
+
85
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
86
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
87
+
88
+ In addition the pipeline inherits the following loading methods:
89
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
90
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
91
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
92
+
93
+ as well as the following saving methods:
94
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
95
+
96
+ Args:
97
+ vae ([`AutoencoderKL`]):
98
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
99
+ text_encoder ([`CLIPTextModel`]):
100
+ Frozen text-encoder. Stable Diffusion uses the text portion of
101
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
102
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
103
+ tokenizer (`CLIPTokenizer`):
104
+ Tokenizer of class
105
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
106
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
107
+ scheduler ([`SchedulerMixin`]):
108
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
109
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
110
+ """
111
+ _optional_components = ["tokenizer", "text_encoder"]
112
+
113
+ def __init__(
114
+ self,
115
+ vae: AutoencoderKL,
116
+ text_encoder: CLIPTextModel,
117
+ text_encoder_2: CLIPTextModelWithProjection,
118
+ tokenizer: CLIPTokenizer,
119
+ tokenizer_2: CLIPTokenizer,
120
+ unet: UNet2DConditionModel,
121
+ scheduler: KarrasDiffusionSchedulers,
122
+ requires_aesthetics_score: bool = False,
123
+ force_zeros_for_empty_prompt: bool = True,
124
+ ):
125
+ super().__init__()
126
+
127
+ self.register_modules(
128
+ vae=vae,
129
+ text_encoder=text_encoder,
130
+ text_encoder_2=text_encoder_2,
131
+ tokenizer=tokenizer,
132
+ tokenizer_2=tokenizer_2,
133
+ unet=unet,
134
+ scheduler=scheduler,
135
+ )
136
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
137
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
138
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
139
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
140
+
141
+ self.watermark = StableDiffusionXLWatermarker()
142
+
143
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
144
+ def enable_vae_slicing(self):
145
+ r"""
146
+ Enable sliced VAE decoding.
147
+
148
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
149
+ steps. This is useful to save some memory and allow larger batch sizes.
150
+ """
151
+ self.vae.enable_slicing()
152
+
153
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
154
+ def disable_vae_slicing(self):
155
+ r"""
156
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
157
+ computing decoding in one step.
158
+ """
159
+ self.vae.disable_slicing()
160
+
161
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
162
+ def enable_vae_tiling(self):
163
+ r"""
164
+ Enable tiled VAE decoding.
165
+
166
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
167
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
168
+ """
169
+ self.vae.enable_tiling()
170
+
171
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
172
+ def disable_vae_tiling(self):
173
+ r"""
174
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
175
+ computing decoding in one step.
176
+ """
177
+ self.vae.disable_tiling()
178
+
179
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_sequential_cpu_offload
180
+ def enable_sequential_cpu_offload(self, gpu_id=0):
181
+ r"""
182
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
183
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
184
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
185
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
186
+ `enable_model_cpu_offload`, but performance is lower.
187
+ """
188
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
189
+ from accelerate import cpu_offload
190
+ else:
191
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
192
+
193
+ device = torch.device(f"cuda:{gpu_id}")
194
+
195
+ if self.device.type != "cpu":
196
+ self.to("cpu", silence_dtype_warnings=True)
197
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
198
+
199
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
200
+ cpu_offload(cpu_offloaded_model, device)
201
+
202
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
203
+ def enable_model_cpu_offload(self, gpu_id=0):
204
+ r"""
205
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
206
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
207
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
208
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
209
+ """
210
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
211
+ from accelerate import cpu_offload_with_hook
212
+ else:
213
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
214
+
215
+ device = torch.device(f"cuda:{gpu_id}")
216
+
217
+ if self.device.type != "cpu":
218
+ self.to("cpu", silence_dtype_warnings=True)
219
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
220
+
221
+ model_sequence = (
222
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
223
+ )
224
+ model_sequence.extend([self.unet, self.vae])
225
+
226
+ hook = None
227
+ for cpu_offloaded_model in model_sequence:
228
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
229
+
230
+ # We'll offload the last model manually.
231
+ self.final_offload_hook = hook
232
+
233
+ @property
234
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
235
+ def _execution_device(self):
236
+ r"""
237
+ Returns the device on which the pipeline's models will be executed. After calling
238
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
239
+ hooks.
240
+ """
241
+ if not hasattr(self.unet, "_hf_hook"):
242
+ return self.device
243
+ for module in self.unet.modules():
244
+ if (
245
+ hasattr(module, "_hf_hook")
246
+ and hasattr(module._hf_hook, "execution_device")
247
+ and module._hf_hook.execution_device is not None
248
+ ):
249
+ return torch.device(module._hf_hook.execution_device)
250
+ return self.device
251
+
252
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
253
+ def encode_prompt(
254
+ self,
255
+ prompt,
256
+ device: Optional[torch.device] = None,
257
+ num_images_per_prompt: int = 1,
258
+ do_classifier_free_guidance: bool = True,
259
+ negative_prompt=None,
260
+ prompt_embeds: Optional[torch.FloatTensor] = None,
261
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
262
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
263
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
264
+ lora_scale: Optional[float] = None,
265
+ ):
266
+ r"""
267
+ Encodes the prompt into text encoder hidden states.
268
+
269
+ Args:
270
+ prompt (`str` or `List[str]`, *optional*):
271
+ prompt to be encoded
272
+ device: (`torch.device`):
273
+ torch device
274
+ num_images_per_prompt (`int`):
275
+ number of images that should be generated per prompt
276
+ do_classifier_free_guidance (`bool`):
277
+ whether to use classifier free guidance or not
278
+ negative_prompt (`str` or `List[str]`, *optional*):
279
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
280
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
281
+ less than `1`).
282
+ prompt_embeds (`torch.FloatTensor`, *optional*):
283
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
284
+ provided, text embeddings will be generated from `prompt` input argument.
285
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
286
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
287
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
288
+ argument.
289
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
290
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
291
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
292
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
293
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
294
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
295
+ input argument.
296
+ lora_scale (`float`, *optional*):
297
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
298
+ """
299
+ device = device or self._execution_device
300
+
301
+ # set lora scale so that monkey patched LoRA
302
+ # function of text encoder can correctly access it
303
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
304
+ self._lora_scale = lora_scale
305
+
306
+ if prompt is not None and isinstance(prompt, str):
307
+ batch_size = 1
308
+ elif prompt is not None and isinstance(prompt, list):
309
+ batch_size = len(prompt)
310
+ else:
311
+ batch_size = prompt_embeds.shape[0]
312
+
313
+ # Define tokenizers and text encoders
314
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
315
+ text_encoders = (
316
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
317
+ )
318
+
319
+ if prompt_embeds is None:
320
+ # textual inversion: procecss multi-vector tokens if necessary
321
+ prompt_embeds_list = []
322
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
323
+ if isinstance(self, TextualInversionLoaderMixin):
324
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
325
+
326
+ text_inputs = tokenizer(
327
+ prompt,
328
+ padding="max_length",
329
+ max_length=tokenizer.model_max_length,
330
+ truncation=True,
331
+ return_tensors="pt",
332
+ )
333
+ text_input_ids = text_inputs.input_ids
334
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
335
+
336
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
337
+ text_input_ids, untruncated_ids
338
+ ):
339
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
340
+ logger.warning(
341
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
342
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
343
+ )
344
+
345
+ prompt_embeds = text_encoder(
346
+ text_input_ids.to(device),
347
+ output_hidden_states=True,
348
+ )
349
+
350
+ # We are only ALWAYS interested in the pooled output of the final text encoder
351
+ pooled_prompt_embeds = prompt_embeds[0]
352
+ prompt_embeds = prompt_embeds.hidden_states[-2]
353
+
354
+ bs_embed, seq_len, _ = prompt_embeds.shape
355
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
356
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
357
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
358
+
359
+ prompt_embeds_list.append(prompt_embeds)
360
+
361
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
362
+
363
+ # get unconditional embeddings for classifier free guidance
364
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
365
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
366
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
367
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
368
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
369
+ negative_prompt = negative_prompt or ""
370
+ uncond_tokens: List[str]
371
+ if prompt is not None and type(prompt) is not type(negative_prompt):
372
+ raise TypeError(
373
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
374
+ f" {type(prompt)}."
375
+ )
376
+ elif isinstance(negative_prompt, str):
377
+ uncond_tokens = [negative_prompt]
378
+ elif batch_size != len(negative_prompt):
379
+ raise ValueError(
380
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
381
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
382
+ " the batch size of `prompt`."
383
+ )
384
+ else:
385
+ uncond_tokens = negative_prompt
386
+
387
+ negative_prompt_embeds_list = []
388
+ for tokenizer, text_encoder in zip(tokenizers, text_encoders):
389
+ # textual inversion: procecss multi-vector tokens if necessary
390
+ if isinstance(self, TextualInversionLoaderMixin):
391
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer)
392
+
393
+ max_length = prompt_embeds.shape[1]
394
+ uncond_input = tokenizer(
395
+ uncond_tokens,
396
+ padding="max_length",
397
+ max_length=max_length,
398
+ truncation=True,
399
+ return_tensors="pt",
400
+ )
401
+
402
+ negative_prompt_embeds = text_encoder(
403
+ uncond_input.input_ids.to(device),
404
+ output_hidden_states=True,
405
+ )
406
+ # We are only ALWAYS interested in the pooled output of the final text encoder
407
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
408
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
409
+
410
+ if do_classifier_free_guidance:
411
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
412
+ seq_len = negative_prompt_embeds.shape[1]
413
+
414
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device)
415
+
416
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
417
+ negative_prompt_embeds = negative_prompt_embeds.view(
418
+ batch_size * num_images_per_prompt, seq_len, -1
419
+ )
420
+
421
+ # For classifier free guidance, we need to do two forward passes.
422
+ # Here we concatenate the unconditional and text embeddings into a single batch
423
+ # to avoid doing two forward passes
424
+
425
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
426
+
427
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
428
+
429
+ bs_embed = pooled_prompt_embeds.shape[0]
430
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
431
+ bs_embed * num_images_per_prompt, -1
432
+ )
433
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
434
+ bs_embed * num_images_per_prompt, -1
435
+ )
436
+
437
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
438
+
439
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
440
+ def prepare_extra_step_kwargs(self, generator, eta):
441
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
442
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
443
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
444
+ # and should be between [0, 1]
445
+
446
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
447
+ extra_step_kwargs = {}
448
+ if accepts_eta:
449
+ extra_step_kwargs["eta"] = eta
450
+
451
+ # check if the scheduler accepts generator
452
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
453
+ if accepts_generator:
454
+ extra_step_kwargs["generator"] = generator
455
+ return extra_step_kwargs
456
+
457
+ def check_inputs(
458
+ self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
459
+ ):
460
+ if strength < 0 or strength > 1:
461
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
462
+
463
+ if (callback_steps is None) or (
464
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
465
+ ):
466
+ raise ValueError(
467
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
468
+ f" {type(callback_steps)}."
469
+ )
470
+
471
+ if prompt is not None and prompt_embeds is not None:
472
+ raise ValueError(
473
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
474
+ " only forward one of the two."
475
+ )
476
+ elif prompt is None and prompt_embeds is None:
477
+ raise ValueError(
478
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
479
+ )
480
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
481
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
482
+
483
+ if negative_prompt is not None and negative_prompt_embeds is not None:
484
+ raise ValueError(
485
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
486
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
487
+ )
488
+
489
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
490
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
491
+ raise ValueError(
492
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
493
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
494
+ f" {negative_prompt_embeds.shape}."
495
+ )
496
+
497
+ def get_timesteps(self, num_inference_steps, strength, device):
498
+ # get the original timestep using init_timestep
499
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
500
+
501
+ t_start = max(num_inference_steps - init_timestep, 0)
502
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
503
+
504
+ return timesteps, num_inference_steps - t_start
505
+
506
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
507
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
508
+ raise ValueError(
509
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
510
+ )
511
+
512
+ # Offload text encoder if `enable_model_cpu_offload` was enabled
513
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
514
+ self.text_encoder_2.to("cpu")
515
+ torch.cuda.empty_cache()
516
+
517
+ image = image.to(device=device, dtype=dtype)
518
+
519
+ batch_size = batch_size * num_images_per_prompt
520
+
521
+ if image.shape[1] == 4:
522
+ init_latents = image
523
+
524
+ else:
525
+ # make sure the VAE is in float32 mode, as it overflows in float16
526
+ image = image.float()
527
+ self.vae.to(dtype=torch.float32)
528
+
529
+ if isinstance(generator, list) and len(generator) != batch_size:
530
+ raise ValueError(
531
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
532
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
533
+ )
534
+
535
+ elif isinstance(generator, list):
536
+ init_latents = [
537
+ self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
538
+ ]
539
+ init_latents = torch.cat(init_latents, dim=0)
540
+ else:
541
+ init_latents = self.vae.encode(image).latent_dist.sample(generator)
542
+
543
+ self.vae.to(dtype)
544
+ init_latents = init_latents.to(dtype)
545
+
546
+ init_latents = self.vae.config.scaling_factor * init_latents
547
+
548
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
549
+ # expand init_latents for batch_size
550
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
551
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
552
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
553
+ raise ValueError(
554
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
555
+ )
556
+ else:
557
+ init_latents = torch.cat([init_latents], dim=0)
558
+
559
+ shape = init_latents.shape
560
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
561
+
562
+ # get latents
563
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
564
+ latents = init_latents
565
+
566
+ return latents
567
+
568
+ def _get_add_time_ids(
569
+ self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
570
+ ):
571
+ if self.config.requires_aesthetics_score:
572
+ add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
573
+ add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
574
+ else:
575
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
576
+ add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
577
+
578
+ passed_add_embed_dim = (
579
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
580
+ )
581
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
582
+
583
+ if (
584
+ expected_add_embed_dim > passed_add_embed_dim
585
+ and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
586
+ ):
587
+ raise ValueError(
588
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
589
+ )
590
+ elif (
591
+ expected_add_embed_dim < passed_add_embed_dim
592
+ and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
593
+ ):
594
+ raise ValueError(
595
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
596
+ )
597
+ elif expected_add_embed_dim != passed_add_embed_dim:
598
+ raise ValueError(
599
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
600
+ )
601
+
602
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
603
+ add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
604
+
605
+ return add_time_ids, add_neg_time_ids
606
+
607
+ @torch.no_grad()
608
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
609
+ def __call__(
610
+ self,
611
+ prompt: Union[str, List[str]] = None,
612
+ image: Union[
613
+ torch.FloatTensor,
614
+ PIL.Image.Image,
615
+ np.ndarray,
616
+ List[torch.FloatTensor],
617
+ List[PIL.Image.Image],
618
+ List[np.ndarray],
619
+ ] = None,
620
+ strength: float = 0.3,
621
+ num_inference_steps: int = 50,
622
+ guidance_scale: float = 5.0,
623
+ negative_prompt: Optional[Union[str, List[str]]] = None,
624
+ num_images_per_prompt: Optional[int] = 1,
625
+ eta: float = 0.0,
626
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
627
+ latents: Optional[torch.FloatTensor] = None,
628
+ prompt_embeds: Optional[torch.FloatTensor] = None,
629
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
630
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
631
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
632
+ output_type: Optional[str] = "pil",
633
+ return_dict: bool = True,
634
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
635
+ callback_steps: int = 1,
636
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
637
+ guidance_rescale: float = 0.0,
638
+ original_size: Tuple[int, int] = None,
639
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
640
+ target_size: Tuple[int, int] = None,
641
+ aesthetic_score: float = 6.0,
642
+ negative_aesthetic_score: float = 2.5,
643
+ ):
644
+ r"""
645
+ Function invoked when calling the pipeline for generation.
646
+
647
+ Args:
648
+ prompt (`str` or `List[str]`, *optional*):
649
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
650
+ instead.
651
+ image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
652
+ The image(s) to modify with the pipeline.
653
+ strength (`float`, *optional*, defaults to 0.8):
654
+ Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
655
+ will be used as a starting point, adding more noise to it the larger the `strength`. The number of
656
+ denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
657
+ be maximum and the denoising process will run for the full number of iterations specified in
658
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
659
+ num_inference_steps (`int`, *optional*, defaults to 50):
660
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
661
+ expense of slower inference.
662
+ guidance_scale (`float`, *optional*, defaults to 7.5):
663
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
664
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
665
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
666
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
667
+ usually at the expense of lower image quality.
668
+ negative_prompt (`str` or `List[str]`, *optional*):
669
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
670
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
671
+ less than `1`).
672
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
673
+ The number of images to generate per prompt.
674
+ eta (`float`, *optional*, defaults to 0.0):
675
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
676
+ [`schedulers.DDIMScheduler`], will be ignored for others.
677
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
678
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
679
+ to make generation deterministic.
680
+ latents (`torch.FloatTensor`, *optional*):
681
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
682
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
683
+ tensor will ge generated by sampling using the supplied random `generator`.
684
+ prompt_embeds (`torch.FloatTensor`, *optional*):
685
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
686
+ provided, text embeddings will be generated from `prompt` input argument.
687
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
688
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
689
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
690
+ argument.
691
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
692
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
693
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
694
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
695
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
696
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
697
+ input argument.
698
+ output_type (`str`, *optional*, defaults to `"pil"`):
699
+ The output format of the generate image. Choose between
700
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
701
+ return_dict (`bool`, *optional*, defaults to `True`):
702
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
703
+ plain tuple.
704
+ callback (`Callable`, *optional*):
705
+ A function that will be called every `callback_steps` steps during inference. The function will be
706
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
707
+ callback_steps (`int`, *optional*, defaults to 1):
708
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
709
+ called at every step.
710
+ cross_attention_kwargs (`dict`, *optional*):
711
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
712
+ `self.processor` in
713
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
714
+ guidance_rescale (`float`, *optional*, defaults to 0.7):
715
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
716
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
717
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
718
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
719
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
720
+ TODO
721
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
722
+ TODO
723
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
724
+ TODO
725
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
726
+ TODO
727
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
728
+ TDOO
729
+
730
+ Examples:
731
+
732
+ Returns:
733
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
734
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
735
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
736
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
737
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
738
+ """
739
+ # 1. Check inputs. Raise error if not correct
740
+ self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
741
+
742
+ # 2. Define call parameters
743
+ if prompt is not None and isinstance(prompt, str):
744
+ batch_size = 1
745
+ elif prompt is not None and isinstance(prompt, list):
746
+ batch_size = len(prompt)
747
+ else:
748
+ batch_size = prompt_embeds.shape[0]
749
+
750
+ device = self._execution_device
751
+
752
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
753
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
754
+ # corresponds to doing no classifier free guidance.
755
+ do_classifier_free_guidance = guidance_scale > 1.0
756
+
757
+ # 3. Encode input prompt
758
+ text_encoder_lora_scale = (
759
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
760
+ )
761
+ (
762
+ prompt_embeds,
763
+ negative_prompt_embeds,
764
+ pooled_prompt_embeds,
765
+ negative_pooled_prompt_embeds,
766
+ ) = self.encode_prompt(
767
+ prompt,
768
+ device,
769
+ num_images_per_prompt,
770
+ do_classifier_free_guidance,
771
+ negative_prompt,
772
+ prompt_embeds=prompt_embeds,
773
+ negative_prompt_embeds=negative_prompt_embeds,
774
+ pooled_prompt_embeds=pooled_prompt_embeds,
775
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
776
+ lora_scale=text_encoder_lora_scale,
777
+ )
778
+
779
+ # 4. Preprocess image
780
+ image = self.image_processor.preprocess(image)
781
+
782
+ # 5. Prepare timesteps
783
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
784
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
785
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
786
+
787
+ # 6. Prepare latent variables
788
+ latents = self.prepare_latents(
789
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
790
+ )
791
+ # 7. Prepare extra step kwargs.
792
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
793
+
794
+ height, width = latents.shape[-2:]
795
+ height = height * self.vae_scale_factor
796
+ width = width * self.vae_scale_factor
797
+
798
+ original_size = original_size or (height, width)
799
+ target_size = target_size or (height, width)
800
+
801
+ # 8. Prepare added time ids & embeddings
802
+ add_text_embeds = pooled_prompt_embeds
803
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
804
+ original_size,
805
+ crops_coords_top_left,
806
+ target_size,
807
+ aesthetic_score,
808
+ negative_aesthetic_score,
809
+ dtype=prompt_embeds.dtype,
810
+ )
811
+
812
+ if do_classifier_free_guidance:
813
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
814
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
815
+ add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
816
+
817
+ prompt_embeds = prompt_embeds.to(device)
818
+ add_text_embeds = add_text_embeds.to(device)
819
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
820
+
821
+ # 9. Denoising loop
822
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
823
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
824
+ for i, t in enumerate(timesteps):
825
+ # expand the latents if we are doing classifier free guidance
826
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
827
+
828
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
829
+
830
+ # predict the noise residual
831
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
832
+ noise_pred = self.unet(
833
+ latent_model_input,
834
+ t,
835
+ encoder_hidden_states=prompt_embeds,
836
+ cross_attention_kwargs=cross_attention_kwargs,
837
+ added_cond_kwargs=added_cond_kwargs,
838
+ return_dict=False,
839
+ )[0]
840
+
841
+ # perform guidance
842
+ if do_classifier_free_guidance:
843
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
844
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
845
+
846
+ if do_classifier_free_guidance and guidance_rescale > 0.0:
847
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
848
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
849
+
850
+ # compute the previous noisy sample x_t -> x_t-1
851
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
852
+
853
+ # call the callback, if provided
854
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
855
+ progress_bar.update()
856
+ if callback is not None and i % callback_steps == 0:
857
+ callback(i, t, latents)
858
+
859
+ # make sure the VAE is in float32 mode, as it overflows in float16
860
+ self.vae.to(dtype=torch.float32)
861
+
862
+ use_torch_2_0_or_xformers = isinstance(
863
+ self.vae.decoder.mid_block.attentions[0].processor,
864
+ (
865
+ AttnProcessor2_0,
866
+ XFormersAttnProcessor,
867
+ LoRAXFormersAttnProcessor,
868
+ LoRAAttnProcessor2_0,
869
+ ),
870
+ )
871
+ # if xformers or torch_2_0 is used attention block does not need
872
+ # to be in float32 which can save lots of memory
873
+ if use_torch_2_0_or_xformers:
874
+ self.vae.post_quant_conv.to(latents.dtype)
875
+ self.vae.decoder.conv_in.to(latents.dtype)
876
+ self.vae.decoder.mid_block.to(latents.dtype)
877
+ else:
878
+ latents = latents.float()
879
+
880
+ if not output_type == "latent":
881
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
882
+ else:
883
+ image = latents
884
+ return StableDiffusionXLPipelineOutput(images=image)
885
+
886
+ image = self.watermark.apply_watermark(image)
887
+ image = self.image_processor.postprocess(image, output_type=output_type)
888
+
889
+ # Offload last model to CPU
890
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
891
+ self.final_offload_hook.offload()
892
+
893
+ if not return_dict:
894
+ return (image,)
895
+
896
+ return StableDiffusionXLPipelineOutput(images=image)