diffusers 0.17.1__py3-none-any.whl → 0.18.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/configuration_utils.py +34 -29
  3. diffusers/dependency_versions_table.py +4 -0
  4. diffusers/image_processor.py +125 -12
  5. diffusers/loaders.py +169 -203
  6. diffusers/models/attention.py +24 -1
  7. diffusers/models/attention_flax.py +10 -5
  8. diffusers/models/attention_processor.py +3 -0
  9. diffusers/models/autoencoder_kl.py +114 -33
  10. diffusers/models/controlnet.py +131 -14
  11. diffusers/models/controlnet_flax.py +37 -26
  12. diffusers/models/cross_attention.py +17 -17
  13. diffusers/models/embeddings.py +67 -0
  14. diffusers/models/modeling_flax_utils.py +64 -56
  15. diffusers/models/modeling_utils.py +193 -104
  16. diffusers/models/prior_transformer.py +207 -37
  17. diffusers/models/resnet.py +26 -26
  18. diffusers/models/transformer_2d.py +36 -41
  19. diffusers/models/transformer_temporal.py +24 -21
  20. diffusers/models/unet_1d.py +31 -25
  21. diffusers/models/unet_2d.py +43 -30
  22. diffusers/models/unet_2d_blocks.py +210 -89
  23. diffusers/models/unet_2d_blocks_flax.py +12 -12
  24. diffusers/models/unet_2d_condition.py +172 -64
  25. diffusers/models/unet_2d_condition_flax.py +38 -24
  26. diffusers/models/unet_3d_blocks.py +34 -31
  27. diffusers/models/unet_3d_condition.py +101 -34
  28. diffusers/models/vae.py +5 -5
  29. diffusers/models/vae_flax.py +37 -34
  30. diffusers/models/vq_model.py +23 -14
  31. diffusers/pipelines/__init__.py +24 -1
  32. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +1 -1
  33. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +5 -3
  34. diffusers/pipelines/consistency_models/__init__.py +1 -0
  35. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +337 -0
  36. diffusers/pipelines/controlnet/multicontrolnet.py +120 -1
  37. diffusers/pipelines/controlnet/pipeline_controlnet.py +59 -17
  38. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +60 -15
  39. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +60 -17
  40. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +1 -1
  41. diffusers/pipelines/kandinsky/__init__.py +1 -1
  42. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +4 -6
  43. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +1 -0
  44. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -0
  45. diffusers/pipelines/kandinsky2_2/__init__.py +7 -0
  46. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +317 -0
  47. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +372 -0
  48. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +434 -0
  49. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +398 -0
  50. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +531 -0
  51. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +541 -0
  52. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +605 -0
  53. diffusers/pipelines/pipeline_flax_utils.py +2 -2
  54. diffusers/pipelines/pipeline_utils.py +124 -146
  55. diffusers/pipelines/shap_e/__init__.py +27 -0
  56. diffusers/pipelines/shap_e/camera.py +147 -0
  57. diffusers/pipelines/shap_e/pipeline_shap_e.py +390 -0
  58. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +349 -0
  59. diffusers/pipelines/shap_e/renderer.py +709 -0
  60. diffusers/pipelines/stable_diffusion/__init__.py +2 -0
  61. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +261 -66
  62. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +3 -3
  63. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +5 -3
  64. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +4 -2
  65. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +6 -6
  66. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +1 -1
  67. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  68. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +719 -0
  69. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -1
  70. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +832 -0
  71. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +17 -7
  72. diffusers/pipelines/stable_diffusion_xl/__init__.py +26 -0
  73. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +823 -0
  74. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +896 -0
  75. diffusers/pipelines/stable_diffusion_xl/watermark.py +31 -0
  76. diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -1
  77. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +5 -1
  78. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +771 -0
  79. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +92 -6
  80. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +3 -3
  81. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +209 -91
  82. diffusers/schedulers/__init__.py +3 -0
  83. diffusers/schedulers/scheduling_consistency_models.py +380 -0
  84. diffusers/schedulers/scheduling_ddim.py +28 -6
  85. diffusers/schedulers/scheduling_ddim_inverse.py +19 -4
  86. diffusers/schedulers/scheduling_ddim_parallel.py +642 -0
  87. diffusers/schedulers/scheduling_ddpm.py +53 -7
  88. diffusers/schedulers/scheduling_ddpm_parallel.py +604 -0
  89. diffusers/schedulers/scheduling_deis_multistep.py +66 -11
  90. diffusers/schedulers/scheduling_dpmsolver_multistep.py +55 -13
  91. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +19 -4
  92. diffusers/schedulers/scheduling_dpmsolver_sde.py +73 -11
  93. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +23 -7
  94. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +58 -9
  95. diffusers/schedulers/scheduling_euler_discrete.py +58 -8
  96. diffusers/schedulers/scheduling_heun_discrete.py +89 -14
  97. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +73 -11
  98. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +73 -11
  99. diffusers/schedulers/scheduling_lms_discrete.py +57 -8
  100. diffusers/schedulers/scheduling_pndm.py +46 -10
  101. diffusers/schedulers/scheduling_repaint.py +19 -4
  102. diffusers/schedulers/scheduling_sde_ve.py +5 -1
  103. diffusers/schedulers/scheduling_unclip.py +43 -4
  104. diffusers/schedulers/scheduling_unipc_multistep.py +48 -7
  105. diffusers/training_utils.py +1 -1
  106. diffusers/utils/__init__.py +2 -1
  107. diffusers/utils/dummy_pt_objects.py +60 -0
  108. diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py +32 -0
  109. diffusers/utils/dummy_torch_and_transformers_objects.py +180 -0
  110. diffusers/utils/hub_utils.py +1 -1
  111. diffusers/utils/import_utils.py +20 -3
  112. diffusers/utils/logging.py +15 -18
  113. diffusers/utils/outputs.py +3 -3
  114. diffusers/utils/testing_utils.py +15 -0
  115. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/METADATA +4 -2
  116. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/RECORD +120 -94
  117. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/WHEEL +1 -1
  118. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/LICENSE +0 -0
  119. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/entry_points.txt +0 -0
  120. {diffusers-0.17.1.dist-info → diffusers-0.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,719 @@
1
+ # Copyright 2023 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from dataclasses import dataclass
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL
21
+ import torch
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+
24
+ from ...image_processor import VaeImageProcessorLDM3D
25
+ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, UNet2DConditionModel
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import (
29
+ BaseOutput,
30
+ is_accelerate_available,
31
+ is_accelerate_version,
32
+ logging,
33
+ randn_tensor,
34
+ replace_example_docstring,
35
+ )
36
+ from ..pipeline_utils import DiffusionPipeline
37
+ from .safety_checker import StableDiffusionSafetyChecker
38
+
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+ EXAMPLE_DOC_STRING = """
43
+ Examples:
44
+ ```py
45
+ >>> import torch
46
+ >>> from diffusers import StableDiffusionPipeline
47
+
48
+ >>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d")
49
+ >>> pipe = pipe.to("cuda")
50
+
51
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
52
+ >>> output = pipe(prompt)
53
+ >>> rgb_image, depth_image = output.rgb, output.depth
54
+ ```
55
+ """
56
+
57
+
58
+ @dataclass
59
+ class LDM3DPipelineOutput(BaseOutput):
60
+ """
61
+ Output class for Stable Diffusion pipelines.
62
+
63
+ Args:
64
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
65
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
66
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
67
+ nsfw_content_detected (`List[bool]`)
68
+ List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
69
+ (nsfw) content, or `None` if safety checking could not be performed.
70
+ """
71
+
72
+ rgb: Union[List[PIL.Image.Image], np.ndarray]
73
+ depth: Union[List[PIL.Image.Image], np.ndarray]
74
+ nsfw_content_detected: Optional[List[bool]]
75
+
76
+
77
+ class StableDiffusionLDM3DPipeline(
78
+ DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
79
+ ):
80
+ r"""
81
+ Pipeline for text-to-image and 3d generation using LDM3D. LDM3D: Latent Diffusion Model for 3D:
82
+ https://arxiv.org/abs/2305.10853
83
+
84
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
85
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
86
+
87
+ In addition the pipeline inherits the following loading methods:
88
+ - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
89
+ - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
90
+ - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
91
+
92
+ as well as the following saving methods:
93
+ - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
94
+
95
+ Args:
96
+ vae ([`AutoencoderKL`]):
97
+ Variational Auto-Encoder (VAE) Model to encode and decode rgb and depth images to and from latent
98
+ representations.
99
+ text_encoder ([`CLIPTextModel`]):
100
+ Frozen text-encoder. Stable Diffusion uses the text portion of
101
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
102
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
103
+ tokenizer (`CLIPTokenizer`):
104
+ Tokenizer of class
105
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
106
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded rgb and depth latents.
107
+ scheduler ([`SchedulerMixin`]):
108
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
109
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
110
+ safety_checker ([`StableDiffusionSafetyChecker`]):
111
+ Classification module that estimates whether generated images could be considered offensive or harmful.
112
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
113
+ feature_extractor ([`CLIPImageProcessor`]):
114
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
115
+ """
116
+ _optional_components = ["safety_checker", "feature_extractor"]
117
+
118
+ def __init__(
119
+ self,
120
+ vae: AutoencoderKL,
121
+ text_encoder: CLIPTextModel,
122
+ tokenizer: CLIPTokenizer,
123
+ unet: UNet2DConditionModel,
124
+ scheduler: KarrasDiffusionSchedulers,
125
+ safety_checker: StableDiffusionSafetyChecker,
126
+ feature_extractor: CLIPImageProcessor,
127
+ requires_safety_checker: bool = True,
128
+ ):
129
+ super().__init__()
130
+
131
+ if safety_checker is None and requires_safety_checker:
132
+ logger.warning(
133
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
134
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
135
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
136
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
137
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
138
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
139
+ )
140
+
141
+ if safety_checker is not None and feature_extractor is None:
142
+ raise ValueError(
143
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
144
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
145
+ )
146
+
147
+ self.register_modules(
148
+ vae=vae,
149
+ text_encoder=text_encoder,
150
+ tokenizer=tokenizer,
151
+ unet=unet,
152
+ scheduler=scheduler,
153
+ safety_checker=safety_checker,
154
+ feature_extractor=feature_extractor,
155
+ )
156
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
157
+ self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
158
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
159
+
160
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
161
+ def enable_vae_slicing(self):
162
+ r"""
163
+ Enable sliced VAE decoding.
164
+
165
+ When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
166
+ steps. This is useful to save some memory and allow larger batch sizes.
167
+ """
168
+ self.vae.enable_slicing()
169
+
170
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
171
+ def disable_vae_slicing(self):
172
+ r"""
173
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
174
+ computing decoding in one step.
175
+ """
176
+ self.vae.disable_slicing()
177
+
178
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
179
+ def enable_vae_tiling(self):
180
+ r"""
181
+ Enable tiled VAE decoding.
182
+
183
+ When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
184
+ several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
185
+ """
186
+ self.vae.enable_tiling()
187
+
188
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
189
+ def disable_vae_tiling(self):
190
+ r"""
191
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
192
+ computing decoding in one step.
193
+ """
194
+ self.vae.disable_tiling()
195
+
196
+ def enable_sequential_cpu_offload(self, gpu_id=0):
197
+ r"""
198
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
199
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
200
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
201
+ Note that offloading happens on a submodule basis. Memory savings are higher than with
202
+ `enable_model_cpu_offload`, but performance is lower.
203
+ """
204
+ if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
205
+ from accelerate import cpu_offload
206
+ else:
207
+ raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
208
+
209
+ device = torch.device(f"cuda:{gpu_id}")
210
+
211
+ if self.device.type != "cpu":
212
+ self.to("cpu", silence_dtype_warnings=True)
213
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
214
+
215
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
216
+ cpu_offload(cpu_offloaded_model, device)
217
+
218
+ if self.safety_checker is not None:
219
+ cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
220
+
221
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
222
+ def enable_model_cpu_offload(self, gpu_id=0):
223
+ r"""
224
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
225
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
226
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
227
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
228
+ """
229
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
230
+ from accelerate import cpu_offload_with_hook
231
+ else:
232
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
233
+
234
+ device = torch.device(f"cuda:{gpu_id}")
235
+
236
+ if self.device.type != "cpu":
237
+ self.to("cpu", silence_dtype_warnings=True)
238
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
239
+
240
+ hook = None
241
+ for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
242
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
243
+
244
+ if self.safety_checker is not None:
245
+ _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
246
+
247
+ # We'll offload the last model manually.
248
+ self.final_offload_hook = hook
249
+
250
+ @property
251
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
252
+ def _execution_device(self):
253
+ r"""
254
+ Returns the device on which the pipeline's models will be executed. After calling
255
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
256
+ hooks.
257
+ """
258
+ if not hasattr(self.unet, "_hf_hook"):
259
+ return self.device
260
+ for module in self.unet.modules():
261
+ if (
262
+ hasattr(module, "_hf_hook")
263
+ and hasattr(module._hf_hook, "execution_device")
264
+ and module._hf_hook.execution_device is not None
265
+ ):
266
+ return torch.device(module._hf_hook.execution_device)
267
+ return self.device
268
+
269
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
270
+ def _encode_prompt(
271
+ self,
272
+ prompt,
273
+ device,
274
+ num_images_per_prompt,
275
+ do_classifier_free_guidance,
276
+ negative_prompt=None,
277
+ prompt_embeds: Optional[torch.FloatTensor] = None,
278
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
279
+ lora_scale: Optional[float] = None,
280
+ ):
281
+ r"""
282
+ Encodes the prompt into text encoder hidden states.
283
+
284
+ Args:
285
+ prompt (`str` or `List[str]`, *optional*):
286
+ prompt to be encoded
287
+ device: (`torch.device`):
288
+ torch device
289
+ num_images_per_prompt (`int`):
290
+ number of images that should be generated per prompt
291
+ do_classifier_free_guidance (`bool`):
292
+ whether to use classifier free guidance or not
293
+ negative_prompt (`str` or `List[str]`, *optional*):
294
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
295
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
296
+ less than `1`).
297
+ prompt_embeds (`torch.FloatTensor`, *optional*):
298
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
299
+ provided, text embeddings will be generated from `prompt` input argument.
300
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
301
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
302
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
303
+ argument.
304
+ lora_scale (`float`, *optional*):
305
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
306
+ """
307
+ # set lora scale so that monkey patched LoRA
308
+ # function of text encoder can correctly access it
309
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
310
+ self._lora_scale = lora_scale
311
+
312
+ if prompt is not None and isinstance(prompt, str):
313
+ batch_size = 1
314
+ elif prompt is not None and isinstance(prompt, list):
315
+ batch_size = len(prompt)
316
+ else:
317
+ batch_size = prompt_embeds.shape[0]
318
+
319
+ if prompt_embeds is None:
320
+ # textual inversion: procecss multi-vector tokens if necessary
321
+ if isinstance(self, TextualInversionLoaderMixin):
322
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
323
+
324
+ text_inputs = self.tokenizer(
325
+ prompt,
326
+ padding="max_length",
327
+ max_length=self.tokenizer.model_max_length,
328
+ truncation=True,
329
+ return_tensors="pt",
330
+ )
331
+ text_input_ids = text_inputs.input_ids
332
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
333
+
334
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
335
+ text_input_ids, untruncated_ids
336
+ ):
337
+ removed_text = self.tokenizer.batch_decode(
338
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
339
+ )
340
+ logger.warning(
341
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
342
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
343
+ )
344
+
345
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
346
+ attention_mask = text_inputs.attention_mask.to(device)
347
+ else:
348
+ attention_mask = None
349
+
350
+ prompt_embeds = self.text_encoder(
351
+ text_input_ids.to(device),
352
+ attention_mask=attention_mask,
353
+ )
354
+ prompt_embeds = prompt_embeds[0]
355
+
356
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
357
+
358
+ bs_embed, seq_len, _ = prompt_embeds.shape
359
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
360
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
361
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
362
+
363
+ # get unconditional embeddings for classifier free guidance
364
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
365
+ uncond_tokens: List[str]
366
+ if negative_prompt is None:
367
+ uncond_tokens = [""] * batch_size
368
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
369
+ raise TypeError(
370
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
371
+ f" {type(prompt)}."
372
+ )
373
+ elif isinstance(negative_prompt, str):
374
+ uncond_tokens = [negative_prompt]
375
+ elif batch_size != len(negative_prompt):
376
+ raise ValueError(
377
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
378
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
379
+ " the batch size of `prompt`."
380
+ )
381
+ else:
382
+ uncond_tokens = negative_prompt
383
+
384
+ # textual inversion: procecss multi-vector tokens if necessary
385
+ if isinstance(self, TextualInversionLoaderMixin):
386
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
387
+
388
+ max_length = prompt_embeds.shape[1]
389
+ uncond_input = self.tokenizer(
390
+ uncond_tokens,
391
+ padding="max_length",
392
+ max_length=max_length,
393
+ truncation=True,
394
+ return_tensors="pt",
395
+ )
396
+
397
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
398
+ attention_mask = uncond_input.attention_mask.to(device)
399
+ else:
400
+ attention_mask = None
401
+
402
+ negative_prompt_embeds = self.text_encoder(
403
+ uncond_input.input_ids.to(device),
404
+ attention_mask=attention_mask,
405
+ )
406
+ negative_prompt_embeds = negative_prompt_embeds[0]
407
+
408
+ if do_classifier_free_guidance:
409
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
410
+ seq_len = negative_prompt_embeds.shape[1]
411
+
412
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
413
+
414
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
415
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
416
+
417
+ # For classifier free guidance, we need to do two forward passes.
418
+ # Here we concatenate the unconditional and text embeddings into a single batch
419
+ # to avoid doing two forward passes
420
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
421
+
422
+ return prompt_embeds
423
+
424
+ def run_safety_checker(self, image, device, dtype):
425
+ if self.safety_checker is None:
426
+ has_nsfw_concept = None
427
+ else:
428
+ if torch.is_tensor(image):
429
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
430
+ else:
431
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
432
+ rgb_feature_extractor_input = feature_extractor_input[0]
433
+ safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
434
+ image, has_nsfw_concept = self.safety_checker(
435
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
436
+ )
437
+ return image, has_nsfw_concept
438
+
439
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
440
+ def prepare_extra_step_kwargs(self, generator, eta):
441
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
442
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
443
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
444
+ # and should be between [0, 1]
445
+
446
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
447
+ extra_step_kwargs = {}
448
+ if accepts_eta:
449
+ extra_step_kwargs["eta"] = eta
450
+
451
+ # check if the scheduler accepts generator
452
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
453
+ if accepts_generator:
454
+ extra_step_kwargs["generator"] = generator
455
+ return extra_step_kwargs
456
+
457
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
458
+ def check_inputs(
459
+ self,
460
+ prompt,
461
+ height,
462
+ width,
463
+ callback_steps,
464
+ negative_prompt=None,
465
+ prompt_embeds=None,
466
+ negative_prompt_embeds=None,
467
+ ):
468
+ if height % 8 != 0 or width % 8 != 0:
469
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
470
+
471
+ if (callback_steps is None) or (
472
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
473
+ ):
474
+ raise ValueError(
475
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
476
+ f" {type(callback_steps)}."
477
+ )
478
+
479
+ if prompt is not None and prompt_embeds is not None:
480
+ raise ValueError(
481
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
482
+ " only forward one of the two."
483
+ )
484
+ elif prompt is None and prompt_embeds is None:
485
+ raise ValueError(
486
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
487
+ )
488
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
489
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
490
+
491
+ if negative_prompt is not None and negative_prompt_embeds is not None:
492
+ raise ValueError(
493
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
494
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
495
+ )
496
+
497
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
498
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
499
+ raise ValueError(
500
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
501
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
502
+ f" {negative_prompt_embeds.shape}."
503
+ )
504
+
505
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
506
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
507
+ if isinstance(generator, list) and len(generator) != batch_size:
508
+ raise ValueError(
509
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
510
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
511
+ )
512
+
513
+ if latents is None:
514
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
515
+ else:
516
+ latents = latents.to(device)
517
+
518
+ # scale the initial noise by the standard deviation required by the scheduler
519
+ latents = latents * self.scheduler.init_noise_sigma
520
+ return latents
521
+
522
+ @torch.no_grad()
523
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
524
+ def __call__(
525
+ self,
526
+ prompt: Union[str, List[str]] = None,
527
+ height: Optional[int] = None,
528
+ width: Optional[int] = None,
529
+ num_inference_steps: int = 49,
530
+ guidance_scale: float = 5.0,
531
+ negative_prompt: Optional[Union[str, List[str]]] = None,
532
+ num_images_per_prompt: Optional[int] = 1,
533
+ eta: float = 0.0,
534
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
535
+ latents: Optional[torch.FloatTensor] = None,
536
+ prompt_embeds: Optional[torch.FloatTensor] = None,
537
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
538
+ output_type: Optional[str] = "pil",
539
+ return_dict: bool = True,
540
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
541
+ callback_steps: int = 1,
542
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
543
+ ):
544
+ r"""
545
+ Function invoked when calling the pipeline for generation.
546
+
547
+ Args:
548
+ prompt (`str` or `List[str]`, *optional*):
549
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
550
+ instead.
551
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
552
+ The height in pixels of the generated image.
553
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
554
+ The width in pixels of the generated image.
555
+ num_inference_steps (`int`, *optional*, defaults to 50):
556
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
557
+ expense of slower inference.
558
+ guidance_scale (`float`, *optional*, defaults to 5.0):
559
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
560
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
561
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
562
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
563
+ usually at the expense of lower image quality.
564
+ negative_prompt (`str` or `List[str]`, *optional*):
565
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
566
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
567
+ less than `1`).
568
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
569
+ The number of images to generate per prompt.
570
+ eta (`float`, *optional*, defaults to 0.0):
571
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
572
+ [`schedulers.DDIMScheduler`], will be ignored for others.
573
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
574
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
575
+ to make generation deterministic.
576
+ latents (`torch.FloatTensor`, *optional*):
577
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
578
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
579
+ tensor will ge generated by sampling using the supplied random `generator`.
580
+ prompt_embeds (`torch.FloatTensor`, *optional*):
581
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
582
+ provided, text embeddings will be generated from `prompt` input argument.
583
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
584
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
585
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
586
+ argument.
587
+ output_type (`str`, *optional*, defaults to `"pil"`):
588
+ The output format of the generate image. Choose between
589
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
590
+ return_dict (`bool`, *optional*, defaults to `True`):
591
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
592
+ plain tuple.
593
+ callback (`Callable`, *optional*):
594
+ A function that will be called every `callback_steps` steps during inference. The function will be
595
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
596
+ callback_steps (`int`, *optional*, defaults to 1):
597
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
598
+ called at every step.
599
+ cross_attention_kwargs (`dict`, *optional*):
600
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
601
+ `self.processor` in
602
+ [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
603
+
604
+ Examples:
605
+
606
+ Returns:
607
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
608
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
609
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
610
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
611
+ (nsfw) content, according to the `safety_checker`.
612
+ """
613
+ # 0. Default height and width to unet
614
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
615
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
616
+
617
+ # 1. Check inputs. Raise error if not correct
618
+ self.check_inputs(
619
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
620
+ )
621
+
622
+ # 2. Define call parameters
623
+ if prompt is not None and isinstance(prompt, str):
624
+ batch_size = 1
625
+ elif prompt is not None and isinstance(prompt, list):
626
+ batch_size = len(prompt)
627
+ else:
628
+ batch_size = prompt_embeds.shape[0]
629
+
630
+ device = self._execution_device
631
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
632
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
633
+ # corresponds to doing no classifier free guidance.
634
+ do_classifier_free_guidance = guidance_scale > 1.0
635
+
636
+ # 3. Encode input prompt
637
+ prompt_embeds = self._encode_prompt(
638
+ prompt,
639
+ device,
640
+ num_images_per_prompt,
641
+ do_classifier_free_guidance,
642
+ negative_prompt,
643
+ prompt_embeds=prompt_embeds,
644
+ negative_prompt_embeds=negative_prompt_embeds,
645
+ )
646
+
647
+ # 4. Prepare timesteps
648
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
649
+ timesteps = self.scheduler.timesteps
650
+
651
+ # 5. Prepare latent variables
652
+ num_channels_latents = self.unet.config.in_channels
653
+ latents = self.prepare_latents(
654
+ batch_size * num_images_per_prompt,
655
+ num_channels_latents,
656
+ height,
657
+ width,
658
+ prompt_embeds.dtype,
659
+ device,
660
+ generator,
661
+ latents,
662
+ )
663
+
664
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
665
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
666
+
667
+ # 7. Denoising loop
668
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
669
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
670
+ for i, t in enumerate(timesteps):
671
+ # expand the latents if we are doing classifier free guidance
672
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
673
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
674
+
675
+ # predict the noise residual
676
+ noise_pred = self.unet(
677
+ latent_model_input,
678
+ t,
679
+ encoder_hidden_states=prompt_embeds,
680
+ cross_attention_kwargs=cross_attention_kwargs,
681
+ return_dict=False,
682
+ )[0]
683
+
684
+ # perform guidance
685
+ if do_classifier_free_guidance:
686
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
687
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
688
+
689
+ # compute the previous noisy sample x_t -> x_t-1
690
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
691
+
692
+ # call the callback, if provided
693
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
694
+ progress_bar.update()
695
+ if callback is not None and i % callback_steps == 0:
696
+ callback(i, t, latents)
697
+
698
+ if not output_type == "latent":
699
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
700
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
701
+ else:
702
+ image = latents
703
+ has_nsfw_concept = None
704
+
705
+ if has_nsfw_concept is None:
706
+ do_denormalize = [True] * image.shape[0]
707
+ else:
708
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
709
+
710
+ rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
711
+
712
+ # Offload last model to CPU
713
+ if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
714
+ self.final_offload_hook.offload()
715
+
716
+ if not return_dict:
717
+ return ((rgb, depth), has_nsfw_concept)
718
+
719
+ return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)