diffusers 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. diffusers/__init__.py +16 -2
  2. diffusers/configuration_utils.py +1 -0
  3. diffusers/dependency_versions_check.py +1 -14
  4. diffusers/dependency_versions_table.py +5 -4
  5. diffusers/image_processor.py +186 -14
  6. diffusers/loaders/__init__.py +82 -0
  7. diffusers/loaders/ip_adapter.py +157 -0
  8. diffusers/loaders/lora.py +1415 -0
  9. diffusers/loaders/lora_conversion_utils.py +284 -0
  10. diffusers/loaders/single_file.py +631 -0
  11. diffusers/loaders/textual_inversion.py +459 -0
  12. diffusers/loaders/unet.py +735 -0
  13. diffusers/loaders/utils.py +59 -0
  14. diffusers/models/__init__.py +12 -1
  15. diffusers/models/attention.py +165 -14
  16. diffusers/models/attention_flax.py +9 -1
  17. diffusers/models/attention_processor.py +286 -1
  18. diffusers/models/autoencoder_asym_kl.py +14 -9
  19. diffusers/models/autoencoder_kl.py +3 -18
  20. diffusers/models/autoencoder_kl_temporal_decoder.py +402 -0
  21. diffusers/models/autoencoder_tiny.py +20 -24
  22. diffusers/models/consistency_decoder_vae.py +37 -30
  23. diffusers/models/controlnet.py +59 -39
  24. diffusers/models/controlnet_flax.py +19 -18
  25. diffusers/models/embeddings_flax.py +2 -0
  26. diffusers/models/lora.py +131 -1
  27. diffusers/models/modeling_flax_utils.py +2 -1
  28. diffusers/models/modeling_outputs.py +17 -0
  29. diffusers/models/modeling_utils.py +27 -19
  30. diffusers/models/normalization.py +2 -2
  31. diffusers/models/resnet.py +390 -59
  32. diffusers/models/transformer_2d.py +20 -3
  33. diffusers/models/transformer_temporal.py +183 -1
  34. diffusers/models/unet_2d_blocks_flax.py +5 -0
  35. diffusers/models/unet_2d_condition.py +9 -0
  36. diffusers/models/unet_2d_condition_flax.py +13 -13
  37. diffusers/models/unet_3d_blocks.py +957 -173
  38. diffusers/models/unet_3d_condition.py +16 -8
  39. diffusers/models/unet_kandi3.py +589 -0
  40. diffusers/models/unet_motion_model.py +48 -33
  41. diffusers/models/unet_spatio_temporal_condition.py +489 -0
  42. diffusers/models/vae.py +63 -13
  43. diffusers/models/vae_flax.py +7 -0
  44. diffusers/models/vq_model.py +3 -1
  45. diffusers/optimization.py +16 -9
  46. diffusers/pipelines/__init__.py +65 -12
  47. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +93 -23
  48. diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +97 -25
  49. diffusers/pipelines/animatediff/pipeline_animatediff.py +34 -4
  50. diffusers/pipelines/audioldm/pipeline_audioldm.py +1 -0
  51. diffusers/pipelines/auto_pipeline.py +6 -0
  52. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -0
  53. diffusers/pipelines/controlnet/pipeline_controlnet.py +217 -31
  54. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +101 -32
  55. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +136 -39
  56. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +119 -37
  57. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +196 -35
  58. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +102 -31
  59. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -0
  60. diffusers/pipelines/ddim/pipeline_ddim.py +1 -0
  61. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -0
  62. diffusers/pipelines/deepfloyd_if/pipeline_if.py +13 -1
  63. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +13 -1
  64. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +13 -1
  65. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +13 -1
  66. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +13 -1
  67. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +13 -1
  68. diffusers/pipelines/dit/pipeline_dit.py +1 -0
  69. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  70. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +3 -3
  71. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  72. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +1 -1
  73. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +1 -1
  74. diffusers/pipelines/kandinsky3/__init__.py +49 -0
  75. diffusers/pipelines/kandinsky3/kandinsky3_pipeline.py +452 -0
  76. diffusers/pipelines/kandinsky3/kandinsky3img2img_pipeline.py +460 -0
  77. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +65 -6
  78. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +55 -3
  79. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -0
  80. diffusers/pipelines/musicldm/pipeline_musicldm.py +1 -1
  81. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +7 -2
  82. diffusers/pipelines/pipeline_flax_utils.py +4 -2
  83. diffusers/pipelines/pipeline_utils.py +33 -13
  84. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +196 -36
  85. diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py +1 -0
  86. diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -0
  87. diffusers/pipelines/stable_diffusion/__init__.py +64 -21
  88. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +8 -3
  89. diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +18 -2
  90. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +2 -2
  91. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +2 -4
  92. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -0
  93. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py +1 -0
  94. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +88 -9
  95. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +1 -0
  96. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +8 -3
  97. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +1 -0
  98. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py +1 -0
  99. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py +1 -0
  100. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +1 -0
  101. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -9
  102. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -9
  103. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +1 -0
  104. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +17 -13
  105. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -0
  106. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +1 -0
  107. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +1 -0
  108. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +1 -0
  109. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +1 -0
  110. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +1 -0
  111. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +1 -0
  112. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +1 -0
  113. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +1 -0
  114. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +103 -8
  115. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +113 -8
  116. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +115 -9
  117. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +16 -12
  118. diffusers/pipelines/stable_video_diffusion/__init__.py +58 -0
  119. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +649 -0
  120. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +108 -12
  121. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +109 -14
  122. diffusers/pipelines/text_to_video_synthesis/__init__.py +2 -0
  123. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +1 -0
  124. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +18 -3
  125. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +4 -2
  126. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +872 -0
  127. diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +29 -40
  128. diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -0
  129. diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -0
  130. diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -0
  131. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +14 -4
  132. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +9 -5
  133. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +1 -1
  134. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +2 -2
  135. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +1 -1
  136. diffusers/schedulers/__init__.py +2 -4
  137. diffusers/schedulers/deprecated/__init__.py +50 -0
  138. diffusers/schedulers/{scheduling_karras_ve.py → deprecated/scheduling_karras_ve.py} +4 -4
  139. diffusers/schedulers/{scheduling_sde_vp.py → deprecated/scheduling_sde_vp.py} +4 -6
  140. diffusers/schedulers/scheduling_ddim.py +1 -3
  141. diffusers/schedulers/scheduling_ddim_inverse.py +1 -3
  142. diffusers/schedulers/scheduling_ddim_parallel.py +1 -3
  143. diffusers/schedulers/scheduling_ddpm.py +1 -3
  144. diffusers/schedulers/scheduling_ddpm_parallel.py +1 -3
  145. diffusers/schedulers/scheduling_deis_multistep.py +15 -5
  146. diffusers/schedulers/scheduling_dpmsolver_multistep.py +15 -5
  147. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +15 -5
  148. diffusers/schedulers/scheduling_dpmsolver_sde.py +1 -3
  149. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +15 -5
  150. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +1 -3
  151. diffusers/schedulers/scheduling_euler_discrete.py +40 -13
  152. diffusers/schedulers/scheduling_heun_discrete.py +15 -5
  153. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +15 -5
  154. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +15 -5
  155. diffusers/schedulers/scheduling_lcm.py +123 -29
  156. diffusers/schedulers/scheduling_lms_discrete.py +1 -3
  157. diffusers/schedulers/scheduling_pndm.py +1 -3
  158. diffusers/schedulers/scheduling_repaint.py +1 -3
  159. diffusers/schedulers/scheduling_unipc_multistep.py +15 -5
  160. diffusers/utils/__init__.py +1 -0
  161. diffusers/utils/constants.py +11 -6
  162. diffusers/utils/dummy_pt_objects.py +45 -0
  163. diffusers/utils/dummy_torch_and_transformers_objects.py +60 -0
  164. diffusers/utils/dynamic_modules_utils.py +4 -4
  165. diffusers/utils/export_utils.py +8 -3
  166. diffusers/utils/logging.py +10 -10
  167. diffusers/utils/outputs.py +5 -5
  168. diffusers/utils/peft_utils.py +88 -44
  169. diffusers/utils/torch_utils.py +2 -2
  170. diffusers/utils/versions.py +117 -0
  171. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/METADATA +83 -64
  172. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/RECORD +176 -157
  173. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/WHEEL +1 -1
  174. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/entry_points.txt +1 -0
  175. diffusers/loaders.py +0 -3336
  176. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/LICENSE +0 -0
  177. {diffusers-0.23.0.dist-info → diffusers-0.24.0.dist-info}/top_level.txt +0 -0
@@ -16,11 +16,18 @@ import inspect
16
16
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
17
 
18
18
  import torch
19
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
19
+ from transformers import (
20
+ CLIPImageProcessor,
21
+ CLIPTextModel,
22
+ CLIPTextModelWithProjection,
23
+ CLIPTokenizer,
24
+ CLIPVisionModelWithProjection,
25
+ )
20
26
 
21
- from ...image_processor import VaeImageProcessor
27
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
22
28
  from ...loaders import (
23
29
  FromSingleFileMixin,
30
+ IPAdapterMixin,
24
31
  StableDiffusionXLLoraLoaderMixin,
25
32
  TextualInversionLoaderMixin,
26
33
  )
@@ -93,8 +100,57 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
93
100
  return noise_cfg
94
101
 
95
102
 
103
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
104
+ def retrieve_timesteps(
105
+ scheduler,
106
+ num_inference_steps: Optional[int] = None,
107
+ device: Optional[Union[str, torch.device]] = None,
108
+ timesteps: Optional[List[int]] = None,
109
+ **kwargs,
110
+ ):
111
+ """
112
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
113
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
114
+
115
+ Args:
116
+ scheduler (`SchedulerMixin`):
117
+ The scheduler to get timesteps from.
118
+ num_inference_steps (`int`):
119
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
120
+ `timesteps` must be `None`.
121
+ device (`str` or `torch.device`, *optional*):
122
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
123
+ timesteps (`List[int]`, *optional*):
124
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
125
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
126
+ must be `None`.
127
+
128
+ Returns:
129
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
130
+ second element is the number of inference steps.
131
+ """
132
+ if timesteps is not None:
133
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
134
+ if not accepts_timesteps:
135
+ raise ValueError(
136
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
137
+ f" timestep schedules. Please check whether you are using the correct scheduler."
138
+ )
139
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
140
+ timesteps = scheduler.timesteps
141
+ num_inference_steps = len(timesteps)
142
+ else:
143
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
144
+ timesteps = scheduler.timesteps
145
+ return timesteps, num_inference_steps
146
+
147
+
96
148
  class StableDiffusionXLPipeline(
97
- DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
149
+ DiffusionPipeline,
150
+ FromSingleFileMixin,
151
+ StableDiffusionXLLoraLoaderMixin,
152
+ TextualInversionLoaderMixin,
153
+ IPAdapterMixin,
98
154
  ):
99
155
  r"""
100
156
  Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -140,8 +196,16 @@ class StableDiffusionXLPipeline(
140
196
  watermark output images. If not defined, it will default to True if the package is installed, otherwise no
141
197
  watermarker will be used.
142
198
  """
199
+
143
200
  model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
144
- _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
201
+ _optional_components = [
202
+ "tokenizer",
203
+ "tokenizer_2",
204
+ "text_encoder",
205
+ "text_encoder_2",
206
+ "image_encoder",
207
+ "feature_extractor",
208
+ ]
145
209
  _callback_tensor_inputs = [
146
210
  "latents",
147
211
  "prompt_embeds",
@@ -161,6 +225,8 @@ class StableDiffusionXLPipeline(
161
225
  tokenizer_2: CLIPTokenizer,
162
226
  unet: UNet2DConditionModel,
163
227
  scheduler: KarrasDiffusionSchedulers,
228
+ image_encoder: CLIPVisionModelWithProjection = None,
229
+ feature_extractor: CLIPImageProcessor = None,
164
230
  force_zeros_for_empty_prompt: bool = True,
165
231
  add_watermarker: Optional[bool] = None,
166
232
  ):
@@ -174,6 +240,8 @@ class StableDiffusionXLPipeline(
174
240
  tokenizer_2=tokenizer_2,
175
241
  unet=unet,
176
242
  scheduler=scheduler,
243
+ image_encoder=image_encoder,
244
+ feature_extractor=feature_extractor,
177
245
  )
178
246
  self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
179
247
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@@ -455,6 +523,20 @@ class StableDiffusionXLPipeline(
455
523
 
456
524
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
457
525
 
526
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
527
+ def encode_image(self, image, device, num_images_per_prompt):
528
+ dtype = next(self.image_encoder.parameters()).dtype
529
+
530
+ if not isinstance(image, torch.Tensor):
531
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
532
+
533
+ image = image.to(device=device, dtype=dtype)
534
+ image_embeds = self.image_encoder(image).image_embeds
535
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
536
+
537
+ uncond_image_embeds = torch.zeros_like(image_embeds)
538
+ return image_embeds, uncond_image_embeds
539
+
458
540
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
459
541
  def prepare_extra_step_kwargs(self, generator, eta):
460
542
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -705,6 +787,7 @@ class StableDiffusionXLPipeline(
705
787
  height: Optional[int] = None,
706
788
  width: Optional[int] = None,
707
789
  num_inference_steps: int = 50,
790
+ timesteps: List[int] = None,
708
791
  denoising_end: Optional[float] = None,
709
792
  guidance_scale: float = 5.0,
710
793
  negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -717,6 +800,7 @@ class StableDiffusionXLPipeline(
717
800
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
718
801
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
719
802
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
803
+ ip_adapter_image: Optional[PipelineImageInput] = None,
720
804
  output_type: Optional[str] = "pil",
721
805
  return_dict: bool = True,
722
806
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -755,6 +839,10 @@ class StableDiffusionXLPipeline(
755
839
  num_inference_steps (`int`, *optional*, defaults to 50):
756
840
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
757
841
  expense of slower inference.
842
+ timesteps (`List[int]`, *optional*):
843
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
844
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
845
+ passed will be used. Must be in descending order.
758
846
  denoising_end (`float`, *optional*):
759
847
  When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
760
848
  completed before it is intentionally prematurely terminated. As a result, the returned sample will
@@ -801,6 +889,7 @@ class StableDiffusionXLPipeline(
801
889
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
802
890
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
803
891
  input argument.
892
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
804
893
  output_type (`str`, *optional*, defaults to `"pil"`):
805
894
  The output format of the generate image. Choose between
806
895
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -853,7 +942,7 @@ class StableDiffusionXLPipeline(
853
942
  callback_on_step_end_tensor_inputs (`List`, *optional*):
854
943
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
855
944
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
856
- `._callback_tensor_inputs` attribute of your pipeine class.
945
+ `._callback_tensor_inputs` attribute of your pipeline class.
857
946
 
858
947
  Examples:
859
948
 
@@ -945,9 +1034,7 @@ class StableDiffusionXLPipeline(
945
1034
  )
946
1035
 
947
1036
  # 4. Prepare timesteps
948
- self.scheduler.set_timesteps(num_inference_steps, device=device)
949
-
950
- timesteps = self.scheduler.timesteps
1037
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
951
1038
 
952
1039
  # 5. Prepare latent variables
953
1040
  num_channels_latents = self.unet.config.in_channels
@@ -999,6 +1086,12 @@ class StableDiffusionXLPipeline(
999
1086
  add_text_embeds = add_text_embeds.to(device)
1000
1087
  add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
1001
1088
 
1089
+ if ip_adapter_image is not None:
1090
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
1091
+ if self.do_classifier_free_guidance:
1092
+ image_embeds = torch.cat([negative_image_embeds, image_embeds])
1093
+ image_embeds = image_embeds.to(device)
1094
+
1002
1095
  # 8. Denoising loop
1003
1096
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1004
1097
 
@@ -1036,6 +1129,8 @@ class StableDiffusionXLPipeline(
1036
1129
 
1037
1130
  # predict the noise residual
1038
1131
  added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1132
+ if ip_adapter_image is not None:
1133
+ added_cond_kwargs["image_embeds"] = image_embeds
1039
1134
  noise_pred = self.unet(
1040
1135
  latent_model_input,
1041
1136
  t,
@@ -17,10 +17,21 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
17
 
18
18
  import PIL.Image
19
19
  import torch
20
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
20
+ from transformers import (
21
+ CLIPImageProcessor,
22
+ CLIPTextModel,
23
+ CLIPTextModelWithProjection,
24
+ CLIPTokenizer,
25
+ CLIPVisionModelWithProjection,
26
+ )
21
27
 
22
28
  from ...image_processor import PipelineImageInput, VaeImageProcessor
23
- from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
29
+ from ...loaders import (
30
+ FromSingleFileMixin,
31
+ IPAdapterMixin,
32
+ StableDiffusionXLLoraLoaderMixin,
33
+ TextualInversionLoaderMixin,
34
+ )
24
35
  from ...models import AutoencoderKL, UNet2DConditionModel
25
36
  from ...models.attention_processor import (
26
37
  AttnProcessor2_0,
@@ -94,17 +105,70 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
94
105
 
95
106
 
96
107
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
97
- def retrieve_latents(encoder_output, generator):
98
- if hasattr(encoder_output, "latent_dist"):
108
+ def retrieve_latents(
109
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
110
+ ):
111
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
99
112
  return encoder_output.latent_dist.sample(generator)
113
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
114
+ return encoder_output.latent_dist.mode()
100
115
  elif hasattr(encoder_output, "latents"):
101
116
  return encoder_output.latents
102
117
  else:
103
118
  raise AttributeError("Could not access latents of provided encoder_output")
104
119
 
105
120
 
121
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
122
+ def retrieve_timesteps(
123
+ scheduler,
124
+ num_inference_steps: Optional[int] = None,
125
+ device: Optional[Union[str, torch.device]] = None,
126
+ timesteps: Optional[List[int]] = None,
127
+ **kwargs,
128
+ ):
129
+ """
130
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
131
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
132
+
133
+ Args:
134
+ scheduler (`SchedulerMixin`):
135
+ The scheduler to get timesteps from.
136
+ num_inference_steps (`int`):
137
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
138
+ `timesteps` must be `None`.
139
+ device (`str` or `torch.device`, *optional*):
140
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
141
+ timesteps (`List[int]`, *optional*):
142
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
143
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
144
+ must be `None`.
145
+
146
+ Returns:
147
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
148
+ second element is the number of inference steps.
149
+ """
150
+ if timesteps is not None:
151
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
152
+ if not accepts_timesteps:
153
+ raise ValueError(
154
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
155
+ f" timestep schedules. Please check whether you are using the correct scheduler."
156
+ )
157
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
158
+ timesteps = scheduler.timesteps
159
+ num_inference_steps = len(timesteps)
160
+ else:
161
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
162
+ timesteps = scheduler.timesteps
163
+ return timesteps, num_inference_steps
164
+
165
+
106
166
  class StableDiffusionXLImg2ImgPipeline(
107
- DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
167
+ DiffusionPipeline,
168
+ TextualInversionLoaderMixin,
169
+ FromSingleFileMixin,
170
+ StableDiffusionXLLoraLoaderMixin,
171
+ IPAdapterMixin,
108
172
  ):
109
173
  r"""
110
174
  Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -153,8 +217,16 @@ class StableDiffusionXLImg2ImgPipeline(
153
217
  watermark output images. If not defined, it will default to True if the package is installed, otherwise no
154
218
  watermarker will be used.
155
219
  """
220
+
156
221
  model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
157
- _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
222
+ _optional_components = [
223
+ "tokenizer",
224
+ "tokenizer_2",
225
+ "text_encoder",
226
+ "text_encoder_2",
227
+ "image_encoder",
228
+ "feature_extractor",
229
+ ]
158
230
  _callback_tensor_inputs = [
159
231
  "latents",
160
232
  "prompt_embeds",
@@ -174,6 +246,8 @@ class StableDiffusionXLImg2ImgPipeline(
174
246
  tokenizer_2: CLIPTokenizer,
175
247
  unet: UNet2DConditionModel,
176
248
  scheduler: KarrasDiffusionSchedulers,
249
+ image_encoder: CLIPVisionModelWithProjection = None,
250
+ feature_extractor: CLIPImageProcessor = None,
177
251
  requires_aesthetics_score: bool = False,
178
252
  force_zeros_for_empty_prompt: bool = True,
179
253
  add_watermarker: Optional[bool] = None,
@@ -187,6 +261,8 @@ class StableDiffusionXLImg2ImgPipeline(
187
261
  tokenizer=tokenizer,
188
262
  tokenizer_2=tokenizer_2,
189
263
  unet=unet,
264
+ image_encoder=image_encoder,
265
+ feature_extractor=feature_extractor,
190
266
  scheduler=scheduler,
191
267
  )
192
268
  self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
@@ -664,6 +740,20 @@ class StableDiffusionXLImg2ImgPipeline(
664
740
 
665
741
  return latents
666
742
 
743
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
744
+ def encode_image(self, image, device, num_images_per_prompt):
745
+ dtype = next(self.image_encoder.parameters()).dtype
746
+
747
+ if not isinstance(image, torch.Tensor):
748
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
749
+
750
+ image = image.to(device=device, dtype=dtype)
751
+ image_embeds = self.image_encoder(image).image_embeds
752
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
753
+
754
+ uncond_image_embeds = torch.zeros_like(image_embeds)
755
+ return image_embeds, uncond_image_embeds
756
+
667
757
  def _get_add_time_ids(
668
758
  self,
669
759
  original_size,
@@ -836,6 +926,7 @@ class StableDiffusionXLImg2ImgPipeline(
836
926
  image: PipelineImageInput = None,
837
927
  strength: float = 0.3,
838
928
  num_inference_steps: int = 50,
929
+ timesteps: List[int] = None,
839
930
  denoising_start: Optional[float] = None,
840
931
  denoising_end: Optional[float] = None,
841
932
  guidance_scale: float = 5.0,
@@ -849,6 +940,7 @@ class StableDiffusionXLImg2ImgPipeline(
849
940
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
850
941
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
851
942
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
943
+ ip_adapter_image: Optional[PipelineImageInput] = None,
852
944
  output_type: Optional[str] = "pil",
853
945
  return_dict: bool = True,
854
946
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -888,6 +980,10 @@ class StableDiffusionXLImg2ImgPipeline(
888
980
  num_inference_steps (`int`, *optional*, defaults to 50):
889
981
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
890
982
  expense of slower inference.
983
+ timesteps (`List[int]`, *optional*):
984
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
985
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
986
+ passed will be used. Must be in descending order.
891
987
  denoising_start (`float`, *optional*):
892
988
  When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
893
989
  bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
@@ -942,6 +1038,7 @@ class StableDiffusionXLImg2ImgPipeline(
942
1038
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
943
1039
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
944
1040
  input argument.
1041
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
945
1042
  output_type (`str`, *optional*, defaults to `"pil"`):
946
1043
  The output format of the generate image. Choose between
947
1044
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -1005,7 +1102,7 @@ class StableDiffusionXLImg2ImgPipeline(
1005
1102
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1006
1103
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1007
1104
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1008
- `._callback_tensor_inputs` attribute of your pipeine class.
1105
+ `._callback_tensor_inputs` attribute of your pipeline class.
1009
1106
 
1010
1107
  Examples:
1011
1108
 
@@ -1094,7 +1191,7 @@ class StableDiffusionXLImg2ImgPipeline(
1094
1191
  def denoising_value_valid(dnv):
1095
1192
  return isinstance(self.denoising_end, float) and 0 < dnv < 1
1096
1193
 
1097
- self.scheduler.set_timesteps(num_inference_steps, device=device)
1194
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1098
1195
  timesteps, num_inference_steps = self.get_timesteps(
1099
1196
  num_inference_steps,
1100
1197
  strength,
@@ -1161,6 +1258,12 @@ class StableDiffusionXLImg2ImgPipeline(
1161
1258
  add_text_embeds = add_text_embeds.to(device)
1162
1259
  add_time_ids = add_time_ids.to(device)
1163
1260
 
1261
+ if ip_adapter_image is not None:
1262
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
1263
+ if self.do_classifier_free_guidance:
1264
+ image_embeds = torch.cat([negative_image_embeds, image_embeds])
1265
+ image_embeds = image_embeds.to(device)
1266
+
1164
1267
  # 9. Denoising loop
1165
1268
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1166
1269
 
@@ -1204,6 +1307,8 @@ class StableDiffusionXLImg2ImgPipeline(
1204
1307
 
1205
1308
  # predict the noise residual
1206
1309
  added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1310
+ if ip_adapter_image is not None:
1311
+ added_cond_kwargs["image_embeds"] = image_embeds
1207
1312
  noise_pred = self.unet(
1208
1313
  latent_model_input,
1209
1314
  t,
@@ -18,10 +18,21 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
18
18
  import numpy as np
19
19
  import PIL.Image
20
20
  import torch
21
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
21
+ from transformers import (
22
+ CLIPImageProcessor,
23
+ CLIPTextModel,
24
+ CLIPTextModelWithProjection,
25
+ CLIPTokenizer,
26
+ CLIPVisionModelWithProjection,
27
+ )
22
28
 
23
29
  from ...image_processor import PipelineImageInput, VaeImageProcessor
24
- from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
30
+ from ...loaders import (
31
+ FromSingleFileMixin,
32
+ IPAdapterMixin,
33
+ StableDiffusionXLLoraLoaderMixin,
34
+ TextualInversionLoaderMixin,
35
+ )
25
36
  from ...models import AutoencoderKL, UNet2DConditionModel
26
37
  from ...models.attention_processor import (
27
38
  AttnProcessor2_0,
@@ -239,17 +250,70 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
239
250
 
240
251
 
241
252
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
242
- def retrieve_latents(encoder_output, generator):
243
- if hasattr(encoder_output, "latent_dist"):
253
+ def retrieve_latents(
254
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
255
+ ):
256
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
244
257
  return encoder_output.latent_dist.sample(generator)
258
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
259
+ return encoder_output.latent_dist.mode()
245
260
  elif hasattr(encoder_output, "latents"):
246
261
  return encoder_output.latents
247
262
  else:
248
263
  raise AttributeError("Could not access latents of provided encoder_output")
249
264
 
250
265
 
266
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
267
+ def retrieve_timesteps(
268
+ scheduler,
269
+ num_inference_steps: Optional[int] = None,
270
+ device: Optional[Union[str, torch.device]] = None,
271
+ timesteps: Optional[List[int]] = None,
272
+ **kwargs,
273
+ ):
274
+ """
275
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
276
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
277
+
278
+ Args:
279
+ scheduler (`SchedulerMixin`):
280
+ The scheduler to get timesteps from.
281
+ num_inference_steps (`int`):
282
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
283
+ `timesteps` must be `None`.
284
+ device (`str` or `torch.device`, *optional*):
285
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
286
+ timesteps (`List[int]`, *optional*):
287
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
288
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
289
+ must be `None`.
290
+
291
+ Returns:
292
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
293
+ second element is the number of inference steps.
294
+ """
295
+ if timesteps is not None:
296
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
297
+ if not accepts_timesteps:
298
+ raise ValueError(
299
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
300
+ f" timestep schedules. Please check whether you are using the correct scheduler."
301
+ )
302
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
303
+ timesteps = scheduler.timesteps
304
+ num_inference_steps = len(timesteps)
305
+ else:
306
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
307
+ timesteps = scheduler.timesteps
308
+ return timesteps, num_inference_steps
309
+
310
+
251
311
  class StableDiffusionXLInpaintPipeline(
252
- DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
312
+ DiffusionPipeline,
313
+ TextualInversionLoaderMixin,
314
+ StableDiffusionXLLoraLoaderMixin,
315
+ FromSingleFileMixin,
316
+ IPAdapterMixin,
253
317
  ):
254
318
  r"""
255
319
  Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -298,9 +362,17 @@ class StableDiffusionXLInpaintPipeline(
298
362
  watermark output images. If not defined, it will default to True if the package is installed, otherwise no
299
363
  watermarker will be used.
300
364
  """
365
+
301
366
  model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
302
367
 
303
- _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
368
+ _optional_components = [
369
+ "tokenizer",
370
+ "tokenizer_2",
371
+ "text_encoder",
372
+ "text_encoder_2",
373
+ "image_encoder",
374
+ "feature_extractor",
375
+ ]
304
376
  _callback_tensor_inputs = [
305
377
  "latents",
306
378
  "prompt_embeds",
@@ -322,6 +394,8 @@ class StableDiffusionXLInpaintPipeline(
322
394
  tokenizer_2: CLIPTokenizer,
323
395
  unet: UNet2DConditionModel,
324
396
  scheduler: KarrasDiffusionSchedulers,
397
+ image_encoder: CLIPVisionModelWithProjection = None,
398
+ feature_extractor: CLIPImageProcessor = None,
325
399
  requires_aesthetics_score: bool = False,
326
400
  force_zeros_for_empty_prompt: bool = True,
327
401
  add_watermarker: Optional[bool] = None,
@@ -335,6 +409,8 @@ class StableDiffusionXLInpaintPipeline(
335
409
  tokenizer=tokenizer,
336
410
  tokenizer_2=tokenizer_2,
337
411
  unet=unet,
412
+ image_encoder=image_encoder,
413
+ feature_extractor=feature_extractor,
338
414
  scheduler=scheduler,
339
415
  )
340
416
  self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
@@ -385,6 +461,20 @@ class StableDiffusionXLInpaintPipeline(
385
461
  """
386
462
  self.vae.disable_tiling()
387
463
 
464
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
465
+ def encode_image(self, image, device, num_images_per_prompt):
466
+ dtype = next(self.image_encoder.parameters()).dtype
467
+
468
+ if not isinstance(image, torch.Tensor):
469
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
470
+
471
+ image = image.to(device=device, dtype=dtype)
472
+ image_embeds = self.image_encoder(image).image_embeds
473
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
474
+
475
+ uncond_image_embeds = torch.zeros_like(image_embeds)
476
+ return image_embeds, uncond_image_embeds
477
+
388
478
  # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
389
479
  def encode_prompt(
390
480
  self,
@@ -741,10 +831,11 @@ class StableDiffusionXLInpaintPipeline(
741
831
 
742
832
  if image.shape[1] == 4:
743
833
  image_latents = image.to(device=device, dtype=dtype)
834
+ image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
744
835
  elif return_image_latents or (latents is None and not is_strength_max):
745
836
  image = image.to(device=device, dtype=dtype)
746
837
  image_latents = self._encode_vae_image(image=image, generator=generator)
747
- image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
838
+ image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
748
839
 
749
840
  if latents is None and add_noise:
750
841
  noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
@@ -1059,6 +1150,7 @@ class StableDiffusionXLInpaintPipeline(
1059
1150
  width: Optional[int] = None,
1060
1151
  strength: float = 0.9999,
1061
1152
  num_inference_steps: int = 50,
1153
+ timesteps: List[int] = None,
1062
1154
  denoising_start: Optional[float] = None,
1063
1155
  denoising_end: Optional[float] = None,
1064
1156
  guidance_scale: float = 7.5,
@@ -1072,6 +1164,7 @@ class StableDiffusionXLInpaintPipeline(
1072
1164
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1073
1165
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1074
1166
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1167
+ ip_adapter_image: Optional[PipelineImageInput] = None,
1075
1168
  output_type: Optional[str] = "pil",
1076
1169
  return_dict: bool = True,
1077
1170
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1128,6 +1221,10 @@ class StableDiffusionXLInpaintPipeline(
1128
1221
  num_inference_steps (`int`, *optional*, defaults to 50):
1129
1222
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
1130
1223
  expense of slower inference.
1224
+ timesteps (`List[int]`, *optional*):
1225
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
1226
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
1227
+ passed will be used. Must be in descending order.
1131
1228
  denoising_start (`float`, *optional*):
1132
1229
  When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
1133
1230
  bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
@@ -1170,6 +1267,7 @@ class StableDiffusionXLInpaintPipeline(
1170
1267
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1171
1268
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1172
1269
  input argument.
1270
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1173
1271
  num_images_per_prompt (`int`, *optional*, defaults to 1):
1174
1272
  The number of images to generate per prompt.
1175
1273
  eta (`float`, *optional*, defaults to 0.0):
@@ -1240,7 +1338,7 @@ class StableDiffusionXLInpaintPipeline(
1240
1338
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1241
1339
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1242
1340
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1243
- `._callback_tensor_inputs` attribute of your pipeine class.
1341
+ `._callback_tensor_inputs` attribute of your pipeline class.
1244
1342
 
1245
1343
  Examples:
1246
1344
 
@@ -1332,7 +1430,7 @@ class StableDiffusionXLInpaintPipeline(
1332
1430
  def denoising_value_valid(dnv):
1333
1431
  return isinstance(self.denoising_end, float) and 0 < dnv < 1
1334
1432
 
1335
- self.scheduler.set_timesteps(num_inference_steps, device=device)
1433
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1336
1434
  timesteps, num_inference_steps = self.get_timesteps(
1337
1435
  num_inference_steps,
1338
1436
  strength,
@@ -1469,6 +1567,12 @@ class StableDiffusionXLInpaintPipeline(
1469
1567
  add_text_embeds = add_text_embeds.to(device)
1470
1568
  add_time_ids = add_time_ids.to(device)
1471
1569
 
1570
+ if ip_adapter_image is not None:
1571
+ image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
1572
+ if self.do_classifier_free_guidance:
1573
+ image_embeds = torch.cat([negative_image_embeds, image_embeds])
1574
+ image_embeds = image_embeds.to(device)
1575
+
1472
1576
  # 11. Denoising loop
1473
1577
  num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1474
1578
 
@@ -1515,6 +1619,8 @@ class StableDiffusionXLInpaintPipeline(
1515
1619
 
1516
1620
  # predict the noise residual
1517
1621
  added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1622
+ if ip_adapter_image is not None:
1623
+ added_cond_kwargs["image_embeds"] = image_embeds
1518
1624
  noise_pred = self.unet(
1519
1625
  latent_model_input,
1520
1626
  t,