diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ import torch
23
23
  import torch.nn.functional as F
24
24
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
25
25
 
26
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
26
27
  from ...image_processor import PipelineImageInput, VaeImageProcessor
27
28
  from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
28
29
  from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
@@ -364,8 +365,8 @@ class StableDiffusionControlNetInpaintPipeline(
364
365
  num_images_per_prompt,
365
366
  do_classifier_free_guidance,
366
367
  negative_prompt=None,
367
- prompt_embeds: Optional[torch.FloatTensor] = None,
368
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
368
+ prompt_embeds: Optional[torch.Tensor] = None,
369
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
369
370
  lora_scale: Optional[float] = None,
370
371
  **kwargs,
371
372
  ):
@@ -397,8 +398,8 @@ class StableDiffusionControlNetInpaintPipeline(
397
398
  num_images_per_prompt,
398
399
  do_classifier_free_guidance,
399
400
  negative_prompt=None,
400
- prompt_embeds: Optional[torch.FloatTensor] = None,
401
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
401
+ prompt_embeds: Optional[torch.Tensor] = None,
402
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
402
403
  lora_scale: Optional[float] = None,
403
404
  clip_skip: Optional[int] = None,
404
405
  ):
@@ -418,10 +419,10 @@ class StableDiffusionControlNetInpaintPipeline(
418
419
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
419
420
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
420
421
  less than `1`).
421
- prompt_embeds (`torch.FloatTensor`, *optional*):
422
+ prompt_embeds (`torch.Tensor`, *optional*):
422
423
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
423
424
  provided, text embeddings will be generated from `prompt` input argument.
424
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
425
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
425
426
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
426
427
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
427
428
  argument.
@@ -972,7 +973,12 @@ class StableDiffusionControlNetInpaintPipeline(
972
973
  return_noise=False,
973
974
  return_image_latents=False,
974
975
  ):
975
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
976
+ shape = (
977
+ batch_size,
978
+ num_channels_latents,
979
+ int(height) // self.vae_scale_factor,
980
+ int(width) // self.vae_scale_factor,
981
+ )
976
982
  if isinstance(generator, list) and len(generator) != batch_size:
977
983
  raise ValueError(
978
984
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1116,11 +1122,11 @@ class StableDiffusionControlNetInpaintPipeline(
1116
1122
  num_images_per_prompt: Optional[int] = 1,
1117
1123
  eta: float = 0.0,
1118
1124
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1119
- latents: Optional[torch.FloatTensor] = None,
1120
- prompt_embeds: Optional[torch.FloatTensor] = None,
1121
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1125
+ latents: Optional[torch.Tensor] = None,
1126
+ prompt_embeds: Optional[torch.Tensor] = None,
1127
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
1122
1128
  ip_adapter_image: Optional[PipelineImageInput] = None,
1123
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1129
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
1124
1130
  output_type: Optional[str] = "pil",
1125
1131
  return_dict: bool = True,
1126
1132
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1129,7 +1135,9 @@ class StableDiffusionControlNetInpaintPipeline(
1129
1135
  control_guidance_start: Union[float, List[float]] = 0.0,
1130
1136
  control_guidance_end: Union[float, List[float]] = 1.0,
1131
1137
  clip_skip: Optional[int] = None,
1132
- callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
1138
+ callback_on_step_end: Optional[
1139
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
1140
+ ] = None,
1133
1141
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
1134
1142
  **kwargs,
1135
1143
  ):
@@ -1139,14 +1147,14 @@ class StableDiffusionControlNetInpaintPipeline(
1139
1147
  Args:
1140
1148
  prompt (`str` or `List[str]`, *optional*):
1141
1149
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
1142
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
1150
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`,
1143
1151
  `List[PIL.Image.Image]`, or `List[np.ndarray]`):
1144
1152
  `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both
1145
1153
  NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a
1146
1154
  list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or
1147
1155
  a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image
1148
1156
  latents as `image`, but if passing latents directly it is not encoded again.
1149
- mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
1157
+ mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`,
1150
1158
  `List[PIL.Image.Image]`, or `List[np.ndarray]`):
1151
1159
  `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask
1152
1160
  are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
@@ -1154,24 +1162,25 @@ class StableDiffusionControlNetInpaintPipeline(
1154
1162
  color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B,
1155
1163
  H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H,
1156
1164
  W, 1)`, or `(H, W)`.
1157
- control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
1158
- `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
1165
+ control_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`,
1166
+ `List[List[torch.Tensor]]`, or `List[List[PIL.Image.Image]]`):
1159
1167
  The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
1160
- specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
1161
- accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
1162
- and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
1163
- `init`, images must be passed as a list such that each element of the list can be correctly batched for
1164
- input to a single ControlNet.
1168
+ specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
1169
+ as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
1170
+ width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
1171
+ images must be passed as a list such that each element of the list can be correctly batched for input
1172
+ to a single ControlNet.
1165
1173
  height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1166
1174
  The height in pixels of the generated image.
1167
1175
  width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
1168
1176
  The width in pixels of the generated image.
1169
1177
  padding_mask_crop (`int`, *optional*, defaults to `None`):
1170
- The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
1171
- `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
1172
- contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
1173
- the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
1174
- and contain information inreleant for inpainging, such as background.
1178
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
1179
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
1180
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
1181
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
1182
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
1183
+ the image is large and contain information irrelevant for inpainting, such as background.
1175
1184
  strength (`float`, *optional*, defaults to 1.0):
1176
1185
  Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
1177
1186
  starting point and more noise is added the higher the `strength`. The number of denoising steps depends
@@ -1195,22 +1204,22 @@ class StableDiffusionControlNetInpaintPipeline(
1195
1204
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
1196
1205
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
1197
1206
  generation deterministic.
1198
- latents (`torch.FloatTensor`, *optional*):
1207
+ latents (`torch.Tensor`, *optional*):
1199
1208
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
1200
1209
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1201
1210
  tensor is generated by sampling using the supplied random `generator`.
1202
- prompt_embeds (`torch.FloatTensor`, *optional*):
1211
+ prompt_embeds (`torch.Tensor`, *optional*):
1203
1212
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
1204
1213
  provided, text embeddings are generated from the `prompt` input argument.
1205
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1214
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
1206
1215
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
1207
1216
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1208
1217
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1209
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1210
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1211
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1212
- if `do_classifier_free_guidance` is set to `True`.
1213
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1218
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
1219
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
1220
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
1221
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
1222
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
1214
1223
  output_type (`str`, *optional*, defaults to `"pil"`):
1215
1224
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
1216
1225
  return_dict (`bool`, *optional*, defaults to `True`):
@@ -1233,15 +1242,15 @@ class StableDiffusionControlNetInpaintPipeline(
1233
1242
  clip_skip (`int`, *optional*):
1234
1243
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1235
1244
  the output of the pre-final layer will be used for computing the prompt embeddings.
1236
- callback_on_step_end (`Callable`, *optional*):
1237
- A function that calls at the end of each denoising steps during the inference. The function is called
1238
- with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1239
- callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1240
- `callback_on_step_end_tensor_inputs`.
1245
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1246
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1247
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1248
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1249
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1241
1250
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1242
1251
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1243
1252
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1244
- `._callback_tensor_inputs` attribute of your pipeine class.
1253
+ `._callback_tensor_inputs` attribute of your pipeline class.
1245
1254
 
1246
1255
  Examples:
1247
1256
 
@@ -1269,6 +1278,9 @@ class StableDiffusionControlNetInpaintPipeline(
1269
1278
  "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1270
1279
  )
1271
1280
 
1281
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1282
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1283
+
1272
1284
  controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
1273
1285
 
1274
1286
  # align format for control guidance
@@ -27,6 +27,7 @@ from transformers import (
27
27
  CLIPVisionModelWithProjection,
28
28
  )
29
29
 
30
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
30
31
  from ...image_processor import PipelineImageInput, VaeImageProcessor
31
32
  from ...loaders import (
32
33
  FromSingleFileMixin,
@@ -151,7 +152,12 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
151
152
 
152
153
 
153
154
  class StableDiffusionXLControlNetInpaintPipeline(
154
- DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
155
+ DiffusionPipeline,
156
+ StableDiffusionMixin,
157
+ StableDiffusionXLLoraLoaderMixin,
158
+ FromSingleFileMixin,
159
+ IPAdapterMixin,
160
+ TextualInversionLoaderMixin,
155
161
  ):
156
162
  r"""
157
163
  Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -160,6 +166,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
160
166
  library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
161
167
 
162
168
  The pipeline also inherits the following loading methods:
169
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
163
170
  - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
164
171
  - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
165
172
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
@@ -191,8 +198,26 @@ class StableDiffusionXLControlNetInpaintPipeline(
191
198
  """
192
199
 
193
200
  model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
194
- _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
195
- _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
201
+
202
+ _optional_components = [
203
+ "tokenizer",
204
+ "tokenizer_2",
205
+ "text_encoder",
206
+ "text_encoder_2",
207
+ "image_encoder",
208
+ "feature_extractor",
209
+ ]
210
+ _callback_tensor_inputs = [
211
+ "latents",
212
+ "prompt_embeds",
213
+ "negative_prompt_embeds",
214
+ "add_text_embeds",
215
+ "add_time_ids",
216
+ "negative_pooled_prompt_embeds",
217
+ "add_neg_time_ids",
218
+ "mask",
219
+ "masked_image_latents",
220
+ ]
196
221
 
197
222
  def __init__(
198
223
  self,
@@ -202,7 +227,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
202
227
  tokenizer: CLIPTokenizer,
203
228
  tokenizer_2: CLIPTokenizer,
204
229
  unet: UNet2DConditionModel,
205
- controlnet: ControlNetModel,
230
+ controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
206
231
  scheduler: KarrasDiffusionSchedulers,
207
232
  requires_aesthetics_score: bool = False,
208
233
  force_zeros_for_empty_prompt: bool = True,
@@ -255,10 +280,10 @@ class StableDiffusionXLControlNetInpaintPipeline(
255
280
  do_classifier_free_guidance: bool = True,
256
281
  negative_prompt: Optional[str] = None,
257
282
  negative_prompt_2: Optional[str] = None,
258
- prompt_embeds: Optional[torch.FloatTensor] = None,
259
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
260
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
261
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
283
+ prompt_embeds: Optional[torch.Tensor] = None,
284
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
285
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
286
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
262
287
  lora_scale: Optional[float] = None,
263
288
  clip_skip: Optional[int] = None,
264
289
  ):
@@ -284,17 +309,17 @@ class StableDiffusionXLControlNetInpaintPipeline(
284
309
  negative_prompt_2 (`str` or `List[str]`, *optional*):
285
310
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
286
311
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
287
- prompt_embeds (`torch.FloatTensor`, *optional*):
312
+ prompt_embeds (`torch.Tensor`, *optional*):
288
313
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
289
314
  provided, text embeddings will be generated from `prompt` input argument.
290
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
315
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
291
316
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
292
317
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
293
318
  argument.
294
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
319
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
295
320
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
296
321
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
297
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
322
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
298
323
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
299
324
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
300
325
  input argument.
@@ -880,7 +905,12 @@ class StableDiffusionXLControlNetInpaintPipeline(
880
905
  return_noise=False,
881
906
  return_image_latents=False,
882
907
  ):
883
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
908
+ shape = (
909
+ batch_size,
910
+ num_channels_latents,
911
+ int(height) // self.vae_scale_factor,
912
+ int(width) // self.vae_scale_factor,
913
+ )
884
914
  if isinstance(generator, list) and len(generator) != batch_size:
885
915
  raise ValueError(
886
916
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1022,7 +1052,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
1022
1052
  # because `num_inference_steps` might be even given that every timestep
1023
1053
  # (except the highest one) is duplicated. If `num_inference_steps` is even it would
1024
1054
  # mean that we cut the timesteps in the middle of the denoising step
1025
- # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
1055
+ # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
1026
1056
  # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
1027
1057
  num_inference_steps = num_inference_steps + 1
1028
1058
 
@@ -1146,13 +1176,13 @@ class StableDiffusionXLControlNetInpaintPipeline(
1146
1176
  num_images_per_prompt: Optional[int] = 1,
1147
1177
  eta: float = 0.0,
1148
1178
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1149
- latents: Optional[torch.FloatTensor] = None,
1150
- prompt_embeds: Optional[torch.FloatTensor] = None,
1151
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1179
+ latents: Optional[torch.Tensor] = None,
1180
+ prompt_embeds: Optional[torch.Tensor] = None,
1181
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
1152
1182
  ip_adapter_image: Optional[PipelineImageInput] = None,
1153
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1154
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1155
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1183
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
1184
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
1185
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
1156
1186
  output_type: Optional[str] = "pil",
1157
1187
  return_dict: bool = True,
1158
1188
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1167,7 +1197,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
1167
1197
  aesthetic_score: float = 6.0,
1168
1198
  negative_aesthetic_score: float = 2.5,
1169
1199
  clip_skip: Optional[int] = None,
1170
- callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
1200
+ callback_on_step_end: Optional[
1201
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
1202
+ ] = None,
1171
1203
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
1172
1204
  **kwargs,
1173
1205
  ):
@@ -1194,11 +1226,12 @@ class StableDiffusionXLControlNetInpaintPipeline(
1194
1226
  width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
1195
1227
  The width in pixels of the generated image.
1196
1228
  padding_mask_crop (`int`, *optional*, defaults to `None`):
1197
- The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
1198
- `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
1199
- contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
1200
- the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
1201
- and contain information inreleant for inpainging, such as background.
1229
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
1230
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
1231
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
1232
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
1233
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
1234
+ the image is large and contain information irrelevant for inpainting, such as background.
1202
1235
  strength (`float`, *optional*, defaults to 0.9999):
1203
1236
  Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1204
1237
  between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -1238,23 +1271,23 @@ class StableDiffusionXLControlNetInpaintPipeline(
1238
1271
  negative_prompt_2 (`str` or `List[str]`, *optional*):
1239
1272
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
1240
1273
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
1241
- prompt_embeds (`torch.FloatTensor`, *optional*):
1274
+ prompt_embeds (`torch.Tensor`, *optional*):
1242
1275
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1243
1276
  provided, text embeddings will be generated from `prompt` input argument.
1244
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1277
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
1245
1278
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1246
1279
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1247
1280
  argument.
1248
1281
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1249
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1250
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1251
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1252
- if `do_classifier_free_guidance` is set to `True`.
1253
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1254
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1282
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
1283
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
1284
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
1285
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
1286
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
1287
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
1255
1288
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1256
1289
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
1257
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1290
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
1258
1291
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1259
1292
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1260
1293
  input argument.
@@ -1266,7 +1299,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
1266
1299
  generator (`torch.Generator`, *optional*):
1267
1300
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1268
1301
  to make generation deterministic.
1269
- latents (`torch.FloatTensor`, *optional*):
1302
+ latents (`torch.Tensor`, *optional*):
1270
1303
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1271
1304
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1272
1305
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -1305,15 +1338,15 @@ class StableDiffusionXLControlNetInpaintPipeline(
1305
1338
  clip_skip (`int`, *optional*):
1306
1339
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1307
1340
  the output of the pre-final layer will be used for computing the prompt embeddings.
1308
- callback_on_step_end (`Callable`, *optional*):
1309
- A function that calls at the end of each denoising steps during the inference. The function is called
1310
- with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1311
- callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1312
- `callback_on_step_end_tensor_inputs`.
1341
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1342
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1343
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1344
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1345
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1313
1346
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1314
1347
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1315
1348
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1316
- `._callback_tensor_inputs` attribute of your pipeine class.
1349
+ `._callback_tensor_inputs` attribute of your pipeline class.
1317
1350
 
1318
1351
  Examples:
1319
1352
 
@@ -1339,6 +1372,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
1339
1372
  "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
1340
1373
  )
1341
1374
 
1375
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1376
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1377
+
1342
1378
  controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
1343
1379
 
1344
1380
  # align format for control guidance
@@ -1601,10 +1637,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
1601
1637
  1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
1602
1638
  for s, e in zip(control_guidance_start, control_guidance_end)
1603
1639
  ]
1604
- if isinstance(self.controlnet, MultiControlNetModel):
1605
- controlnet_keep.append(keeps)
1606
- else:
1607
- controlnet_keep.append(keeps[0])
1640
+ controlnet_keep.append(keeps if isinstance(controlnet, MultiControlNetModel) else keeps[0])
1608
1641
 
1609
1642
  # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1610
1643
  height, width = latents.shape[-2:]
@@ -1721,7 +1754,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
1721
1754
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
1722
1755
  mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
1723
1756
 
1724
- if ip_adapter_image is not None:
1757
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1725
1758
  added_cond_kwargs["image_embeds"] = image_embeds
1726
1759
 
1727
1760
  if num_channels_unet == 9: