diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -26,6 +26,7 @@ from transformers import (
26
26
  CLIPVisionModelWithProjection,
27
27
  )
28
28
 
29
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
29
30
  from ...image_processor import PipelineImageInput, VaeImageProcessor
30
31
  from ...loaders import (
31
32
  FromSingleFileMixin,
@@ -269,6 +270,7 @@ def retrieve_timesteps(
269
270
  num_inference_steps: Optional[int] = None,
270
271
  device: Optional[Union[str, torch.device]] = None,
271
272
  timesteps: Optional[List[int]] = None,
273
+ sigmas: Optional[List[float]] = None,
272
274
  **kwargs,
273
275
  ):
274
276
  """
@@ -279,19 +281,23 @@ def retrieve_timesteps(
279
281
  scheduler (`SchedulerMixin`):
280
282
  The scheduler to get timesteps from.
281
283
  num_inference_steps (`int`):
282
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
283
- `timesteps` must be `None`.
284
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
285
+ must be `None`.
284
286
  device (`str` or `torch.device`, *optional*):
285
287
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
286
288
  timesteps (`List[int]`, *optional*):
287
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
288
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
289
- must be `None`.
289
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
290
+ `num_inference_steps` and `sigmas` must be `None`.
291
+ sigmas (`List[float]`, *optional*):
292
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
293
+ `num_inference_steps` and `timesteps` must be `None`.
290
294
 
291
295
  Returns:
292
296
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
293
297
  second element is the number of inference steps.
294
298
  """
299
+ if timesteps is not None and sigmas is not None:
300
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
295
301
  if timesteps is not None:
296
302
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
297
303
  if not accepts_timesteps:
@@ -302,6 +308,16 @@ def retrieve_timesteps(
302
308
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
303
309
  timesteps = scheduler.timesteps
304
310
  num_inference_steps = len(timesteps)
311
+ elif sigmas is not None:
312
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
313
+ if not accept_sigmas:
314
+ raise ValueError(
315
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
316
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
317
+ )
318
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
319
+ timesteps = scheduler.timesteps
320
+ num_inference_steps = len(timesteps)
305
321
  else:
306
322
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
307
323
  timesteps = scheduler.timesteps
@@ -516,10 +532,10 @@ class StableDiffusionXLInpaintPipeline(
516
532
  do_classifier_free_guidance: bool = True,
517
533
  negative_prompt: Optional[str] = None,
518
534
  negative_prompt_2: Optional[str] = None,
519
- prompt_embeds: Optional[torch.FloatTensor] = None,
520
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
521
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
522
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
535
+ prompt_embeds: Optional[torch.Tensor] = None,
536
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
537
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
538
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
523
539
  lora_scale: Optional[float] = None,
524
540
  clip_skip: Optional[int] = None,
525
541
  ):
@@ -545,17 +561,17 @@ class StableDiffusionXLInpaintPipeline(
545
561
  negative_prompt_2 (`str` or `List[str]`, *optional*):
546
562
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
547
563
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
548
- prompt_embeds (`torch.FloatTensor`, *optional*):
564
+ prompt_embeds (`torch.Tensor`, *optional*):
549
565
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
550
566
  provided, text embeddings will be generated from `prompt` input argument.
551
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
567
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
552
568
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
553
569
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
554
570
  argument.
555
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
571
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
556
572
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
557
573
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
558
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
574
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
559
575
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
560
576
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
561
577
  input argument.
@@ -880,7 +896,12 @@ class StableDiffusionXLInpaintPipeline(
880
896
  return_noise=False,
881
897
  return_image_latents=False,
882
898
  ):
883
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
899
+ shape = (
900
+ batch_size,
901
+ num_channels_latents,
902
+ int(height) // self.vae_scale_factor,
903
+ int(width) // self.vae_scale_factor,
904
+ )
884
905
  if isinstance(generator, list) and len(generator) != batch_size:
885
906
  raise ValueError(
886
907
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -1027,7 +1048,7 @@ class StableDiffusionXLInpaintPipeline(
1027
1048
  # because `num_inference_steps` might be even given that every timestep
1028
1049
  # (except the highest one) is duplicated. If `num_inference_steps` is even it would
1029
1050
  # mean that we cut the timesteps in the middle of the denoising step
1030
- # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
1051
+ # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
1031
1052
  # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
1032
1053
  num_inference_steps = num_inference_steps + 1
1033
1054
 
@@ -1110,20 +1131,22 @@ class StableDiffusionXLInpaintPipeline(
1110
1131
  self.vae.decoder.mid_block.to(dtype)
1111
1132
 
1112
1133
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
1113
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
1134
+ def get_guidance_scale_embedding(
1135
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
1136
+ ) -> torch.Tensor:
1114
1137
  """
1115
1138
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
1116
1139
 
1117
1140
  Args:
1118
- timesteps (`torch.Tensor`):
1119
- generate embedding vectors at these timesteps
1141
+ w (`torch.Tensor`):
1142
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
1120
1143
  embedding_dim (`int`, *optional*, defaults to 512):
1121
- dimension of the embeddings to generate
1122
- dtype:
1123
- data type of the generated embeddings
1144
+ Dimension of the embeddings to generate.
1145
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
1146
+ Data type of the generated embeddings.
1124
1147
 
1125
1148
  Returns:
1126
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
1149
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
1127
1150
  """
1128
1151
  assert len(w.shape) == 1
1129
1152
  w = w * 1000.0
@@ -1185,13 +1208,14 @@ class StableDiffusionXLInpaintPipeline(
1185
1208
  prompt_2: Optional[Union[str, List[str]]] = None,
1186
1209
  image: PipelineImageInput = None,
1187
1210
  mask_image: PipelineImageInput = None,
1188
- masked_image_latents: torch.FloatTensor = None,
1211
+ masked_image_latents: torch.Tensor = None,
1189
1212
  height: Optional[int] = None,
1190
1213
  width: Optional[int] = None,
1191
1214
  padding_mask_crop: Optional[int] = None,
1192
1215
  strength: float = 0.9999,
1193
1216
  num_inference_steps: int = 50,
1194
1217
  timesteps: List[int] = None,
1218
+ sigmas: List[float] = None,
1195
1219
  denoising_start: Optional[float] = None,
1196
1220
  denoising_end: Optional[float] = None,
1197
1221
  guidance_scale: float = 7.5,
@@ -1200,13 +1224,13 @@ class StableDiffusionXLInpaintPipeline(
1200
1224
  num_images_per_prompt: Optional[int] = 1,
1201
1225
  eta: float = 0.0,
1202
1226
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
1203
- latents: Optional[torch.FloatTensor] = None,
1204
- prompt_embeds: Optional[torch.FloatTensor] = None,
1205
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1206
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1207
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1227
+ latents: Optional[torch.Tensor] = None,
1228
+ prompt_embeds: Optional[torch.Tensor] = None,
1229
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
1230
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
1231
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
1208
1232
  ip_adapter_image: Optional[PipelineImageInput] = None,
1209
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1233
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
1210
1234
  output_type: Optional[str] = "pil",
1211
1235
  return_dict: bool = True,
1212
1236
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1220,7 +1244,9 @@ class StableDiffusionXLInpaintPipeline(
1220
1244
  aesthetic_score: float = 6.0,
1221
1245
  negative_aesthetic_score: float = 2.5,
1222
1246
  clip_skip: Optional[int] = None,
1223
- callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
1247
+ callback_on_step_end: Optional[
1248
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
1249
+ ] = None,
1224
1250
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
1225
1251
  **kwargs,
1226
1252
  ):
@@ -1253,11 +1279,12 @@ class StableDiffusionXLInpaintPipeline(
1253
1279
  [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
1254
1280
  and checkpoints that are not specifically fine-tuned on low resolutions.
1255
1281
  padding_mask_crop (`int`, *optional*, defaults to `None`):
1256
- The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
1257
- `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
1258
- contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
1259
- the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
1260
- and contain information inreleant for inpainging, such as background.
1282
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
1283
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
1284
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
1285
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
1286
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
1287
+ the image is large and contain information irrelevant for inpainting, such as background.
1261
1288
  strength (`float`, *optional*, defaults to 0.9999):
1262
1289
  Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
1263
1290
  between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -1273,6 +1300,10 @@ class StableDiffusionXLInpaintPipeline(
1273
1300
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
1274
1301
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
1275
1302
  passed will be used. Must be in descending order.
1303
+ sigmas (`List[float]`, *optional*):
1304
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
1305
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
1306
+ will be used.
1276
1307
  denoising_start (`float`, *optional*):
1277
1308
  When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
1278
1309
  bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
@@ -1301,26 +1332,26 @@ class StableDiffusionXLInpaintPipeline(
1301
1332
  negative_prompt_2 (`str` or `List[str]`, *optional*):
1302
1333
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
1303
1334
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
1304
- prompt_embeds (`torch.FloatTensor`, *optional*):
1335
+ prompt_embeds (`torch.Tensor`, *optional*):
1305
1336
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1306
1337
  provided, text embeddings will be generated from `prompt` input argument.
1307
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
1338
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
1308
1339
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1309
1340
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1310
1341
  argument.
1311
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1342
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
1312
1343
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1313
1344
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
1314
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1345
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
1315
1346
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1316
1347
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
1317
1348
  input argument.
1318
1349
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1319
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1320
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1321
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1322
- if `do_classifier_free_guidance` is set to `True`.
1323
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1350
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
1351
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
1352
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
1353
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
1354
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
1324
1355
  num_images_per_prompt (`int`, *optional*, defaults to 1):
1325
1356
  The number of images to generate per prompt.
1326
1357
  eta (`float`, *optional*, defaults to 0.0):
@@ -1329,7 +1360,7 @@ class StableDiffusionXLInpaintPipeline(
1329
1360
  generator (`torch.Generator`, *optional*):
1330
1361
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
1331
1362
  to make generation deterministic.
1332
- latents (`torch.FloatTensor`, *optional*):
1363
+ latents (`torch.Tensor`, *optional*):
1333
1364
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
1334
1365
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
1335
1366
  tensor will ge generated by sampling using the supplied random `generator`.
@@ -1383,11 +1414,11 @@ class StableDiffusionXLInpaintPipeline(
1383
1414
  clip_skip (`int`, *optional*):
1384
1415
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1385
1416
  the output of the pre-final layer will be used for computing the prompt embeddings.
1386
- callback_on_step_end (`Callable`, *optional*):
1387
- A function that calls at the end of each denoising steps during the inference. The function is called
1388
- with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
1389
- callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
1390
- `callback_on_step_end_tensor_inputs`.
1417
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1418
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1419
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1420
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1421
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1391
1422
  callback_on_step_end_tensor_inputs (`List`, *optional*):
1392
1423
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1393
1424
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1417,6 +1448,9 @@ class StableDiffusionXLInpaintPipeline(
1417
1448
  "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1418
1449
  )
1419
1450
 
1451
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1452
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1453
+
1420
1454
  # 0. Default height and width to unet
1421
1455
  height = height or self.unet.config.sample_size * self.vae_scale_factor
1422
1456
  width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -1490,7 +1524,9 @@ class StableDiffusionXLInpaintPipeline(
1490
1524
  def denoising_value_valid(dnv):
1491
1525
  return isinstance(dnv, float) and 0 < dnv < 1
1492
1526
 
1493
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1527
+ timesteps, num_inference_steps = retrieve_timesteps(
1528
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1529
+ )
1494
1530
  timesteps, num_inference_steps = self.get_timesteps(
1495
1531
  num_inference_steps,
1496
1532
  strength,
@@ -1718,7 +1754,12 @@ class StableDiffusionXLInpaintPipeline(
1718
1754
  noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
1719
1755
 
1720
1756
  # compute the previous noisy sample x_t -> x_t-1
1757
+ latents_dtype = latents.dtype
1721
1758
  latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1759
+ if latents.dtype != latents_dtype:
1760
+ if torch.backends.mps.is_available():
1761
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1762
+ latents = latents.to(latents_dtype)
1722
1763
 
1723
1764
  if num_channels_unet == 4:
1724
1765
  init_latents_proper = image_latents
@@ -1770,6 +1811,10 @@ class StableDiffusionXLInpaintPipeline(
1770
1811
  if needs_upcasting:
1771
1812
  self.upcast_vae()
1772
1813
  latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1814
+ elif latents.dtype != self.vae.dtype:
1815
+ if torch.backends.mps.is_available():
1816
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
1817
+ self.vae = self.vae.to(latents.dtype)
1773
1818
 
1774
1819
  # unscale/denormalize the latents
1775
1820
  # denormalize with the mean and std if available and not None
@@ -169,6 +169,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
169
169
  Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
170
170
  watermark output images. If not defined, it will default to True if the package is installed, otherwise no
171
171
  watermarker will be used.
172
+ is_cosxl_edit (`bool`, *optional*):
173
+ When set the image latents are scaled.
172
174
  """
173
175
 
174
176
  model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
@@ -185,6 +187,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
185
187
  scheduler: KarrasDiffusionSchedulers,
186
188
  force_zeros_for_empty_prompt: bool = True,
187
189
  add_watermarker: Optional[bool] = None,
190
+ is_cosxl_edit: Optional[bool] = False,
188
191
  ):
189
192
  super().__init__()
190
193
 
@@ -201,6 +204,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
201
204
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
202
205
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
203
206
  self.default_sample_size = self.unet.config.sample_size
207
+ self.is_cosxl_edit = is_cosxl_edit
204
208
 
205
209
  add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
206
210
 
@@ -218,10 +222,10 @@ class StableDiffusionXLInstructPix2PixPipeline(
218
222
  do_classifier_free_guidance: bool = True,
219
223
  negative_prompt: Optional[str] = None,
220
224
  negative_prompt_2: Optional[str] = None,
221
- prompt_embeds: Optional[torch.FloatTensor] = None,
222
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
223
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
224
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
225
+ prompt_embeds: Optional[torch.Tensor] = None,
226
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
227
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
228
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
225
229
  lora_scale: Optional[float] = None,
226
230
  ):
227
231
  r"""
@@ -246,17 +250,17 @@ class StableDiffusionXLInstructPix2PixPipeline(
246
250
  negative_prompt_2 (`str` or `List[str]`, *optional*):
247
251
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
248
252
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
249
- prompt_embeds (`torch.FloatTensor`, *optional*):
253
+ prompt_embeds (`torch.Tensor`, *optional*):
250
254
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
251
255
  provided, text embeddings will be generated from `prompt` input argument.
252
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
256
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
253
257
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
254
258
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
255
259
  argument.
256
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
260
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
257
261
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
258
262
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
259
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
263
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
260
264
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
261
265
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
262
266
  input argument.
@@ -432,7 +436,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
432
436
  extra_step_kwargs["generator"] = generator
433
437
  return extra_step_kwargs
434
438
 
435
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
436
439
  def check_inputs(
437
440
  self,
438
441
  prompt,
@@ -483,7 +486,12 @@ class StableDiffusionXLInstructPix2PixPipeline(
483
486
 
484
487
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
485
488
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
486
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
489
+ shape = (
490
+ batch_size,
491
+ num_channels_latents,
492
+ int(height) // self.vae_scale_factor,
493
+ int(width) // self.vae_scale_factor,
494
+ )
487
495
  if isinstance(generator, list) and len(generator) != batch_size:
488
496
  raise ValueError(
489
497
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -517,8 +525,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
517
525
  # make sure the VAE is in float32 mode, as it overflows in float16
518
526
  needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
519
527
  if needs_upcasting:
528
+ image = image.float()
520
529
  self.upcast_vae()
521
- image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
522
530
 
523
531
  image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
524
532
 
@@ -551,6 +559,9 @@ class StableDiffusionXLInstructPix2PixPipeline(
551
559
  if image_latents.dtype != self.vae.dtype:
552
560
  image_latents = image_latents.to(dtype=self.vae.dtype)
553
561
 
562
+ if self.is_cosxl_edit:
563
+ image_latents = image_latents * self.vae.config.scaling_factor
564
+
554
565
  return image_latents
555
566
 
556
567
  # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
@@ -611,14 +622,14 @@ class StableDiffusionXLInstructPix2PixPipeline(
611
622
  num_images_per_prompt: Optional[int] = 1,
612
623
  eta: float = 0.0,
613
624
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
614
- latents: Optional[torch.FloatTensor] = None,
615
- prompt_embeds: Optional[torch.FloatTensor] = None,
616
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
617
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
618
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
625
+ latents: Optional[torch.Tensor] = None,
626
+ prompt_embeds: Optional[torch.Tensor] = None,
627
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
628
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
629
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
619
630
  output_type: Optional[str] = "pil",
620
631
  return_dict: bool = True,
621
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
632
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
622
633
  callback_steps: int = 1,
623
634
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
624
635
  guidance_rescale: float = 0.0,
@@ -636,7 +647,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
636
647
  prompt_2 (`str` or `List[str]`, *optional*):
637
648
  The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
638
649
  used in both text-encoders
639
- image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
650
+ image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
640
651
  The image(s) to modify with the pipeline.
641
652
  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
642
653
  The height in pixels of the generated image.
@@ -659,7 +670,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
659
670
  1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
660
671
  usually at the expense of lower image quality.
661
672
  image_guidance_scale (`float`, *optional*, defaults to 1.5):
662
- Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
673
+ Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
663
674
  scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
664
675
  generate images that are closely linked to the source image `image`, usually at the expense of lower
665
676
  image quality. This pipeline requires a value of at least `1`.
@@ -678,21 +689,21 @@ class StableDiffusionXLInstructPix2PixPipeline(
678
689
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
679
690
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
680
691
  to make generation deterministic.
681
- latents (`torch.FloatTensor`, *optional*):
692
+ latents (`torch.Tensor`, *optional*):
682
693
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
683
694
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
684
695
  tensor will ge generated by sampling using the supplied random `generator`.
685
- prompt_embeds (`torch.FloatTensor`, *optional*):
696
+ prompt_embeds (`torch.Tensor`, *optional*):
686
697
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
687
698
  provided, text embeddings will be generated from `prompt` input argument.
688
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
699
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
689
700
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
690
701
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
691
702
  argument.
692
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
703
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
693
704
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
694
705
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
695
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
706
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
696
707
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
697
708
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
698
709
  input argument.
@@ -704,7 +715,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
704
715
  plain tuple.
705
716
  callback (`Callable`, *optional*):
706
717
  A function that will be called every `callback_steps` steps during inference. The function will be
707
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
718
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
708
719
  callback_steps (`int`, *optional*, defaults to 1):
709
720
  The frequency at which the `callback` function will be called. If not specified, the callback will be
710
721
  called at every step.
@@ -918,7 +929,12 @@ class StableDiffusionXLInstructPix2PixPipeline(
918
929
  noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
919
930
 
920
931
  # compute the previous noisy sample x_t -> x_t-1
932
+ latents_dtype = latents.dtype
921
933
  latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
934
+ if latents.dtype != latents_dtype:
935
+ if torch.backends.mps.is_available():
936
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
937
+ latents = latents.to(latents_dtype)
922
938
 
923
939
  # call the callback, if provided
924
940
  if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -937,6 +953,10 @@ class StableDiffusionXLInstructPix2PixPipeline(
937
953
  if needs_upcasting:
938
954
  self.upcast_vae()
939
955
  latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
956
+ elif latents.dtype != self.vae.dtype:
957
+ if torch.backends.mps.is_available():
958
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
959
+ self.vae = self.vae.to(latents.dtype)
940
960
 
941
961
  # unscale/denormalize the latents
942
962
  # denormalize with the mean and std if available and not None
@@ -21,16 +21,22 @@ class StableDiffusionXLWatermarker:
21
21
 
22
22
  self.encoder.set_watermark("bits", self.watermark)
23
23
 
24
- def apply_watermark(self, images: torch.FloatTensor):
24
+ def apply_watermark(self, images: torch.Tensor):
25
25
  # can't encode images that are smaller than 256
26
26
  if images.shape[-1] < 256:
27
27
  return images
28
28
 
29
29
  images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
30
30
 
31
- images = [self.encoder.encode(image, "dwtDct") for image in images]
31
+ # Convert RGB to BGR, which is the channel order expected by the watermark encoder.
32
+ images = images[:, :, :, ::-1]
32
33
 
33
- images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2)
34
+ # Add watermark and convert BGR back to RGB
35
+ images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images]
36
+
37
+ images = np.array(images)
38
+
39
+ images = torch.from_numpy(images).permute(0, 3, 1, 2)
34
40
 
35
41
  images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
36
42
  return images