diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -74,7 +74,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
74
74
  guidance_scale: Optional[float] = 1.0,
75
75
  eta: Optional[float] = 0.0,
76
76
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
77
- latents: Optional[torch.FloatTensor] = None,
77
+ latents: Optional[torch.Tensor] = None,
78
78
  output_type: Optional[str] = "pil",
79
79
  return_dict: bool = True,
80
80
  **kwargs,
@@ -98,7 +98,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
98
98
  generator (`torch.Generator`, *optional*):
99
99
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
100
100
  generation deterministic.
101
- latents (`torch.FloatTensor`, *optional*):
101
+ latents (`torch.Tensor`, *optional*):
102
102
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
103
103
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
104
104
  tensor is generated by sampling using the supplied random `generator`.
@@ -465,17 +465,17 @@ class LDMBertEncoderLayer(nn.Module):
465
465
 
466
466
  def forward(
467
467
  self,
468
- hidden_states: torch.FloatTensor,
469
- attention_mask: torch.FloatTensor,
470
- layer_head_mask: torch.FloatTensor,
468
+ hidden_states: torch.Tensor,
469
+ attention_mask: torch.Tensor,
470
+ layer_head_mask: torch.Tensor,
471
471
  output_attentions: Optional[bool] = False,
472
- ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
472
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
473
473
  """
474
474
  Args:
475
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
476
- attention_mask (`torch.FloatTensor`): attention mask of size
475
+ hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
476
+ attention_mask (`torch.Tensor`): attention mask of size
477
477
  `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
478
- layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
478
+ layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size
479
479
  `(encoder_attention_heads,)`.
480
480
  output_attentions (`bool`, *optional*):
481
481
  Whether or not to return the attentions tensors of all attention layers. See `attentions` under
@@ -587,7 +587,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel):
587
587
  attention_mask: Optional[torch.Tensor] = None,
588
588
  position_ids: Optional[torch.LongTensor] = None,
589
589
  head_mask: Optional[torch.Tensor] = None,
590
- inputs_embeds: Optional[torch.FloatTensor] = None,
590
+ inputs_embeds: Optional[torch.Tensor] = None,
591
591
  output_attentions: Optional[bool] = None,
592
592
  output_hidden_states: Optional[bool] = None,
593
593
  return_dict: Optional[bool] = None,
@@ -615,7 +615,7 @@ class LDMBertEncoder(LDMBertPreTrainedModel):
615
615
  - 1 indicates the head is **not masked**,
616
616
  - 0 indicates the head is **masked**.
617
617
 
618
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
618
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
619
619
  Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
620
620
  This is useful if you want more control over how to convert `input_ids` indices into associated vectors
621
621
  than the model's internal embedding lookup matrix.
@@ -40,30 +40,21 @@ EXAMPLE_DOC_STRING = """
40
40
  >>> from io import BytesIO
41
41
 
42
42
  >>> from diffusers import LEditsPPPipelineStableDiffusion
43
+ >>> from diffusers.utils import load_image
43
44
 
44
45
  >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
45
46
  ... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
46
47
  ... )
47
48
  >>> pipe = pipe.to("cuda")
48
49
 
49
- >>> def download_image(url):
50
- ... response = requests.get(url)
51
- ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
52
-
53
50
  >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
54
- >>> image = download_image(img_url)
51
+ >>> image = load_image(img_url).convert("RGB")
55
52
 
56
- >>> _ = pipe.invert(
57
- ... image = image,
58
- ... num_inversion_steps=50,
59
- ... skip=0.1
60
- ... )
53
+ >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1)
61
54
 
62
55
  >>> edited_image = pipe(
63
- ... editing_prompt=["cherry blossom"],
64
- ... edit_guidance_scale=10.0,
65
- ... edit_threshold=0.75,
66
- ).images[0]
56
+ ... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75
57
+ ... ).images[0]
67
58
  ```
68
59
  """
69
60
 
@@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion(
279
270
  unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
280
271
  scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
281
272
  A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
282
- [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
283
- be set to [`DPMSolverMultistepScheduler`].
273
+ [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
274
+ automatically be set to [`DPMSolverMultistepScheduler`].
284
275
  safety_checker ([`StableDiffusionSafetyChecker`]):
285
276
  Classification module that estimates whether generated images could be considered offensive or harmful.
286
277
  Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
@@ -511,8 +502,8 @@ class LEditsPPPipelineStableDiffusion(
511
502
  enable_edit_guidance,
512
503
  negative_prompt=None,
513
504
  editing_prompt=None,
514
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
515
- editing_prompt_embeds: Optional[torch.FloatTensor] = None,
505
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
506
+ editing_prompt_embeds: Optional[torch.Tensor] = None,
516
507
  lora_scale: Optional[float] = None,
517
508
  clip_skip: Optional[int] = None,
518
509
  ):
@@ -531,12 +522,11 @@ class LEditsPPPipelineStableDiffusion(
531
522
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
532
523
  less than `1`).
533
524
  editing_prompt (`str` or `List[str]`, *optional*):
534
- Editing prompt(s) to be encoded. If not defined, one has to pass
535
- `editing_prompt_embeds` instead.
536
- editing_prompt_embeds (`torch.FloatTensor`, *optional*):
525
+ Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead.
526
+ editing_prompt_embeds (`torch.Tensor`, *optional*):
537
527
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
538
528
  provided, text embeddings will be generated from `prompt` input argument.
539
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
529
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
540
530
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
541
531
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
542
532
  argument.
@@ -714,13 +704,13 @@ class LEditsPPPipelineStableDiffusion(
714
704
  return_dict: bool = True,
715
705
  editing_prompt: Optional[Union[str, List[str]]] = None,
716
706
  editing_prompt_embeds: Optional[torch.Tensor] = None,
717
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
707
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
718
708
  reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
719
709
  edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
720
710
  edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
721
711
  edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
722
712
  edit_threshold: Optional[Union[float, List[float]]] = 0.9,
723
- user_mask: Optional[torch.FloatTensor] = None,
713
+ user_mask: Optional[torch.Tensor] = None,
724
714
  sem_guidance: Optional[List[torch.Tensor]] = None,
725
715
  use_cross_attn_mask: bool = False,
726
716
  use_intersect_mask: bool = True,
@@ -734,8 +724,9 @@ class LEditsPPPipelineStableDiffusion(
734
724
  **kwargs,
735
725
  ):
736
726
  r"""
737
- The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`]
738
- method has to be called beforehand. Edits will always be performed for the last inverted image(s).
727
+ The call function to the pipeline for editing. The
728
+ [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will
729
+ always be performed for the last inverted image(s).
739
730
 
740
731
  Args:
741
732
  negative_prompt (`str` or `List[str]`, *optional*):
@@ -748,49 +739,51 @@ class LEditsPPPipelineStableDiffusion(
748
739
  The output format of the generate image. Choose between
749
740
  [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
750
741
  return_dict (`bool`, *optional*, defaults to `True`):
751
- Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a
752
- plain tuple.
742
+ Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain
743
+ tuple.
753
744
  editing_prompt (`str` or `List[str]`, *optional*):
754
745
  The prompt or prompts to guide the image generation. The image is reconstructed by setting
755
- `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
746
+ `editing_prompt = None`. Guidance direction of prompt should be specified via
747
+ `reverse_editing_direction`.
756
748
  editing_prompt_embeds (`torch.Tensor>`, *optional*):
757
- Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be
758
- specified via `reverse_editing_direction`.
759
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
749
+ Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should
750
+ be specified via `reverse_editing_direction`.
751
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
760
752
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
761
753
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
762
754
  reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
763
755
  Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
764
756
  edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
765
- Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
766
- `edit_guidance_scale` is defined as `s_e` of equation 12 of
767
- [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
757
+ Guidance scale for guiding the image generation. If provided as list values should correspond to
758
+ `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
759
+ Paper](https://arxiv.org/abs/2301.12247).
768
760
  edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
769
761
  Number of diffusion steps (for each prompt) for which guidance will not be applied.
770
762
  edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
771
763
  Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
772
764
  edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
773
765
  Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
774
- 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
775
- user_mask (`torch.FloatTensor`, *optional*):
776
- User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
777
- masks do not meet user preferences.
766
+ 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
767
+ Paper](https://arxiv.org/abs/2301.12247).
768
+ user_mask (`torch.Tensor`, *optional*):
769
+ User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
770
+ implicit masks do not meet user preferences.
778
771
  sem_guidance (`List[torch.Tensor]`, *optional*):
779
772
  List of pre-generated guidance vectors to be applied at generation. Length of the list has to
780
773
  correspond to `num_inference_steps`.
781
774
  use_cross_attn_mask (`bool`, defaults to `False`):
782
775
  Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
783
- is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
784
- [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
776
+ is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
777
+ paper](https://arxiv.org/pdf/2311.16711.pdf).
785
778
  use_intersect_mask (`bool`, defaults to `True`):
786
- Whether the masking term is calculated as intersection of cross-attention masks and masks derived
787
- from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
788
- estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
779
+ Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
780
+ the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
781
+ are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
789
782
  attn_store_steps (`List[int]`, *optional*):
790
783
  Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
791
784
  store_averaged_over_steps (`bool`, defaults to `True`):
792
- Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
793
- If False, attention maps for each step are stores separately. Just for visualization purposes.
785
+ Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
786
+ False, attention maps for each step are stores separately. Just for visualization purposes.
794
787
  cross_attention_kwargs (`dict`, *optional*):
795
788
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
796
789
  [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -815,10 +808,10 @@ class LEditsPPPipelineStableDiffusion(
815
808
 
816
809
  Returns:
817
810
  [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
818
- [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
819
- otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
820
- second element is a list of `bool`s denoting whether the corresponding generated image likely represents
821
- "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
811
+ [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
812
+ returning a tuple, the first element is a list with the generated images, and the second element is a list
813
+ of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
814
+ content, according to the `safety_checker`.
822
815
  """
823
816
 
824
817
  if self.inversion_steps is None:
@@ -1219,11 +1212,11 @@ class LEditsPPPipelineStableDiffusion(
1219
1212
  crops_coords: Optional[Tuple[int, int, int, int]] = None,
1220
1213
  ):
1221
1214
  r"""
1222
- The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
1223
- If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
1224
- will be performed instead.
1215
+ The function to the pipeline for image inversion as described by the [LEDITS++
1216
+ Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
1217
+ inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
1225
1218
 
1226
- Args:
1219
+ Args:
1227
1220
  image (`PipelineImageInput`):
1228
1221
  Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
1229
1222
  ratio.
@@ -1238,8 +1231,8 @@ class LEditsPPPipelineStableDiffusion(
1238
1231
  Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
1239
1232
  will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
1240
1233
  generator (`torch.Generator`, *optional*):
1241
- A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
1242
- inversion deterministic.
1234
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
1235
+ deterministic.
1243
1236
  cross_attention_kwargs (`dict`, *optional*):
1244
1237
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
1245
1238
  [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -1247,23 +1240,24 @@ class LEditsPPPipelineStableDiffusion(
1247
1240
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1248
1241
  the output of the pre-final layer will be used for computing the prompt embeddings.
1249
1242
  height (`int`, *optional*, defaults to `None`):
1250
- The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
1243
+ The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
1244
+ height.
1251
1245
  width (`int`, *optional*`, defaults to `None`):
1252
- The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
1246
+ The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
1253
1247
  resize_mode (`str`, *optional*, defaults to `default`):
1254
- The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
1255
- within the specified width and height, and it may not maintaining the original aspect ratio.
1256
- If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
1257
- within the dimensions, filling empty with data from image.
1258
- If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
1259
- within the dimensions, cropping the excess.
1260
- Note that resize_mode `fill` and `crop` are only supported for PIL image input.
1248
+ The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
1249
+ the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
1250
+ resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
1251
+ center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
1252
+ image to fit within the specified width and height, maintaining the aspect ratio, and then center the
1253
+ image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
1254
+ supported for PIL image input.
1261
1255
  crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
1262
1256
  The crop coordinates for each image in the batch. If `None`, will not crop the image.
1263
1257
 
1264
1258
  Returns:
1265
- [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
1266
- Output will contain the resized input image(s) and respective VAE reconstruction(s).
1259
+ [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
1260
+ and respective VAE reconstruction(s).
1267
1261
  """
1268
1262
  # Reset attn processor, we do not want to store attn maps during inversion
1269
1263
  self.unet.set_attn_processor(AttnProcessor())
@@ -85,25 +85,23 @@ EXAMPLE_DOC_STRING = """
85
85
  ... )
86
86
  >>> pipe = pipe.to("cuda")
87
87
 
88
+
88
89
  >>> def download_image(url):
89
90
  ... response = requests.get(url)
90
91
  ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
91
92
 
93
+
92
94
  >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
93
95
  >>> image = download_image(img_url)
94
96
 
95
- >>> _ = pipe.invert(
96
- ... image = image,
97
- ... num_inversion_steps=50,
98
- ... skip=0.2
99
- ... )
97
+ >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2)
100
98
 
101
99
  >>> edited_image = pipe(
102
- ... editing_prompt=["tennis ball","tomato"],
103
- ... reverse_editing_direction=[True,False],
104
- ... edit_guidance_scale=[5.0,10.0],
105
- ... edit_threshold=[0.9,0.85],
106
- ).images[0]
100
+ ... editing_prompt=["tennis ball", "tomato"],
101
+ ... reverse_editing_direction=[True, False],
102
+ ... edit_guidance_scale=[5.0, 10.0],
103
+ ... edit_threshold=[0.9, 0.85],
104
+ ... ).images[0]
107
105
  ```
108
106
  """
109
107
 
@@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL(
292
290
  """
293
291
  Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
294
292
 
295
- This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass
296
- documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
297
- device, etc.).
293
+ This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the
294
+ superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a
295
+ particular device, etc.).
298
296
 
299
297
  In addition the pipeline inherits the following loading methods:
300
298
  - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
@@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL(
325
323
  unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
326
324
  scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
327
325
  A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
328
- [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
329
- be set to [`DPMSolverMultistepScheduler`].
326
+ [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will
327
+ automatically be set to [`DPMSolverMultistepScheduler`].
330
328
  force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
331
329
  Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
332
330
  `stabilityai/stable-diffusion-xl-base-1-0`.
@@ -411,14 +409,14 @@ class LEditsPPPipelineStableDiffusionXL(
411
409
  num_images_per_prompt: int = 1,
412
410
  negative_prompt: Optional[str] = None,
413
411
  negative_prompt_2: Optional[str] = None,
414
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
415
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
412
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
413
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
416
414
  lora_scale: Optional[float] = None,
417
415
  clip_skip: Optional[int] = None,
418
416
  enable_edit_guidance: bool = True,
419
417
  editing_prompt: Optional[str] = None,
420
- editing_prompt_embeds: Optional[torch.FloatTensor] = None,
421
- editing_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
418
+ editing_prompt_embeds: Optional[torch.Tensor] = None,
419
+ editing_pooled_prompt_embeds: Optional[torch.Tensor] = None,
422
420
  ) -> object:
423
421
  r"""
424
422
  Encodes the prompt into text encoder hidden states.
@@ -434,11 +432,11 @@ class LEditsPPPipelineStableDiffusionXL(
434
432
  negative_prompt_2 (`str` or `List[str]`, *optional*):
435
433
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
436
434
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
437
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
435
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
438
436
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
439
437
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
440
438
  argument.
441
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
439
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
442
440
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
443
441
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
444
442
  input argument.
@@ -452,11 +450,11 @@ class LEditsPPPipelineStableDiffusionXL(
452
450
  editing_prompt (`str` or `List[str]`, *optional*):
453
451
  Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
454
452
  `editing_prompt_embeds` instead.
455
- editing_prompt_embeds (`torch.FloatTensor`, *optional*):
456
- Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
457
- weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input
458
- argument.
459
- editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
453
+ editing_prompt_embeds (`torch.Tensor`, *optional*):
454
+ Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
455
+ If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from
456
+ `editing_prompt` input argument.
457
+ editing_pooled_prompt_embeds (`torch.Tensor`, *optional*):
460
458
  Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
461
459
  weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
462
460
  input argument.
@@ -713,20 +711,22 @@ class LEditsPPPipelineStableDiffusionXL(
713
711
  self.vae.decoder.mid_block.to(dtype)
714
712
 
715
713
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
716
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
714
+ def get_guidance_scale_embedding(
715
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
716
+ ) -> torch.Tensor:
717
717
  """
718
718
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
719
719
 
720
720
  Args:
721
- timesteps (`torch.Tensor`):
722
- generate embedding vectors at these timesteps
721
+ w (`torch.Tensor`):
722
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
723
723
  embedding_dim (`int`, *optional*, defaults to 512):
724
- dimension of the embeddings to generate
725
- dtype:
726
- data type of the generated embeddings
724
+ Dimension of the embeddings to generate.
725
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
726
+ Data type of the generated embeddings.
727
727
 
728
728
  Returns:
729
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
729
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
730
730
  """
731
731
  assert len(w.shape) == 1
732
732
  w = w * 1000.0
@@ -804,8 +804,8 @@ class LEditsPPPipelineStableDiffusionXL(
804
804
  denoising_end: Optional[float] = None,
805
805
  negative_prompt: Optional[Union[str, List[str]]] = None,
806
806
  negative_prompt_2: Optional[Union[str, List[str]]] = None,
807
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
808
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
807
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
808
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
809
809
  ip_adapter_image: Optional[PipelineImageInput] = None,
810
810
  output_type: Optional[str] = "pil",
811
811
  return_dict: bool = True,
@@ -824,7 +824,7 @@ class LEditsPPPipelineStableDiffusionXL(
824
824
  sem_guidance: Optional[List[torch.Tensor]] = None,
825
825
  use_cross_attn_mask: bool = False,
826
826
  use_intersect_mask: bool = False,
827
- user_mask: Optional[torch.FloatTensor] = None,
827
+ user_mask: Optional[torch.Tensor] = None,
828
828
  attn_store_steps: Optional[List[int]] = [],
829
829
  store_averaged_over_steps: bool = True,
830
830
  clip_skip: Optional[int] = None,
@@ -833,8 +833,9 @@ class LEditsPPPipelineStableDiffusionXL(
833
833
  **kwargs,
834
834
  ):
835
835
  r"""
836
- The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`]
837
- method has to be called beforehand. Edits will always be performed for the last inverted image(s).
836
+ The call function to the pipeline for editing. The
837
+ [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits
838
+ will always be performed for the last inverted image(s).
838
839
 
839
840
  Args:
840
841
  denoising_end (`float`, *optional*):
@@ -850,11 +851,11 @@ class LEditsPPPipelineStableDiffusionXL(
850
851
  negative_prompt_2 (`str` or `List[str]`, *optional*):
851
852
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
852
853
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
853
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
854
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
854
855
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
855
856
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
856
857
  argument.
857
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
858
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
858
859
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
859
860
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
860
861
  input argument.
@@ -868,7 +869,7 @@ class LEditsPPPipelineStableDiffusionXL(
868
869
  of a plain tuple.
869
870
  callback (`Callable`, *optional*):
870
871
  A function that will be called every `callback_steps` steps during inference. The function will be
871
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
872
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
872
873
  callback_steps (`int`, *optional*, defaults to 1):
873
874
  The frequency at which the `callback` function will be called. If not specified, the callback will be
874
875
  called at every step.
@@ -892,11 +893,11 @@ class LEditsPPPipelineStableDiffusionXL(
892
893
  section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
893
894
  editing_prompt (`str` or `List[str]`, *optional*):
894
895
  The prompt or prompts to guide the image generation. The image is reconstructed by setting
895
- `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
896
+ `editing_prompt = None`. Guidance direction of prompt should be specified via
897
+ `reverse_editing_direction`.
896
898
  editing_prompt_embeddings (`torch.Tensor`, *optional*):
897
- Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
898
- weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
899
- argument.
899
+ Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
900
+ If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument.
900
901
  editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
901
902
  Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
902
903
  weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
@@ -904,35 +905,36 @@ class LEditsPPPipelineStableDiffusionXL(
904
905
  reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
905
906
  Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
906
907
  edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
907
- Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
908
- `edit_guidance_scale` is defined as `s_e` of equation 12 of
909
- [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
908
+ Guidance scale for guiding the image generation. If provided as list values should correspond to
909
+ `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++
910
+ Paper](https://arxiv.org/abs/2301.12247).
910
911
  edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
911
912
  Number of diffusion steps (for each prompt) for which guidance is not applied.
912
913
  edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
913
914
  Number of diffusion steps (for each prompt) after which guidance is no longer applied.
914
915
  edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
915
916
  Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
916
- 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
917
+ 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++
918
+ Paper](https://arxiv.org/abs/2301.12247).
917
919
  sem_guidance (`List[torch.Tensor]`, *optional*):
918
920
  List of pre-generated guidance vectors to be applied at generation. Length of the list has to
919
921
  correspond to `num_inference_steps`.
920
922
  use_cross_attn_mask:
921
923
  Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
922
- is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
923
- [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
924
+ is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++
925
+ paper](https://arxiv.org/pdf/2311.16711.pdf).
924
926
  use_intersect_mask:
925
- Whether the masking term is calculated as intersection of cross-attention masks and masks derived
926
- from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
927
- estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
927
+ Whether the masking term is calculated as intersection of cross-attention masks and masks derived from
928
+ the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate
929
+ are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
928
930
  user_mask:
929
- User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
930
- masks do not meet user preferences.
931
+ User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s
932
+ implicit masks do not meet user preferences.
931
933
  attn_store_steps:
932
934
  Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
933
935
  store_averaged_over_steps:
934
- Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
935
- If False, attention maps for each step are stores separately. Just for visualization purposes.
936
+ Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If
937
+ False, attention maps for each step are stores separately. Just for visualization purposes.
936
938
  clip_skip (`int`, *optional*):
937
939
  Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
938
940
  the output of the pre-final layer will be used for computing the prompt embeddings.
@@ -950,8 +952,8 @@ class LEditsPPPipelineStableDiffusionXL(
950
952
 
951
953
  Returns:
952
954
  [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
953
- [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
954
- otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
955
+ [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
956
+ returning a tuple, the first element is a list with the generated images.
955
957
  """
956
958
  if self.inversion_steps is None:
957
959
  raise ValueError(
@@ -1417,7 +1419,6 @@ class LEditsPPPipelineStableDiffusionXL(
1417
1419
  if needs_upcasting:
1418
1420
  image = image.float()
1419
1421
  self.upcast_vae()
1420
- image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1421
1422
 
1422
1423
  x0 = self.vae.encode(image).latent_dist.mode()
1423
1424
  x0 = x0.to(dtype)
@@ -1444,11 +1445,11 @@ class LEditsPPPipelineStableDiffusionXL(
1444
1445
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1445
1446
  ):
1446
1447
  r"""
1447
- The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
1448
- If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
1449
- will be performed instead.
1448
+ The function to the pipeline for image inversion as described by the [LEDITS++
1449
+ Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
1450
+ inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.
1450
1451
 
1451
- Args:
1452
+ Args:
1452
1453
  image (`PipelineImageInput`):
1453
1454
  Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
1454
1455
  ratio.
@@ -1470,8 +1471,8 @@ class LEditsPPPipelineStableDiffusionXL(
1470
1471
  Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
1471
1472
  will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
1472
1473
  generator (`torch.Generator`, *optional*):
1473
- A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
1474
- inversion deterministic.
1474
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion
1475
+ deterministic.
1475
1476
  crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1476
1477
  `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1477
1478
  `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
@@ -1486,8 +1487,8 @@ class LEditsPPPipelineStableDiffusionXL(
1486
1487
  [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1487
1488
 
1488
1489
  Returns:
1489
- [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
1490
- Output will contain the resized input image(s) and respective VAE reconstruction(s).
1490
+ [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s)
1491
+ and respective VAE reconstruction(s).
1491
1492
  """
1492
1493
 
1493
1494
  # Reset attn processor, we do not want to store attn maps during inversion