diffusers 0.27.1__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +20 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -21
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +36 -22
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -42
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +23 -25
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +46 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +26 -22
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +90 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +78 -53
  229. diffusers/schedulers/scheduling_edm_euler.py +53 -30
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +26 -28
  231. diffusers/schedulers/scheduling_euler_discrete.py +163 -67
  232. diffusers/schedulers/scheduling_heun_discrete.py +60 -38
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +22 -18
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +22 -18
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +27 -25
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +115 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/METADATA +7 -7
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. diffusers-0.27.1.dist-info/RECORD +0 -399
  267. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  268. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/WHEEL +0 -0
  269. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.1.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -180,8 +180,8 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
180
180
  num_images_per_prompt,
181
181
  do_classifier_free_guidance,
182
182
  negative_prompt=None,
183
- prompt_embeds: Optional[torch.FloatTensor] = None,
184
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
183
+ prompt_embeds: Optional[torch.Tensor] = None,
184
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
185
185
  lora_scale: Optional[float] = None,
186
186
  **kwargs,
187
187
  ):
@@ -213,8 +213,8 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
213
213
  num_images_per_prompt,
214
214
  do_classifier_free_guidance,
215
215
  negative_prompt=None,
216
- prompt_embeds: Optional[torch.FloatTensor] = None,
217
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
216
+ prompt_embeds: Optional[torch.Tensor] = None,
217
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
218
218
  lora_scale: Optional[float] = None,
219
219
  clip_skip: Optional[int] = None,
220
220
  ):
@@ -234,10 +234,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
234
234
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
235
235
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
236
236
  less than `1`).
237
- prompt_embeds (`torch.FloatTensor`, *optional*):
237
+ prompt_embeds (`torch.Tensor`, *optional*):
238
238
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
239
239
  provided, text embeddings will be generated from `prompt` input argument.
240
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
240
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
241
241
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
242
242
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
243
243
  argument.
@@ -469,14 +469,19 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
469
469
  )
470
470
 
471
471
  if len(gligen_phrases) != len(gligen_boxes):
472
- ValueError(
472
+ raise ValueError(
473
473
  "length of `gligen_phrases` and `gligen_boxes` has to be same, but"
474
474
  f" got: `gligen_phrases` {len(gligen_phrases)} != `gligen_boxes` {len(gligen_boxes)}"
475
475
  )
476
476
 
477
477
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
478
478
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
479
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
479
+ shape = (
480
+ batch_size,
481
+ num_channels_latents,
482
+ int(height) // self.vae_scale_factor,
483
+ int(width) // self.vae_scale_factor,
484
+ )
480
485
  if isinstance(generator, list) and len(generator) != batch_size:
481
486
  raise ValueError(
482
487
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -536,12 +541,12 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
536
541
  num_images_per_prompt: Optional[int] = 1,
537
542
  eta: float = 0.0,
538
543
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
539
- latents: Optional[torch.FloatTensor] = None,
540
- prompt_embeds: Optional[torch.FloatTensor] = None,
541
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
544
+ latents: Optional[torch.Tensor] = None,
545
+ prompt_embeds: Optional[torch.Tensor] = None,
546
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
542
547
  output_type: Optional[str] = "pil",
543
548
  return_dict: bool = True,
544
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
549
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
545
550
  callback_steps: int = 1,
546
551
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
547
552
  clip_skip: Optional[int] = None,
@@ -587,14 +592,14 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
587
592
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
588
593
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
589
594
  generation deterministic.
590
- latents (`torch.FloatTensor`, *optional*):
595
+ latents (`torch.Tensor`, *optional*):
591
596
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
592
597
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
593
598
  tensor is generated by sampling using the supplied random `generator`.
594
- prompt_embeds (`torch.FloatTensor`, *optional*):
599
+ prompt_embeds (`torch.Tensor`, *optional*):
595
600
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
596
601
  provided, text embeddings are generated from the `prompt` input argument.
597
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
602
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
598
603
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
599
604
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
600
605
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -604,7 +609,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
604
609
  plain tuple.
605
610
  callback (`Callable`, *optional*):
606
611
  A function that calls every `callback_steps` steps during inference. The function is called with the
607
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
612
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
608
613
  callback_steps (`int`, *optional*, defaults to 1):
609
614
  The frequency at which the `callback` function is called. If not specified, the callback is called at
610
615
  every step.
@@ -680,7 +685,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
680
685
  timesteps = self.scheduler.timesteps
681
686
 
682
687
  # 5. Prepare latent variables
683
- num_channels_latents = self.unet.in_channels
688
+ num_channels_latents = self.unet.config.in_channels
684
689
  latents = self.prepare_latents(
685
690
  batch_size * num_images_per_prompt,
686
691
  num_channels_latents,
@@ -713,7 +718,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
713
718
  boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
714
719
  boxes[:n_objs] = torch.tensor(gligen_boxes)
715
720
  text_embeddings = torch.zeros(
716
- max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
721
+ max_objs, self.unet.config.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
717
722
  )
718
723
  text_embeddings[:n_objs] = _text_embeddings
719
724
  # Generate a mask for each object that is entity described by phrases
@@ -238,8 +238,8 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
238
238
  num_images_per_prompt,
239
239
  do_classifier_free_guidance,
240
240
  negative_prompt=None,
241
- prompt_embeds: Optional[torch.FloatTensor] = None,
242
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
241
+ prompt_embeds: Optional[torch.Tensor] = None,
242
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
243
243
  lora_scale: Optional[float] = None,
244
244
  clip_skip: Optional[int] = None,
245
245
  ):
@@ -259,10 +259,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
259
259
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
260
260
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
261
261
  less than `1`).
262
- prompt_embeds (`torch.FloatTensor`, *optional*):
262
+ prompt_embeds (`torch.Tensor`, *optional*):
263
263
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
264
264
  provided, text embeddings will be generated from `prompt` input argument.
265
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
265
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
266
266
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
267
267
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
268
268
  argument.
@@ -500,7 +500,12 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
500
500
 
501
501
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
502
502
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
503
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
503
+ shape = (
504
+ batch_size,
505
+ num_channels_latents,
506
+ int(height) // self.vae_scale_factor,
507
+ int(width) // self.vae_scale_factor,
508
+ )
504
509
  if isinstance(generator, list) and len(generator) != batch_size:
505
510
  raise ValueError(
506
511
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -700,12 +705,12 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
700
705
  num_images_per_prompt: Optional[int] = 1,
701
706
  eta: float = 0.0,
702
707
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
703
- latents: Optional[torch.FloatTensor] = None,
704
- prompt_embeds: Optional[torch.FloatTensor] = None,
705
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
708
+ latents: Optional[torch.Tensor] = None,
709
+ prompt_embeds: Optional[torch.Tensor] = None,
710
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
706
711
  output_type: Optional[str] = "pil",
707
712
  return_dict: bool = True,
708
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
713
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
709
714
  callback_steps: int = 1,
710
715
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
711
716
  gligen_normalize_constant: float = 28.7,
@@ -759,14 +764,14 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
759
764
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
760
765
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
761
766
  generation deterministic.
762
- latents (`torch.FloatTensor`, *optional*):
767
+ latents (`torch.Tensor`, *optional*):
763
768
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
764
769
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
765
770
  tensor is generated by sampling using the supplied random `generator`.
766
- prompt_embeds (`torch.FloatTensor`, *optional*):
771
+ prompt_embeds (`torch.Tensor`, *optional*):
767
772
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
768
773
  provided, text embeddings are generated from the `prompt` input argument.
769
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
774
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
770
775
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
771
776
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
772
777
  output_type (`str`, *optional*, defaults to `"pil"`):
@@ -776,7 +781,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
776
781
  plain tuple.
777
782
  callback (`Callable`, *optional*):
778
783
  A function that calls every `callback_steps` steps during inference. The function is called with the
779
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
784
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
780
785
  callback_steps (`int`, *optional*, defaults to 1):
781
786
  The frequency at which the `callback` function is called. If not specified, the callback is called at
782
787
  every step.
@@ -847,7 +852,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
847
852
  timesteps = self.scheduler.timesteps
848
853
 
849
854
  # 5. Prepare latent variables
850
- num_channels_latents = self.unet.in_channels
855
+ num_channels_latents = self.unet.config.in_channels
851
856
  latents = self.prepare_latents(
852
857
  batch_size * num_images_per_prompt,
853
858
  num_channels_latents,
@@ -154,8 +154,8 @@ class StableDiffusionKDiffusionPipeline(
154
154
  num_images_per_prompt,
155
155
  do_classifier_free_guidance,
156
156
  negative_prompt=None,
157
- prompt_embeds: Optional[torch.FloatTensor] = None,
158
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
157
+ prompt_embeds: Optional[torch.Tensor] = None,
158
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
159
159
  lora_scale: Optional[float] = None,
160
160
  **kwargs,
161
161
  ):
@@ -187,8 +187,8 @@ class StableDiffusionKDiffusionPipeline(
187
187
  num_images_per_prompt,
188
188
  do_classifier_free_guidance,
189
189
  negative_prompt=None,
190
- prompt_embeds: Optional[torch.FloatTensor] = None,
191
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
190
+ prompt_embeds: Optional[torch.Tensor] = None,
191
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
192
192
  lora_scale: Optional[float] = None,
193
193
  clip_skip: Optional[int] = None,
194
194
  ):
@@ -208,10 +208,10 @@ class StableDiffusionKDiffusionPipeline(
208
208
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
209
209
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
210
210
  less than `1`).
211
- prompt_embeds (`torch.FloatTensor`, *optional*):
211
+ prompt_embeds (`torch.Tensor`, *optional*):
212
212
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
213
213
  provided, text embeddings will be generated from `prompt` input argument.
214
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
214
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
215
215
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
216
216
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
217
217
  argument.
@@ -441,7 +441,12 @@ class StableDiffusionKDiffusionPipeline(
441
441
  )
442
442
 
443
443
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
444
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
444
+ shape = (
445
+ batch_size,
446
+ num_channels_latents,
447
+ int(height) // self.vae_scale_factor,
448
+ int(width) // self.vae_scale_factor,
449
+ )
445
450
  if latents is None:
446
451
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
447
452
  else:
@@ -464,12 +469,12 @@ class StableDiffusionKDiffusionPipeline(
464
469
  num_images_per_prompt: Optional[int] = 1,
465
470
  eta: float = 0.0,
466
471
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
467
- latents: Optional[torch.FloatTensor] = None,
468
- prompt_embeds: Optional[torch.FloatTensor] = None,
469
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
472
+ latents: Optional[torch.Tensor] = None,
473
+ prompt_embeds: Optional[torch.Tensor] = None,
474
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
470
475
  output_type: Optional[str] = "pil",
471
476
  return_dict: bool = True,
472
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
477
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
473
478
  callback_steps: int = 1,
474
479
  use_karras_sigmas: Optional[bool] = False,
475
480
  noise_sampler_seed: Optional[int] = None,
@@ -507,14 +512,14 @@ class StableDiffusionKDiffusionPipeline(
507
512
  generator (`torch.Generator`, *optional*):
508
513
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
509
514
  to make generation deterministic.
510
- latents (`torch.FloatTensor`, *optional*):
515
+ latents (`torch.Tensor`, *optional*):
511
516
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
512
517
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
513
518
  tensor will ge generated by sampling using the supplied random `generator`.
514
- prompt_embeds (`torch.FloatTensor`, *optional*):
519
+ prompt_embeds (`torch.Tensor`, *optional*):
515
520
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
516
521
  provided, text embeddings will be generated from `prompt` input argument.
517
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
522
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
518
523
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
519
524
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
520
525
  argument.
@@ -526,7 +531,7 @@ class StableDiffusionKDiffusionPipeline(
526
531
  plain tuple.
527
532
  callback (`Callable`, *optional*):
528
533
  A function that will be called every `callback_steps` steps during inference. The function will be
529
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
534
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
530
535
  callback_steps (`int`, *optional*, defaults to 1):
531
536
  The frequency at which the `callback` function will be called. If not specified, the callback will be
532
537
  called at every step.
@@ -207,10 +207,10 @@ class StableDiffusionXLKDiffusionPipeline(
207
207
  do_classifier_free_guidance: bool = True,
208
208
  negative_prompt: Optional[str] = None,
209
209
  negative_prompt_2: Optional[str] = None,
210
- prompt_embeds: Optional[torch.FloatTensor] = None,
211
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
212
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
213
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
210
+ prompt_embeds: Optional[torch.Tensor] = None,
211
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
212
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
213
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
214
214
  lora_scale: Optional[float] = None,
215
215
  clip_skip: Optional[int] = None,
216
216
  ):
@@ -236,17 +236,17 @@ class StableDiffusionXLKDiffusionPipeline(
236
236
  negative_prompt_2 (`str` or `List[str]`, *optional*):
237
237
  The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
238
238
  `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
239
- prompt_embeds (`torch.FloatTensor`, *optional*):
239
+ prompt_embeds (`torch.Tensor`, *optional*):
240
240
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
241
241
  provided, text embeddings will be generated from `prompt` input argument.
242
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
242
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
243
243
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
244
244
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
245
245
  argument.
246
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
246
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
247
247
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
248
248
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
249
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
249
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
250
250
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
251
251
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
252
252
  input argument.
@@ -497,7 +497,12 @@ class StableDiffusionXLKDiffusionPipeline(
497
497
  )
498
498
 
499
499
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
500
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
500
+ shape = (
501
+ batch_size,
502
+ num_channels_latents,
503
+ int(height) // self.vae_scale_factor,
504
+ int(width) // self.vae_scale_factor,
505
+ )
501
506
  if isinstance(generator, list) and len(generator) != batch_size:
502
507
  raise ValueError(
503
508
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -579,11 +584,11 @@ class StableDiffusionXLKDiffusionPipeline(
579
584
  negative_prompt_2: Optional[Union[str, List[str]]] = None,
580
585
  num_images_per_prompt: Optional[int] = 1,
581
586
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
582
- latents: Optional[torch.FloatTensor] = None,
583
- prompt_embeds: Optional[torch.FloatTensor] = None,
584
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
585
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
586
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
587
+ latents: Optional[torch.Tensor] = None,
588
+ prompt_embeds: Optional[torch.Tensor] = None,
589
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
590
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
591
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
587
592
  output_type: Optional[str] = "pil",
588
593
  return_dict: bool = True,
589
594
  original_size: Optional[Tuple[int, int]] = None,
@@ -637,21 +642,21 @@ class StableDiffusionXLKDiffusionPipeline(
637
642
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
638
643
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
639
644
  to make generation deterministic.
640
- latents (`torch.FloatTensor`, *optional*):
645
+ latents (`torch.Tensor`, *optional*):
641
646
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
642
647
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
643
648
  tensor will ge generated by sampling using the supplied random `generator`.
644
- prompt_embeds (`torch.FloatTensor`, *optional*):
649
+ prompt_embeds (`torch.Tensor`, *optional*):
645
650
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
646
651
  provided, text embeddings will be generated from `prompt` input argument.
647
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
652
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
648
653
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
649
654
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
650
655
  argument.
651
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
656
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
652
657
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
653
658
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
654
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
659
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
655
660
  Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
656
661
  weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
657
662
  input argument.
@@ -80,6 +80,7 @@ def retrieve_timesteps(
80
80
  num_inference_steps: Optional[int] = None,
81
81
  device: Optional[Union[str, torch.device]] = None,
82
82
  timesteps: Optional[List[int]] = None,
83
+ sigmas: Optional[List[float]] = None,
83
84
  **kwargs,
84
85
  ):
85
86
  """
@@ -90,19 +91,23 @@ def retrieve_timesteps(
90
91
  scheduler (`SchedulerMixin`):
91
92
  The scheduler to get timesteps from.
92
93
  num_inference_steps (`int`):
93
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
94
- `timesteps` must be `None`.
94
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
95
+ must be `None`.
95
96
  device (`str` or `torch.device`, *optional*):
96
97
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
97
98
  timesteps (`List[int]`, *optional*):
98
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
99
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
100
- must be `None`.
99
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
100
+ `num_inference_steps` and `sigmas` must be `None`.
101
+ sigmas (`List[float]`, *optional*):
102
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
103
+ `num_inference_steps` and `timesteps` must be `None`.
101
104
 
102
105
  Returns:
103
106
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
104
107
  second element is the number of inference steps.
105
108
  """
109
+ if timesteps is not None and sigmas is not None:
110
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
106
111
  if timesteps is not None:
107
112
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
108
113
  if not accepts_timesteps:
@@ -113,6 +118,16 @@ def retrieve_timesteps(
113
118
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
114
119
  timesteps = scheduler.timesteps
115
120
  num_inference_steps = len(timesteps)
121
+ elif sigmas is not None:
122
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
123
+ if not accept_sigmas:
124
+ raise ValueError(
125
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
126
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
127
+ )
128
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
129
+ timesteps = scheduler.timesteps
130
+ num_inference_steps = len(timesteps)
116
131
  else:
117
132
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
118
133
  timesteps = scheduler.timesteps
@@ -239,8 +254,8 @@ class StableDiffusionLDM3DPipeline(
239
254
  num_images_per_prompt,
240
255
  do_classifier_free_guidance,
241
256
  negative_prompt=None,
242
- prompt_embeds: Optional[torch.FloatTensor] = None,
243
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
257
+ prompt_embeds: Optional[torch.Tensor] = None,
258
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
244
259
  lora_scale: Optional[float] = None,
245
260
  **kwargs,
246
261
  ):
@@ -272,8 +287,8 @@ class StableDiffusionLDM3DPipeline(
272
287
  num_images_per_prompt,
273
288
  do_classifier_free_guidance,
274
289
  negative_prompt=None,
275
- prompt_embeds: Optional[torch.FloatTensor] = None,
276
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
290
+ prompt_embeds: Optional[torch.Tensor] = None,
291
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
277
292
  lora_scale: Optional[float] = None,
278
293
  clip_skip: Optional[int] = None,
279
294
  ):
@@ -293,10 +308,10 @@ class StableDiffusionLDM3DPipeline(
293
308
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
294
309
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
295
310
  less than `1`).
296
- prompt_embeds (`torch.FloatTensor`, *optional*):
311
+ prompt_embeds (`torch.Tensor`, *optional*):
297
312
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
298
313
  provided, text embeddings will be generated from `prompt` input argument.
299
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
314
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
300
315
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
301
316
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
302
317
  argument.
@@ -627,7 +642,12 @@ class StableDiffusionLDM3DPipeline(
627
642
  )
628
643
 
629
644
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
630
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
645
+ shape = (
646
+ batch_size,
647
+ num_channels_latents,
648
+ int(height) // self.vae_scale_factor,
649
+ int(width) // self.vae_scale_factor,
650
+ )
631
651
  if isinstance(generator, list) and len(generator) != batch_size:
632
652
  raise ValueError(
633
653
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -644,20 +664,22 @@ class StableDiffusionLDM3DPipeline(
644
664
  return latents
645
665
 
646
666
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
647
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
667
+ def get_guidance_scale_embedding(
668
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
669
+ ) -> torch.Tensor:
648
670
  """
649
671
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
650
672
 
651
673
  Args:
652
- timesteps (`torch.Tensor`):
653
- generate embedding vectors at these timesteps
674
+ w (`torch.Tensor`):
675
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
654
676
  embedding_dim (`int`, *optional*, defaults to 512):
655
- dimension of the embeddings to generate
656
- dtype:
657
- data type of the generated embeddings
677
+ Dimension of the embeddings to generate.
678
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
679
+ Data type of the generated embeddings.
658
680
 
659
681
  Returns:
660
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
682
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
661
683
  """
662
684
  assert len(w.shape) == 1
663
685
  w = w * 1000.0
@@ -712,16 +734,17 @@ class StableDiffusionLDM3DPipeline(
712
734
  width: Optional[int] = None,
713
735
  num_inference_steps: int = 49,
714
736
  timesteps: List[int] = None,
737
+ sigmas: List[float] = None,
715
738
  guidance_scale: float = 5.0,
716
739
  negative_prompt: Optional[Union[str, List[str]]] = None,
717
740
  num_images_per_prompt: Optional[int] = 1,
718
741
  eta: float = 0.0,
719
742
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
720
- latents: Optional[torch.FloatTensor] = None,
721
- prompt_embeds: Optional[torch.FloatTensor] = None,
722
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
743
+ latents: Optional[torch.Tensor] = None,
744
+ prompt_embeds: Optional[torch.Tensor] = None,
745
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
723
746
  ip_adapter_image: Optional[PipelineImageInput] = None,
724
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
747
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
725
748
  output_type: Optional[str] = "pil",
726
749
  return_dict: bool = True,
727
750
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -744,6 +767,14 @@ class StableDiffusionLDM3DPipeline(
744
767
  num_inference_steps (`int`, *optional*, defaults to 50):
745
768
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
746
769
  expense of slower inference.
770
+ timesteps (`List[int]`, *optional*):
771
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
772
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
773
+ passed will be used. Must be in descending order.
774
+ sigmas (`List[float]`, *optional*):
775
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
776
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
777
+ will be used.
747
778
  guidance_scale (`float`, *optional*, defaults to 5.0):
748
779
  A higher guidance scale value encourages the model to generate images closely linked to the text
749
780
  `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
@@ -758,23 +789,23 @@ class StableDiffusionLDM3DPipeline(
758
789
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
759
790
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
760
791
  generation deterministic.
761
- latents (`torch.FloatTensor`, *optional*):
792
+ latents (`torch.Tensor`, *optional*):
762
793
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
763
794
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
764
795
  tensor is generated by sampling using the supplied random `generator`.
765
- prompt_embeds (`torch.FloatTensor`, *optional*):
796
+ prompt_embeds (`torch.Tensor`, *optional*):
766
797
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
767
798
  provided, text embeddings are generated from the `prompt` input argument.
768
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
799
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
769
800
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
770
801
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
771
802
  ip_adapter_image: (`PipelineImageInput`, *optional*):
772
803
  Optional image input to work with IP Adapters.
773
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
774
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
775
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
776
- if `do_classifier_free_guidance` is set to `True`.
777
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
804
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
805
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
806
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
807
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
808
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
778
809
  output_type (`str`, *optional*, defaults to `"pil"`):
779
810
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
780
811
  return_dict (`bool`, *optional*, defaults to `True`):
@@ -881,7 +912,9 @@ class StableDiffusionLDM3DPipeline(
881
912
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
882
913
 
883
914
  # 4. Prepare timesteps
884
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
915
+ timesteps, num_inference_steps = retrieve_timesteps(
916
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
917
+ )
885
918
 
886
919
  # 5. Prepare latent variables
887
920
  num_channels_latents = self.unet.config.in_channels