diffusers 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +33 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +8 -0
  21. diffusers/models/activations.py +23 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +475 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +363 -32
  35. diffusers/models/model_loading_utils.py +177 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_outputs.py +14 -0
  39. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  40. diffusers/models/modeling_utils.py +175 -99
  41. diffusers/models/normalization.py +2 -1
  42. diffusers/models/resnet.py +18 -23
  43. diffusers/models/transformer_temporal.py +3 -3
  44. diffusers/models/transformers/__init__.py +3 -0
  45. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  46. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  47. diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
  48. diffusers/models/transformers/pixart_transformer_2d.py +336 -0
  49. diffusers/models/transformers/prior_transformer.py +7 -7
  50. diffusers/models/transformers/t5_film_transformer.py +17 -19
  51. diffusers/models/transformers/transformer_2d.py +292 -184
  52. diffusers/models/transformers/transformer_temporal.py +10 -10
  53. diffusers/models/unets/unet_1d.py +5 -5
  54. diffusers/models/unets/unet_1d_blocks.py +29 -29
  55. diffusers/models/unets/unet_2d.py +6 -6
  56. diffusers/models/unets/unet_2d_blocks.py +137 -128
  57. diffusers/models/unets/unet_2d_condition.py +19 -15
  58. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  59. diffusers/models/unets/unet_3d_blocks.py +79 -77
  60. diffusers/models/unets/unet_3d_condition.py +13 -9
  61. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  62. diffusers/models/unets/unet_kandinsky3.py +1 -1
  63. diffusers/models/unets/unet_motion_model.py +114 -14
  64. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  65. diffusers/models/unets/unet_stable_cascade.py +16 -13
  66. diffusers/models/upsampling.py +17 -20
  67. diffusers/models/vq_model.py +16 -15
  68. diffusers/pipelines/__init__.py +27 -3
  69. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  70. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  71. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  72. diffusers/pipelines/animatediff/__init__.py +2 -0
  73. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  74. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  75. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  76. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  77. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  78. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  79. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  80. diffusers/pipelines/auto_pipeline.py +21 -17
  81. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  82. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  83. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  84. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  85. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  86. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  87. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  88. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  89. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  90. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  91. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  92. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  93. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  94. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  95. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  96. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  97. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  98. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  99. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  100. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  101. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  102. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  103. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  104. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  105. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  106. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  107. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  108. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  109. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  110. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  111. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  112. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  113. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  114. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  115. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  116. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  117. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  118. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  119. diffusers/pipelines/dit/pipeline_dit.py +7 -4
  120. diffusers/pipelines/free_init_utils.py +39 -38
  121. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  122. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
  123. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  124. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  125. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  126. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  127. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  128. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  129. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  130. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  131. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  132. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  133. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  134. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  135. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  136. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  137. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  138. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  139. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  140. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  141. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  142. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  143. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  144. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  145. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  146. diffusers/pipelines/marigold/__init__.py +50 -0
  147. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  148. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  149. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  150. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  151. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  152. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  153. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  154. diffusers/pipelines/pipeline_loading_utils.py +269 -23
  155. diffusers/pipelines/pipeline_utils.py +266 -37
  156. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +69 -79
  158. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  159. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  160. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  161. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  162. diffusers/pipelines/shap_e/renderer.py +1 -1
  163. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  164. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  165. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  166. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  167. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  168. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  169. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  172. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  173. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  174. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  175. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  176. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  177. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  178. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  179. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  180. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  181. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  182. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  183. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  184. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  185. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  186. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  187. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  188. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  189. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  190. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  191. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  192. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  193. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  194. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  195. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  196. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  197. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  198. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  199. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  200. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  201. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  202. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  203. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  204. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  205. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  206. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  207. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  208. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  209. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  210. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  211. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  212. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  213. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  214. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  215. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  216. diffusers/schedulers/__init__.py +2 -2
  217. diffusers/schedulers/deprecated/__init__.py +1 -1
  218. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  219. diffusers/schedulers/scheduling_amused.py +5 -5
  220. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  221. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  222. diffusers/schedulers/scheduling_ddim.py +22 -24
  223. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  224. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  225. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  226. diffusers/schedulers/scheduling_ddpm.py +20 -22
  227. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  228. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  229. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  230. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  231. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  232. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  236. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  237. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  238. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  239. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  240. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  241. diffusers/schedulers/scheduling_ipndm.py +8 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  244. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  245. diffusers/schedulers/scheduling_lcm.py +21 -23
  246. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  247. diffusers/schedulers/scheduling_pndm.py +20 -20
  248. diffusers/schedulers/scheduling_repaint.py +20 -20
  249. diffusers/schedulers/scheduling_sasolver.py +55 -54
  250. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  251. diffusers/schedulers/scheduling_tcd.py +39 -30
  252. diffusers/schedulers/scheduling_unclip.py +15 -15
  253. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  254. diffusers/schedulers/scheduling_utils.py +14 -5
  255. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  256. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  257. diffusers/training_utils.py +56 -1
  258. diffusers/utils/__init__.py +7 -0
  259. diffusers/utils/doc_utils.py +1 -0
  260. diffusers/utils/dummy_pt_objects.py +75 -0
  261. diffusers/utils/dummy_torch_and_transformers_objects.py +105 -0
  262. diffusers/utils/dynamic_modules_utils.py +24 -11
  263. diffusers/utils/hub_utils.py +3 -2
  264. diffusers/utils/import_utils.py +91 -0
  265. diffusers/utils/loading_utils.py +2 -2
  266. diffusers/utils/logging.py +1 -1
  267. diffusers/utils/peft_utils.py +32 -5
  268. diffusers/utils/state_dict_utils.py +11 -2
  269. diffusers/utils/testing_utils.py +71 -6
  270. diffusers/utils/torch_utils.py +1 -0
  271. diffusers/video_processor.py +113 -0
  272. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/METADATA +7 -7
  273. diffusers-0.28.1.dist-info/RECORD +419 -0
  274. diffusers-0.27.2.dist-info/RECORD +0 -399
  275. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/LICENSE +0 -0
  276. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/WHEEL +0 -0
  277. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/entry_points.txt +0 -0
  278. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,6 @@ from ...models import Kandinsky3UNet, VQModel
8
8
  from ...schedulers import DDPMScheduler
9
9
  from ...utils import (
10
10
  deprecate,
11
- is_accelerate_available,
12
11
  logging,
13
12
  replace_example_docstring,
14
13
  )
@@ -24,7 +23,9 @@ EXAMPLE_DOC_STRING = """
24
23
  >>> from diffusers import AutoPipelineForText2Image
25
24
  >>> import torch
26
25
 
27
- >>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
26
+ >>> pipe = AutoPipelineForText2Image.from_pretrained(
27
+ ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
28
+ ... )
28
29
  >>> pipe.enable_model_cpu_offload()
29
30
 
30
31
  >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
@@ -70,20 +71,6 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
70
71
  tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, movq=movq
71
72
  )
72
73
 
73
- def remove_all_hooks(self):
74
- if is_accelerate_available():
75
- from accelerate.hooks import remove_hook_from_module
76
- else:
77
- raise ImportError("Please install accelerate via `pip install accelerate`")
78
-
79
- for model in [self.text_encoder, self.unet, self.movq]:
80
- if model is not None:
81
- remove_hook_from_module(model, recurse=True)
82
-
83
- self.unet_offload_hook = None
84
- self.text_encoder_offload_hook = None
85
- self.final_offload_hook = None
86
-
87
74
  def process_embeds(self, embeddings, attention_mask, cut_context):
88
75
  if cut_context:
89
76
  embeddings[attention_mask == 0] = torch.zeros_like(embeddings[attention_mask == 0])
@@ -100,11 +87,11 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
100
87
  num_images_per_prompt=1,
101
88
  device=None,
102
89
  negative_prompt=None,
103
- prompt_embeds: Optional[torch.FloatTensor] = None,
104
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
90
+ prompt_embeds: Optional[torch.Tensor] = None,
91
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
105
92
  _cut_context=False,
106
- attention_mask: Optional[torch.FloatTensor] = None,
107
- negative_attention_mask: Optional[torch.FloatTensor] = None,
93
+ attention_mask: Optional[torch.Tensor] = None,
94
+ negative_attention_mask: Optional[torch.Tensor] = None,
108
95
  ):
109
96
  r"""
110
97
  Encodes the prompt into text encoder hidden states.
@@ -122,16 +109,16 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
122
109
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
123
110
  `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
124
111
  Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
125
- prompt_embeds (`torch.FloatTensor`, *optional*):
112
+ prompt_embeds (`torch.Tensor`, *optional*):
126
113
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
127
114
  provided, text embeddings will be generated from `prompt` input argument.
128
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
115
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
129
116
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
130
117
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
131
118
  argument.
132
- attention_mask (`torch.FloatTensor`, *optional*):
119
+ attention_mask (`torch.Tensor`, *optional*):
133
120
  Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
134
- negative_attention_mask (`torch.FloatTensor`, *optional*):
121
+ negative_attention_mask (`torch.Tensor`, *optional*):
135
122
  Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
136
123
  """
137
124
  if prompt is not None and negative_prompt is not None:
@@ -347,10 +334,10 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
347
334
  height: Optional[int] = 1024,
348
335
  width: Optional[int] = 1024,
349
336
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
350
- prompt_embeds: Optional[torch.FloatTensor] = None,
351
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
352
- attention_mask: Optional[torch.FloatTensor] = None,
353
- negative_attention_mask: Optional[torch.FloatTensor] = None,
337
+ prompt_embeds: Optional[torch.Tensor] = None,
338
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
339
+ attention_mask: Optional[torch.Tensor] = None,
340
+ negative_attention_mask: Optional[torch.Tensor] = None,
354
341
  output_type: Optional[str] = "pil",
355
342
  return_dict: bool = True,
356
343
  latents=None,
@@ -393,16 +380,16 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
393
380
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
394
381
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
395
382
  to make generation deterministic.
396
- prompt_embeds (`torch.FloatTensor`, *optional*):
383
+ prompt_embeds (`torch.Tensor`, *optional*):
397
384
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
398
385
  provided, text embeddings will be generated from `prompt` input argument.
399
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
386
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
400
387
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
401
388
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
402
389
  argument.
403
- attention_mask (`torch.FloatTensor`, *optional*):
390
+ attention_mask (`torch.Tensor`, *optional*):
404
391
  Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
405
- negative_attention_mask (`torch.FloatTensor`, *optional*):
392
+ negative_attention_mask (`torch.Tensor`, *optional*):
406
393
  Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
407
394
  output_type (`str`, *optional*, defaults to `"pil"`):
408
395
  The output format of the generate image. Choose between
@@ -411,7 +398,7 @@ class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
411
398
  Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
412
399
  callback (`Callable`, *optional*):
413
400
  A function that will be called every `callback_steps` steps during inference. The function will be
414
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
401
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
415
402
  callback_steps (`int`, *optional*, defaults to 1):
416
403
  The frequency at which the `callback` function will be called. If not specified, the callback will be
417
404
  called at every step.
@@ -12,7 +12,6 @@ from ...models import Kandinsky3UNet, VQModel
12
12
  from ...schedulers import DDPMScheduler
13
13
  from ...utils import (
14
14
  deprecate,
15
- is_accelerate_available,
16
15
  logging,
17
16
  replace_example_docstring,
18
17
  )
@@ -29,11 +28,15 @@ EXAMPLE_DOC_STRING = """
29
28
  >>> from diffusers.utils import load_image
30
29
  >>> import torch
31
30
 
32
- >>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
31
+ >>> pipe = AutoPipelineForImage2Image.from_pretrained(
32
+ ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
33
+ ... )
33
34
  >>> pipe.enable_model_cpu_offload()
34
35
 
35
36
  >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
36
- >>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png")
37
+ >>> image = load_image(
38
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
39
+ ... )
37
40
 
38
41
  >>> generator = torch.Generator(device="cpu").manual_seed(0)
39
42
  >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
@@ -92,20 +95,6 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
92
95
 
93
96
  return timesteps, num_inference_steps - t_start
94
97
 
95
- def remove_all_hooks(self):
96
- if is_accelerate_available():
97
- from accelerate.hooks import remove_hook_from_module
98
- else:
99
- raise ImportError("Please install accelerate via `pip install accelerate`")
100
-
101
- for model in [self.text_encoder, self.unet]:
102
- if model is not None:
103
- remove_hook_from_module(model, recurse=True)
104
-
105
- self.unet_offload_hook = None
106
- self.text_encoder_offload_hook = None
107
- self.final_offload_hook = None
108
-
109
98
  def _process_embeds(self, embeddings, attention_mask, cut_context):
110
99
  # return embeddings, attention_mask
111
100
  if cut_context:
@@ -123,11 +112,11 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
123
112
  num_images_per_prompt=1,
124
113
  device=None,
125
114
  negative_prompt=None,
126
- prompt_embeds: Optional[torch.FloatTensor] = None,
127
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
115
+ prompt_embeds: Optional[torch.Tensor] = None,
116
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
128
117
  _cut_context=False,
129
- attention_mask: Optional[torch.FloatTensor] = None,
130
- negative_attention_mask: Optional[torch.FloatTensor] = None,
118
+ attention_mask: Optional[torch.Tensor] = None,
119
+ negative_attention_mask: Optional[torch.Tensor] = None,
131
120
  ):
132
121
  r"""
133
122
  Encodes the prompt into text encoder hidden states.
@@ -145,16 +134,16 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
145
134
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
146
135
  `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
147
136
  Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
148
- prompt_embeds (`torch.FloatTensor`, *optional*):
137
+ prompt_embeds (`torch.Tensor`, *optional*):
149
138
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
150
139
  provided, text embeddings will be generated from `prompt` input argument.
151
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
140
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
152
141
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
153
142
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
154
143
  argument.
155
- attention_mask (`torch.FloatTensor`, *optional*):
144
+ attention_mask (`torch.Tensor`, *optional*):
156
145
  Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
157
- negative_attention_mask (`torch.FloatTensor`, *optional*):
146
+ negative_attention_mask (`torch.Tensor`, *optional*):
158
147
  Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
159
148
  """
160
149
  if prompt is not None and negative_prompt is not None:
@@ -414,17 +403,17 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
414
403
  def __call__(
415
404
  self,
416
405
  prompt: Union[str, List[str]] = None,
417
- image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
406
+ image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
418
407
  strength: float = 0.3,
419
408
  num_inference_steps: int = 25,
420
409
  guidance_scale: float = 3.0,
421
410
  negative_prompt: Optional[Union[str, List[str]]] = None,
422
411
  num_images_per_prompt: Optional[int] = 1,
423
412
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
424
- prompt_embeds: Optional[torch.FloatTensor] = None,
425
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
426
- attention_mask: Optional[torch.FloatTensor] = None,
427
- negative_attention_mask: Optional[torch.FloatTensor] = None,
413
+ prompt_embeds: Optional[torch.Tensor] = None,
414
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
415
+ attention_mask: Optional[torch.Tensor] = None,
416
+ negative_attention_mask: Optional[torch.Tensor] = None,
428
417
  output_type: Optional[str] = "pil",
429
418
  return_dict: bool = True,
430
419
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -438,7 +427,7 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
438
427
  prompt (`str` or `List[str]`, *optional*):
439
428
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
440
429
  instead.
441
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
430
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
442
431
  `Image`, or tensor representing an image batch, that will be used as the starting point for the
443
432
  process.
444
433
  strength (`float`, *optional*, defaults to 0.8):
@@ -465,16 +454,16 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
465
454
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
466
455
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
467
456
  to make generation deterministic.
468
- prompt_embeds (`torch.FloatTensor`, *optional*):
457
+ prompt_embeds (`torch.Tensor`, *optional*):
469
458
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
470
459
  provided, text embeddings will be generated from `prompt` input argument.
471
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
460
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
472
461
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
473
462
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
474
463
  argument.
475
- attention_mask (`torch.FloatTensor`, *optional*):
464
+ attention_mask (`torch.Tensor`, *optional*):
476
465
  Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
477
- negative_attention_mask (`torch.FloatTensor`, *optional*):
466
+ negative_attention_mask (`torch.Tensor`, *optional*):
478
467
  Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
479
468
  output_type (`str`, *optional*, defaults to `"pil"`):
480
469
  The output format of the generate image. Choose between
@@ -63,6 +63,7 @@ def retrieve_timesteps(
63
63
  num_inference_steps: Optional[int] = None,
64
64
  device: Optional[Union[str, torch.device]] = None,
65
65
  timesteps: Optional[List[int]] = None,
66
+ sigmas: Optional[List[float]] = None,
66
67
  **kwargs,
67
68
  ):
68
69
  """
@@ -73,19 +74,23 @@ def retrieve_timesteps(
73
74
  scheduler (`SchedulerMixin`):
74
75
  The scheduler to get timesteps from.
75
76
  num_inference_steps (`int`):
76
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
77
- `timesteps` must be `None`.
77
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
78
+ must be `None`.
78
79
  device (`str` or `torch.device`, *optional*):
79
80
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
80
81
  timesteps (`List[int]`, *optional*):
81
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
82
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
83
- must be `None`.
82
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
83
+ `num_inference_steps` and `sigmas` must be `None`.
84
+ sigmas (`List[float]`, *optional*):
85
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
86
+ `num_inference_steps` and `timesteps` must be `None`.
84
87
 
85
88
  Returns:
86
89
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
87
90
  second element is the number of inference steps.
88
91
  """
92
+ if timesteps is not None and sigmas is not None:
93
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
89
94
  if timesteps is not None:
90
95
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
91
96
  if not accepts_timesteps:
@@ -96,6 +101,16 @@ def retrieve_timesteps(
96
101
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
97
102
  timesteps = scheduler.timesteps
98
103
  num_inference_steps = len(timesteps)
104
+ elif sigmas is not None:
105
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
106
+ if not accept_sigmas:
107
+ raise ValueError(
108
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
109
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
110
+ )
111
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
112
+ timesteps = scheduler.timesteps
113
+ num_inference_steps = len(timesteps)
99
114
  else:
100
115
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
101
116
  timesteps = scheduler.timesteps
@@ -222,8 +237,8 @@ class LatentConsistencyModelImg2ImgPipeline(
222
237
  num_images_per_prompt,
223
238
  do_classifier_free_guidance,
224
239
  negative_prompt=None,
225
- prompt_embeds: Optional[torch.FloatTensor] = None,
226
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
240
+ prompt_embeds: Optional[torch.Tensor] = None,
241
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
227
242
  lora_scale: Optional[float] = None,
228
243
  clip_skip: Optional[int] = None,
229
244
  ):
@@ -243,10 +258,10 @@ class LatentConsistencyModelImg2ImgPipeline(
243
258
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
244
259
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
245
260
  less than `1`).
246
- prompt_embeds (`torch.FloatTensor`, *optional*):
261
+ prompt_embeds (`torch.Tensor`, *optional*):
247
262
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
248
263
  provided, text embeddings will be generated from `prompt` input argument.
249
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
264
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
250
265
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
251
266
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
252
267
  argument.
@@ -548,20 +563,22 @@ class LatentConsistencyModelImg2ImgPipeline(
548
563
  return latents
549
564
 
550
565
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
551
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
566
+ def get_guidance_scale_embedding(
567
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
568
+ ) -> torch.Tensor:
552
569
  """
553
570
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
554
571
 
555
572
  Args:
556
- timesteps (`torch.Tensor`):
557
- generate embedding vectors at these timesteps
573
+ w (`torch.Tensor`):
574
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
558
575
  embedding_dim (`int`, *optional*, defaults to 512):
559
- dimension of the embeddings to generate
560
- dtype:
561
- data type of the generated embeddings
576
+ Dimension of the embeddings to generate.
577
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
578
+ Data type of the generated embeddings.
562
579
 
563
580
  Returns:
564
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
581
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
565
582
  """
566
583
  assert len(w.shape) == 1
567
584
  w = w * 1000.0
@@ -611,7 +628,7 @@ class LatentConsistencyModelImg2ImgPipeline(
611
628
  prompt: Union[str, List[str]],
612
629
  strength: float,
613
630
  callback_steps: int,
614
- prompt_embeds: Optional[torch.FloatTensor] = None,
631
+ prompt_embeds: Optional[torch.Tensor] = None,
615
632
  ip_adapter_image=None,
616
633
  ip_adapter_image_embeds=None,
617
634
  callback_on_step_end_tensor_inputs=None,
@@ -692,10 +709,10 @@ class LatentConsistencyModelImg2ImgPipeline(
692
709
  guidance_scale: float = 8.5,
693
710
  num_images_per_prompt: Optional[int] = 1,
694
711
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
695
- latents: Optional[torch.FloatTensor] = None,
696
- prompt_embeds: Optional[torch.FloatTensor] = None,
712
+ latents: Optional[torch.Tensor] = None,
713
+ prompt_embeds: Optional[torch.Tensor] = None,
697
714
  ip_adapter_image: Optional[PipelineImageInput] = None,
698
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
715
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
699
716
  output_type: Optional[str] = "pil",
700
717
  return_dict: bool = True,
701
718
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -737,20 +754,20 @@ class LatentConsistencyModelImg2ImgPipeline(
737
754
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
738
755
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
739
756
  generation deterministic.
740
- latents (`torch.FloatTensor`, *optional*):
757
+ latents (`torch.Tensor`, *optional*):
741
758
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
742
759
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
743
760
  tensor is generated by sampling using the supplied random `generator`.
744
- prompt_embeds (`torch.FloatTensor`, *optional*):
761
+ prompt_embeds (`torch.Tensor`, *optional*):
745
762
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
746
763
  provided, text embeddings are generated from the `prompt` input argument.
747
764
  ip_adapter_image: (`PipelineImageInput`, *optional*):
748
765
  Optional image input to work with IP Adapters.
749
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
750
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
751
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
752
- if `do_classifier_free_guidance` is set to `True`.
753
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
766
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
767
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
768
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
769
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
770
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
754
771
  output_type (`str`, *optional*, defaults to `"pil"`):
755
772
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
756
773
  return_dict (`bool`, *optional*, defaults to `True`):
@@ -870,9 +887,10 @@ class LatentConsistencyModelImg2ImgPipeline(
870
887
  else self.scheduler.config.original_inference_steps
871
888
  )
872
889
  latent_timestep = timesteps[:1]
873
- latents = self.prepare_latents(
874
- image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
875
- )
890
+ if latents is None:
891
+ latents = self.prepare_latents(
892
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
893
+ )
876
894
  bs = batch_size * num_images_per_prompt
877
895
 
878
896
  # 6. Get Guidance Scale Embedding
@@ -67,6 +67,7 @@ def retrieve_timesteps(
67
67
  num_inference_steps: Optional[int] = None,
68
68
  device: Optional[Union[str, torch.device]] = None,
69
69
  timesteps: Optional[List[int]] = None,
70
+ sigmas: Optional[List[float]] = None,
70
71
  **kwargs,
71
72
  ):
72
73
  """
@@ -77,19 +78,23 @@ def retrieve_timesteps(
77
78
  scheduler (`SchedulerMixin`):
78
79
  The scheduler to get timesteps from.
79
80
  num_inference_steps (`int`):
80
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
81
- `timesteps` must be `None`.
81
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
82
+ must be `None`.
82
83
  device (`str` or `torch.device`, *optional*):
83
84
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
84
85
  timesteps (`List[int]`, *optional*):
85
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
86
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
87
- must be `None`.
86
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
87
+ `num_inference_steps` and `sigmas` must be `None`.
88
+ sigmas (`List[float]`, *optional*):
89
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
90
+ `num_inference_steps` and `timesteps` must be `None`.
88
91
 
89
92
  Returns:
90
93
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
91
94
  second element is the number of inference steps.
92
95
  """
96
+ if timesteps is not None and sigmas is not None:
97
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
93
98
  if timesteps is not None:
94
99
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
95
100
  if not accepts_timesteps:
@@ -100,6 +105,16 @@ def retrieve_timesteps(
100
105
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
101
106
  timesteps = scheduler.timesteps
102
107
  num_inference_steps = len(timesteps)
108
+ elif sigmas is not None:
109
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
110
+ if not accept_sigmas:
111
+ raise ValueError(
112
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
113
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
114
+ )
115
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
116
+ timesteps = scheduler.timesteps
117
+ num_inference_steps = len(timesteps)
103
118
  else:
104
119
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
105
120
  timesteps = scheduler.timesteps
@@ -206,8 +221,8 @@ class LatentConsistencyModelPipeline(
206
221
  num_images_per_prompt,
207
222
  do_classifier_free_guidance,
208
223
  negative_prompt=None,
209
- prompt_embeds: Optional[torch.FloatTensor] = None,
210
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
224
+ prompt_embeds: Optional[torch.Tensor] = None,
225
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
211
226
  lora_scale: Optional[float] = None,
212
227
  clip_skip: Optional[int] = None,
213
228
  ):
@@ -227,10 +242,10 @@ class LatentConsistencyModelPipeline(
227
242
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
228
243
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
229
244
  less than `1`).
230
- prompt_embeds (`torch.FloatTensor`, *optional*):
245
+ prompt_embeds (`torch.Tensor`, *optional*):
231
246
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
232
247
  provided, text embeddings will be generated from `prompt` input argument.
233
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
248
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
234
249
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
235
250
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
236
251
  argument.
@@ -474,7 +489,12 @@ class LatentConsistencyModelPipeline(
474
489
 
475
490
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
476
491
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
477
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
492
+ shape = (
493
+ batch_size,
494
+ num_channels_latents,
495
+ int(height) // self.vae_scale_factor,
496
+ int(width) // self.vae_scale_factor,
497
+ )
478
498
  if isinstance(generator, list) and len(generator) != batch_size:
479
499
  raise ValueError(
480
500
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -490,20 +510,22 @@ class LatentConsistencyModelPipeline(
490
510
  latents = latents * self.scheduler.init_noise_sigma
491
511
  return latents
492
512
 
493
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
513
+ def get_guidance_scale_embedding(
514
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
515
+ ) -> torch.Tensor:
494
516
  """
495
517
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
496
518
 
497
519
  Args:
498
- timesteps (`torch.Tensor`):
499
- generate embedding vectors at these timesteps
520
+ w (`torch.Tensor`):
521
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
500
522
  embedding_dim (`int`, *optional*, defaults to 512):
501
- dimension of the embeddings to generate
502
- dtype:
503
- data type of the generated embeddings
523
+ Dimension of the embeddings to generate.
524
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
525
+ Data type of the generated embeddings.
504
526
 
505
527
  Returns:
506
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
528
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
507
529
  """
508
530
  assert len(w.shape) == 1
509
531
  w = w * 1000.0
@@ -543,7 +565,7 @@ class LatentConsistencyModelPipeline(
543
565
  height: int,
544
566
  width: int,
545
567
  callback_steps: int,
546
- prompt_embeds: Optional[torch.FloatTensor] = None,
568
+ prompt_embeds: Optional[torch.Tensor] = None,
547
569
  ip_adapter_image=None,
548
570
  ip_adapter_image_embeds=None,
549
571
  callback_on_step_end_tensor_inputs=None,
@@ -624,10 +646,10 @@ class LatentConsistencyModelPipeline(
624
646
  guidance_scale: float = 8.5,
625
647
  num_images_per_prompt: Optional[int] = 1,
626
648
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
627
- latents: Optional[torch.FloatTensor] = None,
628
- prompt_embeds: Optional[torch.FloatTensor] = None,
649
+ latents: Optional[torch.Tensor] = None,
650
+ prompt_embeds: Optional[torch.Tensor] = None,
629
651
  ip_adapter_image: Optional[PipelineImageInput] = None,
630
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
652
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
631
653
  output_type: Optional[str] = "pil",
632
654
  return_dict: bool = True,
633
655
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -669,20 +691,20 @@ class LatentConsistencyModelPipeline(
669
691
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
670
692
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
671
693
  generation deterministic.
672
- latents (`torch.FloatTensor`, *optional*):
694
+ latents (`torch.Tensor`, *optional*):
673
695
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
674
696
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
675
697
  tensor is generated by sampling using the supplied random `generator`.
676
- prompt_embeds (`torch.FloatTensor`, *optional*):
698
+ prompt_embeds (`torch.Tensor`, *optional*):
677
699
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
678
700
  provided, text embeddings are generated from the `prompt` input argument.
679
701
  ip_adapter_image: (`PipelineImageInput`, *optional*):
680
702
  Optional image input to work with IP Adapters.
681
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
682
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
683
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
684
- if `do_classifier_free_guidance` is set to `True`.
685
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
703
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
704
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
705
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
706
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
707
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
686
708
  output_type (`str`, *optional*, defaults to `"pil"`):
687
709
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
688
710
  return_dict (`bool`, *optional*, defaults to `True`):