diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +19 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  229. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  231. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  232. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
  267. diffusers-0.27.2.dist-info/RECORD +0 -399
  268. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  269. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -21,11 +21,12 @@ import PIL.Image
21
21
  import torch
22
22
  from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
23
23
 
24
- from ...image_processor import PipelineImageInput, VaeImageProcessor
24
+ from ...image_processor import PipelineImageInput
25
25
  from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
26
26
  from ...schedulers import EulerDiscreteScheduler
27
27
  from ...utils import BaseOutput, logging, replace_example_docstring
28
28
  from ...utils.torch_utils import is_compiled_module, randn_tensor
29
+ from ...video_processor import VideoProcessor
29
30
  from ..pipeline_utils import DiffusionPipeline
30
31
 
31
32
 
@@ -37,10 +38,14 @@ EXAMPLE_DOC_STRING = """
37
38
  >>> from diffusers import StableVideoDiffusionPipeline
38
39
  >>> from diffusers.utils import load_image, export_to_video
39
40
 
40
- >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
41
+ >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
42
+ ... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
43
+ ... )
41
44
  >>> pipe.to("cuda")
42
45
 
43
- >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
46
+ >>> image = load_image(
47
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
48
+ ... )
44
49
  >>> image = image.resize((1024, 576))
45
50
 
46
51
  >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
@@ -57,26 +62,64 @@ def _append_dims(x, target_dims):
57
62
  return x[(...,) + (None,) * dims_to_append]
58
63
 
59
64
 
60
- # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
61
- def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
62
- batch_size, channels, num_frames, height, width = video.shape
63
- outputs = []
64
- for batch_idx in range(batch_size):
65
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
66
- batch_output = processor.postprocess(batch_vid, output_type)
67
-
68
- outputs.append(batch_output)
69
-
70
- if output_type == "np":
71
- outputs = np.stack(outputs)
72
-
73
- elif output_type == "pt":
74
- outputs = torch.stack(outputs)
75
-
76
- elif not output_type == "pil":
77
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
65
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
66
+ def retrieve_timesteps(
67
+ scheduler,
68
+ num_inference_steps: Optional[int] = None,
69
+ device: Optional[Union[str, torch.device]] = None,
70
+ timesteps: Optional[List[int]] = None,
71
+ sigmas: Optional[List[float]] = None,
72
+ **kwargs,
73
+ ):
74
+ """
75
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
76
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
78
77
 
79
- return outputs
78
+ Args:
79
+ scheduler (`SchedulerMixin`):
80
+ The scheduler to get timesteps from.
81
+ num_inference_steps (`int`):
82
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
83
+ must be `None`.
84
+ device (`str` or `torch.device`, *optional*):
85
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
86
+ timesteps (`List[int]`, *optional*):
87
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
88
+ `num_inference_steps` and `sigmas` must be `None`.
89
+ sigmas (`List[float]`, *optional*):
90
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
91
+ `num_inference_steps` and `timesteps` must be `None`.
92
+
93
+ Returns:
94
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
95
+ second element is the number of inference steps.
96
+ """
97
+ if timesteps is not None and sigmas is not None:
98
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
99
+ if timesteps is not None:
100
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
101
+ if not accepts_timesteps:
102
+ raise ValueError(
103
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
104
+ f" timestep schedules. Please check whether you are using the correct scheduler."
105
+ )
106
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
107
+ timesteps = scheduler.timesteps
108
+ num_inference_steps = len(timesteps)
109
+ elif sigmas is not None:
110
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
111
+ if not accept_sigmas:
112
+ raise ValueError(
113
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
114
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
115
+ )
116
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
117
+ timesteps = scheduler.timesteps
118
+ num_inference_steps = len(timesteps)
119
+ else:
120
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
121
+ timesteps = scheduler.timesteps
122
+ return timesteps, num_inference_steps
80
123
 
81
124
 
82
125
  @dataclass
@@ -85,12 +128,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
85
128
  Output class for Stable Video Diffusion pipeline.
86
129
 
87
130
  Args:
88
- frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
89
- List of denoised PIL images of length `batch_size` or numpy array or torch tensor
90
- of shape `(batch_size, num_frames, height, width, num_channels)`.
131
+ frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
132
+ List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
133
+ num_frames, height, width, num_channels)`.
91
134
  """
92
135
 
93
- frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
136
+ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
94
137
 
95
138
 
96
139
  class StableVideoDiffusionPipeline(DiffusionPipeline):
@@ -104,7 +147,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
104
147
  vae ([`AutoencoderKLTemporalDecoder`]):
105
148
  Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
106
149
  image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
107
- Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
150
+ Frozen CLIP image-encoder
151
+ ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
108
152
  unet ([`UNetSpatioTemporalConditionModel`]):
109
153
  A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
110
154
  scheduler ([`EulerDiscreteScheduler`]):
@@ -134,7 +178,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
134
178
  feature_extractor=feature_extractor,
135
179
  )
136
180
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
137
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
181
+ self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
138
182
 
139
183
  def _encode_image(
140
184
  self,
@@ -142,12 +186,12 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
142
186
  device: Union[str, torch.device],
143
187
  num_videos_per_prompt: int,
144
188
  do_classifier_free_guidance: bool,
145
- ) -> torch.FloatTensor:
189
+ ) -> torch.Tensor:
146
190
  dtype = next(self.image_encoder.parameters()).dtype
147
191
 
148
192
  if not isinstance(image, torch.Tensor):
149
- image = self.image_processor.pil_to_numpy(image)
150
- image = self.image_processor.numpy_to_pt(image)
193
+ image = self.video_processor.pil_to_numpy(image)
194
+ image = self.video_processor.numpy_to_pt(image)
151
195
 
152
196
  # We normalize the image before resizing to match with the original implementation.
153
197
  # Then we unnormalize it after resizing.
@@ -194,6 +238,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
194
238
  image = image.to(device=device)
195
239
  image_latents = self.vae.encode(image).latent_dist.mode()
196
240
 
241
+ # duplicate image_latents for each generation per prompt, using mps friendly method
242
+ image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
243
+
197
244
  if do_classifier_free_guidance:
198
245
  negative_image_latents = torch.zeros_like(image_latents)
199
246
 
@@ -202,9 +249,6 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
202
249
  # to avoid doing two forward passes
203
250
  image_latents = torch.cat([negative_image_latents, image_latents])
204
251
 
205
- # duplicate image_latents for each generation per prompt, using mps friendly method
206
- image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
207
-
208
252
  return image_latents
209
253
 
210
254
  def _get_add_time_ids(
@@ -235,7 +279,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
235
279
 
236
280
  return add_time_ids
237
281
 
238
- def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
282
+ def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
239
283
  # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
240
284
  latents = latents.flatten(0, 1)
241
285
 
@@ -271,7 +315,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
271
315
  and not isinstance(image, list)
272
316
  ):
273
317
  raise ValueError(
274
- "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
318
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
275
319
  f" {type(image)}"
276
320
  )
277
321
 
@@ -288,7 +332,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
288
332
  dtype: torch.dtype,
289
333
  device: Union[str, torch.device],
290
334
  generator: torch.Generator,
291
- latents: Optional[torch.FloatTensor] = None,
335
+ latents: Optional[torch.Tensor] = None,
292
336
  ):
293
337
  shape = (
294
338
  batch_size,
@@ -333,11 +377,12 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
333
377
  @replace_example_docstring(EXAMPLE_DOC_STRING)
334
378
  def __call__(
335
379
  self,
336
- image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
380
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
337
381
  height: int = 576,
338
382
  width: int = 1024,
339
383
  num_frames: Optional[int] = None,
340
384
  num_inference_steps: int = 25,
385
+ sigmas: Optional[List[float]] = None,
341
386
  min_guidance_scale: float = 1.0,
342
387
  max_guidance_scale: float = 3.0,
343
388
  fps: int = 7,
@@ -346,7 +391,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
346
391
  decode_chunk_size: Optional[int] = None,
347
392
  num_videos_per_prompt: Optional[int] = 1,
348
393
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
349
- latents: Optional[torch.FloatTensor] = None,
394
+ latents: Optional[torch.Tensor] = None,
350
395
  output_type: Optional[str] = "pil",
351
396
  callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
352
397
  callback_on_step_end_tensor_inputs: List[str] = ["latents"],
@@ -356,39 +401,46 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
356
401
  The call function to the pipeline for generation.
357
402
 
358
403
  Args:
359
- image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
360
- Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
404
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
405
+ Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
406
+ 1]`.
361
407
  height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
362
408
  The height in pixels of the generated image.
363
409
  width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
364
410
  The width in pixels of the generated image.
365
411
  num_frames (`int`, *optional*):
366
- The number of video frames to generate. Defaults to `self.unet.config.num_frames`
367
- (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
412
+ The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
413
+ `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
368
414
  num_inference_steps (`int`, *optional*, defaults to 25):
369
415
  The number of denoising steps. More denoising steps usually lead to a higher quality video at the
370
416
  expense of slower inference. This parameter is modulated by `strength`.
417
+ sigmas (`List[float]`, *optional*):
418
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
419
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
420
+ will be used.
371
421
  min_guidance_scale (`float`, *optional*, defaults to 1.0):
372
422
  The minimum guidance scale. Used for the classifier free guidance with first frame.
373
423
  max_guidance_scale (`float`, *optional*, defaults to 3.0):
374
424
  The maximum guidance scale. Used for the classifier free guidance with last frame.
375
425
  fps (`int`, *optional*, defaults to 7):
376
- Frames per second. The rate at which the generated images shall be exported to a video after generation.
377
- Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
426
+ Frames per second. The rate at which the generated images shall be exported to a video after
427
+ generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
378
428
  motion_bucket_id (`int`, *optional*, defaults to 127):
379
429
  Used for conditioning the amount of motion for the generation. The higher the number the more motion
380
430
  will be in the video.
381
431
  noise_aug_strength (`float`, *optional*, defaults to 0.02):
382
- The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
432
+ The amount of noise added to the init image, the higher it is the less the video will look like the
433
+ init image. Increase it for more motion.
383
434
  decode_chunk_size (`int`, *optional*):
384
- The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
385
- quality. For lower memory usage, reduce `decode_chunk_size`.
435
+ The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
436
+ expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
437
+ For lower memory usage, reduce `decode_chunk_size`.
386
438
  num_videos_per_prompt (`int`, *optional*, defaults to 1):
387
439
  The number of videos to generate per prompt.
388
440
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
389
441
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
390
442
  generation deterministic.
391
- latents (`torch.FloatTensor`, *optional*):
443
+ latents (`torch.Tensor`, *optional*):
392
444
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
393
445
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
394
446
  tensor is generated by sampling using the supplied random `generator`.
@@ -398,7 +450,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
398
450
  A function that is called at the end of each denoising step during inference. The function is called
399
451
  with the following arguments:
400
452
  `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
401
- `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
453
+ `callback_kwargs` will include a list of all tensors as specified by
454
+ `callback_on_step_end_tensor_inputs`.
402
455
  callback_on_step_end_tensor_inputs (`List`, *optional*):
403
456
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
404
457
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -411,8 +464,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
411
464
 
412
465
  Returns:
413
466
  [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
414
- If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
415
- otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
467
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
468
+ returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
469
+ returned.
416
470
  """
417
471
  # 0. Default height and width to unet
418
472
  height = height or self.unet.config.sample_size * self.vae_scale_factor
@@ -445,7 +499,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
445
499
  fps = fps - 1
446
500
 
447
501
  # 4. Encode input image using VAE
448
- image = self.image_processor.preprocess(image, height=height, width=width).to(device)
502
+ image = self.video_processor.preprocess(image, height=height, width=width).to(device)
449
503
  noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
450
504
  image = image + noise_aug_strength * noise
451
505
 
@@ -482,8 +536,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
482
536
  added_time_ids = added_time_ids.to(device)
483
537
 
484
538
  # 6. Prepare timesteps
485
- self.scheduler.set_timesteps(num_inference_steps, device=device)
486
- timesteps = self.scheduler.timesteps
539
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
487
540
 
488
541
  # 7. Prepare latent variables
489
542
  num_channels_latents = self.unet.config.in_channels
@@ -552,7 +605,7 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
552
605
  if needs_upcasting:
553
606
  self.vae.to(dtype=torch.float16)
554
607
  frames = self.decode_latents(latents, num_frames, decode_chunk_size)
555
- frames = tensor2vid(frames, self.image_processor, output_type=output_type)
608
+ frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
556
609
  else:
557
610
  frames = latents
558
611
 
@@ -627,7 +680,7 @@ def _filter2d(input, kernel):
627
680
 
628
681
  height, width = tmp_kernel.shape[-2:]
629
682
 
630
- padding_shape: list[int] = _compute_padding([height, width])
683
+ padding_shape: List[int] = _compute_padding([height, width])
631
684
  input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
632
685
 
633
686
  # kernel and input tensor reshape to align element-wise or batch-wise params
@@ -124,6 +124,7 @@ def retrieve_timesteps(
124
124
  num_inference_steps: Optional[int] = None,
125
125
  device: Optional[Union[str, torch.device]] = None,
126
126
  timesteps: Optional[List[int]] = None,
127
+ sigmas: Optional[List[float]] = None,
127
128
  **kwargs,
128
129
  ):
129
130
  """
@@ -134,19 +135,23 @@ def retrieve_timesteps(
134
135
  scheduler (`SchedulerMixin`):
135
136
  The scheduler to get timesteps from.
136
137
  num_inference_steps (`int`):
137
- The number of diffusion steps used when generating samples with a pre-trained model. If used,
138
- `timesteps` must be `None`.
138
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
139
+ must be `None`.
139
140
  device (`str` or `torch.device`, *optional*):
140
141
  The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
141
142
  timesteps (`List[int]`, *optional*):
142
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
143
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
144
- must be `None`.
143
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
144
+ `num_inference_steps` and `sigmas` must be `None`.
145
+ sigmas (`List[float]`, *optional*):
146
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
147
+ `num_inference_steps` and `timesteps` must be `None`.
145
148
 
146
149
  Returns:
147
150
  `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
148
151
  second element is the number of inference steps.
149
152
  """
153
+ if timesteps is not None and sigmas is not None:
154
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
150
155
  if timesteps is not None:
151
156
  accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
152
157
  if not accepts_timesteps:
@@ -157,6 +162,16 @@ def retrieve_timesteps(
157
162
  scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
158
163
  timesteps = scheduler.timesteps
159
164
  num_inference_steps = len(timesteps)
165
+ elif sigmas is not None:
166
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
167
+ if not accept_sigmas:
168
+ raise ValueError(
169
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
170
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
171
+ )
172
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
173
+ timesteps = scheduler.timesteps
174
+ num_inference_steps = len(timesteps)
160
175
  else:
161
176
  scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
162
177
  timesteps = scheduler.timesteps
@@ -256,8 +271,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
256
271
  num_images_per_prompt,
257
272
  do_classifier_free_guidance,
258
273
  negative_prompt=None,
259
- prompt_embeds: Optional[torch.FloatTensor] = None,
260
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
274
+ prompt_embeds: Optional[torch.Tensor] = None,
275
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
261
276
  lora_scale: Optional[float] = None,
262
277
  **kwargs,
263
278
  ):
@@ -289,8 +304,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
289
304
  num_images_per_prompt,
290
305
  do_classifier_free_guidance,
291
306
  negative_prompt=None,
292
- prompt_embeds: Optional[torch.FloatTensor] = None,
293
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
307
+ prompt_embeds: Optional[torch.Tensor] = None,
308
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
294
309
  lora_scale: Optional[float] = None,
295
310
  clip_skip: Optional[int] = None,
296
311
  ):
@@ -310,10 +325,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
310
325
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
311
326
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
312
327
  less than `1`).
313
- prompt_embeds (`torch.FloatTensor`, *optional*):
328
+ prompt_embeds (`torch.Tensor`, *optional*):
314
329
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
315
330
  provided, text embeddings will be generated from `prompt` input argument.
316
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
331
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
317
332
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
318
333
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
319
334
  argument.
@@ -569,7 +584,12 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
569
584
 
570
585
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
571
586
  def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
572
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
587
+ shape = (
588
+ batch_size,
589
+ num_channels_latents,
590
+ int(height) // self.vae_scale_factor,
591
+ int(width) // self.vae_scale_factor,
592
+ )
573
593
  if isinstance(generator, list) and len(generator) != batch_size:
574
594
  raise ValueError(
575
595
  f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -613,20 +633,22 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
613
633
  return height, width
614
634
 
615
635
  # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
616
- def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
636
+ def get_guidance_scale_embedding(
637
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
638
+ ) -> torch.Tensor:
617
639
  """
618
640
  See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
619
641
 
620
642
  Args:
621
- timesteps (`torch.Tensor`):
622
- generate embedding vectors at these timesteps
643
+ w (`torch.Tensor`):
644
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
623
645
  embedding_dim (`int`, *optional*, defaults to 512):
624
- dimension of the embeddings to generate
625
- dtype:
626
- data type of the generated embeddings
646
+ Dimension of the embeddings to generate.
647
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
648
+ Data type of the generated embeddings.
627
649
 
628
650
  Returns:
629
- `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
651
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
630
652
  """
631
653
  assert len(w.shape) == 1
632
654
  w = w * 1000.0
@@ -662,17 +684,18 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
662
684
  width: Optional[int] = None,
663
685
  num_inference_steps: int = 50,
664
686
  timesteps: List[int] = None,
687
+ sigmas: List[float] = None,
665
688
  guidance_scale: float = 7.5,
666
689
  negative_prompt: Optional[Union[str, List[str]]] = None,
667
690
  num_images_per_prompt: Optional[int] = 1,
668
691
  eta: float = 0.0,
669
692
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
670
- latents: Optional[torch.FloatTensor] = None,
671
- prompt_embeds: Optional[torch.FloatTensor] = None,
672
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
693
+ latents: Optional[torch.Tensor] = None,
694
+ prompt_embeds: Optional[torch.Tensor] = None,
695
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
673
696
  output_type: Optional[str] = "pil",
674
697
  return_dict: bool = True,
675
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
698
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
676
699
  callback_steps: int = 1,
677
700
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
678
701
  adapter_conditioning_scale: Union[float, List[float]] = 1.0,
@@ -685,9 +708,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
685
708
  prompt (`str` or `List[str]`, *optional*):
686
709
  The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
687
710
  instead.
688
- image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
711
+ image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
689
712
  The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
690
- type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
713
+ type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
691
714
  accepted as an image. The control image is automatically resized to fit the output image.
692
715
  height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
693
716
  The height in pixels of the generated image.
@@ -700,6 +723,10 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
700
723
  Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
701
724
  in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
702
725
  passed will be used. Must be in descending order.
726
+ sigmas (`List[float]`, *optional*):
727
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
728
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
729
+ will be used.
703
730
  guidance_scale (`float`, *optional*, defaults to 7.5):
704
731
  Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
705
732
  `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -718,14 +745,14 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
718
745
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
719
746
  One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
720
747
  to make generation deterministic.
721
- latents (`torch.FloatTensor`, *optional*):
748
+ latents (`torch.Tensor`, *optional*):
722
749
  Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
723
750
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
724
751
  tensor will ge generated by sampling using the supplied random `generator`.
725
- prompt_embeds (`torch.FloatTensor`, *optional*):
752
+ prompt_embeds (`torch.Tensor`, *optional*):
726
753
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
727
754
  provided, text embeddings will be generated from `prompt` input argument.
728
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
755
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
729
756
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
730
757
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
731
758
  argument.
@@ -737,7 +764,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
737
764
  of a plain tuple.
738
765
  callback (`Callable`, *optional*):
739
766
  A function that will be called every `callback_steps` steps during inference. The function will be
740
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
767
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
741
768
  callback_steps (`int`, *optional*, defaults to 1):
742
769
  The frequency at which the `callback` function will be called. If not specified, the callback will be
743
770
  called at every step.
@@ -809,7 +836,9 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
809
836
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
810
837
 
811
838
  # 4. Prepare timesteps
812
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
839
+ timesteps, num_inference_steps = retrieve_timesteps(
840
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
841
+ )
813
842
 
814
843
  # 5. Prepare latent variables
815
844
  num_channels_latents = self.unet.config.in_channels