diffusers 0.27.2__py3-none-any.whl → 0.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. diffusers/__init__.py +18 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +16 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +2 -0
  21. diffusers/models/activations.py +11 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +367 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +147 -24
  35. diffusers/models/model_loading_utils.py +149 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  39. diffusers/models/modeling_utils.py +118 -98
  40. diffusers/models/resnet.py +18 -23
  41. diffusers/models/transformer_temporal.py +3 -3
  42. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  43. diffusers/models/transformers/prior_transformer.py +7 -7
  44. diffusers/models/transformers/t5_film_transformer.py +17 -19
  45. diffusers/models/transformers/transformer_2d.py +272 -156
  46. diffusers/models/transformers/transformer_temporal.py +10 -10
  47. diffusers/models/unets/unet_1d.py +5 -5
  48. diffusers/models/unets/unet_1d_blocks.py +29 -29
  49. diffusers/models/unets/unet_2d.py +6 -6
  50. diffusers/models/unets/unet_2d_blocks.py +137 -128
  51. diffusers/models/unets/unet_2d_condition.py +19 -15
  52. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  53. diffusers/models/unets/unet_3d_blocks.py +79 -77
  54. diffusers/models/unets/unet_3d_condition.py +13 -9
  55. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  56. diffusers/models/unets/unet_kandinsky3.py +1 -1
  57. diffusers/models/unets/unet_motion_model.py +114 -14
  58. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  59. diffusers/models/unets/unet_stable_cascade.py +16 -13
  60. diffusers/models/upsampling.py +17 -20
  61. diffusers/models/vq_model.py +16 -15
  62. diffusers/pipelines/__init__.py +25 -3
  63. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  64. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  65. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  66. diffusers/pipelines/animatediff/__init__.py +2 -0
  67. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  68. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  69. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  70. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  71. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  72. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  73. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  74. diffusers/pipelines/auto_pipeline.py +21 -17
  75. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  76. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  77. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  78. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  79. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  80. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  81. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  82. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  83. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  84. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  85. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  86. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  87. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  88. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  89. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  90. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  91. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  92. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  93. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  94. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  95. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  96. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  97. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  98. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  99. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  100. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  101. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  102. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  103. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  104. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  105. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  106. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  107. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  108. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  109. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  110. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  111. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  112. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  113. diffusers/pipelines/dit/pipeline_dit.py +3 -0
  114. diffusers/pipelines/free_init_utils.py +39 -38
  115. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  116. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  117. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  118. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  119. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  120. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  121. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  122. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  123. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  124. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  125. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  126. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  127. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  128. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  129. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  130. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  131. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  132. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  133. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  134. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  135. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  136. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  137. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  138. diffusers/pipelines/marigold/__init__.py +50 -0
  139. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  140. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  141. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  142. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  143. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  144. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  145. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  146. diffusers/pipelines/pipeline_loading_utils.py +268 -23
  147. diffusers/pipelines/pipeline_utils.py +266 -37
  148. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  149. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +65 -75
  150. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  151. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  152. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  153. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  154. diffusers/pipelines/shap_e/renderer.py +1 -1
  155. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  156. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  157. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  158. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  159. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  160. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  161. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  163. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  164. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  165. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  166. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  167. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  168. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  169. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  172. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  173. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  174. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  175. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  176. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  177. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  178. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  179. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  180. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  181. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  182. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  183. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  184. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  185. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  186. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  187. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  188. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  189. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  190. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  191. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  192. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  193. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  194. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  195. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  196. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  197. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  198. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  199. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  200. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  201. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  202. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  203. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  204. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  205. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  206. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  207. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  208. diffusers/schedulers/__init__.py +2 -2
  209. diffusers/schedulers/deprecated/__init__.py +1 -1
  210. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  211. diffusers/schedulers/scheduling_amused.py +5 -5
  212. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  213. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  214. diffusers/schedulers/scheduling_ddim.py +22 -24
  215. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  216. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  217. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  218. diffusers/schedulers/scheduling_ddpm.py +20 -22
  219. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  220. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  221. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  222. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  223. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  224. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  225. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  226. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  227. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  228. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  229. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  230. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  231. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  232. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  233. diffusers/schedulers/scheduling_ipndm.py +8 -8
  234. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  235. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  236. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  237. diffusers/schedulers/scheduling_lcm.py +21 -23
  238. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  239. diffusers/schedulers/scheduling_pndm.py +20 -20
  240. diffusers/schedulers/scheduling_repaint.py +20 -20
  241. diffusers/schedulers/scheduling_sasolver.py +55 -54
  242. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  243. diffusers/schedulers/scheduling_tcd.py +39 -30
  244. diffusers/schedulers/scheduling_unclip.py +15 -15
  245. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  246. diffusers/schedulers/scheduling_utils.py +14 -5
  247. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  248. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  249. diffusers/training_utils.py +56 -1
  250. diffusers/utils/__init__.py +7 -0
  251. diffusers/utils/doc_utils.py +1 -0
  252. diffusers/utils/dummy_pt_objects.py +30 -0
  253. diffusers/utils/dummy_torch_and_transformers_objects.py +90 -0
  254. diffusers/utils/dynamic_modules_utils.py +24 -11
  255. diffusers/utils/hub_utils.py +3 -2
  256. diffusers/utils/import_utils.py +91 -0
  257. diffusers/utils/loading_utils.py +2 -2
  258. diffusers/utils/logging.py +1 -1
  259. diffusers/utils/peft_utils.py +32 -5
  260. diffusers/utils/state_dict_utils.py +11 -2
  261. diffusers/utils/testing_utils.py +71 -6
  262. diffusers/utils/torch_utils.py +1 -0
  263. diffusers/video_processor.py +113 -0
  264. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/METADATA +47 -47
  265. diffusers-0.28.0.dist-info/RECORD +414 -0
  266. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/WHEEL +1 -1
  267. diffusers-0.27.2.dist-info/RECORD +0 -399
  268. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/LICENSE +0 -0
  269. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/entry_points.txt +0 -0
  270. {diffusers-0.27.2.dist-info → diffusers-0.28.0.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ _import_structure = {
24
24
  "deprecated": [],
25
25
  "latent_diffusion": [],
26
26
  "ledits_pp": [],
27
+ "marigold": [],
27
28
  "stable_diffusion": [],
28
29
  "stable_diffusion_xl": [],
29
30
  }
@@ -114,6 +115,7 @@ else:
114
115
  _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
115
116
  _import_structure["animatediff"] = [
116
117
  "AnimateDiffPipeline",
118
+ "AnimateDiffSDXLPipeline",
117
119
  "AnimateDiffVideoToVideoPipeline",
118
120
  ]
119
121
  _import_structure["audioldm"] = ["AudioLDMPipeline"]
@@ -134,6 +136,12 @@ else:
134
136
  "StableDiffusionXLControlNetPipeline",
135
137
  ]
136
138
  )
139
+ _import_structure["controlnet_xs"].extend(
140
+ [
141
+ "StableDiffusionControlNetXSPipeline",
142
+ "StableDiffusionXLControlNetXSPipeline",
143
+ ]
144
+ )
137
145
  _import_structure["deepfloyd_if"] = [
138
146
  "IFImg2ImgPipeline",
139
147
  "IFImg2ImgSuperResolutionPipeline",
@@ -178,10 +186,16 @@ else:
178
186
  "LEditsPPPipelineStableDiffusionXL",
179
187
  ]
180
188
  )
189
+ _import_structure["marigold"].extend(
190
+ [
191
+ "MarigoldDepthPipeline",
192
+ "MarigoldNormalsPipeline",
193
+ ]
194
+ )
181
195
  _import_structure["musicldm"] = ["MusicLDMPipeline"]
182
196
  _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
183
197
  _import_structure["pia"] = ["PIAPipeline"]
184
- _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline"]
198
+ _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"]
185
199
  _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
186
200
  _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
187
201
  _import_structure["stable_cascade"] = [
@@ -361,7 +375,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
361
375
  from ..utils.dummy_torch_and_transformers_objects import *
362
376
  else:
363
377
  from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
364
- from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
378
+ from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
365
379
  from .audioldm import AudioLDMPipeline
366
380
  from .audioldm2 import (
367
381
  AudioLDM2Pipeline,
@@ -378,6 +392,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
378
392
  StableDiffusionXLControlNetInpaintPipeline,
379
393
  StableDiffusionXLControlNetPipeline,
380
394
  )
395
+ from .controlnet_xs import (
396
+ StableDiffusionControlNetXSPipeline,
397
+ StableDiffusionXLControlNetXSPipeline,
398
+ )
381
399
  from .deepfloyd_if import (
382
400
  IFImg2ImgPipeline,
383
401
  IFImg2ImgSuperResolutionPipeline,
@@ -437,10 +455,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
437
455
  LEditsPPPipelineStableDiffusion,
438
456
  LEditsPPPipelineStableDiffusionXL,
439
457
  )
458
+ from .marigold import (
459
+ MarigoldDepthPipeline,
460
+ MarigoldNormalsPipeline,
461
+ )
440
462
  from .musicldm import MusicLDMPipeline
441
463
  from .paint_by_example import PaintByExamplePipeline
442
464
  from .pia import PIAPipeline
443
- from .pixart_alpha import PixArtAlphaPipeline
465
+ from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
444
466
  from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
445
467
  from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
446
468
  from .stable_cascade import (
@@ -30,9 +30,7 @@ EXAMPLE_DOC_STRING = """
30
30
  >>> import torch
31
31
  >>> from diffusers import AmusedPipeline
32
32
 
33
- >>> pipe = AmusedPipeline.from_pretrained(
34
- ... "amused/amused-512", variant="fp16", torch_dtype=torch.float16
35
- ... )
33
+ >>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
36
34
  >>> pipe = pipe.to("cuda")
37
35
 
38
36
  >>> prompt = "a photo of an astronaut riding a horse on mars"
@@ -90,7 +88,7 @@ class AmusedPipeline(DiffusionPipeline):
90
88
  negative_encoder_hidden_states: Optional[torch.Tensor] = None,
91
89
  output_type="pil",
92
90
  return_dict: bool = True,
93
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
91
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
94
92
  callback_steps: int = 1,
95
93
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
96
94
  micro_conditioning_aesthetic_score: int = 6,
@@ -124,16 +122,16 @@ class AmusedPipeline(DiffusionPipeline):
124
122
  latents (`torch.IntTensor`, *optional*):
125
123
  Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
126
124
  gneration. If not provided, the starting latents will be completely masked.
127
- prompt_embeds (`torch.FloatTensor`, *optional*):
125
+ prompt_embeds (`torch.Tensor`, *optional*):
128
126
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
129
127
  provided, text embeddings are generated from the `prompt` input argument. A single vector from the
130
128
  pooled and projected final hidden states.
131
- encoder_hidden_states (`torch.FloatTensor`, *optional*):
129
+ encoder_hidden_states (`torch.Tensor`, *optional*):
132
130
  Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
133
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
131
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
134
132
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
135
133
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
136
- negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
134
+ negative_encoder_hidden_states (`torch.Tensor`, *optional*):
137
135
  Analogous to `encoder_hidden_states` for the positive prompt.
138
136
  output_type (`str`, *optional*, defaults to `"pil"`):
139
137
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
@@ -142,7 +140,7 @@ class AmusedPipeline(DiffusionPipeline):
142
140
  plain tuple.
143
141
  callback (`Callable`, *optional*):
144
142
  A function that calls every `callback_steps` steps during inference. The function is called with the
145
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
143
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
146
144
  callback_steps (`int`, *optional*, defaults to 1):
147
145
  The frequency at which the `callback` function is called. If not specified, the callback is called at
148
146
  every step.
@@ -150,10 +148,12 @@ class AmusedPipeline(DiffusionPipeline):
150
148
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
151
149
  [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
152
150
  micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
153
- The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
154
- and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
151
+ The targeted aesthetic score according to the laion aesthetic classifier. See
152
+ https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
153
+ https://arxiv.org/abs/2307.01952.
155
154
  micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
156
- The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
155
+ The targeted height, width crop coordinates. See the micro-conditioning section of
156
+ https://arxiv.org/abs/2307.01952.
157
157
  temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
158
158
  Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
159
159
 
@@ -102,7 +102,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
102
102
  negative_encoder_hidden_states: Optional[torch.Tensor] = None,
103
103
  output_type="pil",
104
104
  return_dict: bool = True,
105
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
105
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
106
106
  callback_steps: int = 1,
107
107
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
108
108
  micro_conditioning_aesthetic_score: int = 6,
@@ -115,7 +115,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
115
115
  Args:
116
116
  prompt (`str` or `List[str]`, *optional*):
117
117
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
118
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
118
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
119
119
  `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
120
120
  numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
121
121
  or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
@@ -127,7 +127,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
127
127
  on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
128
128
  process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
129
129
  essentially ignores `image`.
130
- num_inference_steps (`int`, *optional*, defaults to 16):
130
+ num_inference_steps (`int`, *optional*, defaults to 12):
131
131
  The number of denoising steps. More denoising steps usually lead to a higher quality image at the
132
132
  expense of slower inference.
133
133
  guidance_scale (`float`, *optional*, defaults to 10.0):
@@ -141,16 +141,16 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
141
141
  generator (`torch.Generator`, *optional*):
142
142
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
143
143
  generation deterministic.
144
- prompt_embeds (`torch.FloatTensor`, *optional*):
144
+ prompt_embeds (`torch.Tensor`, *optional*):
145
145
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
146
146
  provided, text embeddings are generated from the `prompt` input argument. A single vector from the
147
147
  pooled and projected final hidden states.
148
- encoder_hidden_states (`torch.FloatTensor`, *optional*):
148
+ encoder_hidden_states (`torch.Tensor`, *optional*):
149
149
  Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
150
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
150
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
151
151
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
152
152
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
153
- negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
153
+ negative_encoder_hidden_states (`torch.Tensor`, *optional*):
154
154
  Analogous to `encoder_hidden_states` for the positive prompt.
155
155
  output_type (`str`, *optional*, defaults to `"pil"`):
156
156
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
@@ -159,7 +159,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
159
159
  plain tuple.
160
160
  callback (`Callable`, *optional*):
161
161
  A function that calls every `callback_steps` steps during inference. The function is called with the
162
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
162
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
163
163
  callback_steps (`int`, *optional*, defaults to 1):
164
164
  The frequency at which the `callback` function is called. If not specified, the callback is called at
165
165
  every step.
@@ -167,10 +167,12 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
167
167
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
168
168
  [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
169
169
  micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
170
- The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
171
- and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
170
+ The targeted aesthetic score according to the laion aesthetic classifier. See
171
+ https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
172
+ https://arxiv.org/abs/2307.01952.
172
173
  micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
173
- The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
174
+ The targeted height, width crop coordinates. See the micro-conditioning section of
175
+ https://arxiv.org/abs/2307.01952.
174
176
  temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
175
177
  Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
176
178
 
@@ -191,7 +193,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
191
193
  negative_prompt_embeds is None and negative_encoder_hidden_states is not None
192
194
  ):
193
195
  raise ValueError(
194
- "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
196
+ "pass either both `negative_prompt_embeds` and `negative_encoder_hidden_states` or neither"
195
197
  )
196
198
 
197
199
  if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
@@ -119,7 +119,7 @@ class AmusedInpaintPipeline(DiffusionPipeline):
119
119
  negative_encoder_hidden_states: Optional[torch.Tensor] = None,
120
120
  output_type="pil",
121
121
  return_dict: bool = True,
122
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
122
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
123
123
  callback_steps: int = 1,
124
124
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
125
125
  micro_conditioning_aesthetic_score: int = 6,
@@ -132,13 +132,13 @@ class AmusedInpaintPipeline(DiffusionPipeline):
132
132
  Args:
133
133
  prompt (`str` or `List[str]`, *optional*):
134
134
  The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
135
- image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
135
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
136
136
  `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
137
137
  numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
138
138
  or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
139
139
  list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
140
140
  latents as `image`, but if passing latents directly it is not encoded again.
141
- mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
141
+ mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
142
142
  `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
143
143
  are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
144
144
  single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
@@ -165,16 +165,16 @@ class AmusedInpaintPipeline(DiffusionPipeline):
165
165
  generator (`torch.Generator`, *optional*):
166
166
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
167
167
  generation deterministic.
168
- prompt_embeds (`torch.FloatTensor`, *optional*):
168
+ prompt_embeds (`torch.Tensor`, *optional*):
169
169
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
170
170
  provided, text embeddings are generated from the `prompt` input argument. A single vector from the
171
171
  pooled and projected final hidden states.
172
- encoder_hidden_states (`torch.FloatTensor`, *optional*):
172
+ encoder_hidden_states (`torch.Tensor`, *optional*):
173
173
  Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
174
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
174
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
175
175
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
176
176
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
177
- negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
177
+ negative_encoder_hidden_states (`torch.Tensor`, *optional*):
178
178
  Analogous to `encoder_hidden_states` for the positive prompt.
179
179
  output_type (`str`, *optional*, defaults to `"pil"`):
180
180
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
@@ -183,7 +183,7 @@ class AmusedInpaintPipeline(DiffusionPipeline):
183
183
  plain tuple.
184
184
  callback (`Callable`, *optional*):
185
185
  A function that calls every `callback_steps` steps during inference. The function is called with the
186
- following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
186
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
187
187
  callback_steps (`int`, *optional*, defaults to 1):
188
188
  The frequency at which the `callback` function is called. If not specified, the callback is called at
189
189
  every step.
@@ -191,10 +191,12 @@ class AmusedInpaintPipeline(DiffusionPipeline):
191
191
  A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
192
192
  [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
193
193
  micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
194
- The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
195
- and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
194
+ The targeted aesthetic score according to the laion aesthetic classifier. See
195
+ https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of
196
+ https://arxiv.org/abs/2307.01952.
196
197
  micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
197
- The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
198
+ The targeted height, width crop coordinates. See the micro-conditioning section of
199
+ https://arxiv.org/abs/2307.01952.
198
200
  temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
199
201
  Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
200
202
 
@@ -22,6 +22,7 @@ except OptionalDependencyNotAvailable:
22
22
  _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23
23
  else:
24
24
  _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
25
+ _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
25
26
  _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
26
27
 
27
28
  if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -33,6 +34,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
33
34
 
34
35
  else:
35
36
  from .pipeline_animatediff import AnimateDiffPipeline
37
+ from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
36
38
  from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
37
39
  from .pipeline_output import AnimateDiffPipelineOutput
38
40
 
@@ -15,11 +15,10 @@
15
15
  import inspect
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
 
18
- import numpy as np
19
18
  import torch
20
19
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
21
20
 
22
- from ...image_processor import PipelineImageInput, VaeImageProcessor
21
+ from ...image_processor import PipelineImageInput
23
22
  from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
24
23
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
25
24
  from ...models.lora import adjust_lora_scale_text_encoder
@@ -41,6 +40,7 @@ from ...utils import (
41
40
  unscale_lora_layers,
42
41
  )
43
42
  from ...utils.torch_utils import randn_tensor
43
+ from ...video_processor import VideoProcessor
44
44
  from ..free_init_utils import FreeInitMixin
45
45
  from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
46
46
  from .pipeline_output import AnimateDiffPipelineOutput
@@ -65,27 +65,6 @@ EXAMPLE_DOC_STRING = """
65
65
  """
66
66
 
67
67
 
68
- def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
69
- batch_size, channels, num_frames, height, width = video.shape
70
- outputs = []
71
- for batch_idx in range(batch_size):
72
- batch_vid = video[batch_idx].permute(1, 0, 2, 3)
73
- batch_output = processor.postprocess(batch_vid, output_type)
74
-
75
- outputs.append(batch_output)
76
-
77
- if output_type == "np":
78
- outputs = np.stack(outputs)
79
-
80
- elif output_type == "pt":
81
- outputs = torch.stack(outputs)
82
-
83
- elif not output_type == "pil":
84
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
85
-
86
- return outputs
87
-
88
-
89
68
  class AnimateDiffPipeline(
90
69
  DiffusionPipeline,
91
70
  StableDiffusionMixin,
@@ -131,7 +110,7 @@ class AnimateDiffPipeline(
131
110
  vae: AutoencoderKL,
132
111
  text_encoder: CLIPTextModel,
133
112
  tokenizer: CLIPTokenizer,
134
- unet: UNet2DConditionModel,
113
+ unet: Union[UNet2DConditionModel, UNetMotionModel],
135
114
  motion_adapter: MotionAdapter,
136
115
  scheduler: Union[
137
116
  DDIMScheduler,
@@ -159,7 +138,7 @@ class AnimateDiffPipeline(
159
138
  image_encoder=image_encoder,
160
139
  )
161
140
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
162
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
141
+ self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
163
142
 
164
143
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
165
144
  def encode_prompt(
@@ -169,8 +148,8 @@ class AnimateDiffPipeline(
169
148
  num_images_per_prompt,
170
149
  do_classifier_free_guidance,
171
150
  negative_prompt=None,
172
- prompt_embeds: Optional[torch.FloatTensor] = None,
173
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
151
+ prompt_embeds: Optional[torch.Tensor] = None,
152
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
174
153
  lora_scale: Optional[float] = None,
175
154
  clip_skip: Optional[int] = None,
176
155
  ):
@@ -190,10 +169,10 @@ class AnimateDiffPipeline(
190
169
  The prompt or prompts not to guide the image generation. If not defined, one has to pass
191
170
  `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
192
171
  less than `1`).
193
- prompt_embeds (`torch.FloatTensor`, *optional*):
172
+ prompt_embeds (`torch.Tensor`, *optional*):
194
173
  Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
195
174
  provided, text embeddings will be generated from `prompt` input argument.
196
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
175
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
197
176
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
198
177
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
199
178
  argument.
@@ -584,11 +563,11 @@ class AnimateDiffPipeline(
584
563
  num_videos_per_prompt: Optional[int] = 1,
585
564
  eta: float = 0.0,
586
565
  generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
587
- latents: Optional[torch.FloatTensor] = None,
588
- prompt_embeds: Optional[torch.FloatTensor] = None,
589
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
566
+ latents: Optional[torch.Tensor] = None,
567
+ prompt_embeds: Optional[torch.Tensor] = None,
568
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
590
569
  ip_adapter_image: Optional[PipelineImageInput] = None,
591
- ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
570
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
592
571
  output_type: Optional[str] = "pil",
593
572
  return_dict: bool = True,
594
573
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -625,27 +604,26 @@ class AnimateDiffPipeline(
625
604
  generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
626
605
  A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
627
606
  generation deterministic.
628
- latents (`torch.FloatTensor`, *optional*):
607
+ latents (`torch.Tensor`, *optional*):
629
608
  Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
630
609
  generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
631
610
  tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
632
611
  `(batch_size, num_channel, num_frames, height, width)`.
633
- prompt_embeds (`torch.FloatTensor`, *optional*):
612
+ prompt_embeds (`torch.Tensor`, *optional*):
634
613
  Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
635
614
  provided, text embeddings are generated from the `prompt` input argument.
636
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
615
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
637
616
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
638
617
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
639
618
  ip_adapter_image: (`PipelineImageInput`, *optional*):
640
619
  Optional image input to work with IP Adapters.
641
- ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
642
- Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
643
- Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
644
- if `do_classifier_free_guidance` is set to `True`.
645
- If not provided, embeddings are computed from the `ip_adapter_image` input argument.
620
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
621
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
622
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
623
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
624
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
646
625
  output_type (`str`, *optional*, defaults to `"pil"`):
647
- The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
648
- `np.array`.
626
+ The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
649
627
  return_dict (`bool`, *optional*, defaults to `True`):
650
628
  Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
651
629
  of a plain tuple.
@@ -663,7 +641,7 @@ class AnimateDiffPipeline(
663
641
  callback_on_step_end_tensor_inputs (`List`, *optional*):
664
642
  The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
665
643
  will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
666
- `._callback_tensor_inputs` attribute of your pipeine class.
644
+ `._callback_tensor_inputs` attribute of your pipeline class.
667
645
 
668
646
  Examples:
669
647
 
@@ -792,7 +770,7 @@ class AnimateDiffPipeline(
792
770
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
793
771
 
794
772
  # 8. Denoising loop
795
- with self.progress_bar(total=num_inference_steps) as progress_bar:
773
+ with self.progress_bar(total=self._num_timesteps) as progress_bar:
796
774
  for i, t in enumerate(timesteps):
797
775
  # expand the latents if we are doing classifier free guidance
798
776
  latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
@@ -836,7 +814,7 @@ class AnimateDiffPipeline(
836
814
  video = latents
837
815
  else:
838
816
  video_tensor = self.decode_latents(latents)
839
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
817
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
840
818
 
841
819
  # 10. Offload all models
842
820
  self.maybe_free_model_hooks()