diffusers 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. diffusers/__init__.py +20 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/configuration_utils.py +7 -3
  7. diffusers/dependency_versions_check.py +1 -1
  8. diffusers/dependency_versions_table.py +2 -2
  9. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  10. diffusers/image_processor.py +110 -4
  11. diffusers/loaders/autoencoder.py +7 -8
  12. diffusers/loaders/controlnet.py +17 -8
  13. diffusers/loaders/ip_adapter.py +86 -23
  14. diffusers/loaders/lora.py +105 -310
  15. diffusers/loaders/lora_conversion_utils.py +1 -1
  16. diffusers/loaders/peft.py +1 -1
  17. diffusers/loaders/single_file.py +51 -12
  18. diffusers/loaders/single_file_utils.py +274 -49
  19. diffusers/loaders/textual_inversion.py +23 -4
  20. diffusers/loaders/unet.py +195 -41
  21. diffusers/loaders/utils.py +1 -1
  22. diffusers/models/__init__.py +3 -1
  23. diffusers/models/activations.py +9 -9
  24. diffusers/models/attention.py +26 -36
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +171 -114
  27. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  28. diffusers/models/autoencoders/autoencoder_kl.py +3 -1
  29. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  30. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  31. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  32. diffusers/models/autoencoders/vae.py +1 -1
  33. diffusers/models/controlnet.py +1 -1
  34. diffusers/models/controlnet_flax.py +1 -1
  35. diffusers/models/downsampling.py +8 -12
  36. diffusers/models/dual_transformer_2d.py +1 -1
  37. diffusers/models/embeddings.py +3 -4
  38. diffusers/models/embeddings_flax.py +1 -1
  39. diffusers/models/lora.py +33 -10
  40. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  41. diffusers/models/modeling_flax_utils.py +1 -1
  42. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  43. diffusers/models/modeling_utils.py +4 -6
  44. diffusers/models/normalization.py +1 -1
  45. diffusers/models/resnet.py +31 -58
  46. diffusers/models/resnet_flax.py +1 -1
  47. diffusers/models/t5_film_transformer.py +1 -1
  48. diffusers/models/transformer_2d.py +1 -1
  49. diffusers/models/transformer_temporal.py +1 -1
  50. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  51. diffusers/models/transformers/t5_film_transformer.py +1 -1
  52. diffusers/models/transformers/transformer_2d.py +29 -31
  53. diffusers/models/transformers/transformer_temporal.py +1 -1
  54. diffusers/models/unet_1d.py +1 -1
  55. diffusers/models/unet_1d_blocks.py +1 -1
  56. diffusers/models/unet_2d.py +1 -1
  57. diffusers/models/unet_2d_blocks.py +1 -1
  58. diffusers/models/unet_2d_condition.py +1 -1
  59. diffusers/models/unets/__init__.py +1 -0
  60. diffusers/models/unets/unet_1d.py +1 -1
  61. diffusers/models/unets/unet_1d_blocks.py +1 -1
  62. diffusers/models/unets/unet_2d.py +4 -4
  63. diffusers/models/unets/unet_2d_blocks.py +238 -98
  64. diffusers/models/unets/unet_2d_blocks_flax.py +1 -1
  65. diffusers/models/unets/unet_2d_condition.py +420 -323
  66. diffusers/models/unets/unet_2d_condition_flax.py +21 -12
  67. diffusers/models/unets/unet_3d_blocks.py +50 -40
  68. diffusers/models/unets/unet_3d_condition.py +47 -8
  69. diffusers/models/unets/unet_i2vgen_xl.py +75 -30
  70. diffusers/models/unets/unet_kandinsky3.py +1 -1
  71. diffusers/models/unets/unet_motion_model.py +48 -8
  72. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  73. diffusers/models/unets/unet_stable_cascade.py +610 -0
  74. diffusers/models/unets/uvit_2d.py +1 -1
  75. diffusers/models/upsampling.py +10 -16
  76. diffusers/models/vae_flax.py +1 -1
  77. diffusers/models/vq_model.py +1 -1
  78. diffusers/optimization.py +1 -1
  79. diffusers/pipelines/__init__.py +26 -0
  80. diffusers/pipelines/amused/pipeline_amused.py +1 -1
  81. diffusers/pipelines/amused/pipeline_amused_img2img.py +1 -1
  82. diffusers/pipelines/amused/pipeline_amused_inpaint.py +1 -1
  83. diffusers/pipelines/animatediff/pipeline_animatediff.py +162 -417
  84. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +165 -137
  85. diffusers/pipelines/animatediff/pipeline_output.py +7 -6
  86. diffusers/pipelines/audioldm/pipeline_audioldm.py +3 -19
  87. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  88. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +3 -3
  89. diffusers/pipelines/auto_pipeline.py +7 -16
  90. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  93. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  94. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  95. diffusers/pipelines/controlnet/pipeline_controlnet.py +90 -90
  96. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  97. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +98 -90
  98. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +92 -90
  99. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +145 -70
  100. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +126 -89
  101. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +108 -96
  102. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  103. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -1
  104. diffusers/pipelines/ddim/pipeline_ddim.py +1 -1
  105. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  106. diffusers/pipelines/deepfloyd_if/pipeline_if.py +4 -4
  107. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +4 -4
  108. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +5 -5
  109. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +4 -4
  110. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +5 -5
  111. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +5 -5
  112. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +10 -120
  113. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -91
  114. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  115. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +1 -1
  116. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  117. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +1 -1
  118. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  119. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  120. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  121. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  122. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  123. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  124. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +5 -4
  125. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +5 -4
  126. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +7 -22
  127. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -39
  128. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +5 -5
  129. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  130. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -22
  131. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  132. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  133. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -2
  134. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  135. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  136. diffusers/pipelines/free_init_utils.py +184 -0
  137. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +22 -104
  138. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +1 -1
  139. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  140. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +1 -1
  141. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +2 -2
  142. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -1
  143. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -1
  145. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +1 -1
  146. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  147. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  148. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +2 -2
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +104 -93
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +112 -74
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/ledits_pp/__init__.py +55 -0
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +1505 -0
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +1797 -0
  155. diffusers/pipelines/ledits_pp/pipeline_output.py +43 -0
  156. diffusers/pipelines/musicldm/pipeline_musicldm.py +3 -19
  157. diffusers/pipelines/onnx_utils.py +1 -1
  158. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  159. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +3 -3
  160. diffusers/pipelines/pia/pipeline_pia.py +168 -327
  161. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  162. diffusers/pipelines/pipeline_loading_utils.py +508 -0
  163. diffusers/pipelines/pipeline_utils.py +188 -534
  164. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +56 -10
  165. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +3 -3
  166. diffusers/pipelines/shap_e/camera.py +1 -1
  167. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  168. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  169. diffusers/pipelines/shap_e/renderer.py +1 -1
  170. diffusers/pipelines/stable_cascade/__init__.py +50 -0
  171. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +482 -0
  172. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +311 -0
  173. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +638 -0
  174. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  175. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +4 -1
  176. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  177. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +2 -2
  178. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  179. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +1 -1
  180. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +1 -1
  181. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
  182. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
  183. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +90 -146
  184. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  185. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +4 -32
  186. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -119
  187. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -119
  188. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +13 -59
  189. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -31
  190. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -33
  191. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -21
  192. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -21
  193. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  194. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  195. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  196. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +5 -21
  197. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +9 -38
  198. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -34
  199. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +6 -35
  200. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +7 -6
  201. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +4 -124
  202. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +282 -80
  203. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +94 -46
  204. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +3 -3
  205. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  206. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +6 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  208. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +96 -148
  209. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -154
  210. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +98 -153
  211. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +25 -87
  212. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +89 -80
  213. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +5 -49
  214. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +80 -88
  215. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +8 -6
  216. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +15 -86
  217. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +20 -93
  218. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +5 -5
  219. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +3 -19
  220. diffusers/pipelines/unclip/pipeline_unclip.py +1 -1
  221. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -1
  222. diffusers/pipelines/unclip/text_proj.py +1 -1
  223. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +35 -35
  224. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  225. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +4 -21
  226. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +2 -2
  227. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -5
  228. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  229. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +1 -1
  230. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +2 -2
  231. diffusers/schedulers/__init__.py +7 -1
  232. diffusers/schedulers/deprecated/scheduling_karras_ve.py +1 -1
  233. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  234. diffusers/schedulers/scheduling_consistency_models.py +42 -19
  235. diffusers/schedulers/scheduling_ddim.py +2 -4
  236. diffusers/schedulers/scheduling_ddim_flax.py +13 -5
  237. diffusers/schedulers/scheduling_ddim_inverse.py +2 -4
  238. diffusers/schedulers/scheduling_ddim_parallel.py +2 -4
  239. diffusers/schedulers/scheduling_ddpm.py +2 -4
  240. diffusers/schedulers/scheduling_ddpm_flax.py +1 -1
  241. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -4
  242. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +1 -1
  243. diffusers/schedulers/scheduling_deis_multistep.py +46 -19
  244. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -21
  245. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +1 -1
  246. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +9 -7
  247. diffusers/schedulers/scheduling_dpmsolver_sde.py +35 -35
  248. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +49 -18
  249. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +683 -0
  250. diffusers/schedulers/scheduling_edm_euler.py +381 -0
  251. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +43 -15
  252. diffusers/schedulers/scheduling_euler_discrete.py +42 -17
  253. diffusers/schedulers/scheduling_euler_discrete_flax.py +1 -1
  254. diffusers/schedulers/scheduling_heun_discrete.py +35 -35
  255. diffusers/schedulers/scheduling_ipndm.py +37 -11
  256. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +44 -44
  257. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +44 -44
  258. diffusers/schedulers/scheduling_karras_ve_flax.py +1 -1
  259. diffusers/schedulers/scheduling_lcm.py +38 -14
  260. diffusers/schedulers/scheduling_lms_discrete.py +43 -15
  261. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  262. diffusers/schedulers/scheduling_pndm.py +2 -4
  263. diffusers/schedulers/scheduling_pndm_flax.py +2 -4
  264. diffusers/schedulers/scheduling_repaint.py +1 -1
  265. diffusers/schedulers/scheduling_sasolver.py +41 -9
  266. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  267. diffusers/schedulers/scheduling_sde_ve_flax.py +1 -1
  268. diffusers/schedulers/scheduling_tcd.py +686 -0
  269. diffusers/schedulers/scheduling_unclip.py +1 -1
  270. diffusers/schedulers/scheduling_unipc_multistep.py +46 -19
  271. diffusers/schedulers/scheduling_utils.py +2 -1
  272. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  273. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  274. diffusers/training_utils.py +9 -2
  275. diffusers/utils/__init__.py +2 -1
  276. diffusers/utils/accelerate_utils.py +1 -1
  277. diffusers/utils/constants.py +1 -1
  278. diffusers/utils/doc_utils.py +1 -1
  279. diffusers/utils/dummy_pt_objects.py +60 -0
  280. diffusers/utils/dummy_torch_and_transformers_objects.py +75 -0
  281. diffusers/utils/dynamic_modules_utils.py +1 -1
  282. diffusers/utils/export_utils.py +3 -3
  283. diffusers/utils/hub_utils.py +60 -16
  284. diffusers/utils/import_utils.py +15 -1
  285. diffusers/utils/loading_utils.py +2 -0
  286. diffusers/utils/logging.py +1 -1
  287. diffusers/utils/model_card_template.md +24 -0
  288. diffusers/utils/outputs.py +14 -7
  289. diffusers/utils/peft_utils.py +1 -1
  290. diffusers/utils/state_dict_utils.py +1 -1
  291. diffusers/utils/testing_utils.py +2 -0
  292. diffusers/utils/torch_utils.py +1 -1
  293. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/METADATA +46 -46
  294. diffusers-0.27.0.dist-info/RECORD +399 -0
  295. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/WHEEL +1 -1
  296. diffusers-0.26.3.dist-info/RECORD +0 -384
  297. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/LICENSE +0 -0
  298. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/entry_points.txt +0 -0
  299. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -34,7 +34,8 @@ from ...schedulers import (
34
34
  )
35
35
  from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
36
36
  from ...utils.torch_utils import randn_tensor
37
- from ..pipeline_utils import DiffusionPipeline
37
+ from ..free_init_utils import FreeInitMixin
38
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
38
39
  from .pipeline_output import AnimateDiffPipelineOutput
39
40
 
40
41
 
@@ -99,7 +100,7 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
99
100
  outputs = torch.stack(outputs)
100
101
 
101
102
  elif not output_type == "pil":
102
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
103
+ raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
103
104
 
104
105
  return outputs
105
106
 
@@ -163,7 +164,14 @@ def retrieve_timesteps(
163
164
  return timesteps, num_inference_steps
164
165
 
165
166
 
166
- class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
167
+ class AnimateDiffVideoToVideoPipeline(
168
+ DiffusionPipeline,
169
+ StableDiffusionMixin,
170
+ TextualInversionLoaderMixin,
171
+ IPAdapterMixin,
172
+ LoraLoaderMixin,
173
+ FreeInitMixin,
174
+ ):
167
175
  r"""
168
176
  Pipeline for video-to-video generation.
169
177
 
@@ -193,7 +201,7 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
193
201
  """
194
202
 
195
203
  model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
196
- _optional_components = ["feature_extractor", "image_encoder"]
204
+ _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
197
205
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
198
206
 
199
207
  def __init__(
@@ -215,7 +223,8 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
215
223
  image_encoder: CLIPVisionModelWithProjection = None,
216
224
  ):
217
225
  super().__init__()
218
- unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
226
+ if isinstance(unet, UNet2DConditionModel):
227
+ unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
219
228
 
220
229
  self.register_modules(
221
230
  vae=vae,
@@ -291,7 +300,7 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
291
300
  batch_size = prompt_embeds.shape[0]
292
301
 
293
302
  if prompt_embeds is None:
294
- # textual inversion: procecss multi-vector tokens if necessary
303
+ # textual inversion: process multi-vector tokens if necessary
295
304
  if isinstance(self, TextualInversionLoaderMixin):
296
305
  prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
297
306
 
@@ -373,7 +382,7 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
373
382
  else:
374
383
  uncond_tokens = negative_prompt
375
384
 
376
- # textual inversion: procecss multi-vector tokens if necessary
385
+ # textual inversion: process multi-vector tokens if necessary
377
386
  if isinstance(self, TextualInversionLoaderMixin):
378
387
  uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
379
388
 
@@ -437,6 +446,58 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
437
446
 
438
447
  return image_embeds, uncond_image_embeds
439
448
 
449
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
450
+ def prepare_ip_adapter_image_embeds(
451
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
452
+ ):
453
+ if ip_adapter_image_embeds is None:
454
+ if not isinstance(ip_adapter_image, list):
455
+ ip_adapter_image = [ip_adapter_image]
456
+
457
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
458
+ raise ValueError(
459
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
460
+ )
461
+
462
+ image_embeds = []
463
+ for single_ip_adapter_image, image_proj_layer in zip(
464
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
465
+ ):
466
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
467
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
468
+ single_ip_adapter_image, device, 1, output_hidden_state
469
+ )
470
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
471
+ single_negative_image_embeds = torch.stack(
472
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
473
+ )
474
+
475
+ if do_classifier_free_guidance:
476
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
477
+ single_image_embeds = single_image_embeds.to(device)
478
+
479
+ image_embeds.append(single_image_embeds)
480
+ else:
481
+ repeat_dims = [1]
482
+ image_embeds = []
483
+ for single_image_embeds in ip_adapter_image_embeds:
484
+ if do_classifier_free_guidance:
485
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
486
+ single_image_embeds = single_image_embeds.repeat(
487
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
488
+ )
489
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
490
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
491
+ )
492
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
493
+ else:
494
+ single_image_embeds = single_image_embeds.repeat(
495
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
496
+ )
497
+ image_embeds.append(single_image_embeds)
498
+
499
+ return image_embeds
500
+
440
501
  # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
441
502
  def decode_latents(self, latents):
442
503
  latents = 1 / self.vae.config.scaling_factor * latents
@@ -445,83 +506,11 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
445
506
  latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
446
507
 
447
508
  image = self.vae.decode(latents).sample
448
- video = (
449
- image[None, :]
450
- .reshape(
451
- (
452
- batch_size,
453
- num_frames,
454
- -1,
455
- )
456
- + image.shape[2:]
457
- )
458
- .permute(0, 2, 1, 3, 4)
459
- )
509
+ video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
460
510
  # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
461
511
  video = video.float()
462
512
  return video
463
513
 
464
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
465
- def enable_vae_slicing(self):
466
- r"""
467
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
468
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
469
- """
470
- self.vae.enable_slicing()
471
-
472
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
473
- def disable_vae_slicing(self):
474
- r"""
475
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
476
- computing decoding in one step.
477
- """
478
- self.vae.disable_slicing()
479
-
480
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
481
- def enable_vae_tiling(self):
482
- r"""
483
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
484
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
485
- processing larger images.
486
- """
487
- self.vae.enable_tiling()
488
-
489
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
490
- def disable_vae_tiling(self):
491
- r"""
492
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
493
- computing decoding in one step.
494
- """
495
- self.vae.disable_tiling()
496
-
497
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
498
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
499
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
500
-
501
- The suffixes after the scaling factors represent the stages where they are being applied.
502
-
503
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
504
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
505
-
506
- Args:
507
- s1 (`float`):
508
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
509
- mitigate "oversmoothing effect" in the enhanced denoising process.
510
- s2 (`float`):
511
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
512
- mitigate "oversmoothing effect" in the enhanced denoising process.
513
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
514
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
515
- """
516
- if not hasattr(self, "unet"):
517
- raise ValueError("The pipeline must have `unet` for using FreeU.")
518
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
519
-
520
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
521
- def disable_freeu(self):
522
- """Disables the FreeU mechanism if enabled."""
523
- self.unet.disable_freeu()
524
-
525
514
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
526
515
  def prepare_extra_step_kwargs(self, generator, eta):
527
516
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -551,6 +540,8 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
551
540
  negative_prompt=None,
552
541
  prompt_embeds=None,
553
542
  negative_prompt_embeds=None,
543
+ ip_adapter_image=None,
544
+ ip_adapter_image_embeds=None,
554
545
  callback_on_step_end_tensor_inputs=None,
555
546
  ):
556
547
  if strength < 0 or strength > 1:
@@ -595,12 +586,27 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
595
586
  if video is not None and latents is not None:
596
587
  raise ValueError("Only one of `video` or `latents` should be provided")
597
588
 
598
- def get_timesteps(self, num_inference_steps, strength, device):
589
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
590
+ raise ValueError(
591
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
592
+ )
593
+
594
+ if ip_adapter_image_embeds is not None:
595
+ if not isinstance(ip_adapter_image_embeds, list):
596
+ raise ValueError(
597
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
598
+ )
599
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
600
+ raise ValueError(
601
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
602
+ )
603
+
604
+ def get_timesteps(self, num_inference_steps, timesteps, strength, device):
599
605
  # get the original timestep using init_timestep
600
606
  init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
601
607
 
602
608
  t_start = max(num_inference_steps - init_timestep, 0)
603
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
609
+ timesteps = timesteps[t_start * self.scheduler.order :]
604
610
 
605
611
  return timesteps, num_inference_steps - t_start
606
612
 
@@ -742,6 +748,7 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
742
748
  prompt_embeds: Optional[torch.FloatTensor] = None,
743
749
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
744
750
  ip_adapter_image: Optional[PipelineImageInput] = None,
751
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
745
752
  output_type: Optional[str] = "pil",
746
753
  return_dict: bool = True,
747
754
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -791,6 +798,11 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
791
798
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
792
799
  ip_adapter_image: (`PipelineImageInput`, *optional*):
793
800
  Optional image input to work with IP Adapters.
801
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
802
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
803
+ Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
804
+ if `do_classifier_free_guidance` is set to `True`.
805
+ If not provided, embeddings are computed from the `ip_adapter_image` input argument.
794
806
  output_type (`str`, *optional*, defaults to `"pil"`):
795
807
  The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
796
808
  `np.array`.
@@ -816,8 +828,8 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
816
828
  Examples:
817
829
 
818
830
  Returns:
819
- [`AnimateDiffPipelineOutput`] or `tuple`:
820
- If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
831
+ [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
832
+ If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
821
833
  returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
822
834
  """
823
835
 
@@ -838,6 +850,8 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
838
850
  negative_prompt_embeds=negative_prompt_embeds,
839
851
  video=video,
840
852
  latents=latents,
853
+ ip_adapter_image=ip_adapter_image,
854
+ ip_adapter_image_embeds=ip_adapter_image_embeds,
841
855
  callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
842
856
  )
843
857
 
@@ -877,19 +891,19 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
877
891
  if self.do_classifier_free_guidance:
878
892
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
879
893
 
880
- if ip_adapter_image is not None:
881
- output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
882
- image_embeds, negative_image_embeds = self.encode_image(
883
- ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
894
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
895
+ image_embeds = self.prepare_ip_adapter_image_embeds(
896
+ ip_adapter_image,
897
+ ip_adapter_image_embeds,
898
+ device,
899
+ batch_size * num_videos_per_prompt,
900
+ self.do_classifier_free_guidance,
884
901
  )
885
- if self.do_classifier_free_guidance:
886
- image_embeds = torch.cat([negative_image_embeds, image_embeds])
887
902
 
888
903
  # 4. Prepare timesteps
889
904
  timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
890
- timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
905
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
891
906
  latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
892
- self._num_timesteps = len(timesteps)
893
907
 
894
908
  # 5. Prepare latent variables
895
909
  num_channels_latents = self.unet.config.in_channels
@@ -910,54 +924,68 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
910
924
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
911
925
 
912
926
  # 7. Add image embeds for IP-Adapter
913
- added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
914
-
915
- # 8. Denoising loop
916
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
917
- with self.progress_bar(total=num_inference_steps) as progress_bar:
918
- for i, t in enumerate(timesteps):
919
- # expand the latents if we are doing classifier free guidance
920
- latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
921
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
922
-
923
- # predict the noise residual
924
- noise_pred = self.unet(
925
- latent_model_input,
926
- t,
927
- encoder_hidden_states=prompt_embeds,
928
- cross_attention_kwargs=self.cross_attention_kwargs,
929
- added_cond_kwargs=added_cond_kwargs,
930
- ).sample
931
-
932
- # perform guidance
933
- if self.do_classifier_free_guidance:
934
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
935
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
936
-
937
- # compute the previous noisy sample x_t -> x_t-1
938
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
939
-
940
- if callback_on_step_end is not None:
941
- callback_kwargs = {}
942
- for k in callback_on_step_end_tensor_inputs:
943
- callback_kwargs[k] = locals()[k]
944
- callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
945
-
946
- latents = callback_outputs.pop("latents", latents)
947
- prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
948
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
949
-
950
- progress_bar.update()
927
+ added_cond_kwargs = (
928
+ {"image_embeds": image_embeds}
929
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
930
+ else None
931
+ )
951
932
 
952
- if output_type == "latent":
953
- return AnimateDiffPipelineOutput(frames=latents)
933
+ num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
934
+ for free_init_iter in range(num_free_init_iters):
935
+ if self.free_init_enabled:
936
+ latents, timesteps = self._apply_free_init(
937
+ latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
938
+ )
939
+ num_inference_steps = len(timesteps)
940
+ # make sure to readjust timesteps based on strength
941
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
942
+
943
+ self._num_timesteps = len(timesteps)
944
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
945
+
946
+ # 8. Denoising loop
947
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
948
+ for i, t in enumerate(timesteps):
949
+ # expand the latents if we are doing classifier free guidance
950
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
951
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
952
+
953
+ # predict the noise residual
954
+ noise_pred = self.unet(
955
+ latent_model_input,
956
+ t,
957
+ encoder_hidden_states=prompt_embeds,
958
+ cross_attention_kwargs=self.cross_attention_kwargs,
959
+ added_cond_kwargs=added_cond_kwargs,
960
+ ).sample
961
+
962
+ # perform guidance
963
+ if self.do_classifier_free_guidance:
964
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
965
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
966
+
967
+ # compute the previous noisy sample x_t -> x_t-1
968
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
969
+
970
+ if callback_on_step_end is not None:
971
+ callback_kwargs = {}
972
+ for k in callback_on_step_end_tensor_inputs:
973
+ callback_kwargs[k] = locals()[k]
974
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
975
+
976
+ latents = callback_outputs.pop("latents", latents)
977
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
978
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
979
+
980
+ # call the callback, if provided
981
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
982
+ progress_bar.update()
954
983
 
955
984
  # 9. Post-processing
956
- video_tensor = self.decode_latents(latents)
957
-
958
- if output_type == "pt":
959
- video = video_tensor
985
+ if output_type == "latent":
986
+ video = latents
960
987
  else:
988
+ video_tensor = self.decode_latents(latents)
961
989
  video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
962
990
 
963
991
  # 10. Offload all models
@@ -11,12 +11,13 @@ from ...utils import BaseOutput
11
11
  @dataclass
12
12
  class AnimateDiffPipelineOutput(BaseOutput):
13
13
  r"""
14
- Output class for AnimateDiff pipelines.
14
+ Output class for AnimateDiff pipelines.
15
15
 
16
- Args:
17
- frames (`List[List[PIL.Image.Image]]` or `torch.Tensor` or `np.ndarray`):
18
- List of PIL Images of length `batch_size` or torch.Tensor or np.ndarray of shape
19
- `(batch_size, num_frames, height, width, num_channels)`.
16
+ Args:
17
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
18
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
19
+ PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
20
+ `(batch_size, num_frames, channels, height, width)`
20
21
  """
21
22
 
22
- frames: Union[List[List[PIL.Image.Image]], torch.Tensor, np.ndarray]
23
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel
24
24
  from ...schedulers import KarrasDiffusionSchedulers
25
25
  from ...utils import logging, replace_example_docstring
26
26
  from ...utils.torch_utils import randn_tensor
27
- from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
27
+ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
28
28
 
29
29
 
30
30
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -49,7 +49,7 @@ EXAMPLE_DOC_STRING = """
49
49
  """
50
50
 
51
51
 
52
- class AudioLDMPipeline(DiffusionPipeline):
52
+ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
53
53
  r"""
54
54
  Pipeline for text-to-audio generation using AudioLDM.
55
55
 
@@ -96,22 +96,6 @@ class AudioLDMPipeline(DiffusionPipeline):
96
96
  )
97
97
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
98
98
 
99
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
100
- def enable_vae_slicing(self):
101
- r"""
102
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
103
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
104
- """
105
- self.vae.enable_slicing()
106
-
107
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
108
- def disable_vae_slicing(self):
109
- r"""
110
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
111
- computing decoding in one step.
112
- """
113
- self.vae.disable_slicing()
114
-
115
99
  def _encode_prompt(
116
100
  self,
117
101
  prompt,
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2023 CVSSP, ByteDance and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 CVSSP, ByteDance and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -173,7 +173,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
173
173
  )
174
174
  self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
175
175
 
176
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
176
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
177
177
  def enable_vae_slicing(self):
178
178
  r"""
179
179
  Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
@@ -181,7 +181,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
181
181
  """
182
182
  self.vae.enable_slicing()
183
183
 
184
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
184
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
185
185
  def disable_vae_slicing(self):
186
186
  r"""
187
187
  Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
@@ -1,5 +1,5 @@
1
1
  # coding=utf-8
2
- # Copyright 2023 The HuggingFace Inc. team.
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
5
  # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- import inspect
17
16
  from collections import OrderedDict
18
17
 
19
18
  from huggingface_hub.utils import validate_hf_hub_args
@@ -164,14 +163,6 @@ def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool
164
163
  raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")
165
164
 
166
165
 
167
- def _get_signature_keys(obj):
168
- parameters = inspect.signature(obj.__init__).parameters
169
- required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
170
- optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
171
- expected_modules = set(required_parameters.keys()) - {"self"}
172
- return expected_modules, optional_parameters
173
-
174
-
175
166
  class AutoPipelineForText2Image(ConfigMixin):
176
167
  r"""
177
168
 
@@ -352,7 +343,7 @@ class AutoPipelineForText2Image(ConfigMixin):
352
343
  pipeline linked to the pipeline class using pattern matching on pipeline class name.
353
344
 
354
345
  All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
355
- additional memoery.
346
+ additional memory.
356
347
 
357
348
  The pipeline is set in evaluation mode (`model.eval()`) by default.
358
349
 
@@ -391,7 +382,7 @@ class AutoPipelineForText2Image(ConfigMixin):
391
382
  )
392
383
 
393
384
  # define expected module and optional kwargs given the pipeline signature
394
- expected_modules, optional_kwargs = _get_signature_keys(text_2_image_cls)
385
+ expected_modules, optional_kwargs = text_2_image_cls._get_signature_keys(text_2_image_cls)
395
386
 
396
387
  pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
397
388
 
@@ -625,7 +616,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
625
616
  image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class name.
626
617
 
627
618
  All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
628
- additional memoery.
619
+ additional memory.
629
620
 
630
621
  The pipeline is set in evaluation mode (`model.eval()`) by default.
631
622
 
@@ -668,7 +659,7 @@ class AutoPipelineForImage2Image(ConfigMixin):
668
659
  )
669
660
 
670
661
  # define expected module and optional kwargs given the pipeline signature
671
- expected_modules, optional_kwargs = _get_signature_keys(image_2_image_cls)
662
+ expected_modules, optional_kwargs = image_2_image_cls._get_signature_keys(image_2_image_cls)
672
663
 
673
664
  pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
674
665
 
@@ -901,7 +892,7 @@ class AutoPipelineForInpainting(ConfigMixin):
901
892
  pipeline linked to the pipeline class using pattern matching on pipeline class name.
902
893
 
903
894
  All the modules the pipeline class contain will be used to initialize the new pipeline without reallocating
904
- additional memoery.
895
+ additional memory.
905
896
 
906
897
  The pipeline is set in evaluation mode (`model.eval()`) by default.
907
898
 
@@ -943,7 +934,7 @@ class AutoPipelineForInpainting(ConfigMixin):
943
934
  )
944
935
 
945
936
  # define expected module and optional kwargs given the pipeline signature
946
- expected_modules, optional_kwargs = _get_signature_keys(inpainting_cls)
937
+ expected_modules, optional_kwargs = inpainting_cls._get_signature_keys(inpainting_cls)
947
938
 
948
939
  pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
949
940
 
@@ -1,5 +1,5 @@
1
1
  # coding=utf-8
2
- # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
2
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
5
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
1
- # Copyright 2023 Salesforce.com, inc.
2
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 Salesforce.com, inc.
2
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
5
  # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
1
- # Copyright 2023 Salesforce.com, inc.
2
- # Copyright 2023 The HuggingFace Team. All rights reserved.#
1
+ # Copyright 2024 Salesforce.com, inc.
2
+ # Copyright 2024 The HuggingFace Team. All rights reserved.#
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
5
5
  # You may obtain a copy of the License at
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.