diffusers 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. diffusers/__init__.py +20 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/configuration_utils.py +7 -3
  7. diffusers/dependency_versions_check.py +1 -1
  8. diffusers/dependency_versions_table.py +2 -2
  9. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  10. diffusers/image_processor.py +110 -4
  11. diffusers/loaders/autoencoder.py +7 -8
  12. diffusers/loaders/controlnet.py +17 -8
  13. diffusers/loaders/ip_adapter.py +86 -23
  14. diffusers/loaders/lora.py +105 -310
  15. diffusers/loaders/lora_conversion_utils.py +1 -1
  16. diffusers/loaders/peft.py +1 -1
  17. diffusers/loaders/single_file.py +51 -12
  18. diffusers/loaders/single_file_utils.py +274 -49
  19. diffusers/loaders/textual_inversion.py +23 -4
  20. diffusers/loaders/unet.py +195 -41
  21. diffusers/loaders/utils.py +1 -1
  22. diffusers/models/__init__.py +3 -1
  23. diffusers/models/activations.py +9 -9
  24. diffusers/models/attention.py +26 -36
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +171 -114
  27. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  28. diffusers/models/autoencoders/autoencoder_kl.py +3 -1
  29. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  30. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  31. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  32. diffusers/models/autoencoders/vae.py +1 -1
  33. diffusers/models/controlnet.py +1 -1
  34. diffusers/models/controlnet_flax.py +1 -1
  35. diffusers/models/downsampling.py +8 -12
  36. diffusers/models/dual_transformer_2d.py +1 -1
  37. diffusers/models/embeddings.py +3 -4
  38. diffusers/models/embeddings_flax.py +1 -1
  39. diffusers/models/lora.py +33 -10
  40. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  41. diffusers/models/modeling_flax_utils.py +1 -1
  42. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  43. diffusers/models/modeling_utils.py +4 -6
  44. diffusers/models/normalization.py +1 -1
  45. diffusers/models/resnet.py +31 -58
  46. diffusers/models/resnet_flax.py +1 -1
  47. diffusers/models/t5_film_transformer.py +1 -1
  48. diffusers/models/transformer_2d.py +1 -1
  49. diffusers/models/transformer_temporal.py +1 -1
  50. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  51. diffusers/models/transformers/t5_film_transformer.py +1 -1
  52. diffusers/models/transformers/transformer_2d.py +29 -31
  53. diffusers/models/transformers/transformer_temporal.py +1 -1
  54. diffusers/models/unet_1d.py +1 -1
  55. diffusers/models/unet_1d_blocks.py +1 -1
  56. diffusers/models/unet_2d.py +1 -1
  57. diffusers/models/unet_2d_blocks.py +1 -1
  58. diffusers/models/unet_2d_condition.py +1 -1
  59. diffusers/models/unets/__init__.py +1 -0
  60. diffusers/models/unets/unet_1d.py +1 -1
  61. diffusers/models/unets/unet_1d_blocks.py +1 -1
  62. diffusers/models/unets/unet_2d.py +4 -4
  63. diffusers/models/unets/unet_2d_blocks.py +238 -98
  64. diffusers/models/unets/unet_2d_blocks_flax.py +1 -1
  65. diffusers/models/unets/unet_2d_condition.py +420 -323
  66. diffusers/models/unets/unet_2d_condition_flax.py +21 -12
  67. diffusers/models/unets/unet_3d_blocks.py +50 -40
  68. diffusers/models/unets/unet_3d_condition.py +47 -8
  69. diffusers/models/unets/unet_i2vgen_xl.py +75 -30
  70. diffusers/models/unets/unet_kandinsky3.py +1 -1
  71. diffusers/models/unets/unet_motion_model.py +48 -8
  72. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  73. diffusers/models/unets/unet_stable_cascade.py +610 -0
  74. diffusers/models/unets/uvit_2d.py +1 -1
  75. diffusers/models/upsampling.py +10 -16
  76. diffusers/models/vae_flax.py +1 -1
  77. diffusers/models/vq_model.py +1 -1
  78. diffusers/optimization.py +1 -1
  79. diffusers/pipelines/__init__.py +26 -0
  80. diffusers/pipelines/amused/pipeline_amused.py +1 -1
  81. diffusers/pipelines/amused/pipeline_amused_img2img.py +1 -1
  82. diffusers/pipelines/amused/pipeline_amused_inpaint.py +1 -1
  83. diffusers/pipelines/animatediff/pipeline_animatediff.py +162 -417
  84. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +165 -137
  85. diffusers/pipelines/animatediff/pipeline_output.py +7 -6
  86. diffusers/pipelines/audioldm/pipeline_audioldm.py +3 -19
  87. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  88. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +3 -3
  89. diffusers/pipelines/auto_pipeline.py +7 -16
  90. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  93. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  94. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  95. diffusers/pipelines/controlnet/pipeline_controlnet.py +90 -90
  96. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  97. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +98 -90
  98. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +92 -90
  99. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +145 -70
  100. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +126 -89
  101. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +108 -96
  102. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  103. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -1
  104. diffusers/pipelines/ddim/pipeline_ddim.py +1 -1
  105. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  106. diffusers/pipelines/deepfloyd_if/pipeline_if.py +4 -4
  107. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +4 -4
  108. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +5 -5
  109. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +4 -4
  110. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +5 -5
  111. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +5 -5
  112. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +10 -120
  113. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -91
  114. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  115. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +1 -1
  116. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  117. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +1 -1
  118. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  119. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  120. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  121. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  122. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  123. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  124. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +5 -4
  125. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +5 -4
  126. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +7 -22
  127. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -39
  128. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +5 -5
  129. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  130. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -22
  131. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  132. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  133. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -2
  134. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  135. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  136. diffusers/pipelines/free_init_utils.py +184 -0
  137. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +22 -104
  138. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +1 -1
  139. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  140. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +1 -1
  141. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +2 -2
  142. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -1
  143. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -1
  145. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +1 -1
  146. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  147. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  148. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +2 -2
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +104 -93
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +112 -74
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/ledits_pp/__init__.py +55 -0
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +1505 -0
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +1797 -0
  155. diffusers/pipelines/ledits_pp/pipeline_output.py +43 -0
  156. diffusers/pipelines/musicldm/pipeline_musicldm.py +3 -19
  157. diffusers/pipelines/onnx_utils.py +1 -1
  158. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  159. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +3 -3
  160. diffusers/pipelines/pia/pipeline_pia.py +168 -327
  161. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  162. diffusers/pipelines/pipeline_loading_utils.py +508 -0
  163. diffusers/pipelines/pipeline_utils.py +188 -534
  164. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +56 -10
  165. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +3 -3
  166. diffusers/pipelines/shap_e/camera.py +1 -1
  167. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  168. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  169. diffusers/pipelines/shap_e/renderer.py +1 -1
  170. diffusers/pipelines/stable_cascade/__init__.py +50 -0
  171. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +482 -0
  172. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +311 -0
  173. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +638 -0
  174. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  175. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +4 -1
  176. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  177. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +2 -2
  178. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  179. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +1 -1
  180. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +1 -1
  181. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
  182. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
  183. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +90 -146
  184. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  185. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +4 -32
  186. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -119
  187. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -119
  188. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +13 -59
  189. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -31
  190. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -33
  191. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -21
  192. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -21
  193. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  194. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  195. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  196. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +5 -21
  197. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +9 -38
  198. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -34
  199. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +6 -35
  200. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +7 -6
  201. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +4 -124
  202. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +282 -80
  203. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +94 -46
  204. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +3 -3
  205. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  206. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +6 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  208. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +96 -148
  209. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -154
  210. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +98 -153
  211. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +25 -87
  212. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +89 -80
  213. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +5 -49
  214. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +80 -88
  215. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +8 -6
  216. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +15 -86
  217. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +20 -93
  218. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +5 -5
  219. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +3 -19
  220. diffusers/pipelines/unclip/pipeline_unclip.py +1 -1
  221. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -1
  222. diffusers/pipelines/unclip/text_proj.py +1 -1
  223. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +35 -35
  224. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  225. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +4 -21
  226. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +2 -2
  227. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -5
  228. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  229. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +1 -1
  230. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +2 -2
  231. diffusers/schedulers/__init__.py +7 -1
  232. diffusers/schedulers/deprecated/scheduling_karras_ve.py +1 -1
  233. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  234. diffusers/schedulers/scheduling_consistency_models.py +42 -19
  235. diffusers/schedulers/scheduling_ddim.py +2 -4
  236. diffusers/schedulers/scheduling_ddim_flax.py +13 -5
  237. diffusers/schedulers/scheduling_ddim_inverse.py +2 -4
  238. diffusers/schedulers/scheduling_ddim_parallel.py +2 -4
  239. diffusers/schedulers/scheduling_ddpm.py +2 -4
  240. diffusers/schedulers/scheduling_ddpm_flax.py +1 -1
  241. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -4
  242. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +1 -1
  243. diffusers/schedulers/scheduling_deis_multistep.py +46 -19
  244. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -21
  245. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +1 -1
  246. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +9 -7
  247. diffusers/schedulers/scheduling_dpmsolver_sde.py +35 -35
  248. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +49 -18
  249. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +683 -0
  250. diffusers/schedulers/scheduling_edm_euler.py +381 -0
  251. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +43 -15
  252. diffusers/schedulers/scheduling_euler_discrete.py +42 -17
  253. diffusers/schedulers/scheduling_euler_discrete_flax.py +1 -1
  254. diffusers/schedulers/scheduling_heun_discrete.py +35 -35
  255. diffusers/schedulers/scheduling_ipndm.py +37 -11
  256. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +44 -44
  257. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +44 -44
  258. diffusers/schedulers/scheduling_karras_ve_flax.py +1 -1
  259. diffusers/schedulers/scheduling_lcm.py +38 -14
  260. diffusers/schedulers/scheduling_lms_discrete.py +43 -15
  261. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  262. diffusers/schedulers/scheduling_pndm.py +2 -4
  263. diffusers/schedulers/scheduling_pndm_flax.py +2 -4
  264. diffusers/schedulers/scheduling_repaint.py +1 -1
  265. diffusers/schedulers/scheduling_sasolver.py +41 -9
  266. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  267. diffusers/schedulers/scheduling_sde_ve_flax.py +1 -1
  268. diffusers/schedulers/scheduling_tcd.py +686 -0
  269. diffusers/schedulers/scheduling_unclip.py +1 -1
  270. diffusers/schedulers/scheduling_unipc_multistep.py +46 -19
  271. diffusers/schedulers/scheduling_utils.py +2 -1
  272. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  273. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  274. diffusers/training_utils.py +9 -2
  275. diffusers/utils/__init__.py +2 -1
  276. diffusers/utils/accelerate_utils.py +1 -1
  277. diffusers/utils/constants.py +1 -1
  278. diffusers/utils/doc_utils.py +1 -1
  279. diffusers/utils/dummy_pt_objects.py +60 -0
  280. diffusers/utils/dummy_torch_and_transformers_objects.py +75 -0
  281. diffusers/utils/dynamic_modules_utils.py +1 -1
  282. diffusers/utils/export_utils.py +3 -3
  283. diffusers/utils/hub_utils.py +60 -16
  284. diffusers/utils/import_utils.py +15 -1
  285. diffusers/utils/loading_utils.py +2 -0
  286. diffusers/utils/logging.py +1 -1
  287. diffusers/utils/model_card_template.md +24 -0
  288. diffusers/utils/outputs.py +14 -7
  289. diffusers/utils/peft_utils.py +1 -1
  290. diffusers/utils/state_dict_utils.py +1 -1
  291. diffusers/utils/testing_utils.py +2 -0
  292. diffusers/utils/torch_utils.py +1 -1
  293. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/METADATA +46 -46
  294. diffusers-0.27.0.dist-info/RECORD +399 -0
  295. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/WHEEL +1 -1
  296. diffusers-0.26.3.dist-info/RECORD +0 -384
  297. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/LICENSE +0 -0
  298. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/entry_points.txt +0 -0
  299. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -13,12 +13,10 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import inspect
16
- import math
17
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
16
+ from typing import Any, Callable, Dict, List, Optional, Union
18
17
 
19
18
  import numpy as np
20
19
  import torch
21
- import torch.fft as fft
22
20
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
23
21
 
24
22
  from ...image_processor import PipelineImageInput, VaeImageProcessor
@@ -43,7 +41,8 @@ from ...utils import (
43
41
  unscale_lora_layers,
44
42
  )
45
43
  from ...utils.torch_utils import randn_tensor
46
- from ..pipeline_utils import DiffusionPipeline
44
+ from ..free_init_utils import FreeInitMixin
45
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
47
46
  from .pipeline_output import AnimateDiffPipelineOutput
48
47
 
49
48
 
@@ -82,77 +81,19 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
82
81
  outputs = torch.stack(outputs)
83
82
 
84
83
  elif not output_type == "pil":
85
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
84
+ raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
86
85
 
87
86
  return outputs
88
87
 
89
88
 
90
- def _get_freeinit_freq_filter(
91
- shape: Tuple[int, ...],
92
- device: Union[str, torch.dtype],
93
- filter_type: str,
94
- order: float,
95
- spatial_stop_frequency: float,
96
- temporal_stop_frequency: float,
97
- ) -> torch.Tensor:
98
- r"""Returns the FreeInit filter based on filter type and other input conditions."""
99
-
100
- T, H, W = shape[-3], shape[-2], shape[-1]
101
- mask = torch.zeros(shape)
102
-
103
- if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
104
- return mask
105
-
106
- if filter_type == "butterworth":
107
-
108
- def retrieve_mask(x):
109
- return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
110
- elif filter_type == "gaussian":
111
-
112
- def retrieve_mask(x):
113
- return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
114
- elif filter_type == "ideal":
115
-
116
- def retrieve_mask(x):
117
- return 1 if x <= spatial_stop_frequency * 2 else 0
118
- else:
119
- raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
120
-
121
- for t in range(T):
122
- for h in range(H):
123
- for w in range(W):
124
- d_square = (
125
- ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / T - 1)) ** 2
126
- + (2 * h / H - 1) ** 2
127
- + (2 * w / W - 1) ** 2
128
- )
129
- mask[..., t, h, w] = retrieve_mask(d_square)
130
-
131
- return mask.to(device)
132
-
133
-
134
- def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> torch.Tensor:
135
- r"""Noise reinitialization."""
136
- # FFT
137
- x_freq = fft.fftn(x, dim=(-3, -2, -1))
138
- x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
139
- noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
140
- noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
141
-
142
- # frequency mix
143
- HPF = 1 - LPF
144
- x_freq_low = x_freq * LPF
145
- noise_freq_high = noise_freq * HPF
146
- x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain
147
-
148
- # IFFT
149
- x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
150
- x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
151
-
152
- return x_mixed
153
-
154
-
155
- class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
89
+ class AnimateDiffPipeline(
90
+ DiffusionPipeline,
91
+ StableDiffusionMixin,
92
+ TextualInversionLoaderMixin,
93
+ IPAdapterMixin,
94
+ LoraLoaderMixin,
95
+ FreeInitMixin,
96
+ ):
156
97
  r"""
157
98
  Pipeline for text-to-video generation.
158
99
 
@@ -182,7 +123,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
182
123
  """
183
124
 
184
125
  model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
185
- _optional_components = ["feature_extractor", "image_encoder"]
126
+ _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
186
127
  _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
187
128
 
188
129
  def __init__(
@@ -204,7 +145,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
204
145
  image_encoder: CLIPVisionModelWithProjection = None,
205
146
  ):
206
147
  super().__init__()
207
- unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
148
+ if isinstance(unet, UNet2DConditionModel):
149
+ unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
208
150
 
209
151
  self.register_modules(
210
152
  vae=vae,
@@ -280,7 +222,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
280
222
  batch_size = prompt_embeds.shape[0]
281
223
 
282
224
  if prompt_embeds is None:
283
- # textual inversion: procecss multi-vector tokens if necessary
225
+ # textual inversion: process multi-vector tokens if necessary
284
226
  if isinstance(self, TextualInversionLoaderMixin):
285
227
  prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
286
228
 
@@ -362,7 +304,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
362
304
  else:
363
305
  uncond_tokens = negative_prompt
364
306
 
365
- # textual inversion: procecss multi-vector tokens if necessary
307
+ # textual inversion: process multi-vector tokens if necessary
366
308
  if isinstance(self, TextualInversionLoaderMixin):
367
309
  uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
368
310
 
@@ -427,31 +369,54 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
427
369
  return image_embeds, uncond_image_embeds
428
370
 
429
371
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
430
- def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt):
431
- if not isinstance(ip_adapter_image, list):
432
- ip_adapter_image = [ip_adapter_image]
372
+ def prepare_ip_adapter_image_embeds(
373
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
374
+ ):
375
+ if ip_adapter_image_embeds is None:
376
+ if not isinstance(ip_adapter_image, list):
377
+ ip_adapter_image = [ip_adapter_image]
433
378
 
434
- if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
435
- raise ValueError(
436
- f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
437
- )
379
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
380
+ raise ValueError(
381
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
382
+ )
438
383
 
439
- image_embeds = []
440
- for single_ip_adapter_image, image_proj_layer in zip(
441
- ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
442
- ):
443
- output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
444
- single_image_embeds, single_negative_image_embeds = self.encode_image(
445
- single_ip_adapter_image, device, 1, output_hidden_state
446
- )
447
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
448
- single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
384
+ image_embeds = []
385
+ for single_ip_adapter_image, image_proj_layer in zip(
386
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
387
+ ):
388
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
389
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
390
+ single_ip_adapter_image, device, 1, output_hidden_state
391
+ )
392
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
393
+ single_negative_image_embeds = torch.stack(
394
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
395
+ )
449
396
 
450
- if self.do_classifier_free_guidance:
451
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
452
- single_image_embeds = single_image_embeds.to(device)
397
+ if do_classifier_free_guidance:
398
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
399
+ single_image_embeds = single_image_embeds.to(device)
453
400
 
454
- image_embeds.append(single_image_embeds)
401
+ image_embeds.append(single_image_embeds)
402
+ else:
403
+ repeat_dims = [1]
404
+ image_embeds = []
405
+ for single_image_embeds in ip_adapter_image_embeds:
406
+ if do_classifier_free_guidance:
407
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
408
+ single_image_embeds = single_image_embeds.repeat(
409
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
410
+ )
411
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
412
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
413
+ )
414
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
415
+ else:
416
+ single_image_embeds = single_image_embeds.repeat(
417
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
418
+ )
419
+ image_embeds.append(single_image_embeds)
455
420
 
456
421
  return image_embeds
457
422
 
@@ -463,135 +428,11 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
463
428
  latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
464
429
 
465
430
  image = self.vae.decode(latents).sample
466
- video = (
467
- image[None, :]
468
- .reshape(
469
- (
470
- batch_size,
471
- num_frames,
472
- -1,
473
- )
474
- + image.shape[2:]
475
- )
476
- .permute(0, 2, 1, 3, 4)
477
- )
431
+ video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
478
432
  # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
479
433
  video = video.float()
480
434
  return video
481
435
 
482
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
483
- def enable_vae_slicing(self):
484
- r"""
485
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
486
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
487
- """
488
- self.vae.enable_slicing()
489
-
490
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
491
- def disable_vae_slicing(self):
492
- r"""
493
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
494
- computing decoding in one step.
495
- """
496
- self.vae.disable_slicing()
497
-
498
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
499
- def enable_vae_tiling(self):
500
- r"""
501
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
502
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
503
- processing larger images.
504
- """
505
- self.vae.enable_tiling()
506
-
507
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
508
- def disable_vae_tiling(self):
509
- r"""
510
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
511
- computing decoding in one step.
512
- """
513
- self.vae.disable_tiling()
514
-
515
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
516
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
517
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
518
-
519
- The suffixes after the scaling factors represent the stages where they are being applied.
520
-
521
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
522
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
523
-
524
- Args:
525
- s1 (`float`):
526
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
527
- mitigate "oversmoothing effect" in the enhanced denoising process.
528
- s2 (`float`):
529
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
530
- mitigate "oversmoothing effect" in the enhanced denoising process.
531
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
532
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
533
- """
534
- if not hasattr(self, "unet"):
535
- raise ValueError("The pipeline must have `unet` for using FreeU.")
536
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
537
-
538
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
539
- def disable_freeu(self):
540
- """Disables the FreeU mechanism if enabled."""
541
- self.unet.disable_freeu()
542
-
543
- @property
544
- def free_init_enabled(self):
545
- return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
546
-
547
- def enable_free_init(
548
- self,
549
- num_iters: int = 3,
550
- use_fast_sampling: bool = False,
551
- method: str = "butterworth",
552
- order: int = 4,
553
- spatial_stop_frequency: float = 0.25,
554
- temporal_stop_frequency: float = 0.25,
555
- generator: torch.Generator = None,
556
- ):
557
- """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
558
-
559
- This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
560
-
561
- Args:
562
- num_iters (`int`, *optional*, defaults to `3`):
563
- Number of FreeInit noise re-initialization iterations.
564
- use_fast_sampling (`bool`, *optional*, defaults to `False`):
565
- Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
566
- the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
567
- method (`str`, *optional*, defaults to `butterworth`):
568
- Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
569
- FreeInit low pass filter.
570
- order (`int`, *optional*, defaults to `4`):
571
- Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
572
- whereas lower values lead to `gaussian` method behaviour.
573
- spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
574
- Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
575
- the original implementation.
576
- temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
577
- Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
578
- the original implementation.
579
- generator (`torch.Generator`, *optional*, defaults to `0.25`):
580
- A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
581
- FreeInit generation deterministic.
582
- """
583
- self._free_init_num_iters = num_iters
584
- self._free_init_use_fast_sampling = use_fast_sampling
585
- self._free_init_method = method
586
- self._free_init_order = order
587
- self._free_init_spatial_stop_frequency = spatial_stop_frequency
588
- self._free_init_temporal_stop_frequency = temporal_stop_frequency
589
- self._free_init_generator = generator
590
-
591
- def disable_free_init(self):
592
- """Disables the FreeInit mechanism if enabled."""
593
- self._free_init_num_iters = None
594
-
595
436
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
596
437
  def prepare_extra_step_kwargs(self, generator, eta):
597
438
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -620,6 +461,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
620
461
  negative_prompt=None,
621
462
  prompt_embeds=None,
622
463
  negative_prompt_embeds=None,
464
+ ip_adapter_image=None,
465
+ ip_adapter_image_embeds=None,
623
466
  callback_on_step_end_tensor_inputs=None,
624
467
  ):
625
468
  if height % 8 != 0 or width % 8 != 0:
@@ -663,6 +506,21 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
663
506
  f" {negative_prompt_embeds.shape}."
664
507
  )
665
508
 
509
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
510
+ raise ValueError(
511
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
512
+ )
513
+
514
+ if ip_adapter_image_embeds is not None:
515
+ if not isinstance(ip_adapter_image_embeds, list):
516
+ raise ValueError(
517
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
518
+ )
519
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
520
+ raise ValueError(
521
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
522
+ )
523
+
666
524
  # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
667
525
  def prepare_latents(
668
526
  self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
@@ -689,158 +547,6 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
689
547
  latents = latents * self.scheduler.init_noise_sigma
690
548
  return latents
691
549
 
692
- def _denoise_loop(
693
- self,
694
- timesteps,
695
- num_inference_steps,
696
- do_classifier_free_guidance,
697
- guidance_scale,
698
- num_warmup_steps,
699
- prompt_embeds,
700
- negative_prompt_embeds,
701
- latents,
702
- cross_attention_kwargs,
703
- added_cond_kwargs,
704
- extra_step_kwargs,
705
- callback,
706
- callback_steps,
707
- callback_on_step_end,
708
- callback_on_step_end_tensor_inputs,
709
- ):
710
- """Denoising loop for AnimateDiff."""
711
- with self.progress_bar(total=num_inference_steps) as progress_bar:
712
- for i, t in enumerate(timesteps):
713
- # expand the latents if we are doing classifier free guidance
714
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
715
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
716
-
717
- # predict the noise residual
718
- noise_pred = self.unet(
719
- latent_model_input,
720
- t,
721
- encoder_hidden_states=prompt_embeds,
722
- cross_attention_kwargs=cross_attention_kwargs,
723
- added_cond_kwargs=added_cond_kwargs,
724
- ).sample
725
-
726
- # perform guidance
727
- if do_classifier_free_guidance:
728
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
729
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
730
-
731
- # compute the previous noisy sample x_t -> x_t-1
732
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
733
-
734
- if callback_on_step_end is not None:
735
- callback_kwargs = {}
736
- for k in callback_on_step_end_tensor_inputs:
737
- callback_kwargs[k] = locals()[k]
738
- callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
739
-
740
- latents = callback_outputs.pop("latents", latents)
741
- prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
742
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
743
-
744
- # call the callback, if provided
745
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
746
- progress_bar.update()
747
- if callback is not None and i % callback_steps == 0:
748
- callback(i, t, latents)
749
-
750
- return latents
751
-
752
- def _free_init_loop(
753
- self,
754
- height,
755
- width,
756
- num_frames,
757
- num_channels_latents,
758
- batch_size,
759
- num_videos_per_prompt,
760
- denoise_args,
761
- device,
762
- ):
763
- """Denoising loop for AnimateDiff using FreeInit noise reinitialization technique."""
764
-
765
- latents = denoise_args.get("latents")
766
- prompt_embeds = denoise_args.get("prompt_embeds")
767
- timesteps = denoise_args.get("timesteps")
768
- num_inference_steps = denoise_args.get("num_inference_steps")
769
-
770
- latent_shape = (
771
- batch_size * num_videos_per_prompt,
772
- num_channels_latents,
773
- num_frames,
774
- height // self.vae_scale_factor,
775
- width // self.vae_scale_factor,
776
- )
777
- free_init_filter_shape = (
778
- 1,
779
- num_channels_latents,
780
- num_frames,
781
- height // self.vae_scale_factor,
782
- width // self.vae_scale_factor,
783
- )
784
- free_init_freq_filter = _get_freeinit_freq_filter(
785
- shape=free_init_filter_shape,
786
- device=device,
787
- filter_type=self._free_init_method,
788
- order=self._free_init_order,
789
- spatial_stop_frequency=self._free_init_spatial_stop_frequency,
790
- temporal_stop_frequency=self._free_init_temporal_stop_frequency,
791
- )
792
-
793
- with self.progress_bar(total=self._free_init_num_iters) as free_init_progress_bar:
794
- for i in range(self._free_init_num_iters):
795
- # For the first FreeInit iteration, the original latent is used without modification.
796
- # Subsequent iterations apply the noise reinitialization technique.
797
- if i == 0:
798
- initial_noise = latents.detach().clone()
799
- else:
800
- current_diffuse_timestep = (
801
- self.scheduler.config.num_train_timesteps - 1
802
- ) # diffuse to t=999 noise level
803
- diffuse_timesteps = torch.full((batch_size,), current_diffuse_timestep).long()
804
- z_T = self.scheduler.add_noise(
805
- original_samples=latents, noise=initial_noise, timesteps=diffuse_timesteps.to(device)
806
- ).to(dtype=torch.float32)
807
- z_rand = randn_tensor(
808
- shape=latent_shape,
809
- generator=self._free_init_generator,
810
- device=device,
811
- dtype=torch.float32,
812
- )
813
- latents = _freq_mix_3d(z_T, z_rand, LPF=free_init_freq_filter)
814
- latents = latents.to(prompt_embeds.dtype)
815
-
816
- # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
817
- if self._free_init_use_fast_sampling:
818
- current_num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (i + 1))
819
- self.scheduler.set_timesteps(current_num_inference_steps, device=device)
820
- timesteps = self.scheduler.timesteps
821
- denoise_args.update({"timesteps": timesteps, "num_inference_steps": current_num_inference_steps})
822
-
823
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
824
- denoise_args.update({"latents": latents, "num_warmup_steps": num_warmup_steps})
825
- latents = self._denoise_loop(**denoise_args)
826
-
827
- free_init_progress_bar.update()
828
-
829
- return latents
830
-
831
- def _retrieve_video_frames(self, latents, output_type, return_dict):
832
- """Helper function to handle latents to output conversion."""
833
- if output_type == "latent":
834
- return AnimateDiffPipelineOutput(frames=latents)
835
-
836
- video_tensor = self.decode_latents(latents)
837
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
838
-
839
- if not return_dict:
840
- return (video,)
841
-
842
- return AnimateDiffPipelineOutput(frames=video)
843
-
844
550
  @property
845
551
  def guidance_scale(self):
846
552
  return self._guidance_scale
@@ -882,6 +588,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
882
588
  prompt_embeds: Optional[torch.FloatTensor] = None,
883
589
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
884
590
  ip_adapter_image: Optional[PipelineImageInput] = None,
591
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
885
592
  output_type: Optional[str] = "pil",
886
593
  return_dict: bool = True,
887
594
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -931,6 +638,11 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
931
638
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
932
639
  ip_adapter_image: (`PipelineImageInput`, *optional*):
933
640
  Optional image input to work with IP Adapters.
641
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
642
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
643
+ Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
644
+ if `do_classifier_free_guidance` is set to `True`.
645
+ If not provided, embeddings are computed from the `ip_adapter_image` input argument.
934
646
  output_type (`str`, *optional*, defaults to `"pil"`):
935
647
  The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
936
648
  `np.array`.
@@ -956,8 +668,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
956
668
  Examples:
957
669
 
958
670
  Returns:
959
- [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
960
- If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
671
+ [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
672
+ If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
961
673
  returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
962
674
  """
963
675
 
@@ -992,6 +704,8 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
992
704
  negative_prompt,
993
705
  prompt_embeds,
994
706
  negative_prompt_embeds,
707
+ ip_adapter_image,
708
+ ip_adapter_image_embeds,
995
709
  callback_on_step_end_tensor_inputs,
996
710
  )
997
711
 
@@ -1030,15 +744,18 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1030
744
  if self.do_classifier_free_guidance:
1031
745
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
1032
746
 
1033
- if ip_adapter_image is not None:
747
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1034
748
  image_embeds = self.prepare_ip_adapter_image_embeds(
1035
- ip_adapter_image, device, batch_size * num_videos_per_prompt
749
+ ip_adapter_image,
750
+ ip_adapter_image_embeds,
751
+ device,
752
+ batch_size * num_videos_per_prompt,
753
+ self.do_classifier_free_guidance,
1036
754
  )
1037
755
 
1038
756
  # 4. Prepare timesteps
1039
757
  self.scheduler.set_timesteps(num_inference_steps, device=device)
1040
758
  timesteps = self.scheduler.timesteps
1041
- self._num_timesteps = len(timesteps)
1042
759
 
1043
760
  # 5. Prepare latent variables
1044
761
  num_channels_latents = self.unet.config.in_channels
@@ -1058,45 +775,73 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
1058
775
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1059
776
 
1060
777
  # 7. Add image embeds for IP-Adapter
1061
- added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
1062
-
1063
- # 8. Denoising loop
1064
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1065
- denoise_args = {
1066
- "timesteps": timesteps,
1067
- "num_inference_steps": num_inference_steps,
1068
- "do_classifier_free_guidance": self.do_classifier_free_guidance,
1069
- "guidance_scale": guidance_scale,
1070
- "num_warmup_steps": num_warmup_steps,
1071
- "prompt_embeds": prompt_embeds,
1072
- "negative_prompt_embeds": negative_prompt_embeds,
1073
- "latents": latents,
1074
- "cross_attention_kwargs": self.cross_attention_kwargs,
1075
- "added_cond_kwargs": added_cond_kwargs,
1076
- "extra_step_kwargs": extra_step_kwargs,
1077
- "callback": callback,
1078
- "callback_steps": callback_steps,
1079
- "callback_on_step_end": callback_on_step_end,
1080
- "callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
1081
- }
1082
-
1083
- if self.free_init_enabled:
1084
- latents = self._free_init_loop(
1085
- height=height,
1086
- width=width,
1087
- num_frames=num_frames,
1088
- num_channels_latents=num_channels_latents,
1089
- batch_size=batch_size,
1090
- num_videos_per_prompt=num_videos_per_prompt,
1091
- denoise_args=denoise_args,
1092
- device=device,
1093
- )
1094
- else:
1095
- latents = self._denoise_loop(**denoise_args)
778
+ added_cond_kwargs = (
779
+ {"image_embeds": image_embeds}
780
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
781
+ else None
782
+ )
1096
783
 
1097
- video = self._retrieve_video_frames(latents, output_type, return_dict)
784
+ num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
785
+ for free_init_iter in range(num_free_init_iters):
786
+ if self.free_init_enabled:
787
+ latents, timesteps = self._apply_free_init(
788
+ latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
789
+ )
1098
790
 
1099
- # 9. Offload all models
791
+ self._num_timesteps = len(timesteps)
792
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
793
+
794
+ # 8. Denoising loop
795
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
796
+ for i, t in enumerate(timesteps):
797
+ # expand the latents if we are doing classifier free guidance
798
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
799
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
800
+
801
+ # predict the noise residual
802
+ noise_pred = self.unet(
803
+ latent_model_input,
804
+ t,
805
+ encoder_hidden_states=prompt_embeds,
806
+ cross_attention_kwargs=cross_attention_kwargs,
807
+ added_cond_kwargs=added_cond_kwargs,
808
+ ).sample
809
+
810
+ # perform guidance
811
+ if self.do_classifier_free_guidance:
812
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
813
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
814
+
815
+ # compute the previous noisy sample x_t -> x_t-1
816
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
817
+
818
+ if callback_on_step_end is not None:
819
+ callback_kwargs = {}
820
+ for k in callback_on_step_end_tensor_inputs:
821
+ callback_kwargs[k] = locals()[k]
822
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
823
+
824
+ latents = callback_outputs.pop("latents", latents)
825
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
826
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
827
+
828
+ # call the callback, if provided
829
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
830
+ progress_bar.update()
831
+ if callback is not None and i % callback_steps == 0:
832
+ callback(i, t, latents)
833
+
834
+ # 9. Post processing
835
+ if output_type == "latent":
836
+ video = latents
837
+ else:
838
+ video_tensor = self.decode_latents(latents)
839
+ video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
840
+
841
+ # 10. Offload all models
1100
842
  self.maybe_free_model_hooks()
1101
843
 
1102
- return video
844
+ if not return_dict:
845
+ return (video,)
846
+
847
+ return AnimateDiffPipelineOutput(frames=video)