diffusers 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. diffusers/__init__.py +20 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/configuration_utils.py +7 -3
  7. diffusers/dependency_versions_check.py +1 -1
  8. diffusers/dependency_versions_table.py +2 -2
  9. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  10. diffusers/image_processor.py +110 -4
  11. diffusers/loaders/autoencoder.py +7 -8
  12. diffusers/loaders/controlnet.py +17 -8
  13. diffusers/loaders/ip_adapter.py +86 -23
  14. diffusers/loaders/lora.py +105 -310
  15. diffusers/loaders/lora_conversion_utils.py +1 -1
  16. diffusers/loaders/peft.py +1 -1
  17. diffusers/loaders/single_file.py +51 -12
  18. diffusers/loaders/single_file_utils.py +274 -49
  19. diffusers/loaders/textual_inversion.py +23 -4
  20. diffusers/loaders/unet.py +195 -41
  21. diffusers/loaders/utils.py +1 -1
  22. diffusers/models/__init__.py +3 -1
  23. diffusers/models/activations.py +9 -9
  24. diffusers/models/attention.py +26 -36
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +171 -114
  27. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  28. diffusers/models/autoencoders/autoencoder_kl.py +3 -1
  29. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  30. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  31. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  32. diffusers/models/autoencoders/vae.py +1 -1
  33. diffusers/models/controlnet.py +1 -1
  34. diffusers/models/controlnet_flax.py +1 -1
  35. diffusers/models/downsampling.py +8 -12
  36. diffusers/models/dual_transformer_2d.py +1 -1
  37. diffusers/models/embeddings.py +3 -4
  38. diffusers/models/embeddings_flax.py +1 -1
  39. diffusers/models/lora.py +33 -10
  40. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  41. diffusers/models/modeling_flax_utils.py +1 -1
  42. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  43. diffusers/models/modeling_utils.py +4 -6
  44. diffusers/models/normalization.py +1 -1
  45. diffusers/models/resnet.py +31 -58
  46. diffusers/models/resnet_flax.py +1 -1
  47. diffusers/models/t5_film_transformer.py +1 -1
  48. diffusers/models/transformer_2d.py +1 -1
  49. diffusers/models/transformer_temporal.py +1 -1
  50. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  51. diffusers/models/transformers/t5_film_transformer.py +1 -1
  52. diffusers/models/transformers/transformer_2d.py +29 -31
  53. diffusers/models/transformers/transformer_temporal.py +1 -1
  54. diffusers/models/unet_1d.py +1 -1
  55. diffusers/models/unet_1d_blocks.py +1 -1
  56. diffusers/models/unet_2d.py +1 -1
  57. diffusers/models/unet_2d_blocks.py +1 -1
  58. diffusers/models/unet_2d_condition.py +1 -1
  59. diffusers/models/unets/__init__.py +1 -0
  60. diffusers/models/unets/unet_1d.py +1 -1
  61. diffusers/models/unets/unet_1d_blocks.py +1 -1
  62. diffusers/models/unets/unet_2d.py +4 -4
  63. diffusers/models/unets/unet_2d_blocks.py +238 -98
  64. diffusers/models/unets/unet_2d_blocks_flax.py +1 -1
  65. diffusers/models/unets/unet_2d_condition.py +420 -323
  66. diffusers/models/unets/unet_2d_condition_flax.py +21 -12
  67. diffusers/models/unets/unet_3d_blocks.py +50 -40
  68. diffusers/models/unets/unet_3d_condition.py +47 -8
  69. diffusers/models/unets/unet_i2vgen_xl.py +75 -30
  70. diffusers/models/unets/unet_kandinsky3.py +1 -1
  71. diffusers/models/unets/unet_motion_model.py +48 -8
  72. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  73. diffusers/models/unets/unet_stable_cascade.py +610 -0
  74. diffusers/models/unets/uvit_2d.py +1 -1
  75. diffusers/models/upsampling.py +10 -16
  76. diffusers/models/vae_flax.py +1 -1
  77. diffusers/models/vq_model.py +1 -1
  78. diffusers/optimization.py +1 -1
  79. diffusers/pipelines/__init__.py +26 -0
  80. diffusers/pipelines/amused/pipeline_amused.py +1 -1
  81. diffusers/pipelines/amused/pipeline_amused_img2img.py +1 -1
  82. diffusers/pipelines/amused/pipeline_amused_inpaint.py +1 -1
  83. diffusers/pipelines/animatediff/pipeline_animatediff.py +162 -417
  84. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +165 -137
  85. diffusers/pipelines/animatediff/pipeline_output.py +7 -6
  86. diffusers/pipelines/audioldm/pipeline_audioldm.py +3 -19
  87. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  88. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +3 -3
  89. diffusers/pipelines/auto_pipeline.py +7 -16
  90. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  93. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  94. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  95. diffusers/pipelines/controlnet/pipeline_controlnet.py +90 -90
  96. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  97. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +98 -90
  98. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +92 -90
  99. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +145 -70
  100. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +126 -89
  101. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +108 -96
  102. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  103. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -1
  104. diffusers/pipelines/ddim/pipeline_ddim.py +1 -1
  105. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  106. diffusers/pipelines/deepfloyd_if/pipeline_if.py +4 -4
  107. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +4 -4
  108. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +5 -5
  109. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +4 -4
  110. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +5 -5
  111. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +5 -5
  112. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +10 -120
  113. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -91
  114. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  115. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +1 -1
  116. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  117. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +1 -1
  118. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  119. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  120. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  121. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  122. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  123. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  124. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +5 -4
  125. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +5 -4
  126. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +7 -22
  127. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -39
  128. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +5 -5
  129. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  130. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -22
  131. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  132. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  133. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -2
  134. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  135. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  136. diffusers/pipelines/free_init_utils.py +184 -0
  137. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +22 -104
  138. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +1 -1
  139. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  140. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +1 -1
  141. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +2 -2
  142. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -1
  143. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -1
  145. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +1 -1
  146. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  147. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  148. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +2 -2
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +104 -93
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +112 -74
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/ledits_pp/__init__.py +55 -0
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +1505 -0
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +1797 -0
  155. diffusers/pipelines/ledits_pp/pipeline_output.py +43 -0
  156. diffusers/pipelines/musicldm/pipeline_musicldm.py +3 -19
  157. diffusers/pipelines/onnx_utils.py +1 -1
  158. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  159. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +3 -3
  160. diffusers/pipelines/pia/pipeline_pia.py +168 -327
  161. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  162. diffusers/pipelines/pipeline_loading_utils.py +508 -0
  163. diffusers/pipelines/pipeline_utils.py +188 -534
  164. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +56 -10
  165. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +3 -3
  166. diffusers/pipelines/shap_e/camera.py +1 -1
  167. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  168. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  169. diffusers/pipelines/shap_e/renderer.py +1 -1
  170. diffusers/pipelines/stable_cascade/__init__.py +50 -0
  171. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +482 -0
  172. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +311 -0
  173. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +638 -0
  174. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  175. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +4 -1
  176. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  177. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +2 -2
  178. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  179. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +1 -1
  180. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +1 -1
  181. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
  182. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
  183. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +90 -146
  184. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  185. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +4 -32
  186. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -119
  187. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -119
  188. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +13 -59
  189. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -31
  190. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -33
  191. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -21
  192. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -21
  193. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  194. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  195. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  196. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +5 -21
  197. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +9 -38
  198. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -34
  199. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +6 -35
  200. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +7 -6
  201. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +4 -124
  202. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +282 -80
  203. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +94 -46
  204. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +3 -3
  205. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  206. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +6 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  208. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +96 -148
  209. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -154
  210. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +98 -153
  211. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +25 -87
  212. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +89 -80
  213. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +5 -49
  214. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +80 -88
  215. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +8 -6
  216. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +15 -86
  217. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +20 -93
  218. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +5 -5
  219. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +3 -19
  220. diffusers/pipelines/unclip/pipeline_unclip.py +1 -1
  221. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -1
  222. diffusers/pipelines/unclip/text_proj.py +1 -1
  223. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +35 -35
  224. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  225. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +4 -21
  226. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +2 -2
  227. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -5
  228. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  229. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +1 -1
  230. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +2 -2
  231. diffusers/schedulers/__init__.py +7 -1
  232. diffusers/schedulers/deprecated/scheduling_karras_ve.py +1 -1
  233. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  234. diffusers/schedulers/scheduling_consistency_models.py +42 -19
  235. diffusers/schedulers/scheduling_ddim.py +2 -4
  236. diffusers/schedulers/scheduling_ddim_flax.py +13 -5
  237. diffusers/schedulers/scheduling_ddim_inverse.py +2 -4
  238. diffusers/schedulers/scheduling_ddim_parallel.py +2 -4
  239. diffusers/schedulers/scheduling_ddpm.py +2 -4
  240. diffusers/schedulers/scheduling_ddpm_flax.py +1 -1
  241. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -4
  242. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +1 -1
  243. diffusers/schedulers/scheduling_deis_multistep.py +46 -19
  244. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -21
  245. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +1 -1
  246. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +9 -7
  247. diffusers/schedulers/scheduling_dpmsolver_sde.py +35 -35
  248. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +49 -18
  249. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +683 -0
  250. diffusers/schedulers/scheduling_edm_euler.py +381 -0
  251. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +43 -15
  252. diffusers/schedulers/scheduling_euler_discrete.py +42 -17
  253. diffusers/schedulers/scheduling_euler_discrete_flax.py +1 -1
  254. diffusers/schedulers/scheduling_heun_discrete.py +35 -35
  255. diffusers/schedulers/scheduling_ipndm.py +37 -11
  256. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +44 -44
  257. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +44 -44
  258. diffusers/schedulers/scheduling_karras_ve_flax.py +1 -1
  259. diffusers/schedulers/scheduling_lcm.py +38 -14
  260. diffusers/schedulers/scheduling_lms_discrete.py +43 -15
  261. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  262. diffusers/schedulers/scheduling_pndm.py +2 -4
  263. diffusers/schedulers/scheduling_pndm_flax.py +2 -4
  264. diffusers/schedulers/scheduling_repaint.py +1 -1
  265. diffusers/schedulers/scheduling_sasolver.py +41 -9
  266. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  267. diffusers/schedulers/scheduling_sde_ve_flax.py +1 -1
  268. diffusers/schedulers/scheduling_tcd.py +686 -0
  269. diffusers/schedulers/scheduling_unclip.py +1 -1
  270. diffusers/schedulers/scheduling_unipc_multistep.py +46 -19
  271. diffusers/schedulers/scheduling_utils.py +2 -1
  272. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  273. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  274. diffusers/training_utils.py +9 -2
  275. diffusers/utils/__init__.py +2 -1
  276. diffusers/utils/accelerate_utils.py +1 -1
  277. diffusers/utils/constants.py +1 -1
  278. diffusers/utils/doc_utils.py +1 -1
  279. diffusers/utils/dummy_pt_objects.py +60 -0
  280. diffusers/utils/dummy_torch_and_transformers_objects.py +75 -0
  281. diffusers/utils/dynamic_modules_utils.py +1 -1
  282. diffusers/utils/export_utils.py +3 -3
  283. diffusers/utils/hub_utils.py +60 -16
  284. diffusers/utils/import_utils.py +15 -1
  285. diffusers/utils/loading_utils.py +2 -0
  286. diffusers/utils/logging.py +1 -1
  287. diffusers/utils/model_card_template.md +24 -0
  288. diffusers/utils/outputs.py +14 -7
  289. diffusers/utils/peft_utils.py +1 -1
  290. diffusers/utils/state_dict_utils.py +1 -1
  291. diffusers/utils/testing_utils.py +2 -0
  292. diffusers/utils/torch_utils.py +1 -1
  293. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/METADATA +46 -46
  294. diffusers-0.27.0.dist-info/RECORD +399 -0
  295. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/WHEEL +1 -1
  296. diffusers-0.26.3.dist-info/RECORD +0 -384
  297. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/LICENSE +0 -0
  298. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/entry_points.txt +0 -0
  299. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ import torch.fft as fft
24
24
  from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
25
25
 
26
26
  from ...image_processor import PipelineImageInput, VaeImageProcessor
27
- from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
27
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
28
28
  from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
29
29
  from ...models.lora import adjust_lora_scale_text_encoder
30
30
  from ...models.unets.unet_motion_model import MotionAdapter
@@ -45,7 +45,8 @@ from ...utils import (
45
45
  unscale_lora_layers,
46
46
  )
47
47
  from ...utils.torch_utils import randn_tensor
48
- from ..pipeline_utils import DiffusionPipeline
48
+ from ..free_init_utils import FreeInitMixin
49
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
49
50
 
50
51
 
51
52
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -106,7 +107,7 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
106
107
  outputs = torch.stack(outputs)
107
108
 
108
109
  elif not output_type == "pil":
109
- raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
110
+ raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
110
111
 
111
112
  return outputs
112
113
 
@@ -200,16 +201,24 @@ class PIAPipelineOutput(BaseOutput):
200
201
  Output class for PIAPipeline.
201
202
 
202
203
  Args:
203
- frames (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]):
204
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
204
205
  Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
205
206
  NumPy array of shape `(batch_size, num_frames, channels, height, width,
206
207
  Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
207
208
  """
208
209
 
209
- frames: Union[torch.Tensor, np.ndarray, PIL.Image.Image]
210
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
210
211
 
211
212
 
212
- class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
213
+ class PIAPipeline(
214
+ DiffusionPipeline,
215
+ StableDiffusionMixin,
216
+ TextualInversionLoaderMixin,
217
+ IPAdapterMixin,
218
+ LoraLoaderMixin,
219
+ FromSingleFileMixin,
220
+ FreeInitMixin,
221
+ ):
213
222
  r"""
214
223
  Pipeline for text-to-video generation.
215
224
 
@@ -338,7 +347,7 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
338
347
  batch_size = prompt_embeds.shape[0]
339
348
 
340
349
  if prompt_embeds is None:
341
- # textual inversion: procecss multi-vector tokens if necessary
350
+ # textual inversion: process multi-vector tokens if necessary
342
351
  if isinstance(self, TextualInversionLoaderMixin):
343
352
  prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
344
353
 
@@ -420,7 +429,7 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
420
429
  else:
421
430
  uncond_tokens = negative_prompt
422
431
 
423
- # textual inversion: procecss multi-vector tokens if necessary
432
+ # textual inversion: process multi-vector tokens if necessary
424
433
  if isinstance(self, TextualInversionLoaderMixin):
425
434
  uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
426
435
 
@@ -492,135 +501,11 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
492
501
  latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
493
502
 
494
503
  image = self.vae.decode(latents).sample
495
- video = (
496
- image[None, :]
497
- .reshape(
498
- (
499
- batch_size,
500
- num_frames,
501
- -1,
502
- )
503
- + image.shape[2:]
504
- )
505
- .permute(0, 2, 1, 3, 4)
506
- )
504
+ video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
507
505
  # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
508
506
  video = video.float()
509
507
  return video
510
508
 
511
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
512
- def enable_vae_slicing(self):
513
- r"""
514
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
515
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
516
- """
517
- self.vae.enable_slicing()
518
-
519
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
520
- def disable_vae_slicing(self):
521
- r"""
522
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
523
- computing decoding in one step.
524
- """
525
- self.vae.disable_slicing()
526
-
527
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
528
- def enable_vae_tiling(self):
529
- r"""
530
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
531
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
532
- processing larger images.
533
- """
534
- self.vae.enable_tiling()
535
-
536
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
537
- def disable_vae_tiling(self):
538
- r"""
539
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
540
- computing decoding in one step.
541
- """
542
- self.vae.disable_tiling()
543
-
544
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
545
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
546
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
547
-
548
- The suffixes after the scaling factors represent the stages where they are being applied.
549
-
550
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
551
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
552
-
553
- Args:
554
- s1 (`float`):
555
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
556
- mitigate "oversmoothing effect" in the enhanced denoising process.
557
- s2 (`float`):
558
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
559
- mitigate "oversmoothing effect" in the enhanced denoising process.
560
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
561
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
562
- """
563
- if not hasattr(self, "unet"):
564
- raise ValueError("The pipeline must have `unet` for using FreeU.")
565
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
566
-
567
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
568
- def disable_freeu(self):
569
- """Disables the FreeU mechanism if enabled."""
570
- self.unet.disable_freeu()
571
-
572
- @property
573
- def free_init_enabled(self):
574
- return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
575
-
576
- def enable_free_init(
577
- self,
578
- num_iters: int = 3,
579
- use_fast_sampling: bool = False,
580
- method: str = "butterworth",
581
- order: int = 4,
582
- spatial_stop_frequency: float = 0.25,
583
- temporal_stop_frequency: float = 0.25,
584
- generator: Optional[torch.Generator] = None,
585
- ):
586
- """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
587
-
588
- This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
589
-
590
- Args:
591
- num_iters (`int`, *optional*, defaults to `3`):
592
- Number of FreeInit noise re-initialization iterations.
593
- use_fast_sampling (`bool`, *optional*, defaults to `False`):
594
- Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
595
- the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
596
- method (`str`, *optional*, defaults to `butterworth`):
597
- Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
598
- FreeInit low pass filter.
599
- order (`int`, *optional*, defaults to `4`):
600
- Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
601
- whereas lower values lead to `gaussian` method behaviour.
602
- spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
603
- Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
604
- the original implementation.
605
- temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
606
- Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
607
- the original implementation.
608
- generator (`torch.Generator`, *optional*, defaults to `0.25`):
609
- A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
610
- FreeInit generation deterministic.
611
- """
612
- self._free_init_num_iters = num_iters
613
- self._free_init_use_fast_sampling = use_fast_sampling
614
- self._free_init_method = method
615
- self._free_init_order = order
616
- self._free_init_spatial_stop_frequency = spatial_stop_frequency
617
- self._free_init_temporal_stop_frequency = temporal_stop_frequency
618
- self._free_init_generator = generator
619
-
620
- def disable_free_init(self):
621
- """Disables the FreeInit mechanism if enabled."""
622
- self._free_init_num_iters = None
623
-
624
509
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
625
510
  def prepare_extra_step_kwargs(self, generator, eta):
626
511
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -647,6 +532,8 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
647
532
  negative_prompt=None,
648
533
  prompt_embeds=None,
649
534
  negative_prompt_embeds=None,
535
+ ip_adapter_image=None,
536
+ ip_adapter_image_embeds=None,
650
537
  callback_on_step_end_tensor_inputs=None,
651
538
  ):
652
539
  if height % 8 != 0 or width % 8 != 0:
@@ -685,6 +572,73 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
685
572
  f" {negative_prompt_embeds.shape}."
686
573
  )
687
574
 
575
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
576
+ raise ValueError(
577
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
578
+ )
579
+
580
+ if ip_adapter_image_embeds is not None:
581
+ if not isinstance(ip_adapter_image_embeds, list):
582
+ raise ValueError(
583
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
584
+ )
585
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
586
+ raise ValueError(
587
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
588
+ )
589
+
590
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
591
+ def prepare_ip_adapter_image_embeds(
592
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
593
+ ):
594
+ if ip_adapter_image_embeds is None:
595
+ if not isinstance(ip_adapter_image, list):
596
+ ip_adapter_image = [ip_adapter_image]
597
+
598
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
599
+ raise ValueError(
600
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
601
+ )
602
+
603
+ image_embeds = []
604
+ for single_ip_adapter_image, image_proj_layer in zip(
605
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
606
+ ):
607
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
608
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
609
+ single_ip_adapter_image, device, 1, output_hidden_state
610
+ )
611
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
612
+ single_negative_image_embeds = torch.stack(
613
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
614
+ )
615
+
616
+ if do_classifier_free_guidance:
617
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
618
+ single_image_embeds = single_image_embeds.to(device)
619
+
620
+ image_embeds.append(single_image_embeds)
621
+ else:
622
+ repeat_dims = [1]
623
+ image_embeds = []
624
+ for single_image_embeds in ip_adapter_image_embeds:
625
+ if do_classifier_free_guidance:
626
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
627
+ single_image_embeds = single_image_embeds.repeat(
628
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
629
+ )
630
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
631
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
632
+ )
633
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
634
+ else:
635
+ single_image_embeds = single_image_embeds.repeat(
636
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
637
+ )
638
+ image_embeds.append(single_image_embeds)
639
+
640
+ return image_embeds
641
+
688
642
  # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
689
643
  def prepare_latents(
690
644
  self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
@@ -762,143 +716,6 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
762
716
 
763
717
  return mask, masked_image
764
718
 
765
- def _denoise_loop(
766
- self,
767
- timesteps,
768
- num_inference_steps,
769
- do_classifier_free_guidance,
770
- guidance_scale,
771
- num_warmup_steps,
772
- prompt_embeds,
773
- negative_prompt_embeds,
774
- latents,
775
- mask,
776
- masked_image,
777
- cross_attention_kwargs,
778
- added_cond_kwargs,
779
- extra_step_kwargs,
780
- callback_on_step_end,
781
- callback_on_step_end_tensor_inputs,
782
- ):
783
- """Denoising loop for PIA."""
784
- with self.progress_bar(total=num_inference_steps) as progress_bar:
785
- for i, t in enumerate(timesteps):
786
- # expand the latents if we are doing classifier free guidance
787
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
788
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
789
- latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
790
-
791
- # predict the noise residual
792
- noise_pred = self.unet(
793
- latent_model_input,
794
- t,
795
- encoder_hidden_states=prompt_embeds,
796
- cross_attention_kwargs=cross_attention_kwargs,
797
- added_cond_kwargs=added_cond_kwargs,
798
- ).sample
799
-
800
- # perform guidance
801
- if do_classifier_free_guidance:
802
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
803
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
804
-
805
- # compute the previous noisy sample x_t -> x_t-1
806
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
807
-
808
- if callback_on_step_end is not None:
809
- callback_kwargs = {}
810
- for k in callback_on_step_end_tensor_inputs:
811
- callback_kwargs[k] = locals()[k]
812
- callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
813
-
814
- latents = callback_outputs.pop("latents", latents)
815
- prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
816
- negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
817
-
818
- # call the callback, if provided
819
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
820
- progress_bar.update()
821
-
822
- return latents
823
-
824
- def _free_init_loop(
825
- self,
826
- height,
827
- width,
828
- num_frames,
829
- batch_size,
830
- num_videos_per_prompt,
831
- denoise_args,
832
- device,
833
- ):
834
- """Denoising loop for PIA using FreeInit noise reinitialization technique."""
835
-
836
- latents = denoise_args.get("latents")
837
- prompt_embeds = denoise_args.get("prompt_embeds")
838
- timesteps = denoise_args.get("timesteps")
839
- num_inference_steps = denoise_args.get("num_inference_steps")
840
-
841
- latent_shape = (
842
- batch_size * num_videos_per_prompt,
843
- 4,
844
- num_frames,
845
- height // self.vae_scale_factor,
846
- width // self.vae_scale_factor,
847
- )
848
- free_init_filter_shape = (
849
- 1,
850
- 4,
851
- num_frames,
852
- height // self.vae_scale_factor,
853
- width // self.vae_scale_factor,
854
- )
855
- free_init_freq_filter = _get_freeinit_freq_filter(
856
- shape=free_init_filter_shape,
857
- device=device,
858
- filter_type=self._free_init_method,
859
- order=self._free_init_order,
860
- spatial_stop_frequency=self._free_init_spatial_stop_frequency,
861
- temporal_stop_frequency=self._free_init_temporal_stop_frequency,
862
- )
863
-
864
- with self.progress_bar(total=self._free_init_num_iters) as free_init_progress_bar:
865
- for i in range(self._free_init_num_iters):
866
- # For the first FreeInit iteration, the original latent is used without modification.
867
- # Subsequent iterations apply the noise reinitialization technique.
868
- if i == 0:
869
- initial_noise = latents.detach().clone()
870
- else:
871
- current_diffuse_timestep = (
872
- self.scheduler.config.num_train_timesteps - 1
873
- ) # diffuse to t=999 noise level
874
- diffuse_timesteps = torch.full((batch_size,), current_diffuse_timestep).long()
875
- z_T = self.scheduler.add_noise(
876
- original_samples=latents, noise=initial_noise, timesteps=diffuse_timesteps.to(device)
877
- ).to(dtype=torch.float32)
878
- z_rand = randn_tensor(
879
- shape=latent_shape,
880
- generator=self._free_init_generator,
881
- device=device,
882
- dtype=torch.float32,
883
- )
884
- latents = _freq_mix_3d(z_T, z_rand, LPF=free_init_freq_filter)
885
- latents = latents.to(prompt_embeds.dtype)
886
-
887
- # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
888
- if self._free_init_use_fast_sampling:
889
- current_num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (i + 1))
890
- self.scheduler.set_timesteps(current_num_inference_steps, device=device)
891
- timesteps = self.scheduler.timesteps
892
- denoise_args.update({"timesteps": timesteps, "num_inference_steps": current_num_inference_steps})
893
-
894
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
895
- denoise_args.update({"latents": latents, "num_warmup_steps": num_warmup_steps})
896
- latents = self._denoise_loop(**denoise_args)
897
-
898
- free_init_progress_bar.update()
899
-
900
- return latents
901
-
902
719
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
903
720
  def get_timesteps(self, num_inference_steps, strength, device):
904
721
  # get the original timestep using init_timestep
@@ -906,22 +723,11 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
906
723
 
907
724
  t_start = max(num_inference_steps - init_timestep, 0)
908
725
  timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
726
+ if hasattr(self.scheduler, "set_begin_index"):
727
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
909
728
 
910
729
  return timesteps, num_inference_steps - t_start
911
730
 
912
- def _retrieve_video_frames(self, latents, output_type, return_dict):
913
- """Helper function to handle latents to output conversion."""
914
- if output_type == "latent":
915
- return PIAPipelineOutput(frames=latents)
916
-
917
- video_tensor = self.decode_latents(latents)
918
- video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
919
-
920
- if not return_dict:
921
- return (video,)
922
-
923
- return PIAPipelineOutput(frames=video)
924
-
925
731
  @property
926
732
  def guidance_scale(self):
927
733
  return self._guidance_scale
@@ -965,6 +771,7 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
965
771
  prompt_embeds: Optional[torch.FloatTensor] = None,
966
772
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
967
773
  ip_adapter_image: Optional[PipelineImageInput] = None,
774
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
968
775
  motion_scale: int = 0,
969
776
  output_type: Optional[str] = "pil",
970
777
  return_dict: bool = True,
@@ -1017,6 +824,11 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
1017
824
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1018
825
  ip_adapter_image: (`PipelineImageInput`, *optional*):
1019
826
  Optional image input to work with IP Adapters.
827
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
828
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
829
+ Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
830
+ if `do_classifier_free_guidance` is set to `True`.
831
+ If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1020
832
  motion_scale: (`int`, *optional*, defaults to 0):
1021
833
  Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific
1022
834
  ranges of values control the type of motion that is added. Must be between 0 and 8.
@@ -1048,8 +860,8 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
1048
860
  Examples:
1049
861
 
1050
862
  Returns:
1051
- [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
1052
- If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
863
+ [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
864
+ If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is
1053
865
  returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
1054
866
  """
1055
867
  # 0. Default height and width to unet
@@ -1066,6 +878,8 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
1066
878
  negative_prompt,
1067
879
  prompt_embeds,
1068
880
  negative_prompt_embeds,
881
+ ip_adapter_image,
882
+ ip_adapter_image_embeds,
1069
883
  callback_on_step_end_tensor_inputs,
1070
884
  )
1071
885
 
@@ -1104,13 +918,14 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
1104
918
  if self.do_classifier_free_guidance:
1105
919
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
1106
920
 
1107
- if ip_adapter_image is not None:
1108
- output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
1109
- image_embeds, negative_image_embeds = self.encode_image(
1110
- ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
921
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
922
+ image_embeds = self.prepare_ip_adapter_image_embeds(
923
+ ip_adapter_image,
924
+ ip_adapter_image_embeds,
925
+ device,
926
+ batch_size * num_videos_per_prompt,
927
+ self.do_classifier_free_guidance,
1111
928
  )
1112
- if self.do_classifier_free_guidance:
1113
- image_embeds = torch.cat([negative_image_embeds, image_embeds])
1114
929
 
1115
930
  # 4. Prepare timesteps
1116
931
  self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -1150,44 +965,70 @@ class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin
1150
965
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1151
966
 
1152
967
  # 7. Add image embeds for IP-Adapter
1153
- added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
968
+ added_cond_kwargs = (
969
+ {"image_embeds": image_embeds}
970
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
971
+ else None
972
+ )
1154
973
 
1155
974
  # 8. Denoising loop
1156
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1157
- denoise_args = {
1158
- "timesteps": timesteps,
1159
- "num_inference_steps": num_inference_steps,
1160
- "do_classifier_free_guidance": self.do_classifier_free_guidance,
1161
- "guidance_scale": guidance_scale,
1162
- "num_warmup_steps": num_warmup_steps,
1163
- "prompt_embeds": prompt_embeds,
1164
- "negative_prompt_embeds": negative_prompt_embeds,
1165
- "latents": latents,
1166
- "mask": mask,
1167
- "masked_image": masked_image,
1168
- "cross_attention_kwargs": self.cross_attention_kwargs,
1169
- "added_cond_kwargs": added_cond_kwargs,
1170
- "extra_step_kwargs": extra_step_kwargs,
1171
- "callback_on_step_end": callback_on_step_end,
1172
- "callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
1173
- }
1174
-
1175
- if self.free_init_enabled:
1176
- latents = self._free_init_loop(
1177
- height=height,
1178
- width=width,
1179
- num_frames=num_frames,
1180
- batch_size=batch_size,
1181
- num_videos_per_prompt=num_videos_per_prompt,
1182
- denoise_args=denoise_args,
1183
- device=device,
1184
- )
1185
- else:
1186
- latents = self._denoise_loop(**denoise_args)
975
+ num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
976
+ for free_init_iter in range(num_free_init_iters):
977
+ if self.free_init_enabled:
978
+ latents, timesteps = self._apply_free_init(
979
+ latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
980
+ )
1187
981
 
1188
- video = self._retrieve_video_frames(latents, output_type, return_dict)
982
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
983
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
984
+ for i, t in enumerate(timesteps):
985
+ # expand the latents if we are doing classifier free guidance
986
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
987
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
988
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
989
+
990
+ # predict the noise residual
991
+ noise_pred = self.unet(
992
+ latent_model_input,
993
+ t,
994
+ encoder_hidden_states=prompt_embeds,
995
+ cross_attention_kwargs=cross_attention_kwargs,
996
+ added_cond_kwargs=added_cond_kwargs,
997
+ ).sample
998
+
999
+ # perform guidance
1000
+ if self.do_classifier_free_guidance:
1001
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1002
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1003
+
1004
+ # compute the previous noisy sample x_t -> x_t-1
1005
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
1006
+
1007
+ if callback_on_step_end is not None:
1008
+ callback_kwargs = {}
1009
+ for k in callback_on_step_end_tensor_inputs:
1010
+ callback_kwargs[k] = locals()[k]
1011
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1012
+
1013
+ latents = callback_outputs.pop("latents", latents)
1014
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1015
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1016
+
1017
+ # call the callback, if provided
1018
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1019
+ progress_bar.update()
1020
+
1021
+ # 9. Post processing
1022
+ if output_type == "latent":
1023
+ video = latents
1024
+ else:
1025
+ video_tensor = self.decode_latents(latents)
1026
+ video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
1189
1027
 
1190
- # 9. Offload all models
1028
+ # 10. Offload all models
1191
1029
  self.maybe_free_model_hooks()
1192
1030
 
1193
- return video
1031
+ if not return_dict:
1032
+ return (video,)
1033
+
1034
+ return PIAPipelineOutput(frames=video)
@@ -1,5 +1,5 @@
1
1
  # coding=utf-8
2
- # Copyright 2023 The HuggingFace Inc. team.
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
3
  # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License");