diffusers 0.26.3__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. diffusers/__init__.py +20 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/configuration_utils.py +7 -3
  7. diffusers/dependency_versions_check.py +1 -1
  8. diffusers/dependency_versions_table.py +2 -2
  9. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  10. diffusers/image_processor.py +110 -4
  11. diffusers/loaders/autoencoder.py +7 -8
  12. diffusers/loaders/controlnet.py +17 -8
  13. diffusers/loaders/ip_adapter.py +86 -23
  14. diffusers/loaders/lora.py +105 -310
  15. diffusers/loaders/lora_conversion_utils.py +1 -1
  16. diffusers/loaders/peft.py +1 -1
  17. diffusers/loaders/single_file.py +51 -12
  18. diffusers/loaders/single_file_utils.py +274 -49
  19. diffusers/loaders/textual_inversion.py +23 -4
  20. diffusers/loaders/unet.py +195 -41
  21. diffusers/loaders/utils.py +1 -1
  22. diffusers/models/__init__.py +3 -1
  23. diffusers/models/activations.py +9 -9
  24. diffusers/models/attention.py +26 -36
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +171 -114
  27. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  28. diffusers/models/autoencoders/autoencoder_kl.py +3 -1
  29. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  30. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  31. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  32. diffusers/models/autoencoders/vae.py +1 -1
  33. diffusers/models/controlnet.py +1 -1
  34. diffusers/models/controlnet_flax.py +1 -1
  35. diffusers/models/downsampling.py +8 -12
  36. diffusers/models/dual_transformer_2d.py +1 -1
  37. diffusers/models/embeddings.py +3 -4
  38. diffusers/models/embeddings_flax.py +1 -1
  39. diffusers/models/lora.py +33 -10
  40. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  41. diffusers/models/modeling_flax_utils.py +1 -1
  42. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  43. diffusers/models/modeling_utils.py +4 -6
  44. diffusers/models/normalization.py +1 -1
  45. diffusers/models/resnet.py +31 -58
  46. diffusers/models/resnet_flax.py +1 -1
  47. diffusers/models/t5_film_transformer.py +1 -1
  48. diffusers/models/transformer_2d.py +1 -1
  49. diffusers/models/transformer_temporal.py +1 -1
  50. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  51. diffusers/models/transformers/t5_film_transformer.py +1 -1
  52. diffusers/models/transformers/transformer_2d.py +29 -31
  53. diffusers/models/transformers/transformer_temporal.py +1 -1
  54. diffusers/models/unet_1d.py +1 -1
  55. diffusers/models/unet_1d_blocks.py +1 -1
  56. diffusers/models/unet_2d.py +1 -1
  57. diffusers/models/unet_2d_blocks.py +1 -1
  58. diffusers/models/unet_2d_condition.py +1 -1
  59. diffusers/models/unets/__init__.py +1 -0
  60. diffusers/models/unets/unet_1d.py +1 -1
  61. diffusers/models/unets/unet_1d_blocks.py +1 -1
  62. diffusers/models/unets/unet_2d.py +4 -4
  63. diffusers/models/unets/unet_2d_blocks.py +238 -98
  64. diffusers/models/unets/unet_2d_blocks_flax.py +1 -1
  65. diffusers/models/unets/unet_2d_condition.py +420 -323
  66. diffusers/models/unets/unet_2d_condition_flax.py +21 -12
  67. diffusers/models/unets/unet_3d_blocks.py +50 -40
  68. diffusers/models/unets/unet_3d_condition.py +47 -8
  69. diffusers/models/unets/unet_i2vgen_xl.py +75 -30
  70. diffusers/models/unets/unet_kandinsky3.py +1 -1
  71. diffusers/models/unets/unet_motion_model.py +48 -8
  72. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  73. diffusers/models/unets/unet_stable_cascade.py +610 -0
  74. diffusers/models/unets/uvit_2d.py +1 -1
  75. diffusers/models/upsampling.py +10 -16
  76. diffusers/models/vae_flax.py +1 -1
  77. diffusers/models/vq_model.py +1 -1
  78. diffusers/optimization.py +1 -1
  79. diffusers/pipelines/__init__.py +26 -0
  80. diffusers/pipelines/amused/pipeline_amused.py +1 -1
  81. diffusers/pipelines/amused/pipeline_amused_img2img.py +1 -1
  82. diffusers/pipelines/amused/pipeline_amused_inpaint.py +1 -1
  83. diffusers/pipelines/animatediff/pipeline_animatediff.py +162 -417
  84. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +165 -137
  85. diffusers/pipelines/animatediff/pipeline_output.py +7 -6
  86. diffusers/pipelines/audioldm/pipeline_audioldm.py +3 -19
  87. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  88. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +3 -3
  89. diffusers/pipelines/auto_pipeline.py +7 -16
  90. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  93. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  94. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  95. diffusers/pipelines/controlnet/pipeline_controlnet.py +90 -90
  96. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  97. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +98 -90
  98. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +92 -90
  99. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +145 -70
  100. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +126 -89
  101. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +108 -96
  102. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  103. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -1
  104. diffusers/pipelines/ddim/pipeline_ddim.py +1 -1
  105. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  106. diffusers/pipelines/deepfloyd_if/pipeline_if.py +4 -4
  107. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +4 -4
  108. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +5 -5
  109. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +4 -4
  110. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +5 -5
  111. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +5 -5
  112. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +10 -120
  113. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -91
  114. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  115. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +1 -1
  116. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  117. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +1 -1
  118. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  119. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  120. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  121. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  122. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  123. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  124. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +5 -4
  125. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +5 -4
  126. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +7 -22
  127. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -39
  128. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +5 -5
  129. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  130. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -22
  131. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  132. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  133. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -2
  134. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  135. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  136. diffusers/pipelines/free_init_utils.py +184 -0
  137. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +22 -104
  138. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +1 -1
  139. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  140. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +1 -1
  141. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +2 -2
  142. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -1
  143. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -1
  145. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +1 -1
  146. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  147. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  148. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +2 -2
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +104 -93
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +112 -74
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/ledits_pp/__init__.py +55 -0
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +1505 -0
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +1797 -0
  155. diffusers/pipelines/ledits_pp/pipeline_output.py +43 -0
  156. diffusers/pipelines/musicldm/pipeline_musicldm.py +3 -19
  157. diffusers/pipelines/onnx_utils.py +1 -1
  158. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  159. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +3 -3
  160. diffusers/pipelines/pia/pipeline_pia.py +168 -327
  161. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  162. diffusers/pipelines/pipeline_loading_utils.py +508 -0
  163. diffusers/pipelines/pipeline_utils.py +188 -534
  164. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +56 -10
  165. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +3 -3
  166. diffusers/pipelines/shap_e/camera.py +1 -1
  167. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  168. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  169. diffusers/pipelines/shap_e/renderer.py +1 -1
  170. diffusers/pipelines/stable_cascade/__init__.py +50 -0
  171. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +482 -0
  172. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +311 -0
  173. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +638 -0
  174. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  175. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +4 -1
  176. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  177. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +2 -2
  178. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  179. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +1 -1
  180. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +1 -1
  181. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
  182. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
  183. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +90 -146
  184. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  185. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +4 -32
  186. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -119
  187. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -119
  188. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +13 -59
  189. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -31
  190. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -33
  191. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -21
  192. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -21
  193. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  194. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  195. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  196. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +5 -21
  197. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +9 -38
  198. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -34
  199. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +6 -35
  200. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +7 -6
  201. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +4 -124
  202. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +282 -80
  203. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +94 -46
  204. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +3 -3
  205. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  206. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +6 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  208. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +96 -148
  209. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -154
  210. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +98 -153
  211. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +25 -87
  212. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +89 -80
  213. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +5 -49
  214. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +80 -88
  215. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +8 -6
  216. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +15 -86
  217. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +20 -93
  218. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +5 -5
  219. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +3 -19
  220. diffusers/pipelines/unclip/pipeline_unclip.py +1 -1
  221. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -1
  222. diffusers/pipelines/unclip/text_proj.py +1 -1
  223. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +35 -35
  224. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  225. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +4 -21
  226. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +2 -2
  227. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -5
  228. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  229. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +1 -1
  230. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +2 -2
  231. diffusers/schedulers/__init__.py +7 -1
  232. diffusers/schedulers/deprecated/scheduling_karras_ve.py +1 -1
  233. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  234. diffusers/schedulers/scheduling_consistency_models.py +42 -19
  235. diffusers/schedulers/scheduling_ddim.py +2 -4
  236. diffusers/schedulers/scheduling_ddim_flax.py +13 -5
  237. diffusers/schedulers/scheduling_ddim_inverse.py +2 -4
  238. diffusers/schedulers/scheduling_ddim_parallel.py +2 -4
  239. diffusers/schedulers/scheduling_ddpm.py +2 -4
  240. diffusers/schedulers/scheduling_ddpm_flax.py +1 -1
  241. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -4
  242. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +1 -1
  243. diffusers/schedulers/scheduling_deis_multistep.py +46 -19
  244. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -21
  245. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +1 -1
  246. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +9 -7
  247. diffusers/schedulers/scheduling_dpmsolver_sde.py +35 -35
  248. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +49 -18
  249. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +683 -0
  250. diffusers/schedulers/scheduling_edm_euler.py +381 -0
  251. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +43 -15
  252. diffusers/schedulers/scheduling_euler_discrete.py +42 -17
  253. diffusers/schedulers/scheduling_euler_discrete_flax.py +1 -1
  254. diffusers/schedulers/scheduling_heun_discrete.py +35 -35
  255. diffusers/schedulers/scheduling_ipndm.py +37 -11
  256. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +44 -44
  257. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +44 -44
  258. diffusers/schedulers/scheduling_karras_ve_flax.py +1 -1
  259. diffusers/schedulers/scheduling_lcm.py +38 -14
  260. diffusers/schedulers/scheduling_lms_discrete.py +43 -15
  261. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  262. diffusers/schedulers/scheduling_pndm.py +2 -4
  263. diffusers/schedulers/scheduling_pndm_flax.py +2 -4
  264. diffusers/schedulers/scheduling_repaint.py +1 -1
  265. diffusers/schedulers/scheduling_sasolver.py +41 -9
  266. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  267. diffusers/schedulers/scheduling_sde_ve_flax.py +1 -1
  268. diffusers/schedulers/scheduling_tcd.py +686 -0
  269. diffusers/schedulers/scheduling_unclip.py +1 -1
  270. diffusers/schedulers/scheduling_unipc_multistep.py +46 -19
  271. diffusers/schedulers/scheduling_utils.py +2 -1
  272. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  273. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  274. diffusers/training_utils.py +9 -2
  275. diffusers/utils/__init__.py +2 -1
  276. diffusers/utils/accelerate_utils.py +1 -1
  277. diffusers/utils/constants.py +1 -1
  278. diffusers/utils/doc_utils.py +1 -1
  279. diffusers/utils/dummy_pt_objects.py +60 -0
  280. diffusers/utils/dummy_torch_and_transformers_objects.py +75 -0
  281. diffusers/utils/dynamic_modules_utils.py +1 -1
  282. diffusers/utils/export_utils.py +3 -3
  283. diffusers/utils/hub_utils.py +60 -16
  284. diffusers/utils/import_utils.py +15 -1
  285. diffusers/utils/loading_utils.py +2 -0
  286. diffusers/utils/logging.py +1 -1
  287. diffusers/utils/model_card_template.md +24 -0
  288. diffusers/utils/outputs.py +14 -7
  289. diffusers/utils/peft_utils.py +1 -1
  290. diffusers/utils/state_dict_utils.py +1 -1
  291. diffusers/utils/testing_utils.py +2 -0
  292. diffusers/utils/torch_utils.py +1 -1
  293. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/METADATA +46 -46
  294. diffusers-0.27.0.dist-info/RECORD +399 -0
  295. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/WHEEL +1 -1
  296. diffusers-0.26.3.dist-info/RECORD +0 -384
  297. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/LICENSE +0 -0
  298. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/entry_points.txt +0 -0
  299. {diffusers-0.26.3.dist-info → diffusers-0.27.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ from ...utils import (
37
37
  unscale_lora_layers,
38
38
  )
39
39
  from ...utils.torch_utils import is_compiled_module, randn_tensor
40
- from ..pipeline_utils import DiffusionPipeline
40
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
41
41
  from ..stable_diffusion import StableDiffusionPipelineOutput
42
42
  from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
43
43
  from .multicontrolnet import MultiControlNetModel
@@ -241,7 +241,12 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
241
241
 
242
242
 
243
243
  class StableDiffusionControlNetInpaintPipeline(
244
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
244
+ DiffusionPipeline,
245
+ StableDiffusionMixin,
246
+ TextualInversionLoaderMixin,
247
+ LoraLoaderMixin,
248
+ IPAdapterMixin,
249
+ FromSingleFileMixin,
245
250
  ):
246
251
  r"""
247
252
  Pipeline for image inpainting using Stable Diffusion with ControlNet guidance.
@@ -351,39 +356,6 @@ class StableDiffusionControlNetInpaintPipeline(
351
356
  )
352
357
  self.register_to_config(requires_safety_checker=requires_safety_checker)
353
358
 
354
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
355
- def enable_vae_slicing(self):
356
- r"""
357
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
358
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
359
- """
360
- self.vae.enable_slicing()
361
-
362
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
363
- def disable_vae_slicing(self):
364
- r"""
365
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
366
- computing decoding in one step.
367
- """
368
- self.vae.disable_slicing()
369
-
370
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
371
- def enable_vae_tiling(self):
372
- r"""
373
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
374
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
375
- processing larger images.
376
- """
377
- self.vae.enable_tiling()
378
-
379
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
380
- def disable_vae_tiling(self):
381
- r"""
382
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
383
- computing decoding in one step.
384
- """
385
- self.vae.disable_tiling()
386
-
387
359
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
388
360
  def _encode_prompt(
389
361
  self,
@@ -478,7 +450,7 @@ class StableDiffusionControlNetInpaintPipeline(
478
450
  batch_size = prompt_embeds.shape[0]
479
451
 
480
452
  if prompt_embeds is None:
481
- # textual inversion: procecss multi-vector tokens if necessary
453
+ # textual inversion: process multi-vector tokens if necessary
482
454
  if isinstance(self, TextualInversionLoaderMixin):
483
455
  prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
484
456
 
@@ -560,7 +532,7 @@ class StableDiffusionControlNetInpaintPipeline(
560
532
  else:
561
533
  uncond_tokens = negative_prompt
562
534
 
563
- # textual inversion: procecss multi-vector tokens if necessary
535
+ # textual inversion: process multi-vector tokens if necessary
564
536
  if isinstance(self, TextualInversionLoaderMixin):
565
537
  uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
566
538
 
@@ -625,31 +597,54 @@ class StableDiffusionControlNetInpaintPipeline(
625
597
  return image_embeds, uncond_image_embeds
626
598
 
627
599
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
628
- def prepare_ip_adapter_image_embeds(self, ip_adapter_image, device, num_images_per_prompt):
629
- if not isinstance(ip_adapter_image, list):
630
- ip_adapter_image = [ip_adapter_image]
600
+ def prepare_ip_adapter_image_embeds(
601
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
602
+ ):
603
+ if ip_adapter_image_embeds is None:
604
+ if not isinstance(ip_adapter_image, list):
605
+ ip_adapter_image = [ip_adapter_image]
631
606
 
632
- if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
633
- raise ValueError(
634
- f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
635
- )
607
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
608
+ raise ValueError(
609
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
610
+ )
636
611
 
637
- image_embeds = []
638
- for single_ip_adapter_image, image_proj_layer in zip(
639
- ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
640
- ):
641
- output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
642
- single_image_embeds, single_negative_image_embeds = self.encode_image(
643
- single_ip_adapter_image, device, 1, output_hidden_state
644
- )
645
- single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
646
- single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
612
+ image_embeds = []
613
+ for single_ip_adapter_image, image_proj_layer in zip(
614
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
615
+ ):
616
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
617
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
618
+ single_ip_adapter_image, device, 1, output_hidden_state
619
+ )
620
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
621
+ single_negative_image_embeds = torch.stack(
622
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
623
+ )
647
624
 
648
- if self.do_classifier_free_guidance:
649
- single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
650
- single_image_embeds = single_image_embeds.to(device)
625
+ if do_classifier_free_guidance:
626
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
627
+ single_image_embeds = single_image_embeds.to(device)
651
628
 
652
- image_embeds.append(single_image_embeds)
629
+ image_embeds.append(single_image_embeds)
630
+ else:
631
+ repeat_dims = [1]
632
+ image_embeds = []
633
+ for single_image_embeds in ip_adapter_image_embeds:
634
+ if do_classifier_free_guidance:
635
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
636
+ single_image_embeds = single_image_embeds.repeat(
637
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
638
+ )
639
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
640
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
641
+ )
642
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
643
+ else:
644
+ single_image_embeds = single_image_embeds.repeat(
645
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
646
+ )
647
+ image_embeds.append(single_image_embeds)
653
648
 
654
649
  return image_embeds
655
650
 
@@ -705,6 +700,8 @@ class StableDiffusionControlNetInpaintPipeline(
705
700
 
706
701
  t_start = max(num_inference_steps - init_timestep, 0)
707
702
  timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
703
+ if hasattr(self.scheduler, "set_begin_index"):
704
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
708
705
 
709
706
  return timesteps, num_inference_steps - t_start
710
707
 
@@ -720,6 +717,8 @@ class StableDiffusionControlNetInpaintPipeline(
720
717
  negative_prompt=None,
721
718
  prompt_embeds=None,
722
719
  negative_prompt_embeds=None,
720
+ ip_adapter_image=None,
721
+ ip_adapter_image_embeds=None,
723
722
  controlnet_conditioning_scale=1.0,
724
723
  control_guidance_start=0.0,
725
724
  control_guidance_end=1.0,
@@ -869,6 +868,21 @@ class StableDiffusionControlNetInpaintPipeline(
869
868
  if end > 1.0:
870
869
  raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
871
870
 
871
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
872
+ raise ValueError(
873
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
874
+ )
875
+
876
+ if ip_adapter_image_embeds is not None:
877
+ if not isinstance(ip_adapter_image_embeds, list):
878
+ raise ValueError(
879
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
880
+ )
881
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
882
+ raise ValueError(
883
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
884
+ )
885
+
872
886
  # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
873
887
  def check_image(self, image, prompt, prompt_embeds):
874
888
  image_is_pil = isinstance(image, PIL.Image.Image)
@@ -1061,34 +1075,6 @@ class StableDiffusionControlNetInpaintPipeline(
1061
1075
 
1062
1076
  return image_latents
1063
1077
 
1064
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
1065
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
1066
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
1067
-
1068
- The suffixes after the scaling factors represent the stages where they are being applied.
1069
-
1070
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
1071
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
1072
-
1073
- Args:
1074
- s1 (`float`):
1075
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
1076
- mitigate "oversmoothing effect" in the enhanced denoising process.
1077
- s2 (`float`):
1078
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
1079
- mitigate "oversmoothing effect" in the enhanced denoising process.
1080
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
1081
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
1082
- """
1083
- if not hasattr(self, "unet"):
1084
- raise ValueError("The pipeline must have `unet` for using FreeU.")
1085
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
1086
-
1087
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
1088
- def disable_freeu(self):
1089
- """Disables the FreeU mechanism if enabled."""
1090
- self.unet.disable_freeu()
1091
-
1092
1078
  @property
1093
1079
  def guidance_scale(self):
1094
1080
  return self._guidance_scale
@@ -1134,6 +1120,7 @@ class StableDiffusionControlNetInpaintPipeline(
1134
1120
  prompt_embeds: Optional[torch.FloatTensor] = None,
1135
1121
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1136
1122
  ip_adapter_image: Optional[PipelineImageInput] = None,
1123
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1137
1124
  output_type: Optional[str] = "pil",
1138
1125
  return_dict: bool = True,
1139
1126
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -1219,6 +1206,11 @@ class StableDiffusionControlNetInpaintPipeline(
1219
1206
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
1220
1207
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
1221
1208
  ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1209
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1210
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1211
+ Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1212
+ if `do_classifier_free_guidance` is set to `True`.
1213
+ If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1222
1214
  output_type (`str`, *optional*, defaults to `"pil"`):
1223
1215
  The output format of the generated image. Choose between `PIL.Image` or `np.array`.
1224
1216
  return_dict (`bool`, *optional*, defaults to `True`):
@@ -1303,6 +1295,8 @@ class StableDiffusionControlNetInpaintPipeline(
1303
1295
  negative_prompt,
1304
1296
  prompt_embeds,
1305
1297
  negative_prompt_embeds,
1298
+ ip_adapter_image,
1299
+ ip_adapter_image_embeds,
1306
1300
  controlnet_conditioning_scale,
1307
1301
  control_guidance_start,
1308
1302
  control_guidance_end,
@@ -1363,9 +1357,13 @@ class StableDiffusionControlNetInpaintPipeline(
1363
1357
  if self.do_classifier_free_guidance:
1364
1358
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
1365
1359
 
1366
- if ip_adapter_image is not None:
1360
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1367
1361
  image_embeds = self.prepare_ip_adapter_image_embeds(
1368
- ip_adapter_image, device, batch_size * num_images_per_prompt
1362
+ ip_adapter_image,
1363
+ ip_adapter_image_embeds,
1364
+ device,
1365
+ batch_size * num_images_per_prompt,
1366
+ self.do_classifier_free_guidance,
1369
1367
  )
1370
1368
 
1371
1369
  # 4. Prepare image
@@ -1474,7 +1472,11 @@ class StableDiffusionControlNetInpaintPipeline(
1474
1472
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1475
1473
 
1476
1474
  # 7.1 Add image embeds for IP-Adapter
1477
- added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
1475
+ added_cond_kwargs = (
1476
+ {"image_embeds": image_embeds}
1477
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
1478
+ else None
1479
+ )
1478
1480
 
1479
1481
  # 7.2 Create tensor stating which controlnets to keep
1480
1482
  controlnet_keep = []
@@ -1,4 +1,4 @@
1
- # Copyright 2023 Harutatsu Akiyama, Jinbin Bai, and The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 Harutatsu Akiyama, Jinbin Bai, and The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -19,11 +19,22 @@ import numpy as np
19
19
  import PIL.Image
20
20
  import torch
21
21
  import torch.nn.functional as F
22
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
22
+ from transformers import (
23
+ CLIPImageProcessor,
24
+ CLIPTextModel,
25
+ CLIPTextModelWithProjection,
26
+ CLIPTokenizer,
27
+ CLIPVisionModelWithProjection,
28
+ )
23
29
 
24
30
  from ...image_processor import PipelineImageInput, VaeImageProcessor
25
- from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
26
- from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
31
+ from ...loaders import (
32
+ FromSingleFileMixin,
33
+ IPAdapterMixin,
34
+ StableDiffusionXLLoraLoaderMixin,
35
+ TextualInversionLoaderMixin,
36
+ )
37
+ from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
27
38
  from ...models.attention_processor import (
28
39
  AttnProcessor2_0,
29
40
  LoRAAttnProcessor2_0,
@@ -42,7 +53,7 @@ from ...utils import (
42
53
  unscale_lora_layers,
43
54
  )
44
55
  from ...utils.torch_utils import is_compiled_module, randn_tensor
45
- from ..pipeline_utils import DiffusionPipeline
56
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
46
57
  from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
47
58
  from .multicontrolnet import MultiControlNetModel
48
59
 
@@ -140,7 +151,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
140
151
 
141
152
 
142
153
  class StableDiffusionXLControlNetInpaintPipeline(
143
- DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
154
+ DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
144
155
  ):
145
156
  r"""
146
157
  Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -152,6 +163,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
152
163
  - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
153
164
  - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
154
165
  - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
166
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
155
167
 
156
168
  Args:
157
169
  vae ([`AutoencoderKL`]):
@@ -195,6 +207,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
195
207
  requires_aesthetics_score: bool = False,
196
208
  force_zeros_for_empty_prompt: bool = True,
197
209
  add_watermarker: Optional[bool] = None,
210
+ feature_extractor: Optional[CLIPImageProcessor] = None,
211
+ image_encoder: Optional[CLIPVisionModelWithProjection] = None,
198
212
  ):
199
213
  super().__init__()
200
214
 
@@ -210,6 +224,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
210
224
  unet=unet,
211
225
  controlnet=controlnet,
212
226
  scheduler=scheduler,
227
+ feature_extractor=feature_extractor,
228
+ image_encoder=image_encoder,
213
229
  )
214
230
  self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
215
231
  self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
@@ -229,39 +245,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
229
245
  else:
230
246
  self.watermark = None
231
247
 
232
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
233
- def enable_vae_slicing(self):
234
- r"""
235
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
236
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
237
- """
238
- self.vae.enable_slicing()
239
-
240
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
241
- def disable_vae_slicing(self):
242
- r"""
243
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
244
- computing decoding in one step.
245
- """
246
- self.vae.disable_slicing()
247
-
248
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
249
- def enable_vae_tiling(self):
250
- r"""
251
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
252
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
253
- processing larger images.
254
- """
255
- self.vae.enable_tiling()
256
-
257
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
258
- def disable_vae_tiling(self):
259
- r"""
260
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
261
- computing decoding in one step.
262
- """
263
- self.vae.disable_tiling()
264
-
265
248
  # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
266
249
  def encode_prompt(
267
250
  self,
@@ -358,7 +341,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
358
341
  prompt_2 = prompt_2 or prompt
359
342
  prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
360
343
 
361
- # textual inversion: procecss multi-vector tokens if necessary
344
+ # textual inversion: process multi-vector tokens if necessary
362
345
  prompt_embeds_list = []
363
346
  prompts = [prompt, prompt_2]
364
347
  for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
@@ -497,6 +480,83 @@ class StableDiffusionXLControlNetInpaintPipeline(
497
480
 
498
481
  return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
499
482
 
483
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
484
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
485
+ dtype = next(self.image_encoder.parameters()).dtype
486
+
487
+ if not isinstance(image, torch.Tensor):
488
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
489
+
490
+ image = image.to(device=device, dtype=dtype)
491
+ if output_hidden_states:
492
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
493
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
494
+ uncond_image_enc_hidden_states = self.image_encoder(
495
+ torch.zeros_like(image), output_hidden_states=True
496
+ ).hidden_states[-2]
497
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
498
+ num_images_per_prompt, dim=0
499
+ )
500
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
501
+ else:
502
+ image_embeds = self.image_encoder(image).image_embeds
503
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
504
+ uncond_image_embeds = torch.zeros_like(image_embeds)
505
+
506
+ return image_embeds, uncond_image_embeds
507
+
508
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
509
+ def prepare_ip_adapter_image_embeds(
510
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
511
+ ):
512
+ if ip_adapter_image_embeds is None:
513
+ if not isinstance(ip_adapter_image, list):
514
+ ip_adapter_image = [ip_adapter_image]
515
+
516
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
517
+ raise ValueError(
518
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
519
+ )
520
+
521
+ image_embeds = []
522
+ for single_ip_adapter_image, image_proj_layer in zip(
523
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
524
+ ):
525
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
526
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
527
+ single_ip_adapter_image, device, 1, output_hidden_state
528
+ )
529
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
530
+ single_negative_image_embeds = torch.stack(
531
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
532
+ )
533
+
534
+ if do_classifier_free_guidance:
535
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
536
+ single_image_embeds = single_image_embeds.to(device)
537
+
538
+ image_embeds.append(single_image_embeds)
539
+ else:
540
+ repeat_dims = [1]
541
+ image_embeds = []
542
+ for single_image_embeds in ip_adapter_image_embeds:
543
+ if do_classifier_free_guidance:
544
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
545
+ single_image_embeds = single_image_embeds.repeat(
546
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
547
+ )
548
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
549
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
550
+ )
551
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
552
+ else:
553
+ single_image_embeds = single_image_embeds.repeat(
554
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
555
+ )
556
+ image_embeds.append(single_image_embeds)
557
+
558
+ return image_embeds
559
+
500
560
  # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
501
561
  def prepare_extra_step_kwargs(self, generator, eta):
502
562
  # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -566,6 +626,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
566
626
  negative_prompt_2=None,
567
627
  prompt_embeds=None,
568
628
  negative_prompt_embeds=None,
629
+ ip_adapter_image=None,
630
+ ip_adapter_image_embeds=None,
569
631
  pooled_prompt_embeds=None,
570
632
  negative_pooled_prompt_embeds=None,
571
633
  controlnet_conditioning_scale=1.0,
@@ -752,6 +814,21 @@ class StableDiffusionXLControlNetInpaintPipeline(
752
814
  if end > 1.0:
753
815
  raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
754
816
 
817
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
818
+ raise ValueError(
819
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
820
+ )
821
+
822
+ if ip_adapter_image_embeds is not None:
823
+ if not isinstance(ip_adapter_image_embeds, list):
824
+ raise ValueError(
825
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
826
+ )
827
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
828
+ raise ValueError(
829
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
830
+ )
831
+
755
832
  def prepare_control_image(
756
833
  self,
757
834
  image,
@@ -1021,34 +1098,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
1021
1098
  self.vae.decoder.conv_in.to(dtype)
1022
1099
  self.vae.decoder.mid_block.to(dtype)
1023
1100
 
1024
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
1025
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
1026
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
1027
-
1028
- The suffixes after the scaling factors represent the stages where they are being applied.
1029
-
1030
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
1031
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
1032
-
1033
- Args:
1034
- s1 (`float`):
1035
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
1036
- mitigate "oversmoothing effect" in the enhanced denoising process.
1037
- s2 (`float`):
1038
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
1039
- mitigate "oversmoothing effect" in the enhanced denoising process.
1040
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
1041
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
1042
- """
1043
- if not hasattr(self, "unet"):
1044
- raise ValueError("The pipeline must have `unet` for using FreeU.")
1045
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
1046
-
1047
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
1048
- def disable_freeu(self):
1049
- """Disables the FreeU mechanism if enabled."""
1050
- self.unet.disable_freeu()
1051
-
1052
1101
  @property
1053
1102
  def guidance_scale(self):
1054
1103
  return self._guidance_scale
@@ -1100,6 +1149,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
1100
1149
  latents: Optional[torch.FloatTensor] = None,
1101
1150
  prompt_embeds: Optional[torch.FloatTensor] = None,
1102
1151
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
1152
+ ip_adapter_image: Optional[PipelineImageInput] = None,
1153
+ ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
1103
1154
  pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1104
1155
  negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
1105
1156
  output_type: Optional[str] = "pil",
@@ -1194,6 +1245,12 @@ class StableDiffusionXLControlNetInpaintPipeline(
1194
1245
  Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
1195
1246
  weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
1196
1247
  argument.
1248
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
1249
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
1250
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
1251
+ Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
1252
+ if `do_classifier_free_guidance` is set to `True`.
1253
+ If not provided, embeddings are computed from the `ip_adapter_image` input argument.
1197
1254
  pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
1198
1255
  Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
1199
1256
  If not provided, pooled text embeddings will be generated from `prompt` input argument.
@@ -1326,6 +1383,8 @@ class StableDiffusionXLControlNetInpaintPipeline(
1326
1383
  negative_prompt_2,
1327
1384
  prompt_embeds,
1328
1385
  negative_prompt_embeds,
1386
+ ip_adapter_image,
1387
+ ip_adapter_image_embeds,
1329
1388
  pooled_prompt_embeds,
1330
1389
  negative_pooled_prompt_embeds,
1331
1390
  controlnet_conditioning_scale,
@@ -1378,13 +1437,26 @@ class StableDiffusionXLControlNetInpaintPipeline(
1378
1437
  clip_skip=self.clip_skip,
1379
1438
  )
1380
1439
 
1440
+ # 3.1 Encode ip_adapter_image
1441
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1442
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1443
+ ip_adapter_image,
1444
+ ip_adapter_image_embeds,
1445
+ device,
1446
+ batch_size * num_images_per_prompt,
1447
+ self.do_classifier_free_guidance,
1448
+ )
1449
+
1381
1450
  # 4. set timesteps
1382
1451
  def denoising_value_valid(dnv):
1383
- return isinstance(denoising_end, float) and 0 < dnv < 1
1452
+ return isinstance(dnv, float) and 0 < dnv < 1
1384
1453
 
1385
1454
  self.scheduler.set_timesteps(num_inference_steps, device=device)
1386
1455
  timesteps, num_inference_steps = self.get_timesteps(
1387
- num_inference_steps, strength, device, denoising_start=denoising_start if denoising_value_valid else None
1456
+ num_inference_steps,
1457
+ strength,
1458
+ device,
1459
+ denoising_start=denoising_start if denoising_value_valid(denoising_start) else None,
1388
1460
  )
1389
1461
  # check that number of inference steps is not < 1 - as this doesn't make sense
1390
1462
  if num_inference_steps < 1:
@@ -1649,6 +1721,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
1649
1721
  down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
1650
1722
  mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
1651
1723
 
1724
+ if ip_adapter_image is not None:
1725
+ added_cond_kwargs["image_embeds"] = image_embeds
1726
+
1652
1727
  if num_channels_unet == 9:
1653
1728
  latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1654
1729