diffusers 0.26.2__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (299) hide show
  1. diffusers/__init__.py +20 -1
  2. diffusers/commands/__init__.py +1 -1
  3. diffusers/commands/diffusers_cli.py +1 -1
  4. diffusers/commands/env.py +1 -1
  5. diffusers/commands/fp16_safetensors.py +1 -1
  6. diffusers/configuration_utils.py +7 -3
  7. diffusers/dependency_versions_check.py +1 -1
  8. diffusers/dependency_versions_table.py +2 -2
  9. diffusers/experimental/rl/value_guided_sampling.py +1 -1
  10. diffusers/image_processor.py +110 -4
  11. diffusers/loaders/autoencoder.py +28 -8
  12. diffusers/loaders/controlnet.py +17 -8
  13. diffusers/loaders/ip_adapter.py +86 -23
  14. diffusers/loaders/lora.py +105 -310
  15. diffusers/loaders/lora_conversion_utils.py +1 -1
  16. diffusers/loaders/peft.py +1 -1
  17. diffusers/loaders/single_file.py +51 -12
  18. diffusers/loaders/single_file_utils.py +278 -49
  19. diffusers/loaders/textual_inversion.py +23 -4
  20. diffusers/loaders/unet.py +195 -41
  21. diffusers/loaders/utils.py +1 -1
  22. diffusers/models/__init__.py +3 -1
  23. diffusers/models/activations.py +9 -9
  24. diffusers/models/attention.py +26 -36
  25. diffusers/models/attention_flax.py +1 -1
  26. diffusers/models/attention_processor.py +171 -114
  27. diffusers/models/autoencoders/autoencoder_asym_kl.py +1 -1
  28. diffusers/models/autoencoders/autoencoder_kl.py +3 -1
  29. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +1 -1
  30. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  31. diffusers/models/autoencoders/consistency_decoder_vae.py +1 -1
  32. diffusers/models/autoencoders/vae.py +1 -1
  33. diffusers/models/controlnet.py +1 -1
  34. diffusers/models/controlnet_flax.py +1 -1
  35. diffusers/models/downsampling.py +8 -12
  36. diffusers/models/dual_transformer_2d.py +1 -1
  37. diffusers/models/embeddings.py +3 -4
  38. diffusers/models/embeddings_flax.py +1 -1
  39. diffusers/models/lora.py +33 -10
  40. diffusers/models/modeling_flax_pytorch_utils.py +1 -1
  41. diffusers/models/modeling_flax_utils.py +1 -1
  42. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  43. diffusers/models/modeling_utils.py +4 -6
  44. diffusers/models/normalization.py +1 -1
  45. diffusers/models/resnet.py +31 -58
  46. diffusers/models/resnet_flax.py +1 -1
  47. diffusers/models/t5_film_transformer.py +1 -1
  48. diffusers/models/transformer_2d.py +1 -1
  49. diffusers/models/transformer_temporal.py +1 -1
  50. diffusers/models/transformers/dual_transformer_2d.py +1 -1
  51. diffusers/models/transformers/t5_film_transformer.py +1 -1
  52. diffusers/models/transformers/transformer_2d.py +29 -31
  53. diffusers/models/transformers/transformer_temporal.py +1 -1
  54. diffusers/models/unet_1d.py +1 -1
  55. diffusers/models/unet_1d_blocks.py +1 -1
  56. diffusers/models/unet_2d.py +1 -1
  57. diffusers/models/unet_2d_blocks.py +1 -1
  58. diffusers/models/unet_2d_condition.py +1 -1
  59. diffusers/models/unets/__init__.py +1 -0
  60. diffusers/models/unets/unet_1d.py +1 -1
  61. diffusers/models/unets/unet_1d_blocks.py +1 -1
  62. diffusers/models/unets/unet_2d.py +4 -4
  63. diffusers/models/unets/unet_2d_blocks.py +238 -98
  64. diffusers/models/unets/unet_2d_blocks_flax.py +1 -1
  65. diffusers/models/unets/unet_2d_condition.py +420 -323
  66. diffusers/models/unets/unet_2d_condition_flax.py +21 -12
  67. diffusers/models/unets/unet_3d_blocks.py +50 -40
  68. diffusers/models/unets/unet_3d_condition.py +47 -8
  69. diffusers/models/unets/unet_i2vgen_xl.py +75 -30
  70. diffusers/models/unets/unet_kandinsky3.py +1 -1
  71. diffusers/models/unets/unet_motion_model.py +48 -8
  72. diffusers/models/unets/unet_spatio_temporal_condition.py +1 -1
  73. diffusers/models/unets/unet_stable_cascade.py +610 -0
  74. diffusers/models/unets/uvit_2d.py +1 -1
  75. diffusers/models/upsampling.py +10 -16
  76. diffusers/models/vae_flax.py +1 -1
  77. diffusers/models/vq_model.py +1 -1
  78. diffusers/optimization.py +1 -1
  79. diffusers/pipelines/__init__.py +26 -0
  80. diffusers/pipelines/amused/pipeline_amused.py +1 -1
  81. diffusers/pipelines/amused/pipeline_amused_img2img.py +1 -1
  82. diffusers/pipelines/amused/pipeline_amused_inpaint.py +1 -1
  83. diffusers/pipelines/animatediff/pipeline_animatediff.py +162 -417
  84. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +165 -137
  85. diffusers/pipelines/animatediff/pipeline_output.py +7 -6
  86. diffusers/pipelines/audioldm/pipeline_audioldm.py +3 -19
  87. diffusers/pipelines/audioldm2/modeling_audioldm2.py +1 -1
  88. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +3 -3
  89. diffusers/pipelines/auto_pipeline.py +7 -16
  90. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +2 -2
  93. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  94. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +1 -1
  95. diffusers/pipelines/controlnet/pipeline_controlnet.py +90 -90
  96. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  97. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +98 -90
  98. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +92 -90
  99. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +145 -70
  100. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +126 -89
  101. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +108 -96
  102. diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +2 -2
  103. diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +1 -1
  104. diffusers/pipelines/ddim/pipeline_ddim.py +1 -1
  105. diffusers/pipelines/ddpm/pipeline_ddpm.py +1 -1
  106. diffusers/pipelines/deepfloyd_if/pipeline_if.py +4 -4
  107. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +4 -4
  108. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +5 -5
  109. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +4 -4
  110. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +5 -5
  111. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +5 -5
  112. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +10 -120
  113. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +10 -91
  114. diffusers/pipelines/deprecated/audio_diffusion/mel.py +1 -1
  115. diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py +1 -1
  116. diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +1 -1
  117. diffusers/pipelines/deprecated/pndm/pipeline_pndm.py +1 -1
  118. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +1 -1
  119. diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py +1 -1
  120. diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py +1 -1
  121. diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py +1 -1
  122. diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py +1 -1
  123. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +1 -1
  124. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +5 -4
  125. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +5 -4
  126. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +7 -22
  127. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +5 -39
  128. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +5 -5
  129. diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +1 -1
  130. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -22
  131. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +1 -1
  132. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +1 -1
  133. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +1 -2
  134. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +1 -1
  135. diffusers/pipelines/dit/pipeline_dit.py +1 -1
  136. diffusers/pipelines/free_init_utils.py +184 -0
  137. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +22 -104
  138. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +1 -1
  139. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +1 -1
  140. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +1 -1
  141. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +2 -2
  142. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +1 -1
  143. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +1 -1
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +1 -1
  145. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +1 -1
  146. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +1 -1
  147. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +1 -1
  148. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +2 -2
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +104 -93
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +112 -74
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/ledits_pp/__init__.py +55 -0
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +1505 -0
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +1797 -0
  155. diffusers/pipelines/ledits_pp/pipeline_output.py +43 -0
  156. diffusers/pipelines/musicldm/pipeline_musicldm.py +3 -19
  157. diffusers/pipelines/onnx_utils.py +1 -1
  158. diffusers/pipelines/paint_by_example/image_encoder.py +1 -1
  159. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +3 -3
  160. diffusers/pipelines/pia/pipeline_pia.py +168 -327
  161. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  162. diffusers/pipelines/pipeline_loading_utils.py +508 -0
  163. diffusers/pipelines/pipeline_utils.py +188 -534
  164. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +56 -10
  165. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +3 -3
  166. diffusers/pipelines/shap_e/camera.py +1 -1
  167. diffusers/pipelines/shap_e/pipeline_shap_e.py +1 -1
  168. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +1 -1
  169. diffusers/pipelines/shap_e/renderer.py +1 -1
  170. diffusers/pipelines/stable_cascade/__init__.py +50 -0
  171. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +482 -0
  172. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +311 -0
  173. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +638 -0
  174. diffusers/pipelines/stable_diffusion/clip_image_project_model.py +1 -1
  175. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +4 -1
  176. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +1 -1
  177. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +2 -2
  178. diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py +1 -1
  179. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +1 -1
  180. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +1 -1
  181. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +1 -1
  182. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +1 -1
  183. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +90 -146
  184. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +5 -4
  185. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +4 -32
  186. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +92 -119
  187. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +92 -119
  188. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +13 -59
  189. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +3 -31
  190. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +5 -33
  191. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +5 -21
  192. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +7 -21
  193. diffusers/pipelines/stable_diffusion/safety_checker.py +1 -1
  194. diffusers/pipelines/stable_diffusion/safety_checker_flax.py +1 -1
  195. diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +1 -1
  196. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +5 -21
  197. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +9 -38
  198. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +5 -34
  199. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +6 -35
  200. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +7 -6
  201. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +4 -124
  202. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +282 -80
  203. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +94 -46
  204. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +3 -3
  205. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  206. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +6 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py +1 -1
  208. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +96 -148
  209. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +98 -154
  210. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +98 -153
  211. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +25 -87
  212. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +89 -80
  213. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +5 -49
  214. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +80 -88
  215. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +8 -6
  216. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +15 -86
  217. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +20 -93
  218. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +5 -5
  219. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +3 -19
  220. diffusers/pipelines/unclip/pipeline_unclip.py +1 -1
  221. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +1 -1
  222. diffusers/pipelines/unclip/text_proj.py +1 -1
  223. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +35 -35
  224. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +1 -1
  225. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +4 -21
  226. diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py +2 -2
  227. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -5
  228. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +8 -8
  229. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +1 -1
  230. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +2 -2
  231. diffusers/schedulers/__init__.py +7 -1
  232. diffusers/schedulers/deprecated/scheduling_karras_ve.py +1 -1
  233. diffusers/schedulers/deprecated/scheduling_sde_vp.py +1 -1
  234. diffusers/schedulers/scheduling_consistency_models.py +42 -19
  235. diffusers/schedulers/scheduling_ddim.py +2 -4
  236. diffusers/schedulers/scheduling_ddim_flax.py +13 -5
  237. diffusers/schedulers/scheduling_ddim_inverse.py +2 -4
  238. diffusers/schedulers/scheduling_ddim_parallel.py +2 -4
  239. diffusers/schedulers/scheduling_ddpm.py +2 -4
  240. diffusers/schedulers/scheduling_ddpm_flax.py +1 -1
  241. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -4
  242. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +1 -1
  243. diffusers/schedulers/scheduling_deis_multistep.py +46 -19
  244. diffusers/schedulers/scheduling_dpmsolver_multistep.py +107 -21
  245. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +1 -1
  246. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +9 -7
  247. diffusers/schedulers/scheduling_dpmsolver_sde.py +35 -35
  248. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +52 -21
  249. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +683 -0
  250. diffusers/schedulers/scheduling_edm_euler.py +381 -0
  251. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +43 -15
  252. diffusers/schedulers/scheduling_euler_discrete.py +42 -17
  253. diffusers/schedulers/scheduling_euler_discrete_flax.py +1 -1
  254. diffusers/schedulers/scheduling_heun_discrete.py +35 -35
  255. diffusers/schedulers/scheduling_ipndm.py +37 -11
  256. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +44 -44
  257. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +44 -44
  258. diffusers/schedulers/scheduling_karras_ve_flax.py +1 -1
  259. diffusers/schedulers/scheduling_lcm.py +38 -14
  260. diffusers/schedulers/scheduling_lms_discrete.py +43 -15
  261. diffusers/schedulers/scheduling_lms_discrete_flax.py +1 -1
  262. diffusers/schedulers/scheduling_pndm.py +2 -4
  263. diffusers/schedulers/scheduling_pndm_flax.py +2 -4
  264. diffusers/schedulers/scheduling_repaint.py +1 -1
  265. diffusers/schedulers/scheduling_sasolver.py +41 -9
  266. diffusers/schedulers/scheduling_sde_ve.py +1 -1
  267. diffusers/schedulers/scheduling_sde_ve_flax.py +1 -1
  268. diffusers/schedulers/scheduling_tcd.py +686 -0
  269. diffusers/schedulers/scheduling_unclip.py +1 -1
  270. diffusers/schedulers/scheduling_unipc_multistep.py +46 -19
  271. diffusers/schedulers/scheduling_utils.py +2 -1
  272. diffusers/schedulers/scheduling_utils_flax.py +1 -1
  273. diffusers/schedulers/scheduling_vq_diffusion.py +1 -1
  274. diffusers/training_utils.py +9 -2
  275. diffusers/utils/__init__.py +2 -1
  276. diffusers/utils/accelerate_utils.py +1 -1
  277. diffusers/utils/constants.py +1 -1
  278. diffusers/utils/doc_utils.py +1 -1
  279. diffusers/utils/dummy_pt_objects.py +60 -0
  280. diffusers/utils/dummy_torch_and_transformers_objects.py +75 -0
  281. diffusers/utils/dynamic_modules_utils.py +1 -1
  282. diffusers/utils/export_utils.py +3 -3
  283. diffusers/utils/hub_utils.py +60 -16
  284. diffusers/utils/import_utils.py +15 -1
  285. diffusers/utils/loading_utils.py +2 -0
  286. diffusers/utils/logging.py +1 -1
  287. diffusers/utils/model_card_template.md +24 -0
  288. diffusers/utils/outputs.py +14 -7
  289. diffusers/utils/peft_utils.py +1 -1
  290. diffusers/utils/state_dict_utils.py +1 -1
  291. diffusers/utils/testing_utils.py +2 -0
  292. diffusers/utils/torch_utils.py +1 -1
  293. {diffusers-0.26.2.dist-info → diffusers-0.27.0.dist-info}/METADATA +5 -5
  294. diffusers-0.27.0.dist-info/RECORD +399 -0
  295. diffusers-0.26.2.dist-info/RECORD +0 -384
  296. {diffusers-0.26.2.dist-info → diffusers-0.27.0.dist-info}/LICENSE +0 -0
  297. {diffusers-0.26.2.dist-info → diffusers-0.27.0.dist-info}/WHEEL +0 -0
  298. {diffusers-0.26.2.dist-info → diffusers-0.27.0.dist-info}/entry_points.txt +0 -0
  299. {diffusers-0.26.2.dist-info → diffusers-0.27.0.dist-info}/top_level.txt +0 -0
@@ -90,7 +90,7 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
90
90
  layers_per_block: Union[int, Tuple[int]] = 2,
91
91
  cross_attention_dim: Union[int, Tuple[int]] = 1024,
92
92
  transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
93
- num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
93
+ num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
94
94
  num_frames: int = 25,
95
95
  ):
96
96
  super().__init__()
@@ -0,0 +1,610 @@
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from dataclasses import dataclass
17
+ from typing import Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from ...configuration_utils import ConfigMixin, register_to_config
24
+ from ...loaders.unet import FromOriginalUNetMixin
25
+ from ...utils import BaseOutput
26
+ from ..attention_processor import Attention
27
+ from ..modeling_utils import ModelMixin
28
+
29
+
30
+ # Copied from diffusers.pipelines.wuerstchen.modeling_wuerstchen_common.WuerstchenLayerNorm with WuerstchenLayerNorm -> SDCascadeLayerNorm
31
+ class SDCascadeLayerNorm(nn.LayerNorm):
32
+ def __init__(self, *args, **kwargs):
33
+ super().__init__(*args, **kwargs)
34
+
35
+ def forward(self, x):
36
+ x = x.permute(0, 2, 3, 1)
37
+ x = super().forward(x)
38
+ return x.permute(0, 3, 1, 2)
39
+
40
+
41
+ class SDCascadeTimestepBlock(nn.Module):
42
+ def __init__(self, c, c_timestep, conds=[]):
43
+ super().__init__()
44
+ linear_cls = nn.Linear
45
+ self.mapper = linear_cls(c_timestep, c * 2)
46
+ self.conds = conds
47
+ for cname in conds:
48
+ setattr(self, f"mapper_{cname}", linear_cls(c_timestep, c * 2))
49
+
50
+ def forward(self, x, t):
51
+ t = t.chunk(len(self.conds) + 1, dim=1)
52
+ a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
53
+ for i, c in enumerate(self.conds):
54
+ ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
55
+ a, b = a + ac, b + bc
56
+ return x * (1 + a) + b
57
+
58
+
59
+ class SDCascadeResBlock(nn.Module):
60
+ def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
61
+ super().__init__()
62
+ self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
63
+ self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
64
+ self.channelwise = nn.Sequential(
65
+ nn.Linear(c + c_skip, c * 4),
66
+ nn.GELU(),
67
+ GlobalResponseNorm(c * 4),
68
+ nn.Dropout(dropout),
69
+ nn.Linear(c * 4, c),
70
+ )
71
+
72
+ def forward(self, x, x_skip=None):
73
+ x_res = x
74
+ x = self.norm(self.depthwise(x))
75
+ if x_skip is not None:
76
+ x = torch.cat([x, x_skip], dim=1)
77
+ x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
78
+ return x + x_res
79
+
80
+
81
+ # from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
82
+ class GlobalResponseNorm(nn.Module):
83
+ def __init__(self, dim):
84
+ super().__init__()
85
+ self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
86
+ self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
87
+
88
+ def forward(self, x):
89
+ agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
90
+ stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
91
+ return self.gamma * (x * stand_div_norm) + self.beta + x
92
+
93
+
94
+ class SDCascadeAttnBlock(nn.Module):
95
+ def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
96
+ super().__init__()
97
+ linear_cls = nn.Linear
98
+
99
+ self.self_attn = self_attn
100
+ self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
101
+ self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
102
+ self.kv_mapper = nn.Sequential(nn.SiLU(), linear_cls(c_cond, c))
103
+
104
+ def forward(self, x, kv):
105
+ kv = self.kv_mapper(kv)
106
+ norm_x = self.norm(x)
107
+ if self.self_attn:
108
+ batch_size, channel, _, _ = x.shape
109
+ kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
110
+ x = x + self.attention(norm_x, encoder_hidden_states=kv)
111
+ return x
112
+
113
+
114
+ class UpDownBlock2d(nn.Module):
115
+ def __init__(self, in_channels, out_channels, mode, enabled=True):
116
+ super().__init__()
117
+ if mode not in ["up", "down"]:
118
+ raise ValueError(f"{mode} not supported")
119
+ interpolation = (
120
+ nn.Upsample(scale_factor=2 if mode == "up" else 0.5, mode="bilinear", align_corners=True)
121
+ if enabled
122
+ else nn.Identity()
123
+ )
124
+ mapping = nn.Conv2d(in_channels, out_channels, kernel_size=1)
125
+ self.blocks = nn.ModuleList([interpolation, mapping] if mode == "up" else [mapping, interpolation])
126
+
127
+ def forward(self, x):
128
+ for block in self.blocks:
129
+ x = block(x)
130
+ return x
131
+
132
+
133
+ @dataclass
134
+ class StableCascadeUNetOutput(BaseOutput):
135
+ sample: torch.FloatTensor = None
136
+
137
+
138
+ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
139
+ _supports_gradient_checkpointing = True
140
+
141
+ @register_to_config
142
+ def __init__(
143
+ self,
144
+ in_channels: int = 16,
145
+ out_channels: int = 16,
146
+ timestep_ratio_embedding_dim: int = 64,
147
+ patch_size: int = 1,
148
+ conditioning_dim: int = 2048,
149
+ block_out_channels: Tuple[int] = (2048, 2048),
150
+ num_attention_heads: Tuple[int] = (32, 32),
151
+ down_num_layers_per_block: Tuple[int] = (8, 24),
152
+ up_num_layers_per_block: Tuple[int] = (24, 8),
153
+ down_blocks_repeat_mappers: Optional[Tuple[int]] = (
154
+ 1,
155
+ 1,
156
+ ),
157
+ up_blocks_repeat_mappers: Optional[Tuple[int]] = (1, 1),
158
+ block_types_per_layer: Tuple[Tuple[str]] = (
159
+ ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
160
+ ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
161
+ ),
162
+ clip_text_in_channels: Optional[int] = None,
163
+ clip_text_pooled_in_channels=1280,
164
+ clip_image_in_channels: Optional[int] = None,
165
+ clip_seq=4,
166
+ effnet_in_channels: Optional[int] = None,
167
+ pixel_mapper_in_channels: Optional[int] = None,
168
+ kernel_size=3,
169
+ dropout: Union[float, Tuple[float]] = (0.1, 0.1),
170
+ self_attn: Union[bool, Tuple[bool]] = True,
171
+ timestep_conditioning_type: Tuple[str] = ("sca", "crp"),
172
+ switch_level: Optional[Tuple[bool]] = None,
173
+ ):
174
+ """
175
+
176
+ Parameters:
177
+ in_channels (`int`, defaults to 16):
178
+ Number of channels in the input sample.
179
+ out_channels (`int`, defaults to 16):
180
+ Number of channels in the output sample.
181
+ timestep_ratio_embedding_dim (`int`, defaults to 64):
182
+ Dimension of the projected time embedding.
183
+ patch_size (`int`, defaults to 1):
184
+ Patch size to use for pixel unshuffling layer
185
+ conditioning_dim (`int`, defaults to 2048):
186
+ Dimension of the image and text conditional embedding.
187
+ block_out_channels (Tuple[int], defaults to (2048, 2048)):
188
+ Tuple of output channels for each block.
189
+ num_attention_heads (Tuple[int], defaults to (32, 32)):
190
+ Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention.
191
+ down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
192
+ Number of layers in each down block.
193
+ up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
194
+ Number of layers in each up block.
195
+ down_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
196
+ Number of 1x1 Convolutional layers to repeat in each down block.
197
+ up_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
198
+ Number of 1x1 Convolutional layers to repeat in each up block.
199
+ block_types_per_layer (Tuple[Tuple[str]], optional,
200
+ defaults to (
201
+ ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
202
+ ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
203
+ ):
204
+ Block types used in each layer of the up/down blocks.
205
+ clip_text_in_channels (`int`, *optional*, defaults to `None`):
206
+ Number of input channels for CLIP based text conditioning.
207
+ clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
208
+ Number of input channels for pooled CLIP text embeddings.
209
+ clip_image_in_channels (`int`, *optional*):
210
+ Number of input channels for CLIP based image conditioning.
211
+ clip_seq (`int`, *optional*, defaults to 4):
212
+ effnet_in_channels (`int`, *optional*, defaults to `None`):
213
+ Number of input channels for effnet conditioning.
214
+ pixel_mapper_in_channels (`int`, defaults to `None`):
215
+ Number of input channels for pixel mapper conditioning.
216
+ kernel_size (`int`, *optional*, defaults to 3):
217
+ Kernel size to use in the block convolutional layers.
218
+ dropout (Tuple[float], *optional*, defaults to (0.1, 0.1)):
219
+ Dropout to use per block.
220
+ self_attn (Union[bool, Tuple[bool]]):
221
+ Tuple of booleans that determine whether to use self attention in a block or not.
222
+ timestep_conditioning_type (Tuple[str], defaults to ("sca", "crp")):
223
+ Timestep conditioning type.
224
+ switch_level (Optional[Tuple[bool]], *optional*, defaults to `None`):
225
+ Tuple that indicates whether upsampling or downsampling should be applied in a block
226
+ """
227
+
228
+ super().__init__()
229
+
230
+ if len(block_out_channels) != len(down_num_layers_per_block):
231
+ raise ValueError(
232
+ f"Number of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
233
+ )
234
+
235
+ elif len(block_out_channels) != len(up_num_layers_per_block):
236
+ raise ValueError(
237
+ f"Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
238
+ )
239
+
240
+ elif len(block_out_channels) != len(down_blocks_repeat_mappers):
241
+ raise ValueError(
242
+ f"Number of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
243
+ )
244
+
245
+ elif len(block_out_channels) != len(up_blocks_repeat_mappers):
246
+ raise ValueError(
247
+ f"Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
248
+ )
249
+
250
+ elif len(block_out_channels) != len(block_types_per_layer):
251
+ raise ValueError(
252
+ f"Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: {len(block_out_channels)}"
253
+ )
254
+
255
+ if isinstance(dropout, float):
256
+ dropout = (dropout,) * len(block_out_channels)
257
+ if isinstance(self_attn, bool):
258
+ self_attn = (self_attn,) * len(block_out_channels)
259
+
260
+ # CONDITIONING
261
+ if effnet_in_channels is not None:
262
+ self.effnet_mapper = nn.Sequential(
263
+ nn.Conv2d(effnet_in_channels, block_out_channels[0] * 4, kernel_size=1),
264
+ nn.GELU(),
265
+ nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
266
+ SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
267
+ )
268
+ if pixel_mapper_in_channels is not None:
269
+ self.pixels_mapper = nn.Sequential(
270
+ nn.Conv2d(pixel_mapper_in_channels, block_out_channels[0] * 4, kernel_size=1),
271
+ nn.GELU(),
272
+ nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
273
+ SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
274
+ )
275
+
276
+ self.clip_txt_pooled_mapper = nn.Linear(clip_text_pooled_in_channels, conditioning_dim * clip_seq)
277
+ if clip_text_in_channels is not None:
278
+ self.clip_txt_mapper = nn.Linear(clip_text_in_channels, conditioning_dim)
279
+ if clip_image_in_channels is not None:
280
+ self.clip_img_mapper = nn.Linear(clip_image_in_channels, conditioning_dim * clip_seq)
281
+ self.clip_norm = nn.LayerNorm(conditioning_dim, elementwise_affine=False, eps=1e-6)
282
+
283
+ self.embedding = nn.Sequential(
284
+ nn.PixelUnshuffle(patch_size),
285
+ nn.Conv2d(in_channels * (patch_size**2), block_out_channels[0], kernel_size=1),
286
+ SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
287
+ )
288
+
289
+ def get_block(block_type, in_channels, nhead, c_skip=0, dropout=0, self_attn=True):
290
+ if block_type == "SDCascadeResBlock":
291
+ return SDCascadeResBlock(in_channels, c_skip, kernel_size=kernel_size, dropout=dropout)
292
+ elif block_type == "SDCascadeAttnBlock":
293
+ return SDCascadeAttnBlock(in_channels, conditioning_dim, nhead, self_attn=self_attn, dropout=dropout)
294
+ elif block_type == "SDCascadeTimestepBlock":
295
+ return SDCascadeTimestepBlock(
296
+ in_channels, timestep_ratio_embedding_dim, conds=timestep_conditioning_type
297
+ )
298
+ else:
299
+ raise ValueError(f"Block type {block_type} not supported")
300
+
301
+ # BLOCKS
302
+ # -- down blocks
303
+ self.down_blocks = nn.ModuleList()
304
+ self.down_downscalers = nn.ModuleList()
305
+ self.down_repeat_mappers = nn.ModuleList()
306
+ for i in range(len(block_out_channels)):
307
+ if i > 0:
308
+ self.down_downscalers.append(
309
+ nn.Sequential(
310
+ SDCascadeLayerNorm(block_out_channels[i - 1], elementwise_affine=False, eps=1e-6),
311
+ UpDownBlock2d(
312
+ block_out_channels[i - 1], block_out_channels[i], mode="down", enabled=switch_level[i - 1]
313
+ )
314
+ if switch_level is not None
315
+ else nn.Conv2d(block_out_channels[i - 1], block_out_channels[i], kernel_size=2, stride=2),
316
+ )
317
+ )
318
+ else:
319
+ self.down_downscalers.append(nn.Identity())
320
+
321
+ down_block = nn.ModuleList()
322
+ for _ in range(down_num_layers_per_block[i]):
323
+ for block_type in block_types_per_layer[i]:
324
+ block = get_block(
325
+ block_type,
326
+ block_out_channels[i],
327
+ num_attention_heads[i],
328
+ dropout=dropout[i],
329
+ self_attn=self_attn[i],
330
+ )
331
+ down_block.append(block)
332
+ self.down_blocks.append(down_block)
333
+
334
+ if down_blocks_repeat_mappers is not None:
335
+ block_repeat_mappers = nn.ModuleList()
336
+ for _ in range(down_blocks_repeat_mappers[i] - 1):
337
+ block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
338
+ self.down_repeat_mappers.append(block_repeat_mappers)
339
+
340
+ # -- up blocks
341
+ self.up_blocks = nn.ModuleList()
342
+ self.up_upscalers = nn.ModuleList()
343
+ self.up_repeat_mappers = nn.ModuleList()
344
+ for i in reversed(range(len(block_out_channels))):
345
+ if i > 0:
346
+ self.up_upscalers.append(
347
+ nn.Sequential(
348
+ SDCascadeLayerNorm(block_out_channels[i], elementwise_affine=False, eps=1e-6),
349
+ UpDownBlock2d(
350
+ block_out_channels[i], block_out_channels[i - 1], mode="up", enabled=switch_level[i - 1]
351
+ )
352
+ if switch_level is not None
353
+ else nn.ConvTranspose2d(
354
+ block_out_channels[i], block_out_channels[i - 1], kernel_size=2, stride=2
355
+ ),
356
+ )
357
+ )
358
+ else:
359
+ self.up_upscalers.append(nn.Identity())
360
+
361
+ up_block = nn.ModuleList()
362
+ for j in range(up_num_layers_per_block[::-1][i]):
363
+ for k, block_type in enumerate(block_types_per_layer[i]):
364
+ c_skip = block_out_channels[i] if i < len(block_out_channels) - 1 and j == k == 0 else 0
365
+ block = get_block(
366
+ block_type,
367
+ block_out_channels[i],
368
+ num_attention_heads[i],
369
+ c_skip=c_skip,
370
+ dropout=dropout[i],
371
+ self_attn=self_attn[i],
372
+ )
373
+ up_block.append(block)
374
+ self.up_blocks.append(up_block)
375
+
376
+ if up_blocks_repeat_mappers is not None:
377
+ block_repeat_mappers = nn.ModuleList()
378
+ for _ in range(up_blocks_repeat_mappers[::-1][i] - 1):
379
+ block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
380
+ self.up_repeat_mappers.append(block_repeat_mappers)
381
+
382
+ # OUTPUT
383
+ self.clf = nn.Sequential(
384
+ SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
385
+ nn.Conv2d(block_out_channels[0], out_channels * (patch_size**2), kernel_size=1),
386
+ nn.PixelShuffle(patch_size),
387
+ )
388
+
389
+ self.gradient_checkpointing = False
390
+
391
+ def _set_gradient_checkpointing(self, value=False):
392
+ self.gradient_checkpointing = value
393
+
394
+ def _init_weights(self, m):
395
+ if isinstance(m, (nn.Conv2d, nn.Linear)):
396
+ torch.nn.init.xavier_uniform_(m.weight)
397
+ if m.bias is not None:
398
+ nn.init.constant_(m.bias, 0)
399
+
400
+ nn.init.normal_(self.clip_txt_pooled_mapper.weight, std=0.02)
401
+ nn.init.normal_(self.clip_txt_mapper.weight, std=0.02) if hasattr(self, "clip_txt_mapper") else None
402
+ nn.init.normal_(self.clip_img_mapper.weight, std=0.02) if hasattr(self, "clip_img_mapper") else None
403
+
404
+ if hasattr(self, "effnet_mapper"):
405
+ nn.init.normal_(self.effnet_mapper[0].weight, std=0.02) # conditionings
406
+ nn.init.normal_(self.effnet_mapper[2].weight, std=0.02) # conditionings
407
+
408
+ if hasattr(self, "pixels_mapper"):
409
+ nn.init.normal_(self.pixels_mapper[0].weight, std=0.02) # conditionings
410
+ nn.init.normal_(self.pixels_mapper[2].weight, std=0.02) # conditionings
411
+
412
+ torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02) # inputs
413
+ nn.init.constant_(self.clf[1].weight, 0) # outputs
414
+
415
+ # blocks
416
+ for level_block in self.down_blocks + self.up_blocks:
417
+ for block in level_block:
418
+ if isinstance(block, SDCascadeResBlock):
419
+ block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks[0]))
420
+ elif isinstance(block, SDCascadeTimestepBlock):
421
+ nn.init.constant_(block.mapper.weight, 0)
422
+
423
+ def get_timestep_ratio_embedding(self, timestep_ratio, max_positions=10000):
424
+ r = timestep_ratio * max_positions
425
+ half_dim = self.config.timestep_ratio_embedding_dim // 2
426
+
427
+ emb = math.log(max_positions) / (half_dim - 1)
428
+ emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
429
+ emb = r[:, None] * emb[None, :]
430
+ emb = torch.cat([emb.sin(), emb.cos()], dim=1)
431
+
432
+ if self.config.timestep_ratio_embedding_dim % 2 == 1: # zero pad
433
+ emb = nn.functional.pad(emb, (0, 1), mode="constant")
434
+
435
+ return emb.to(dtype=r.dtype)
436
+
437
+ def get_clip_embeddings(self, clip_txt_pooled, clip_txt=None, clip_img=None):
438
+ if len(clip_txt_pooled.shape) == 2:
439
+ clip_txt_pool = clip_txt_pooled.unsqueeze(1)
440
+ clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(
441
+ clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.config.clip_seq, -1
442
+ )
443
+ if clip_txt is not None and clip_img is not None:
444
+ clip_txt = self.clip_txt_mapper(clip_txt)
445
+ if len(clip_img.shape) == 2:
446
+ clip_img = clip_img.unsqueeze(1)
447
+ clip_img = self.clip_img_mapper(clip_img).view(
448
+ clip_img.size(0), clip_img.size(1) * self.config.clip_seq, -1
449
+ )
450
+ clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
451
+ else:
452
+ clip = clip_txt_pool
453
+ return self.clip_norm(clip)
454
+
455
+ def _down_encode(self, x, r_embed, clip):
456
+ level_outputs = []
457
+ block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
458
+
459
+ if self.training and self.gradient_checkpointing:
460
+
461
+ def create_custom_forward(module):
462
+ def custom_forward(*inputs):
463
+ return module(*inputs)
464
+
465
+ return custom_forward
466
+
467
+ for down_block, downscaler, repmap in block_group:
468
+ x = downscaler(x)
469
+ for i in range(len(repmap) + 1):
470
+ for block in down_block:
471
+ if isinstance(block, SDCascadeResBlock):
472
+ x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
473
+ elif isinstance(block, SDCascadeAttnBlock):
474
+ x = torch.utils.checkpoint.checkpoint(
475
+ create_custom_forward(block), x, clip, use_reentrant=False
476
+ )
477
+ elif isinstance(block, SDCascadeTimestepBlock):
478
+ x = torch.utils.checkpoint.checkpoint(
479
+ create_custom_forward(block), x, r_embed, use_reentrant=False
480
+ )
481
+ else:
482
+ x = x = torch.utils.checkpoint.checkpoint(
483
+ create_custom_forward(block), use_reentrant=False
484
+ )
485
+ if i < len(repmap):
486
+ x = repmap[i](x)
487
+ level_outputs.insert(0, x)
488
+ else:
489
+ for down_block, downscaler, repmap in block_group:
490
+ x = downscaler(x)
491
+ for i in range(len(repmap) + 1):
492
+ for block in down_block:
493
+ if isinstance(block, SDCascadeResBlock):
494
+ x = block(x)
495
+ elif isinstance(block, SDCascadeAttnBlock):
496
+ x = block(x, clip)
497
+ elif isinstance(block, SDCascadeTimestepBlock):
498
+ x = block(x, r_embed)
499
+ else:
500
+ x = block(x)
501
+ if i < len(repmap):
502
+ x = repmap[i](x)
503
+ level_outputs.insert(0, x)
504
+ return level_outputs
505
+
506
+ def _up_decode(self, level_outputs, r_embed, clip):
507
+ x = level_outputs[0]
508
+ block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
509
+
510
+ if self.training and self.gradient_checkpointing:
511
+
512
+ def create_custom_forward(module):
513
+ def custom_forward(*inputs):
514
+ return module(*inputs)
515
+
516
+ return custom_forward
517
+
518
+ for i, (up_block, upscaler, repmap) in enumerate(block_group):
519
+ for j in range(len(repmap) + 1):
520
+ for k, block in enumerate(up_block):
521
+ if isinstance(block, SDCascadeResBlock):
522
+ skip = level_outputs[i] if k == 0 and i > 0 else None
523
+ if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
524
+ x = torch.nn.functional.interpolate(
525
+ x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
526
+ )
527
+ x = torch.utils.checkpoint.checkpoint(
528
+ create_custom_forward(block), x, skip, use_reentrant=False
529
+ )
530
+ elif isinstance(block, SDCascadeAttnBlock):
531
+ x = torch.utils.checkpoint.checkpoint(
532
+ create_custom_forward(block), x, clip, use_reentrant=False
533
+ )
534
+ elif isinstance(block, SDCascadeTimestepBlock):
535
+ x = torch.utils.checkpoint.checkpoint(
536
+ create_custom_forward(block), x, r_embed, use_reentrant=False
537
+ )
538
+ else:
539
+ x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
540
+ if j < len(repmap):
541
+ x = repmap[j](x)
542
+ x = upscaler(x)
543
+ else:
544
+ for i, (up_block, upscaler, repmap) in enumerate(block_group):
545
+ for j in range(len(repmap) + 1):
546
+ for k, block in enumerate(up_block):
547
+ if isinstance(block, SDCascadeResBlock):
548
+ skip = level_outputs[i] if k == 0 and i > 0 else None
549
+ if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
550
+ x = torch.nn.functional.interpolate(
551
+ x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
552
+ )
553
+ x = block(x, skip)
554
+ elif isinstance(block, SDCascadeAttnBlock):
555
+ x = block(x, clip)
556
+ elif isinstance(block, SDCascadeTimestepBlock):
557
+ x = block(x, r_embed)
558
+ else:
559
+ x = block(x)
560
+ if j < len(repmap):
561
+ x = repmap[j](x)
562
+ x = upscaler(x)
563
+ return x
564
+
565
+ def forward(
566
+ self,
567
+ sample,
568
+ timestep_ratio,
569
+ clip_text_pooled,
570
+ clip_text=None,
571
+ clip_img=None,
572
+ effnet=None,
573
+ pixels=None,
574
+ sca=None,
575
+ crp=None,
576
+ return_dict=True,
577
+ ):
578
+ if pixels is None:
579
+ pixels = sample.new_zeros(sample.size(0), 3, 8, 8)
580
+
581
+ # Process the conditioning embeddings
582
+ timestep_ratio_embed = self.get_timestep_ratio_embedding(timestep_ratio)
583
+ for c in self.config.timestep_conditioning_type:
584
+ if c == "sca":
585
+ cond = sca
586
+ elif c == "crp":
587
+ cond = crp
588
+ else:
589
+ cond = None
590
+ t_cond = cond or torch.zeros_like(timestep_ratio)
591
+ timestep_ratio_embed = torch.cat([timestep_ratio_embed, self.get_timestep_ratio_embedding(t_cond)], dim=1)
592
+ clip = self.get_clip_embeddings(clip_txt_pooled=clip_text_pooled, clip_txt=clip_text, clip_img=clip_img)
593
+
594
+ # Model Blocks
595
+ x = self.embedding(sample)
596
+ if hasattr(self, "effnet_mapper") and effnet is not None:
597
+ x = x + self.effnet_mapper(
598
+ nn.functional.interpolate(effnet, size=x.shape[-2:], mode="bilinear", align_corners=True)
599
+ )
600
+ if hasattr(self, "pixels_mapper"):
601
+ x = x + nn.functional.interpolate(
602
+ self.pixels_mapper(pixels), size=x.shape[-2:], mode="bilinear", align_corners=True
603
+ )
604
+ level_outputs = self._down_encode(x, timestep_ratio_embed, clip)
605
+ x = self._up_decode(level_outputs, timestep_ratio_embed, clip)
606
+ sample = self.clf(x)
607
+
608
+ if not return_dict:
609
+ return (sample,)
610
+ return StableCascadeUNetOutput(sample=sample)
@@ -1,5 +1,5 @@
1
1
  # coding=utf-8
2
- # Copyright 2023 The HuggingFace Inc. team.
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
5
  # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -18,8 +18,7 @@ import torch
18
18
  import torch.nn as nn
19
19
  import torch.nn.functional as F
20
20
 
21
- from ..utils import USE_PEFT_BACKEND
22
- from .lora import LoRACompatibleConv
21
+ from ..utils import deprecate
23
22
  from .normalization import RMSNorm
24
23
 
25
24
 
@@ -111,7 +110,7 @@ class Upsample2D(nn.Module):
111
110
  self.use_conv_transpose = use_conv_transpose
112
111
  self.name = name
113
112
  self.interpolate = interpolate
114
- conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
113
+ conv_cls = nn.Conv2d
115
114
 
116
115
  if norm_type == "ln_norm":
117
116
  self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
@@ -141,11 +140,12 @@ class Upsample2D(nn.Module):
141
140
  self.Conv2d_0 = conv
142
141
 
143
142
  def forward(
144
- self,
145
- hidden_states: torch.FloatTensor,
146
- output_size: Optional[int] = None,
147
- scale: float = 1.0,
143
+ self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs
148
144
  ) -> torch.FloatTensor:
145
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
146
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
147
+ deprecate("scale", "1.0.0", deprecation_message)
148
+
149
149
  assert hidden_states.shape[1] == self.channels
150
150
 
151
151
  if self.norm is not None:
@@ -180,15 +180,9 @@ class Upsample2D(nn.Module):
180
180
  # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
181
181
  if self.use_conv:
182
182
  if self.name == "conv":
183
- if isinstance(self.conv, LoRACompatibleConv) and not USE_PEFT_BACKEND:
184
- hidden_states = self.conv(hidden_states, scale)
185
- else:
186
- hidden_states = self.conv(hidden_states)
183
+ hidden_states = self.conv(hidden_states)
187
184
  else:
188
- if isinstance(self.Conv2d_0, LoRACompatibleConv) and not USE_PEFT_BACKEND:
189
- hidden_states = self.Conv2d_0(hidden_states, scale)
190
- else:
191
- hidden_states = self.Conv2d_0(hidden_states)
185
+ hidden_states = self.Conv2d_0(hidden_states)
192
186
 
193
187
  return hidden_states
194
188