diffusers 0.27.2__py3-none-any.whl → 0.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. diffusers/__init__.py +26 -1
  2. diffusers/callbacks.py +156 -0
  3. diffusers/commands/env.py +110 -6
  4. diffusers/configuration_utils.py +33 -11
  5. diffusers/dependency_versions_table.py +2 -1
  6. diffusers/image_processor.py +158 -45
  7. diffusers/loaders/__init__.py +2 -5
  8. diffusers/loaders/autoencoder.py +4 -4
  9. diffusers/loaders/controlnet.py +4 -4
  10. diffusers/loaders/ip_adapter.py +80 -22
  11. diffusers/loaders/lora.py +134 -20
  12. diffusers/loaders/lora_conversion_utils.py +46 -43
  13. diffusers/loaders/peft.py +4 -3
  14. diffusers/loaders/single_file.py +401 -170
  15. diffusers/loaders/single_file_model.py +290 -0
  16. diffusers/loaders/single_file_utils.py +616 -672
  17. diffusers/loaders/textual_inversion.py +41 -20
  18. diffusers/loaders/unet.py +168 -115
  19. diffusers/loaders/unet_loader_utils.py +163 -0
  20. diffusers/models/__init__.py +8 -0
  21. diffusers/models/activations.py +23 -3
  22. diffusers/models/attention.py +10 -11
  23. diffusers/models/attention_processor.py +475 -148
  24. diffusers/models/autoencoders/autoencoder_asym_kl.py +14 -16
  25. diffusers/models/autoencoders/autoencoder_kl.py +18 -19
  26. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +11 -11
  27. diffusers/models/autoencoders/autoencoder_tiny.py +16 -16
  28. diffusers/models/autoencoders/consistency_decoder_vae.py +36 -11
  29. diffusers/models/autoencoders/vae.py +23 -24
  30. diffusers/models/controlnet.py +12 -9
  31. diffusers/models/controlnet_flax.py +4 -4
  32. diffusers/models/controlnet_xs.py +1915 -0
  33. diffusers/models/downsampling.py +17 -18
  34. diffusers/models/embeddings.py +363 -32
  35. diffusers/models/model_loading_utils.py +177 -0
  36. diffusers/models/modeling_flax_pytorch_utils.py +2 -1
  37. diffusers/models/modeling_flax_utils.py +4 -4
  38. diffusers/models/modeling_outputs.py +14 -0
  39. diffusers/models/modeling_pytorch_flax_utils.py +1 -1
  40. diffusers/models/modeling_utils.py +175 -99
  41. diffusers/models/normalization.py +2 -1
  42. diffusers/models/resnet.py +18 -23
  43. diffusers/models/transformer_temporal.py +3 -3
  44. diffusers/models/transformers/__init__.py +3 -0
  45. diffusers/models/transformers/dit_transformer_2d.py +240 -0
  46. diffusers/models/transformers/dual_transformer_2d.py +4 -4
  47. diffusers/models/transformers/hunyuan_transformer_2d.py +427 -0
  48. diffusers/models/transformers/pixart_transformer_2d.py +336 -0
  49. diffusers/models/transformers/prior_transformer.py +7 -7
  50. diffusers/models/transformers/t5_film_transformer.py +17 -19
  51. diffusers/models/transformers/transformer_2d.py +292 -184
  52. diffusers/models/transformers/transformer_temporal.py +10 -10
  53. diffusers/models/unets/unet_1d.py +5 -5
  54. diffusers/models/unets/unet_1d_blocks.py +29 -29
  55. diffusers/models/unets/unet_2d.py +6 -6
  56. diffusers/models/unets/unet_2d_blocks.py +137 -128
  57. diffusers/models/unets/unet_2d_condition.py +19 -15
  58. diffusers/models/unets/unet_2d_condition_flax.py +6 -5
  59. diffusers/models/unets/unet_3d_blocks.py +79 -77
  60. diffusers/models/unets/unet_3d_condition.py +13 -9
  61. diffusers/models/unets/unet_i2vgen_xl.py +14 -13
  62. diffusers/models/unets/unet_kandinsky3.py +1 -1
  63. diffusers/models/unets/unet_motion_model.py +114 -14
  64. diffusers/models/unets/unet_spatio_temporal_condition.py +15 -14
  65. diffusers/models/unets/unet_stable_cascade.py +16 -13
  66. diffusers/models/upsampling.py +17 -20
  67. diffusers/models/vq_model.py +16 -15
  68. diffusers/pipelines/__init__.py +27 -3
  69. diffusers/pipelines/amused/pipeline_amused.py +12 -12
  70. diffusers/pipelines/amused/pipeline_amused_img2img.py +14 -12
  71. diffusers/pipelines/amused/pipeline_amused_inpaint.py +13 -11
  72. diffusers/pipelines/animatediff/__init__.py +2 -0
  73. diffusers/pipelines/animatediff/pipeline_animatediff.py +24 -46
  74. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +1284 -0
  75. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +82 -72
  76. diffusers/pipelines/animatediff/pipeline_output.py +3 -2
  77. diffusers/pipelines/audioldm/pipeline_audioldm.py +14 -14
  78. diffusers/pipelines/audioldm2/modeling_audioldm2.py +54 -35
  79. diffusers/pipelines/audioldm2/pipeline_audioldm2.py +120 -36
  80. diffusers/pipelines/auto_pipeline.py +21 -17
  81. diffusers/pipelines/blip_diffusion/blip_image_processing.py +1 -1
  82. diffusers/pipelines/blip_diffusion/modeling_blip2.py +5 -5
  83. diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +1 -1
  84. diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +2 -2
  85. diffusers/pipelines/consistency_models/pipeline_consistency_models.py +5 -5
  86. diffusers/pipelines/controlnet/multicontrolnet.py +4 -8
  87. diffusers/pipelines/controlnet/pipeline_controlnet.py +87 -52
  88. diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +2 -2
  89. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +50 -43
  90. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +52 -40
  91. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +80 -47
  92. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +147 -49
  93. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +89 -55
  94. diffusers/pipelines/controlnet_xs/__init__.py +68 -0
  95. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +911 -0
  96. diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +1115 -0
  97. diffusers/pipelines/deepfloyd_if/pipeline_if.py +14 -28
  98. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +18 -33
  99. diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +21 -39
  100. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +20 -36
  101. diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +23 -39
  102. diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +17 -32
  103. diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +11 -11
  104. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +43 -20
  105. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +36 -18
  106. diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +2 -2
  107. diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +7 -7
  108. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +12 -12
  109. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +18 -18
  110. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +20 -15
  111. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +20 -15
  112. diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +30 -25
  113. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +69 -59
  114. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +13 -13
  115. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +10 -5
  116. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +11 -6
  117. diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +10 -5
  118. diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +5 -5
  119. diffusers/pipelines/dit/pipeline_dit.py +7 -4
  120. diffusers/pipelines/free_init_utils.py +39 -38
  121. diffusers/pipelines/hunyuandit/__init__.py +48 -0
  122. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +881 -0
  123. diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +33 -48
  124. diffusers/pipelines/kandinsky/pipeline_kandinsky.py +8 -8
  125. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +23 -20
  126. diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +11 -11
  127. diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +12 -12
  128. diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +10 -10
  129. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +6 -6
  130. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +32 -29
  131. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +10 -10
  132. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +10 -10
  133. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +6 -6
  134. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +8 -8
  135. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +7 -7
  136. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +6 -6
  137. diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +3 -3
  138. diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +20 -33
  139. diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +24 -35
  140. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +48 -30
  141. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +50 -28
  142. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +11 -11
  143. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +61 -67
  144. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +70 -69
  145. diffusers/pipelines/ledits_pp/pipeline_output.py +2 -2
  146. diffusers/pipelines/marigold/__init__.py +50 -0
  147. diffusers/pipelines/marigold/marigold_image_processing.py +561 -0
  148. diffusers/pipelines/marigold/pipeline_marigold_depth.py +813 -0
  149. diffusers/pipelines/marigold/pipeline_marigold_normals.py +690 -0
  150. diffusers/pipelines/musicldm/pipeline_musicldm.py +14 -14
  151. diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +17 -12
  152. diffusers/pipelines/pia/pipeline_pia.py +39 -125
  153. diffusers/pipelines/pipeline_flax_utils.py +4 -4
  154. diffusers/pipelines/pipeline_loading_utils.py +269 -23
  155. diffusers/pipelines/pipeline_utils.py +266 -37
  156. diffusers/pipelines/pixart_alpha/__init__.py +8 -1
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +69 -79
  158. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +880 -0
  159. diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +10 -5
  160. diffusers/pipelines/shap_e/pipeline_shap_e.py +3 -3
  161. diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +14 -14
  162. diffusers/pipelines/shap_e/renderer.py +1 -1
  163. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +18 -18
  164. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +23 -19
  165. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +33 -32
  166. diffusers/pipelines/stable_diffusion/__init__.py +0 -1
  167. diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +18 -11
  168. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +2 -2
  169. diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +6 -6
  170. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +73 -39
  171. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +24 -17
  172. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +13 -8
  173. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +66 -36
  174. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +82 -46
  175. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +123 -28
  176. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +6 -6
  177. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +16 -16
  178. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +24 -19
  179. diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +37 -31
  180. diffusers/pipelines/stable_diffusion/safety_checker.py +2 -1
  181. diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +23 -15
  182. diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +44 -39
  183. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +23 -18
  184. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +19 -14
  185. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +20 -15
  186. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +24 -19
  187. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +65 -32
  188. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +274 -38
  189. diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +10 -5
  190. diffusers/pipelines/stable_diffusion_safe/safety_checker.py +1 -1
  191. diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +92 -25
  192. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +88 -44
  193. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +108 -56
  194. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +96 -51
  195. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +45 -25
  196. diffusers/pipelines/stable_diffusion_xl/watermark.py +9 -3
  197. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +110 -57
  198. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +59 -30
  199. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +71 -42
  200. diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +3 -2
  201. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +18 -41
  202. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +21 -85
  203. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +28 -19
  204. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +39 -33
  205. diffusers/pipelines/unclip/pipeline_unclip.py +6 -6
  206. diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +6 -6
  207. diffusers/pipelines/unidiffuser/modeling_text_decoder.py +1 -1
  208. diffusers/pipelines/unidiffuser/modeling_uvit.py +9 -9
  209. diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +23 -23
  210. diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +5 -5
  211. diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py +5 -10
  212. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +4 -6
  213. diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +4 -4
  214. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +12 -12
  215. diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +10 -10
  216. diffusers/schedulers/__init__.py +2 -2
  217. diffusers/schedulers/deprecated/__init__.py +1 -1
  218. diffusers/schedulers/deprecated/scheduling_karras_ve.py +25 -25
  219. diffusers/schedulers/scheduling_amused.py +5 -5
  220. diffusers/schedulers/scheduling_consistency_decoder.py +11 -11
  221. diffusers/schedulers/scheduling_consistency_models.py +20 -26
  222. diffusers/schedulers/scheduling_ddim.py +22 -24
  223. diffusers/schedulers/scheduling_ddim_flax.py +2 -1
  224. diffusers/schedulers/scheduling_ddim_inverse.py +16 -16
  225. diffusers/schedulers/scheduling_ddim_parallel.py +28 -30
  226. diffusers/schedulers/scheduling_ddpm.py +20 -22
  227. diffusers/schedulers/scheduling_ddpm_flax.py +7 -3
  228. diffusers/schedulers/scheduling_ddpm_parallel.py +26 -28
  229. diffusers/schedulers/scheduling_ddpm_wuerstchen.py +14 -14
  230. diffusers/schedulers/scheduling_deis_multistep.py +42 -42
  231. diffusers/schedulers/scheduling_dpmsolver_multistep.py +103 -77
  232. diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py +2 -2
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +46 -46
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +23 -23
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +86 -65
  236. diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +75 -54
  237. diffusers/schedulers/scheduling_edm_euler.py +50 -31
  238. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +23 -29
  239. diffusers/schedulers/scheduling_euler_discrete.py +160 -68
  240. diffusers/schedulers/scheduling_heun_discrete.py +57 -39
  241. diffusers/schedulers/scheduling_ipndm.py +8 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +19 -19
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +19 -19
  244. diffusers/schedulers/scheduling_karras_ve_flax.py +6 -6
  245. diffusers/schedulers/scheduling_lcm.py +21 -23
  246. diffusers/schedulers/scheduling_lms_discrete.py +24 -26
  247. diffusers/schedulers/scheduling_pndm.py +20 -20
  248. diffusers/schedulers/scheduling_repaint.py +20 -20
  249. diffusers/schedulers/scheduling_sasolver.py +55 -54
  250. diffusers/schedulers/scheduling_sde_ve.py +19 -19
  251. diffusers/schedulers/scheduling_tcd.py +39 -30
  252. diffusers/schedulers/scheduling_unclip.py +15 -15
  253. diffusers/schedulers/scheduling_unipc_multistep.py +111 -41
  254. diffusers/schedulers/scheduling_utils.py +14 -5
  255. diffusers/schedulers/scheduling_utils_flax.py +3 -3
  256. diffusers/schedulers/scheduling_vq_diffusion.py +10 -10
  257. diffusers/training_utils.py +56 -1
  258. diffusers/utils/__init__.py +7 -0
  259. diffusers/utils/doc_utils.py +1 -0
  260. diffusers/utils/dummy_pt_objects.py +75 -0
  261. diffusers/utils/dummy_torch_and_transformers_objects.py +105 -0
  262. diffusers/utils/dynamic_modules_utils.py +24 -11
  263. diffusers/utils/hub_utils.py +3 -2
  264. diffusers/utils/import_utils.py +91 -0
  265. diffusers/utils/loading_utils.py +2 -2
  266. diffusers/utils/logging.py +1 -1
  267. diffusers/utils/peft_utils.py +32 -5
  268. diffusers/utils/state_dict_utils.py +11 -2
  269. diffusers/utils/testing_utils.py +71 -6
  270. diffusers/utils/torch_utils.py +1 -0
  271. diffusers/video_processor.py +113 -0
  272. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/METADATA +7 -7
  273. diffusers-0.28.1.dist-info/RECORD +419 -0
  274. diffusers-0.27.2.dist-info/RECORD +0 -399
  275. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/LICENSE +0 -0
  276. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/WHEEL +0 -0
  277. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/entry_points.txt +0 -0
  278. {diffusers-0.27.2.dist-info → diffusers-0.28.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1915 @@
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from math import gcd
16
+ from typing import Any, Dict, List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.utils.checkpoint
20
+ from torch import Tensor, nn
21
+
22
+ from ..configuration_utils import ConfigMixin, register_to_config
23
+ from ..utils import BaseOutput, is_torch_version, logging
24
+ from ..utils.torch_utils import apply_freeu
25
+ from .attention_processor import (
26
+ ADDED_KV_ATTENTION_PROCESSORS,
27
+ CROSS_ATTENTION_PROCESSORS,
28
+ Attention,
29
+ AttentionProcessor,
30
+ AttnAddedKVProcessor,
31
+ AttnProcessor,
32
+ )
33
+ from .controlnet import ControlNetConditioningEmbedding
34
+ from .embeddings import TimestepEmbedding, Timesteps
35
+ from .modeling_utils import ModelMixin
36
+ from .unets.unet_2d_blocks import (
37
+ CrossAttnDownBlock2D,
38
+ CrossAttnUpBlock2D,
39
+ Downsample2D,
40
+ ResnetBlock2D,
41
+ Transformer2DModel,
42
+ UNetMidBlock2DCrossAttn,
43
+ Upsample2D,
44
+ )
45
+ from .unets.unet_2d_condition import UNet2DConditionModel
46
+
47
+
48
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
49
+
50
+
51
+ @dataclass
52
+ class ControlNetXSOutput(BaseOutput):
53
+ """
54
+ The output of [`UNetControlNetXSModel`].
55
+
56
+ Args:
57
+ sample (`Tensor` of shape `(batch_size, num_channels, height, width)`):
58
+ The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base
59
+ model output, but is already the final output.
60
+ """
61
+
62
+ sample: Tensor = None
63
+
64
+
65
+ class DownBlockControlNetXSAdapter(nn.Module):
66
+ """Components that together with corresponding components from the base model will form a
67
+ `ControlNetXSCrossAttnDownBlock2D`"""
68
+
69
+ def __init__(
70
+ self,
71
+ resnets: nn.ModuleList,
72
+ base_to_ctrl: nn.ModuleList,
73
+ ctrl_to_base: nn.ModuleList,
74
+ attentions: Optional[nn.ModuleList] = None,
75
+ downsampler: Optional[nn.Conv2d] = None,
76
+ ):
77
+ super().__init__()
78
+ self.resnets = resnets
79
+ self.base_to_ctrl = base_to_ctrl
80
+ self.ctrl_to_base = ctrl_to_base
81
+ self.attentions = attentions
82
+ self.downsamplers = downsampler
83
+
84
+
85
+ class MidBlockControlNetXSAdapter(nn.Module):
86
+ """Components that together with corresponding components from the base model will form a
87
+ `ControlNetXSCrossAttnMidBlock2D`"""
88
+
89
+ def __init__(self, midblock: UNetMidBlock2DCrossAttn, base_to_ctrl: nn.ModuleList, ctrl_to_base: nn.ModuleList):
90
+ super().__init__()
91
+ self.midblock = midblock
92
+ self.base_to_ctrl = base_to_ctrl
93
+ self.ctrl_to_base = ctrl_to_base
94
+
95
+
96
+ class UpBlockControlNetXSAdapter(nn.Module):
97
+ """Components that together with corresponding components from the base model will form a `ControlNetXSCrossAttnUpBlock2D`"""
98
+
99
+ def __init__(self, ctrl_to_base: nn.ModuleList):
100
+ super().__init__()
101
+ self.ctrl_to_base = ctrl_to_base
102
+
103
+
104
+ def get_down_block_adapter(
105
+ base_in_channels: int,
106
+ base_out_channels: int,
107
+ ctrl_in_channels: int,
108
+ ctrl_out_channels: int,
109
+ temb_channels: int,
110
+ max_norm_num_groups: Optional[int] = 32,
111
+ has_crossattn=True,
112
+ transformer_layers_per_block: Optional[Union[int, Tuple[int]]] = 1,
113
+ num_attention_heads: Optional[int] = 1,
114
+ cross_attention_dim: Optional[int] = 1024,
115
+ add_downsample: bool = True,
116
+ upcast_attention: Optional[bool] = False,
117
+ ):
118
+ num_layers = 2 # only support sd + sdxl
119
+
120
+ resnets = []
121
+ attentions = []
122
+ ctrl_to_base = []
123
+ base_to_ctrl = []
124
+
125
+ if isinstance(transformer_layers_per_block, int):
126
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
127
+
128
+ for i in range(num_layers):
129
+ base_in_channels = base_in_channels if i == 0 else base_out_channels
130
+ ctrl_in_channels = ctrl_in_channels if i == 0 else ctrl_out_channels
131
+
132
+ # Before the resnet/attention application, information is concatted from base to control.
133
+ # Concat doesn't require change in number of channels
134
+ base_to_ctrl.append(make_zero_conv(base_in_channels, base_in_channels))
135
+
136
+ resnets.append(
137
+ ResnetBlock2D(
138
+ in_channels=ctrl_in_channels + base_in_channels, # information from base is concatted to ctrl
139
+ out_channels=ctrl_out_channels,
140
+ temb_channels=temb_channels,
141
+ groups=find_largest_factor(ctrl_in_channels + base_in_channels, max_factor=max_norm_num_groups),
142
+ groups_out=find_largest_factor(ctrl_out_channels, max_factor=max_norm_num_groups),
143
+ eps=1e-5,
144
+ )
145
+ )
146
+
147
+ if has_crossattn:
148
+ attentions.append(
149
+ Transformer2DModel(
150
+ num_attention_heads,
151
+ ctrl_out_channels // num_attention_heads,
152
+ in_channels=ctrl_out_channels,
153
+ num_layers=transformer_layers_per_block[i],
154
+ cross_attention_dim=cross_attention_dim,
155
+ use_linear_projection=True,
156
+ upcast_attention=upcast_attention,
157
+ norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=max_norm_num_groups),
158
+ )
159
+ )
160
+
161
+ # After the resnet/attention application, information is added from control to base
162
+ # Addition requires change in number of channels
163
+ ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
164
+
165
+ if add_downsample:
166
+ # Before the downsampler application, information is concatted from base to control
167
+ # Concat doesn't require change in number of channels
168
+ base_to_ctrl.append(make_zero_conv(base_out_channels, base_out_channels))
169
+
170
+ downsamplers = Downsample2D(
171
+ ctrl_out_channels + base_out_channels, use_conv=True, out_channels=ctrl_out_channels, name="op"
172
+ )
173
+
174
+ # After the downsampler application, information is added from control to base
175
+ # Addition requires change in number of channels
176
+ ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
177
+ else:
178
+ downsamplers = None
179
+
180
+ down_block_components = DownBlockControlNetXSAdapter(
181
+ resnets=nn.ModuleList(resnets),
182
+ base_to_ctrl=nn.ModuleList(base_to_ctrl),
183
+ ctrl_to_base=nn.ModuleList(ctrl_to_base),
184
+ )
185
+
186
+ if has_crossattn:
187
+ down_block_components.attentions = nn.ModuleList(attentions)
188
+ if downsamplers is not None:
189
+ down_block_components.downsamplers = downsamplers
190
+
191
+ return down_block_components
192
+
193
+
194
+ def get_mid_block_adapter(
195
+ base_channels: int,
196
+ ctrl_channels: int,
197
+ temb_channels: Optional[int] = None,
198
+ max_norm_num_groups: Optional[int] = 32,
199
+ transformer_layers_per_block: int = 1,
200
+ num_attention_heads: Optional[int] = 1,
201
+ cross_attention_dim: Optional[int] = 1024,
202
+ upcast_attention: bool = False,
203
+ ):
204
+ # Before the midblock application, information is concatted from base to control.
205
+ # Concat doesn't require change in number of channels
206
+ base_to_ctrl = make_zero_conv(base_channels, base_channels)
207
+
208
+ midblock = UNetMidBlock2DCrossAttn(
209
+ transformer_layers_per_block=transformer_layers_per_block,
210
+ in_channels=ctrl_channels + base_channels,
211
+ out_channels=ctrl_channels,
212
+ temb_channels=temb_channels,
213
+ # number or norm groups must divide both in_channels and out_channels
214
+ resnet_groups=find_largest_factor(gcd(ctrl_channels, ctrl_channels + base_channels), max_norm_num_groups),
215
+ cross_attention_dim=cross_attention_dim,
216
+ num_attention_heads=num_attention_heads,
217
+ use_linear_projection=True,
218
+ upcast_attention=upcast_attention,
219
+ )
220
+
221
+ # After the midblock application, information is added from control to base
222
+ # Addition requires change in number of channels
223
+ ctrl_to_base = make_zero_conv(ctrl_channels, base_channels)
224
+
225
+ return MidBlockControlNetXSAdapter(base_to_ctrl=base_to_ctrl, midblock=midblock, ctrl_to_base=ctrl_to_base)
226
+
227
+
228
+ def get_up_block_adapter(
229
+ out_channels: int,
230
+ prev_output_channel: int,
231
+ ctrl_skip_channels: List[int],
232
+ ):
233
+ ctrl_to_base = []
234
+ num_layers = 3 # only support sd + sdxl
235
+ for i in range(num_layers):
236
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
237
+ ctrl_to_base.append(make_zero_conv(ctrl_skip_channels[i], resnet_in_channels))
238
+
239
+ return UpBlockControlNetXSAdapter(ctrl_to_base=nn.ModuleList(ctrl_to_base))
240
+
241
+
242
+ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
243
+ r"""
244
+ A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
245
+ `UNet2DConditionModel` base model).
246
+
247
+ This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
248
+ methods implemented for all models (such as downloading or saving).
249
+
250
+ Like `UNetControlNetXSModel`, `ControlNetXSAdapter` is compatible with StableDiffusion and StableDiffusion-XL. It's
251
+ default parameters are compatible with StableDiffusion.
252
+
253
+ Parameters:
254
+ conditioning_channels (`int`, defaults to 3):
255
+ Number of channels of conditioning input (e.g. an image)
256
+ conditioning_channel_order (`str`, defaults to `"rgb"`):
257
+ The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
258
+ conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
259
+ The tuple of output channels for each block in the `controlnet_cond_embedding` layer.
260
+ time_embedding_mix (`float`, defaults to 1.0):
261
+ If 0, then only the control adapters's time embedding is used. If 1, then only the base unet's time
262
+ embedding is used. Otherwise, both are combined.
263
+ learn_time_embedding (`bool`, defaults to `False`):
264
+ Whether a time embedding should be learned. If yes, `UNetControlNetXSModel` will combine the time
265
+ embeddings of the base model and the control adapter. If no, `UNetControlNetXSModel` will use the base
266
+ model's time embedding.
267
+ num_attention_heads (`list[int]`, defaults to `[4]`):
268
+ The number of attention heads.
269
+ block_out_channels (`list[int]`, defaults to `[4, 8, 16, 16]`):
270
+ The tuple of output channels for each block.
271
+ base_block_out_channels (`list[int]`, defaults to `[320, 640, 1280, 1280]`):
272
+ The tuple of output channels for each block in the base unet.
273
+ cross_attention_dim (`int`, defaults to 1024):
274
+ The dimension of the cross attention features.
275
+ down_block_types (`list[str]`, defaults to `["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]`):
276
+ The tuple of downsample blocks to use.
277
+ sample_size (`int`, defaults to 96):
278
+ Height and width of input/output sample.
279
+ transformer_layers_per_block (`Union[int, Tuple[int]]`, defaults to 1):
280
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
281
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
282
+ upcast_attention (`bool`, defaults to `True`):
283
+ Whether the attention computation should always be upcasted.
284
+ max_norm_num_groups (`int`, defaults to 32):
285
+ Maximum number of groups in group normal. The actual number will the the largest divisor of the respective
286
+ channels, that is <= max_norm_num_groups.
287
+ """
288
+
289
+ @register_to_config
290
+ def __init__(
291
+ self,
292
+ conditioning_channels: int = 3,
293
+ conditioning_channel_order: str = "rgb",
294
+ conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
295
+ time_embedding_mix: float = 1.0,
296
+ learn_time_embedding: bool = False,
297
+ num_attention_heads: Union[int, Tuple[int]] = 4,
298
+ block_out_channels: Tuple[int] = (4, 8, 16, 16),
299
+ base_block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
300
+ cross_attention_dim: int = 1024,
301
+ down_block_types: Tuple[str] = (
302
+ "CrossAttnDownBlock2D",
303
+ "CrossAttnDownBlock2D",
304
+ "CrossAttnDownBlock2D",
305
+ "DownBlock2D",
306
+ ),
307
+ sample_size: Optional[int] = 96,
308
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
309
+ upcast_attention: bool = True,
310
+ max_norm_num_groups: int = 32,
311
+ ):
312
+ super().__init__()
313
+
314
+ time_embedding_input_dim = base_block_out_channels[0]
315
+ time_embedding_dim = base_block_out_channels[0] * 4
316
+
317
+ # Check inputs
318
+ if conditioning_channel_order not in ["rgb", "bgr"]:
319
+ raise ValueError(f"unknown `conditioning_channel_order`: {conditioning_channel_order}")
320
+
321
+ if len(block_out_channels) != len(down_block_types):
322
+ raise ValueError(
323
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
324
+ )
325
+
326
+ if not isinstance(transformer_layers_per_block, (list, tuple)):
327
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
328
+ if not isinstance(cross_attention_dim, (list, tuple)):
329
+ cross_attention_dim = [cross_attention_dim] * len(down_block_types)
330
+ # see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why `ControlNetXSAdapter` takes `num_attention_heads` instead of `attention_head_dim`
331
+ if not isinstance(num_attention_heads, (list, tuple)):
332
+ num_attention_heads = [num_attention_heads] * len(down_block_types)
333
+
334
+ if len(num_attention_heads) != len(down_block_types):
335
+ raise ValueError(
336
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
337
+ )
338
+
339
+ # 5 - Create conditioning hint embedding
340
+ self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
341
+ conditioning_embedding_channels=block_out_channels[0],
342
+ block_out_channels=conditioning_embedding_out_channels,
343
+ conditioning_channels=conditioning_channels,
344
+ )
345
+
346
+ # time
347
+ if learn_time_embedding:
348
+ self.time_embedding = TimestepEmbedding(time_embedding_input_dim, time_embedding_dim)
349
+ else:
350
+ self.time_embedding = None
351
+
352
+ self.down_blocks = nn.ModuleList([])
353
+ self.up_connections = nn.ModuleList([])
354
+
355
+ # input
356
+ self.conv_in = nn.Conv2d(4, block_out_channels[0], kernel_size=3, padding=1)
357
+ self.control_to_base_for_conv_in = make_zero_conv(block_out_channels[0], base_block_out_channels[0])
358
+
359
+ # down
360
+ base_out_channels = base_block_out_channels[0]
361
+ ctrl_out_channels = block_out_channels[0]
362
+ for i, down_block_type in enumerate(down_block_types):
363
+ base_in_channels = base_out_channels
364
+ base_out_channels = base_block_out_channels[i]
365
+ ctrl_in_channels = ctrl_out_channels
366
+ ctrl_out_channels = block_out_channels[i]
367
+ has_crossattn = "CrossAttn" in down_block_type
368
+ is_final_block = i == len(down_block_types) - 1
369
+
370
+ self.down_blocks.append(
371
+ get_down_block_adapter(
372
+ base_in_channels=base_in_channels,
373
+ base_out_channels=base_out_channels,
374
+ ctrl_in_channels=ctrl_in_channels,
375
+ ctrl_out_channels=ctrl_out_channels,
376
+ temb_channels=time_embedding_dim,
377
+ max_norm_num_groups=max_norm_num_groups,
378
+ has_crossattn=has_crossattn,
379
+ transformer_layers_per_block=transformer_layers_per_block[i],
380
+ num_attention_heads=num_attention_heads[i],
381
+ cross_attention_dim=cross_attention_dim[i],
382
+ add_downsample=not is_final_block,
383
+ upcast_attention=upcast_attention,
384
+ )
385
+ )
386
+
387
+ # mid
388
+ self.mid_block = get_mid_block_adapter(
389
+ base_channels=base_block_out_channels[-1],
390
+ ctrl_channels=block_out_channels[-1],
391
+ temb_channels=time_embedding_dim,
392
+ transformer_layers_per_block=transformer_layers_per_block[-1],
393
+ num_attention_heads=num_attention_heads[-1],
394
+ cross_attention_dim=cross_attention_dim[-1],
395
+ upcast_attention=upcast_attention,
396
+ )
397
+
398
+ # up
399
+ # The skip connection channels are the output of the conv_in and of all the down subblocks
400
+ ctrl_skip_channels = [block_out_channels[0]]
401
+ for i, out_channels in enumerate(block_out_channels):
402
+ number_of_subblocks = (
403
+ 3 if i < len(block_out_channels) - 1 else 2
404
+ ) # every block has 3 subblocks, except last one, which has 2 as it has no downsampler
405
+ ctrl_skip_channels.extend([out_channels] * number_of_subblocks)
406
+
407
+ reversed_base_block_out_channels = list(reversed(base_block_out_channels))
408
+
409
+ base_out_channels = reversed_base_block_out_channels[0]
410
+ for i in range(len(down_block_types)):
411
+ prev_base_output_channel = base_out_channels
412
+ base_out_channels = reversed_base_block_out_channels[i]
413
+ ctrl_skip_channels_ = [ctrl_skip_channels.pop() for _ in range(3)]
414
+
415
+ self.up_connections.append(
416
+ get_up_block_adapter(
417
+ out_channels=base_out_channels,
418
+ prev_output_channel=prev_base_output_channel,
419
+ ctrl_skip_channels=ctrl_skip_channels_,
420
+ )
421
+ )
422
+
423
+ @classmethod
424
+ def from_unet(
425
+ cls,
426
+ unet: UNet2DConditionModel,
427
+ size_ratio: Optional[float] = None,
428
+ block_out_channels: Optional[List[int]] = None,
429
+ num_attention_heads: Optional[List[int]] = None,
430
+ learn_time_embedding: bool = False,
431
+ time_embedding_mix: int = 1.0,
432
+ conditioning_channels: int = 3,
433
+ conditioning_channel_order: str = "rgb",
434
+ conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
435
+ ):
436
+ r"""
437
+ Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].
438
+
439
+ Parameters:
440
+ unet (`UNet2DConditionModel`):
441
+ The UNet model we want to control. The dimensions of the ControlNetXSAdapter will be adapted to it.
442
+ size_ratio (float, *optional*, defaults to `None`):
443
+ When given, block_out_channels is set to a fraction of the base model's block_out_channels. Either this
444
+ or `block_out_channels` must be given.
445
+ block_out_channels (`List[int]`, *optional*, defaults to `None`):
446
+ Down blocks output channels in control model. Either this or `size_ratio` must be given.
447
+ num_attention_heads (`List[int]`, *optional*, defaults to `None`):
448
+ The dimension of the attention heads. The naming seems a bit confusing and it is, see
449
+ https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
450
+ learn_time_embedding (`bool`, defaults to `False`):
451
+ Whether the `ControlNetXSAdapter` should learn a time embedding.
452
+ time_embedding_mix (`float`, defaults to 1.0):
453
+ If 0, then only the control adapter's time embedding is used. If 1, then only the base unet's time
454
+ embedding is used. Otherwise, both are combined.
455
+ conditioning_channels (`int`, defaults to 3):
456
+ Number of channels of conditioning input (e.g. an image)
457
+ conditioning_channel_order (`str`, defaults to `"rgb"`):
458
+ The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
459
+ conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
460
+ The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
461
+ """
462
+
463
+ # Check input
464
+ fixed_size = block_out_channels is not None
465
+ relative_size = size_ratio is not None
466
+ if not (fixed_size ^ relative_size):
467
+ raise ValueError(
468
+ "Pass exactly one of `block_out_channels` (for absolute sizing) or `size_ratio` (for relative sizing)."
469
+ )
470
+
471
+ # Create model
472
+ block_out_channels = block_out_channels or [int(b * size_ratio) for b in unet.config.block_out_channels]
473
+ if num_attention_heads is None:
474
+ # The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
475
+ num_attention_heads = unet.config.attention_head_dim
476
+
477
+ model = cls(
478
+ conditioning_channels=conditioning_channels,
479
+ conditioning_channel_order=conditioning_channel_order,
480
+ conditioning_embedding_out_channels=conditioning_embedding_out_channels,
481
+ time_embedding_mix=time_embedding_mix,
482
+ learn_time_embedding=learn_time_embedding,
483
+ num_attention_heads=num_attention_heads,
484
+ block_out_channels=block_out_channels,
485
+ base_block_out_channels=unet.config.block_out_channels,
486
+ cross_attention_dim=unet.config.cross_attention_dim,
487
+ down_block_types=unet.config.down_block_types,
488
+ sample_size=unet.config.sample_size,
489
+ transformer_layers_per_block=unet.config.transformer_layers_per_block,
490
+ upcast_attention=unet.config.upcast_attention,
491
+ max_norm_num_groups=unet.config.norm_num_groups,
492
+ )
493
+
494
+ # ensure that the ControlNetXSAdapter is the same dtype as the UNet2DConditionModel
495
+ model.to(unet.dtype)
496
+
497
+ return model
498
+
499
+ def forward(self, *args, **kwargs):
500
+ raise ValueError(
501
+ "A ControlNetXSAdapter cannot be run by itself. Use it together with a UNet2DConditionModel to instantiate a UNetControlNetXSModel."
502
+ )
503
+
504
+
505
+ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
506
+ r"""
507
+ A UNet fused with a ControlNet-XS adapter model
508
+
509
+ This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
510
+ methods implemented for all models (such as downloading or saving).
511
+
512
+ `UNetControlNetXSModel` is compatible with StableDiffusion and StableDiffusion-XL. It's default parameters are
513
+ compatible with StableDiffusion.
514
+
515
+ It's parameters are either passed to the underlying `UNet2DConditionModel` or used exactly like in
516
+ `ControlNetXSAdapter` . See their documentation for details.
517
+ """
518
+
519
+ _supports_gradient_checkpointing = True
520
+
521
+ @register_to_config
522
+ def __init__(
523
+ self,
524
+ # unet configs
525
+ sample_size: Optional[int] = 96,
526
+ down_block_types: Tuple[str] = (
527
+ "CrossAttnDownBlock2D",
528
+ "CrossAttnDownBlock2D",
529
+ "CrossAttnDownBlock2D",
530
+ "DownBlock2D",
531
+ ),
532
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
533
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
534
+ norm_num_groups: Optional[int] = 32,
535
+ cross_attention_dim: Union[int, Tuple[int]] = 1024,
536
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
537
+ num_attention_heads: Union[int, Tuple[int]] = 8,
538
+ addition_embed_type: Optional[str] = None,
539
+ addition_time_embed_dim: Optional[int] = None,
540
+ upcast_attention: bool = True,
541
+ time_cond_proj_dim: Optional[int] = None,
542
+ projection_class_embeddings_input_dim: Optional[int] = None,
543
+ # additional controlnet configs
544
+ time_embedding_mix: float = 1.0,
545
+ ctrl_conditioning_channels: int = 3,
546
+ ctrl_conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
547
+ ctrl_conditioning_channel_order: str = "rgb",
548
+ ctrl_learn_time_embedding: bool = False,
549
+ ctrl_block_out_channels: Tuple[int] = (4, 8, 16, 16),
550
+ ctrl_num_attention_heads: Union[int, Tuple[int]] = 4,
551
+ ctrl_max_norm_num_groups: int = 32,
552
+ ):
553
+ super().__init__()
554
+
555
+ if time_embedding_mix < 0 or time_embedding_mix > 1:
556
+ raise ValueError("`time_embedding_mix` needs to be between 0 and 1.")
557
+ if time_embedding_mix < 1 and not ctrl_learn_time_embedding:
558
+ raise ValueError("To use `time_embedding_mix` < 1, `ctrl_learn_time_embedding` must be `True`")
559
+
560
+ if addition_embed_type is not None and addition_embed_type != "text_time":
561
+ raise ValueError(
562
+ "As `UNetControlNetXSModel` currently only supports StableDiffusion and StableDiffusion-XL, `addition_embed_type` must be `None` or `'text_time'`."
563
+ )
564
+
565
+ if not isinstance(transformer_layers_per_block, (list, tuple)):
566
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
567
+ if not isinstance(cross_attention_dim, (list, tuple)):
568
+ cross_attention_dim = [cross_attention_dim] * len(down_block_types)
569
+ if not isinstance(num_attention_heads, (list, tuple)):
570
+ num_attention_heads = [num_attention_heads] * len(down_block_types)
571
+ if not isinstance(ctrl_num_attention_heads, (list, tuple)):
572
+ ctrl_num_attention_heads = [ctrl_num_attention_heads] * len(down_block_types)
573
+
574
+ base_num_attention_heads = num_attention_heads
575
+
576
+ self.in_channels = 4
577
+
578
+ # # Input
579
+ self.base_conv_in = nn.Conv2d(4, block_out_channels[0], kernel_size=3, padding=1)
580
+ self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
581
+ conditioning_embedding_channels=ctrl_block_out_channels[0],
582
+ block_out_channels=ctrl_conditioning_embedding_out_channels,
583
+ conditioning_channels=ctrl_conditioning_channels,
584
+ )
585
+ self.ctrl_conv_in = nn.Conv2d(4, ctrl_block_out_channels[0], kernel_size=3, padding=1)
586
+ self.control_to_base_for_conv_in = make_zero_conv(ctrl_block_out_channels[0], block_out_channels[0])
587
+
588
+ # # Time
589
+ time_embed_input_dim = block_out_channels[0]
590
+ time_embed_dim = block_out_channels[0] * 4
591
+
592
+ self.base_time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos=True, downscale_freq_shift=0)
593
+ self.base_time_embedding = TimestepEmbedding(
594
+ time_embed_input_dim,
595
+ time_embed_dim,
596
+ cond_proj_dim=time_cond_proj_dim,
597
+ )
598
+ self.ctrl_time_embedding = TimestepEmbedding(in_channels=time_embed_input_dim, time_embed_dim=time_embed_dim)
599
+
600
+ if addition_embed_type is None:
601
+ self.base_add_time_proj = None
602
+ self.base_add_embedding = None
603
+ else:
604
+ self.base_add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
605
+ self.base_add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
606
+
607
+ # # Create down blocks
608
+ down_blocks = []
609
+ base_out_channels = block_out_channels[0]
610
+ ctrl_out_channels = ctrl_block_out_channels[0]
611
+ for i, down_block_type in enumerate(down_block_types):
612
+ base_in_channels = base_out_channels
613
+ base_out_channels = block_out_channels[i]
614
+ ctrl_in_channels = ctrl_out_channels
615
+ ctrl_out_channels = ctrl_block_out_channels[i]
616
+ has_crossattn = "CrossAttn" in down_block_type
617
+ is_final_block = i == len(down_block_types) - 1
618
+
619
+ down_blocks.append(
620
+ ControlNetXSCrossAttnDownBlock2D(
621
+ base_in_channels=base_in_channels,
622
+ base_out_channels=base_out_channels,
623
+ ctrl_in_channels=ctrl_in_channels,
624
+ ctrl_out_channels=ctrl_out_channels,
625
+ temb_channels=time_embed_dim,
626
+ norm_num_groups=norm_num_groups,
627
+ ctrl_max_norm_num_groups=ctrl_max_norm_num_groups,
628
+ has_crossattn=has_crossattn,
629
+ transformer_layers_per_block=transformer_layers_per_block[i],
630
+ base_num_attention_heads=base_num_attention_heads[i],
631
+ ctrl_num_attention_heads=ctrl_num_attention_heads[i],
632
+ cross_attention_dim=cross_attention_dim[i],
633
+ add_downsample=not is_final_block,
634
+ upcast_attention=upcast_attention,
635
+ )
636
+ )
637
+
638
+ # # Create mid block
639
+ self.mid_block = ControlNetXSCrossAttnMidBlock2D(
640
+ base_channels=block_out_channels[-1],
641
+ ctrl_channels=ctrl_block_out_channels[-1],
642
+ temb_channels=time_embed_dim,
643
+ norm_num_groups=norm_num_groups,
644
+ ctrl_max_norm_num_groups=ctrl_max_norm_num_groups,
645
+ transformer_layers_per_block=transformer_layers_per_block[-1],
646
+ base_num_attention_heads=base_num_attention_heads[-1],
647
+ ctrl_num_attention_heads=ctrl_num_attention_heads[-1],
648
+ cross_attention_dim=cross_attention_dim[-1],
649
+ upcast_attention=upcast_attention,
650
+ )
651
+
652
+ # # Create up blocks
653
+ up_blocks = []
654
+ rev_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
655
+ rev_num_attention_heads = list(reversed(base_num_attention_heads))
656
+ rev_cross_attention_dim = list(reversed(cross_attention_dim))
657
+
658
+ # The skip connection channels are the output of the conv_in and of all the down subblocks
659
+ ctrl_skip_channels = [ctrl_block_out_channels[0]]
660
+ for i, out_channels in enumerate(ctrl_block_out_channels):
661
+ number_of_subblocks = (
662
+ 3 if i < len(ctrl_block_out_channels) - 1 else 2
663
+ ) # every block has 3 subblocks, except last one, which has 2 as it has no downsampler
664
+ ctrl_skip_channels.extend([out_channels] * number_of_subblocks)
665
+
666
+ reversed_block_out_channels = list(reversed(block_out_channels))
667
+
668
+ out_channels = reversed_block_out_channels[0]
669
+ for i, up_block_type in enumerate(up_block_types):
670
+ prev_output_channel = out_channels
671
+ out_channels = reversed_block_out_channels[i]
672
+ in_channels = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
673
+ ctrl_skip_channels_ = [ctrl_skip_channels.pop() for _ in range(3)]
674
+
675
+ has_crossattn = "CrossAttn" in up_block_type
676
+ is_final_block = i == len(block_out_channels) - 1
677
+
678
+ up_blocks.append(
679
+ ControlNetXSCrossAttnUpBlock2D(
680
+ in_channels=in_channels,
681
+ out_channels=out_channels,
682
+ prev_output_channel=prev_output_channel,
683
+ ctrl_skip_channels=ctrl_skip_channels_,
684
+ temb_channels=time_embed_dim,
685
+ resolution_idx=i,
686
+ has_crossattn=has_crossattn,
687
+ transformer_layers_per_block=rev_transformer_layers_per_block[i],
688
+ num_attention_heads=rev_num_attention_heads[i],
689
+ cross_attention_dim=rev_cross_attention_dim[i],
690
+ add_upsample=not is_final_block,
691
+ upcast_attention=upcast_attention,
692
+ norm_num_groups=norm_num_groups,
693
+ )
694
+ )
695
+
696
+ self.down_blocks = nn.ModuleList(down_blocks)
697
+ self.up_blocks = nn.ModuleList(up_blocks)
698
+
699
+ self.base_conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups)
700
+ self.base_conv_act = nn.SiLU()
701
+ self.base_conv_out = nn.Conv2d(block_out_channels[0], 4, kernel_size=3, padding=1)
702
+
703
+ @classmethod
704
+ def from_unet(
705
+ cls,
706
+ unet: UNet2DConditionModel,
707
+ controlnet: Optional[ControlNetXSAdapter] = None,
708
+ size_ratio: Optional[float] = None,
709
+ ctrl_block_out_channels: Optional[List[float]] = None,
710
+ time_embedding_mix: Optional[float] = None,
711
+ ctrl_optional_kwargs: Optional[Dict] = None,
712
+ ):
713
+ r"""
714
+ Instantiate a [`UNetControlNetXSModel`] from a [`UNet2DConditionModel`] and an optional [`ControlNetXSAdapter`]
715
+ .
716
+
717
+ Parameters:
718
+ unet (`UNet2DConditionModel`):
719
+ The UNet model we want to control.
720
+ controlnet (`ControlNetXSAdapter`):
721
+ The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS
722
+ adapter will be created.
723
+ size_ratio (float, *optional*, defaults to `None`):
724
+ Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
725
+ ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
726
+ Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
727
+ where this parameter is called `block_out_channels`.
728
+ time_embedding_mix (`float`, *optional*, defaults to None):
729
+ Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
730
+ ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
731
+ Passed to the `init` of the new controlent if no controlent was given.
732
+ """
733
+ if controlnet is None:
734
+ controlnet = ControlNetXSAdapter.from_unet(
735
+ unet, size_ratio, ctrl_block_out_channels, **ctrl_optional_kwargs
736
+ )
737
+ else:
738
+ if any(
739
+ o is not None for o in (size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs)
740
+ ):
741
+ raise ValueError(
742
+ "When a controlnet is passed, none of these parameters should be passed: size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs."
743
+ )
744
+
745
+ # # get params
746
+ params_for_unet = [
747
+ "sample_size",
748
+ "down_block_types",
749
+ "up_block_types",
750
+ "block_out_channels",
751
+ "norm_num_groups",
752
+ "cross_attention_dim",
753
+ "transformer_layers_per_block",
754
+ "addition_embed_type",
755
+ "addition_time_embed_dim",
756
+ "upcast_attention",
757
+ "time_cond_proj_dim",
758
+ "projection_class_embeddings_input_dim",
759
+ ]
760
+ params_for_unet = {k: v for k, v in unet.config.items() if k in params_for_unet}
761
+ # The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
762
+ params_for_unet["num_attention_heads"] = unet.config.attention_head_dim
763
+
764
+ params_for_controlnet = [
765
+ "conditioning_channels",
766
+ "conditioning_embedding_out_channels",
767
+ "conditioning_channel_order",
768
+ "learn_time_embedding",
769
+ "block_out_channels",
770
+ "num_attention_heads",
771
+ "max_norm_num_groups",
772
+ ]
773
+ params_for_controlnet = {"ctrl_" + k: v for k, v in controlnet.config.items() if k in params_for_controlnet}
774
+ params_for_controlnet["time_embedding_mix"] = controlnet.config.time_embedding_mix
775
+
776
+ # # create model
777
+ model = cls.from_config({**params_for_unet, **params_for_controlnet})
778
+
779
+ # # load weights
780
+ # from unet
781
+ modules_from_unet = [
782
+ "time_embedding",
783
+ "conv_in",
784
+ "conv_norm_out",
785
+ "conv_out",
786
+ ]
787
+ for m in modules_from_unet:
788
+ getattr(model, "base_" + m).load_state_dict(getattr(unet, m).state_dict())
789
+
790
+ optional_modules_from_unet = [
791
+ "add_time_proj",
792
+ "add_embedding",
793
+ ]
794
+ for m in optional_modules_from_unet:
795
+ if hasattr(unet, m) and getattr(unet, m) is not None:
796
+ getattr(model, "base_" + m).load_state_dict(getattr(unet, m).state_dict())
797
+
798
+ # from controlnet
799
+ model.controlnet_cond_embedding.load_state_dict(controlnet.controlnet_cond_embedding.state_dict())
800
+ model.ctrl_conv_in.load_state_dict(controlnet.conv_in.state_dict())
801
+ if controlnet.time_embedding is not None:
802
+ model.ctrl_time_embedding.load_state_dict(controlnet.time_embedding.state_dict())
803
+ model.control_to_base_for_conv_in.load_state_dict(controlnet.control_to_base_for_conv_in.state_dict())
804
+
805
+ # from both
806
+ model.down_blocks = nn.ModuleList(
807
+ ControlNetXSCrossAttnDownBlock2D.from_modules(b, c)
808
+ for b, c in zip(unet.down_blocks, controlnet.down_blocks)
809
+ )
810
+ model.mid_block = ControlNetXSCrossAttnMidBlock2D.from_modules(unet.mid_block, controlnet.mid_block)
811
+ model.up_blocks = nn.ModuleList(
812
+ ControlNetXSCrossAttnUpBlock2D.from_modules(b, c)
813
+ for b, c in zip(unet.up_blocks, controlnet.up_connections)
814
+ )
815
+
816
+ # ensure that the UNetControlNetXSModel is the same dtype as the UNet2DConditionModel
817
+ model.to(unet.dtype)
818
+
819
+ return model
820
+
821
+ def freeze_unet_params(self) -> None:
822
+ """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
823
+ tuning."""
824
+ # Freeze everything
825
+ for param in self.parameters():
826
+ param.requires_grad = True
827
+
828
+ # Unfreeze ControlNetXSAdapter
829
+ base_parts = [
830
+ "base_time_proj",
831
+ "base_time_embedding",
832
+ "base_add_time_proj",
833
+ "base_add_embedding",
834
+ "base_conv_in",
835
+ "base_conv_norm_out",
836
+ "base_conv_act",
837
+ "base_conv_out",
838
+ ]
839
+ base_parts = [getattr(self, part) for part in base_parts if getattr(self, part) is not None]
840
+ for part in base_parts:
841
+ for param in part.parameters():
842
+ param.requires_grad = False
843
+
844
+ for d in self.down_blocks:
845
+ d.freeze_base_params()
846
+ self.mid_block.freeze_base_params()
847
+ for u in self.up_blocks:
848
+ u.freeze_base_params()
849
+
850
+ def _set_gradient_checkpointing(self, module, value=False):
851
+ if hasattr(module, "gradient_checkpointing"):
852
+ module.gradient_checkpointing = value
853
+
854
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel
855
+ @property
856
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
857
+ r"""
858
+ Returns:
859
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
860
+ indexed by its weight name.
861
+ """
862
+ # set recursively
863
+ processors = {}
864
+
865
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
866
+ if hasattr(module, "get_processor"):
867
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
868
+
869
+ for sub_name, child in module.named_children():
870
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
871
+
872
+ return processors
873
+
874
+ for name, module in self.named_children():
875
+ fn_recursive_add_processors(name, module, processors)
876
+
877
+ return processors
878
+
879
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
880
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
881
+ r"""
882
+ Sets the attention processor to use to compute attention.
883
+
884
+ Parameters:
885
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
886
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
887
+ for **all** `Attention` layers.
888
+
889
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
890
+ processor. This is strongly recommended when setting trainable attention processors.
891
+
892
+ """
893
+ count = len(self.attn_processors.keys())
894
+
895
+ if isinstance(processor, dict) and len(processor) != count:
896
+ raise ValueError(
897
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
898
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
899
+ )
900
+
901
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
902
+ if hasattr(module, "set_processor"):
903
+ if not isinstance(processor, dict):
904
+ module.set_processor(processor)
905
+ else:
906
+ module.set_processor(processor.pop(f"{name}.processor"))
907
+
908
+ for sub_name, child in module.named_children():
909
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
910
+
911
+ for name, module in self.named_children():
912
+ fn_recursive_attn_processor(name, module, processor)
913
+
914
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
915
+ def set_default_attn_processor(self):
916
+ """
917
+ Disables custom attention processors and sets the default attention implementation.
918
+ """
919
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
920
+ processor = AttnAddedKVProcessor()
921
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
922
+ processor = AttnProcessor()
923
+ else:
924
+ raise ValueError(
925
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
926
+ )
927
+
928
+ self.set_attn_processor(processor)
929
+
930
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
931
+ def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
932
+ r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
933
+
934
+ The suffixes after the scaling factors represent the stage blocks where they are being applied.
935
+
936
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
937
+ are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
938
+
939
+ Args:
940
+ s1 (`float`):
941
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
942
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
943
+ s2 (`float`):
944
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
945
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
946
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
947
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
948
+ """
949
+ for i, upsample_block in enumerate(self.up_blocks):
950
+ setattr(upsample_block, "s1", s1)
951
+ setattr(upsample_block, "s2", s2)
952
+ setattr(upsample_block, "b1", b1)
953
+ setattr(upsample_block, "b2", b2)
954
+
955
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
956
+ def disable_freeu(self):
957
+ """Disables the FreeU mechanism."""
958
+ freeu_keys = {"s1", "s2", "b1", "b2"}
959
+ for i, upsample_block in enumerate(self.up_blocks):
960
+ for k in freeu_keys:
961
+ if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
962
+ setattr(upsample_block, k, None)
963
+
964
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
965
+ def fuse_qkv_projections(self):
966
+ """
967
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
968
+ are fused. For cross-attention modules, key and value projection matrices are fused.
969
+
970
+ <Tip warning={true}>
971
+
972
+ This API is 🧪 experimental.
973
+
974
+ </Tip>
975
+ """
976
+ self.original_attn_processors = None
977
+
978
+ for _, attn_processor in self.attn_processors.items():
979
+ if "Added" in str(attn_processor.__class__.__name__):
980
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
981
+
982
+ self.original_attn_processors = self.attn_processors
983
+
984
+ for module in self.modules():
985
+ if isinstance(module, Attention):
986
+ module.fuse_projections(fuse=True)
987
+
988
+ # copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
989
+ def unfuse_qkv_projections(self):
990
+ """Disables the fused QKV projection if enabled.
991
+
992
+ <Tip warning={true}>
993
+
994
+ This API is 🧪 experimental.
995
+
996
+ </Tip>
997
+
998
+ """
999
+ if self.original_attn_processors is not None:
1000
+ self.set_attn_processor(self.original_attn_processors)
1001
+
1002
+ def forward(
1003
+ self,
1004
+ sample: Tensor,
1005
+ timestep: Union[torch.Tensor, float, int],
1006
+ encoder_hidden_states: torch.Tensor,
1007
+ controlnet_cond: Optional[torch.Tensor] = None,
1008
+ conditioning_scale: Optional[float] = 1.0,
1009
+ class_labels: Optional[torch.Tensor] = None,
1010
+ timestep_cond: Optional[torch.Tensor] = None,
1011
+ attention_mask: Optional[torch.Tensor] = None,
1012
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1013
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
1014
+ return_dict: bool = True,
1015
+ apply_control: bool = True,
1016
+ ) -> Union[ControlNetXSOutput, Tuple]:
1017
+ """
1018
+ The [`ControlNetXSModel`] forward method.
1019
+
1020
+ Args:
1021
+ sample (`Tensor`):
1022
+ The noisy input tensor.
1023
+ timestep (`Union[torch.Tensor, float, int]`):
1024
+ The number of timesteps to denoise an input.
1025
+ encoder_hidden_states (`torch.Tensor`):
1026
+ The encoder hidden states.
1027
+ controlnet_cond (`Tensor`):
1028
+ The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
1029
+ conditioning_scale (`float`, defaults to `1.0`):
1030
+ How much the control model affects the base model outputs.
1031
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
1032
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
1033
+ timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
1034
+ Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
1035
+ timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
1036
+ embeddings.
1037
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
1038
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
1039
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
1040
+ negative values to the attention scores corresponding to "discard" tokens.
1041
+ cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
1042
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
1043
+ added_cond_kwargs (`dict`):
1044
+ Additional conditions for the Stable Diffusion XL UNet.
1045
+ return_dict (`bool`, defaults to `True`):
1046
+ Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
1047
+ apply_control (`bool`, defaults to `True`):
1048
+ If `False`, the input is run only through the base model.
1049
+
1050
+ Returns:
1051
+ [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
1052
+ If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
1053
+ tuple is returned where the first element is the sample tensor.
1054
+ """
1055
+
1056
+ # check channel order
1057
+ if self.config.ctrl_conditioning_channel_order == "bgr":
1058
+ controlnet_cond = torch.flip(controlnet_cond, dims=[1])
1059
+
1060
+ # prepare attention_mask
1061
+ if attention_mask is not None:
1062
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
1063
+ attention_mask = attention_mask.unsqueeze(1)
1064
+
1065
+ # 1. time
1066
+ timesteps = timestep
1067
+ if not torch.is_tensor(timesteps):
1068
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
1069
+ # This would be a good case for the `match` statement (Python 3.10+)
1070
+ is_mps = sample.device.type == "mps"
1071
+ if isinstance(timestep, float):
1072
+ dtype = torch.float32 if is_mps else torch.float64
1073
+ else:
1074
+ dtype = torch.int32 if is_mps else torch.int64
1075
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
1076
+ elif len(timesteps.shape) == 0:
1077
+ timesteps = timesteps[None].to(sample.device)
1078
+
1079
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
1080
+ timesteps = timesteps.expand(sample.shape[0])
1081
+
1082
+ t_emb = self.base_time_proj(timesteps)
1083
+
1084
+ # timesteps does not contain any weights and will always return f32 tensors
1085
+ # but time_embedding might actually be running in fp16. so we need to cast here.
1086
+ # there might be better ways to encapsulate this.
1087
+ t_emb = t_emb.to(dtype=sample.dtype)
1088
+
1089
+ if self.config.ctrl_learn_time_embedding and apply_control:
1090
+ ctrl_temb = self.ctrl_time_embedding(t_emb, timestep_cond)
1091
+ base_temb = self.base_time_embedding(t_emb, timestep_cond)
1092
+ interpolation_param = self.config.time_embedding_mix**0.3
1093
+
1094
+ temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
1095
+ else:
1096
+ temb = self.base_time_embedding(t_emb)
1097
+
1098
+ # added time & text embeddings
1099
+ aug_emb = None
1100
+
1101
+ if self.config.addition_embed_type is None:
1102
+ pass
1103
+ elif self.config.addition_embed_type == "text_time":
1104
+ # SDXL - style
1105
+ if "text_embeds" not in added_cond_kwargs:
1106
+ raise ValueError(
1107
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
1108
+ )
1109
+ text_embeds = added_cond_kwargs.get("text_embeds")
1110
+ if "time_ids" not in added_cond_kwargs:
1111
+ raise ValueError(
1112
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
1113
+ )
1114
+ time_ids = added_cond_kwargs.get("time_ids")
1115
+ time_embeds = self.base_add_time_proj(time_ids.flatten())
1116
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
1117
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
1118
+ add_embeds = add_embeds.to(temb.dtype)
1119
+ aug_emb = self.base_add_embedding(add_embeds)
1120
+ else:
1121
+ raise ValueError(
1122
+ f"ControlNet-XS currently only supports StableDiffusion and StableDiffusion-XL, so addition_embed_type = {self.config.addition_embed_type} is currently not supported."
1123
+ )
1124
+
1125
+ temb = temb + aug_emb if aug_emb is not None else temb
1126
+
1127
+ # text embeddings
1128
+ cemb = encoder_hidden_states
1129
+
1130
+ # Preparation
1131
+ h_ctrl = h_base = sample
1132
+ hs_base, hs_ctrl = [], []
1133
+
1134
+ # Cross Control
1135
+ guided_hint = self.controlnet_cond_embedding(controlnet_cond)
1136
+
1137
+ # 1 - conv in & down
1138
+
1139
+ h_base = self.base_conv_in(h_base)
1140
+ h_ctrl = self.ctrl_conv_in(h_ctrl)
1141
+ if guided_hint is not None:
1142
+ h_ctrl += guided_hint
1143
+ if apply_control:
1144
+ h_base = h_base + self.control_to_base_for_conv_in(h_ctrl) * conditioning_scale # add ctrl -> base
1145
+
1146
+ hs_base.append(h_base)
1147
+ hs_ctrl.append(h_ctrl)
1148
+
1149
+ for down in self.down_blocks:
1150
+ h_base, h_ctrl, residual_hb, residual_hc = down(
1151
+ hidden_states_base=h_base,
1152
+ hidden_states_ctrl=h_ctrl,
1153
+ temb=temb,
1154
+ encoder_hidden_states=cemb,
1155
+ conditioning_scale=conditioning_scale,
1156
+ cross_attention_kwargs=cross_attention_kwargs,
1157
+ attention_mask=attention_mask,
1158
+ apply_control=apply_control,
1159
+ )
1160
+ hs_base.extend(residual_hb)
1161
+ hs_ctrl.extend(residual_hc)
1162
+
1163
+ # 2 - mid
1164
+ h_base, h_ctrl = self.mid_block(
1165
+ hidden_states_base=h_base,
1166
+ hidden_states_ctrl=h_ctrl,
1167
+ temb=temb,
1168
+ encoder_hidden_states=cemb,
1169
+ conditioning_scale=conditioning_scale,
1170
+ cross_attention_kwargs=cross_attention_kwargs,
1171
+ attention_mask=attention_mask,
1172
+ apply_control=apply_control,
1173
+ )
1174
+
1175
+ # 3 - up
1176
+ for up in self.up_blocks:
1177
+ n_resnets = len(up.resnets)
1178
+ skips_hb = hs_base[-n_resnets:]
1179
+ skips_hc = hs_ctrl[-n_resnets:]
1180
+ hs_base = hs_base[:-n_resnets]
1181
+ hs_ctrl = hs_ctrl[:-n_resnets]
1182
+ h_base = up(
1183
+ hidden_states=h_base,
1184
+ res_hidden_states_tuple_base=skips_hb,
1185
+ res_hidden_states_tuple_ctrl=skips_hc,
1186
+ temb=temb,
1187
+ encoder_hidden_states=cemb,
1188
+ conditioning_scale=conditioning_scale,
1189
+ cross_attention_kwargs=cross_attention_kwargs,
1190
+ attention_mask=attention_mask,
1191
+ apply_control=apply_control,
1192
+ )
1193
+
1194
+ # 4 - conv out
1195
+ h_base = self.base_conv_norm_out(h_base)
1196
+ h_base = self.base_conv_act(h_base)
1197
+ h_base = self.base_conv_out(h_base)
1198
+
1199
+ if not return_dict:
1200
+ return (h_base,)
1201
+
1202
+ return ControlNetXSOutput(sample=h_base)
1203
+
1204
+
1205
+ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
1206
+ def __init__(
1207
+ self,
1208
+ base_in_channels: int,
1209
+ base_out_channels: int,
1210
+ ctrl_in_channels: int,
1211
+ ctrl_out_channels: int,
1212
+ temb_channels: int,
1213
+ norm_num_groups: int = 32,
1214
+ ctrl_max_norm_num_groups: int = 32,
1215
+ has_crossattn=True,
1216
+ transformer_layers_per_block: Optional[Union[int, Tuple[int]]] = 1,
1217
+ base_num_attention_heads: Optional[int] = 1,
1218
+ ctrl_num_attention_heads: Optional[int] = 1,
1219
+ cross_attention_dim: Optional[int] = 1024,
1220
+ add_downsample: bool = True,
1221
+ upcast_attention: Optional[bool] = False,
1222
+ ):
1223
+ super().__init__()
1224
+ base_resnets = []
1225
+ base_attentions = []
1226
+ ctrl_resnets = []
1227
+ ctrl_attentions = []
1228
+ ctrl_to_base = []
1229
+ base_to_ctrl = []
1230
+
1231
+ num_layers = 2 # only support sd + sdxl
1232
+
1233
+ if isinstance(transformer_layers_per_block, int):
1234
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
1235
+
1236
+ for i in range(num_layers):
1237
+ base_in_channels = base_in_channels if i == 0 else base_out_channels
1238
+ ctrl_in_channels = ctrl_in_channels if i == 0 else ctrl_out_channels
1239
+
1240
+ # Before the resnet/attention application, information is concatted from base to control.
1241
+ # Concat doesn't require change in number of channels
1242
+ base_to_ctrl.append(make_zero_conv(base_in_channels, base_in_channels))
1243
+
1244
+ base_resnets.append(
1245
+ ResnetBlock2D(
1246
+ in_channels=base_in_channels,
1247
+ out_channels=base_out_channels,
1248
+ temb_channels=temb_channels,
1249
+ groups=norm_num_groups,
1250
+ )
1251
+ )
1252
+ ctrl_resnets.append(
1253
+ ResnetBlock2D(
1254
+ in_channels=ctrl_in_channels + base_in_channels, # information from base is concatted to ctrl
1255
+ out_channels=ctrl_out_channels,
1256
+ temb_channels=temb_channels,
1257
+ groups=find_largest_factor(
1258
+ ctrl_in_channels + base_in_channels, max_factor=ctrl_max_norm_num_groups
1259
+ ),
1260
+ groups_out=find_largest_factor(ctrl_out_channels, max_factor=ctrl_max_norm_num_groups),
1261
+ eps=1e-5,
1262
+ )
1263
+ )
1264
+
1265
+ if has_crossattn:
1266
+ base_attentions.append(
1267
+ Transformer2DModel(
1268
+ base_num_attention_heads,
1269
+ base_out_channels // base_num_attention_heads,
1270
+ in_channels=base_out_channels,
1271
+ num_layers=transformer_layers_per_block[i],
1272
+ cross_attention_dim=cross_attention_dim,
1273
+ use_linear_projection=True,
1274
+ upcast_attention=upcast_attention,
1275
+ norm_num_groups=norm_num_groups,
1276
+ )
1277
+ )
1278
+ ctrl_attentions.append(
1279
+ Transformer2DModel(
1280
+ ctrl_num_attention_heads,
1281
+ ctrl_out_channels // ctrl_num_attention_heads,
1282
+ in_channels=ctrl_out_channels,
1283
+ num_layers=transformer_layers_per_block[i],
1284
+ cross_attention_dim=cross_attention_dim,
1285
+ use_linear_projection=True,
1286
+ upcast_attention=upcast_attention,
1287
+ norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=ctrl_max_norm_num_groups),
1288
+ )
1289
+ )
1290
+
1291
+ # After the resnet/attention application, information is added from control to base
1292
+ # Addition requires change in number of channels
1293
+ ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
1294
+
1295
+ if add_downsample:
1296
+ # Before the downsampler application, information is concatted from base to control
1297
+ # Concat doesn't require change in number of channels
1298
+ base_to_ctrl.append(make_zero_conv(base_out_channels, base_out_channels))
1299
+
1300
+ self.base_downsamplers = Downsample2D(
1301
+ base_out_channels, use_conv=True, out_channels=base_out_channels, name="op"
1302
+ )
1303
+ self.ctrl_downsamplers = Downsample2D(
1304
+ ctrl_out_channels + base_out_channels, use_conv=True, out_channels=ctrl_out_channels, name="op"
1305
+ )
1306
+
1307
+ # After the downsampler application, information is added from control to base
1308
+ # Addition requires change in number of channels
1309
+ ctrl_to_base.append(make_zero_conv(ctrl_out_channels, base_out_channels))
1310
+ else:
1311
+ self.base_downsamplers = None
1312
+ self.ctrl_downsamplers = None
1313
+
1314
+ self.base_resnets = nn.ModuleList(base_resnets)
1315
+ self.ctrl_resnets = nn.ModuleList(ctrl_resnets)
1316
+ self.base_attentions = nn.ModuleList(base_attentions) if has_crossattn else [None] * num_layers
1317
+ self.ctrl_attentions = nn.ModuleList(ctrl_attentions) if has_crossattn else [None] * num_layers
1318
+ self.base_to_ctrl = nn.ModuleList(base_to_ctrl)
1319
+ self.ctrl_to_base = nn.ModuleList(ctrl_to_base)
1320
+
1321
+ self.gradient_checkpointing = False
1322
+
1323
+ @classmethod
1324
+ def from_modules(cls, base_downblock: CrossAttnDownBlock2D, ctrl_downblock: DownBlockControlNetXSAdapter):
1325
+ # get params
1326
+ def get_first_cross_attention(block):
1327
+ return block.attentions[0].transformer_blocks[0].attn2
1328
+
1329
+ base_in_channels = base_downblock.resnets[0].in_channels
1330
+ base_out_channels = base_downblock.resnets[0].out_channels
1331
+ ctrl_in_channels = (
1332
+ ctrl_downblock.resnets[0].in_channels - base_in_channels
1333
+ ) # base channels are concatted to ctrl channels in init
1334
+ ctrl_out_channels = ctrl_downblock.resnets[0].out_channels
1335
+ temb_channels = base_downblock.resnets[0].time_emb_proj.in_features
1336
+ num_groups = base_downblock.resnets[0].norm1.num_groups
1337
+ ctrl_num_groups = ctrl_downblock.resnets[0].norm1.num_groups
1338
+ if hasattr(base_downblock, "attentions"):
1339
+ has_crossattn = True
1340
+ transformer_layers_per_block = len(base_downblock.attentions[0].transformer_blocks)
1341
+ base_num_attention_heads = get_first_cross_attention(base_downblock).heads
1342
+ ctrl_num_attention_heads = get_first_cross_attention(ctrl_downblock).heads
1343
+ cross_attention_dim = get_first_cross_attention(base_downblock).cross_attention_dim
1344
+ upcast_attention = get_first_cross_attention(base_downblock).upcast_attention
1345
+ else:
1346
+ has_crossattn = False
1347
+ transformer_layers_per_block = None
1348
+ base_num_attention_heads = None
1349
+ ctrl_num_attention_heads = None
1350
+ cross_attention_dim = None
1351
+ upcast_attention = None
1352
+ add_downsample = base_downblock.downsamplers is not None
1353
+
1354
+ # create model
1355
+ model = cls(
1356
+ base_in_channels=base_in_channels,
1357
+ base_out_channels=base_out_channels,
1358
+ ctrl_in_channels=ctrl_in_channels,
1359
+ ctrl_out_channels=ctrl_out_channels,
1360
+ temb_channels=temb_channels,
1361
+ norm_num_groups=num_groups,
1362
+ ctrl_max_norm_num_groups=ctrl_num_groups,
1363
+ has_crossattn=has_crossattn,
1364
+ transformer_layers_per_block=transformer_layers_per_block,
1365
+ base_num_attention_heads=base_num_attention_heads,
1366
+ ctrl_num_attention_heads=ctrl_num_attention_heads,
1367
+ cross_attention_dim=cross_attention_dim,
1368
+ add_downsample=add_downsample,
1369
+ upcast_attention=upcast_attention,
1370
+ )
1371
+
1372
+ # # load weights
1373
+ model.base_resnets.load_state_dict(base_downblock.resnets.state_dict())
1374
+ model.ctrl_resnets.load_state_dict(ctrl_downblock.resnets.state_dict())
1375
+ if has_crossattn:
1376
+ model.base_attentions.load_state_dict(base_downblock.attentions.state_dict())
1377
+ model.ctrl_attentions.load_state_dict(ctrl_downblock.attentions.state_dict())
1378
+ if add_downsample:
1379
+ model.base_downsamplers.load_state_dict(base_downblock.downsamplers[0].state_dict())
1380
+ model.ctrl_downsamplers.load_state_dict(ctrl_downblock.downsamplers.state_dict())
1381
+ model.base_to_ctrl.load_state_dict(ctrl_downblock.base_to_ctrl.state_dict())
1382
+ model.ctrl_to_base.load_state_dict(ctrl_downblock.ctrl_to_base.state_dict())
1383
+
1384
+ return model
1385
+
1386
+ def freeze_base_params(self) -> None:
1387
+ """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
1388
+ tuning."""
1389
+ # Unfreeze everything
1390
+ for param in self.parameters():
1391
+ param.requires_grad = True
1392
+
1393
+ # Freeze base part
1394
+ base_parts = [self.base_resnets]
1395
+ if isinstance(self.base_attentions, nn.ModuleList): # attentions can be a list of Nones
1396
+ base_parts.append(self.base_attentions)
1397
+ if self.base_downsamplers is not None:
1398
+ base_parts.append(self.base_downsamplers)
1399
+ for part in base_parts:
1400
+ for param in part.parameters():
1401
+ param.requires_grad = False
1402
+
1403
+ def forward(
1404
+ self,
1405
+ hidden_states_base: Tensor,
1406
+ temb: Tensor,
1407
+ encoder_hidden_states: Optional[Tensor] = None,
1408
+ hidden_states_ctrl: Optional[Tensor] = None,
1409
+ conditioning_scale: Optional[float] = 1.0,
1410
+ attention_mask: Optional[Tensor] = None,
1411
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1412
+ encoder_attention_mask: Optional[Tensor] = None,
1413
+ apply_control: bool = True,
1414
+ ) -> Tuple[Tensor, Tensor, Tuple[Tensor, ...], Tuple[Tensor, ...]]:
1415
+ if cross_attention_kwargs is not None:
1416
+ if cross_attention_kwargs.get("scale", None) is not None:
1417
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
1418
+
1419
+ h_base = hidden_states_base
1420
+ h_ctrl = hidden_states_ctrl
1421
+
1422
+ base_output_states = ()
1423
+ ctrl_output_states = ()
1424
+
1425
+ base_blocks = list(zip(self.base_resnets, self.base_attentions))
1426
+ ctrl_blocks = list(zip(self.ctrl_resnets, self.ctrl_attentions))
1427
+
1428
+ def create_custom_forward(module, return_dict=None):
1429
+ def custom_forward(*inputs):
1430
+ if return_dict is not None:
1431
+ return module(*inputs, return_dict=return_dict)
1432
+ else:
1433
+ return module(*inputs)
1434
+
1435
+ return custom_forward
1436
+
1437
+ for (b_res, b_attn), (c_res, c_attn), b2c, c2b in zip(
1438
+ base_blocks, ctrl_blocks, self.base_to_ctrl, self.ctrl_to_base
1439
+ ):
1440
+ # concat base -> ctrl
1441
+ if apply_control:
1442
+ h_ctrl = torch.cat([h_ctrl, b2c(h_base)], dim=1)
1443
+
1444
+ # apply base subblock
1445
+ if self.training and self.gradient_checkpointing:
1446
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
1447
+ h_base = torch.utils.checkpoint.checkpoint(
1448
+ create_custom_forward(b_res),
1449
+ h_base,
1450
+ temb,
1451
+ **ckpt_kwargs,
1452
+ )
1453
+ else:
1454
+ h_base = b_res(h_base, temb)
1455
+
1456
+ if b_attn is not None:
1457
+ h_base = b_attn(
1458
+ h_base,
1459
+ encoder_hidden_states=encoder_hidden_states,
1460
+ cross_attention_kwargs=cross_attention_kwargs,
1461
+ attention_mask=attention_mask,
1462
+ encoder_attention_mask=encoder_attention_mask,
1463
+ return_dict=False,
1464
+ )[0]
1465
+
1466
+ # apply ctrl subblock
1467
+ if apply_control:
1468
+ if self.training and self.gradient_checkpointing:
1469
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
1470
+ h_ctrl = torch.utils.checkpoint.checkpoint(
1471
+ create_custom_forward(c_res),
1472
+ h_ctrl,
1473
+ temb,
1474
+ **ckpt_kwargs,
1475
+ )
1476
+ else:
1477
+ h_ctrl = c_res(h_ctrl, temb)
1478
+ if c_attn is not None:
1479
+ h_ctrl = c_attn(
1480
+ h_ctrl,
1481
+ encoder_hidden_states=encoder_hidden_states,
1482
+ cross_attention_kwargs=cross_attention_kwargs,
1483
+ attention_mask=attention_mask,
1484
+ encoder_attention_mask=encoder_attention_mask,
1485
+ return_dict=False,
1486
+ )[0]
1487
+
1488
+ # add ctrl -> base
1489
+ if apply_control:
1490
+ h_base = h_base + c2b(h_ctrl) * conditioning_scale
1491
+
1492
+ base_output_states = base_output_states + (h_base,)
1493
+ ctrl_output_states = ctrl_output_states + (h_ctrl,)
1494
+
1495
+ if self.base_downsamplers is not None: # if we have a base_downsampler, then also a ctrl_downsampler
1496
+ b2c = self.base_to_ctrl[-1]
1497
+ c2b = self.ctrl_to_base[-1]
1498
+
1499
+ # concat base -> ctrl
1500
+ if apply_control:
1501
+ h_ctrl = torch.cat([h_ctrl, b2c(h_base)], dim=1)
1502
+ # apply base subblock
1503
+ h_base = self.base_downsamplers(h_base)
1504
+ # apply ctrl subblock
1505
+ if apply_control:
1506
+ h_ctrl = self.ctrl_downsamplers(h_ctrl)
1507
+ # add ctrl -> base
1508
+ if apply_control:
1509
+ h_base = h_base + c2b(h_ctrl) * conditioning_scale
1510
+
1511
+ base_output_states = base_output_states + (h_base,)
1512
+ ctrl_output_states = ctrl_output_states + (h_ctrl,)
1513
+
1514
+ return h_base, h_ctrl, base_output_states, ctrl_output_states
1515
+
1516
+
1517
+ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
1518
+ def __init__(
1519
+ self,
1520
+ base_channels: int,
1521
+ ctrl_channels: int,
1522
+ temb_channels: Optional[int] = None,
1523
+ norm_num_groups: int = 32,
1524
+ ctrl_max_norm_num_groups: int = 32,
1525
+ transformer_layers_per_block: int = 1,
1526
+ base_num_attention_heads: Optional[int] = 1,
1527
+ ctrl_num_attention_heads: Optional[int] = 1,
1528
+ cross_attention_dim: Optional[int] = 1024,
1529
+ upcast_attention: bool = False,
1530
+ ):
1531
+ super().__init__()
1532
+
1533
+ # Before the midblock application, information is concatted from base to control.
1534
+ # Concat doesn't require change in number of channels
1535
+ self.base_to_ctrl = make_zero_conv(base_channels, base_channels)
1536
+
1537
+ self.base_midblock = UNetMidBlock2DCrossAttn(
1538
+ transformer_layers_per_block=transformer_layers_per_block,
1539
+ in_channels=base_channels,
1540
+ temb_channels=temb_channels,
1541
+ resnet_groups=norm_num_groups,
1542
+ cross_attention_dim=cross_attention_dim,
1543
+ num_attention_heads=base_num_attention_heads,
1544
+ use_linear_projection=True,
1545
+ upcast_attention=upcast_attention,
1546
+ )
1547
+
1548
+ self.ctrl_midblock = UNetMidBlock2DCrossAttn(
1549
+ transformer_layers_per_block=transformer_layers_per_block,
1550
+ in_channels=ctrl_channels + base_channels,
1551
+ out_channels=ctrl_channels,
1552
+ temb_channels=temb_channels,
1553
+ # number or norm groups must divide both in_channels and out_channels
1554
+ resnet_groups=find_largest_factor(
1555
+ gcd(ctrl_channels, ctrl_channels + base_channels), ctrl_max_norm_num_groups
1556
+ ),
1557
+ cross_attention_dim=cross_attention_dim,
1558
+ num_attention_heads=ctrl_num_attention_heads,
1559
+ use_linear_projection=True,
1560
+ upcast_attention=upcast_attention,
1561
+ )
1562
+
1563
+ # After the midblock application, information is added from control to base
1564
+ # Addition requires change in number of channels
1565
+ self.ctrl_to_base = make_zero_conv(ctrl_channels, base_channels)
1566
+
1567
+ self.gradient_checkpointing = False
1568
+
1569
+ @classmethod
1570
+ def from_modules(
1571
+ cls,
1572
+ base_midblock: UNetMidBlock2DCrossAttn,
1573
+ ctrl_midblock: MidBlockControlNetXSAdapter,
1574
+ ):
1575
+ base_to_ctrl = ctrl_midblock.base_to_ctrl
1576
+ ctrl_to_base = ctrl_midblock.ctrl_to_base
1577
+ ctrl_midblock = ctrl_midblock.midblock
1578
+
1579
+ # get params
1580
+ def get_first_cross_attention(midblock):
1581
+ return midblock.attentions[0].transformer_blocks[0].attn2
1582
+
1583
+ base_channels = ctrl_to_base.out_channels
1584
+ ctrl_channels = ctrl_to_base.in_channels
1585
+ transformer_layers_per_block = len(base_midblock.attentions[0].transformer_blocks)
1586
+ temb_channels = base_midblock.resnets[0].time_emb_proj.in_features
1587
+ num_groups = base_midblock.resnets[0].norm1.num_groups
1588
+ ctrl_num_groups = ctrl_midblock.resnets[0].norm1.num_groups
1589
+ base_num_attention_heads = get_first_cross_attention(base_midblock).heads
1590
+ ctrl_num_attention_heads = get_first_cross_attention(ctrl_midblock).heads
1591
+ cross_attention_dim = get_first_cross_attention(base_midblock).cross_attention_dim
1592
+ upcast_attention = get_first_cross_attention(base_midblock).upcast_attention
1593
+
1594
+ # create model
1595
+ model = cls(
1596
+ base_channels=base_channels,
1597
+ ctrl_channels=ctrl_channels,
1598
+ temb_channels=temb_channels,
1599
+ norm_num_groups=num_groups,
1600
+ ctrl_max_norm_num_groups=ctrl_num_groups,
1601
+ transformer_layers_per_block=transformer_layers_per_block,
1602
+ base_num_attention_heads=base_num_attention_heads,
1603
+ ctrl_num_attention_heads=ctrl_num_attention_heads,
1604
+ cross_attention_dim=cross_attention_dim,
1605
+ upcast_attention=upcast_attention,
1606
+ )
1607
+
1608
+ # load weights
1609
+ model.base_to_ctrl.load_state_dict(base_to_ctrl.state_dict())
1610
+ model.base_midblock.load_state_dict(base_midblock.state_dict())
1611
+ model.ctrl_midblock.load_state_dict(ctrl_midblock.state_dict())
1612
+ model.ctrl_to_base.load_state_dict(ctrl_to_base.state_dict())
1613
+
1614
+ return model
1615
+
1616
+ def freeze_base_params(self) -> None:
1617
+ """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
1618
+ tuning."""
1619
+ # Unfreeze everything
1620
+ for param in self.parameters():
1621
+ param.requires_grad = True
1622
+
1623
+ # Freeze base part
1624
+ for param in self.base_midblock.parameters():
1625
+ param.requires_grad = False
1626
+
1627
+ def forward(
1628
+ self,
1629
+ hidden_states_base: Tensor,
1630
+ temb: Tensor,
1631
+ encoder_hidden_states: Tensor,
1632
+ hidden_states_ctrl: Optional[Tensor] = None,
1633
+ conditioning_scale: Optional[float] = 1.0,
1634
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1635
+ attention_mask: Optional[Tensor] = None,
1636
+ encoder_attention_mask: Optional[Tensor] = None,
1637
+ apply_control: bool = True,
1638
+ ) -> Tuple[Tensor, Tensor]:
1639
+ if cross_attention_kwargs is not None:
1640
+ if cross_attention_kwargs.get("scale", None) is not None:
1641
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
1642
+
1643
+ h_base = hidden_states_base
1644
+ h_ctrl = hidden_states_ctrl
1645
+
1646
+ joint_args = {
1647
+ "temb": temb,
1648
+ "encoder_hidden_states": encoder_hidden_states,
1649
+ "attention_mask": attention_mask,
1650
+ "cross_attention_kwargs": cross_attention_kwargs,
1651
+ "encoder_attention_mask": encoder_attention_mask,
1652
+ }
1653
+
1654
+ if apply_control:
1655
+ h_ctrl = torch.cat([h_ctrl, self.base_to_ctrl(h_base)], dim=1) # concat base -> ctrl
1656
+ h_base = self.base_midblock(h_base, **joint_args) # apply base mid block
1657
+ if apply_control:
1658
+ h_ctrl = self.ctrl_midblock(h_ctrl, **joint_args) # apply ctrl mid block
1659
+ h_base = h_base + self.ctrl_to_base(h_ctrl) * conditioning_scale # add ctrl -> base
1660
+
1661
+ return h_base, h_ctrl
1662
+
1663
+
1664
+ class ControlNetXSCrossAttnUpBlock2D(nn.Module):
1665
+ def __init__(
1666
+ self,
1667
+ in_channels: int,
1668
+ out_channels: int,
1669
+ prev_output_channel: int,
1670
+ ctrl_skip_channels: List[int],
1671
+ temb_channels: int,
1672
+ norm_num_groups: int = 32,
1673
+ resolution_idx: Optional[int] = None,
1674
+ has_crossattn=True,
1675
+ transformer_layers_per_block: int = 1,
1676
+ num_attention_heads: int = 1,
1677
+ cross_attention_dim: int = 1024,
1678
+ add_upsample: bool = True,
1679
+ upcast_attention: bool = False,
1680
+ ):
1681
+ super().__init__()
1682
+ resnets = []
1683
+ attentions = []
1684
+ ctrl_to_base = []
1685
+
1686
+ num_layers = 3 # only support sd + sdxl
1687
+
1688
+ self.has_cross_attention = has_crossattn
1689
+ self.num_attention_heads = num_attention_heads
1690
+
1691
+ if isinstance(transformer_layers_per_block, int):
1692
+ transformer_layers_per_block = [transformer_layers_per_block] * num_layers
1693
+
1694
+ for i in range(num_layers):
1695
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
1696
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
1697
+
1698
+ ctrl_to_base.append(make_zero_conv(ctrl_skip_channels[i], resnet_in_channels))
1699
+
1700
+ resnets.append(
1701
+ ResnetBlock2D(
1702
+ in_channels=resnet_in_channels + res_skip_channels,
1703
+ out_channels=out_channels,
1704
+ temb_channels=temb_channels,
1705
+ groups=norm_num_groups,
1706
+ )
1707
+ )
1708
+
1709
+ if has_crossattn:
1710
+ attentions.append(
1711
+ Transformer2DModel(
1712
+ num_attention_heads,
1713
+ out_channels // num_attention_heads,
1714
+ in_channels=out_channels,
1715
+ num_layers=transformer_layers_per_block[i],
1716
+ cross_attention_dim=cross_attention_dim,
1717
+ use_linear_projection=True,
1718
+ upcast_attention=upcast_attention,
1719
+ norm_num_groups=norm_num_groups,
1720
+ )
1721
+ )
1722
+
1723
+ self.resnets = nn.ModuleList(resnets)
1724
+ self.attentions = nn.ModuleList(attentions) if has_crossattn else [None] * num_layers
1725
+ self.ctrl_to_base = nn.ModuleList(ctrl_to_base)
1726
+
1727
+ if add_upsample:
1728
+ self.upsamplers = Upsample2D(out_channels, use_conv=True, out_channels=out_channels)
1729
+ else:
1730
+ self.upsamplers = None
1731
+
1732
+ self.gradient_checkpointing = False
1733
+ self.resolution_idx = resolution_idx
1734
+
1735
+ @classmethod
1736
+ def from_modules(cls, base_upblock: CrossAttnUpBlock2D, ctrl_upblock: UpBlockControlNetXSAdapter):
1737
+ ctrl_to_base_skip_connections = ctrl_upblock.ctrl_to_base
1738
+
1739
+ # get params
1740
+ def get_first_cross_attention(block):
1741
+ return block.attentions[0].transformer_blocks[0].attn2
1742
+
1743
+ out_channels = base_upblock.resnets[0].out_channels
1744
+ in_channels = base_upblock.resnets[-1].in_channels - out_channels
1745
+ prev_output_channels = base_upblock.resnets[0].in_channels - out_channels
1746
+ ctrl_skip_channelss = [c.in_channels for c in ctrl_to_base_skip_connections]
1747
+ temb_channels = base_upblock.resnets[0].time_emb_proj.in_features
1748
+ num_groups = base_upblock.resnets[0].norm1.num_groups
1749
+ resolution_idx = base_upblock.resolution_idx
1750
+ if hasattr(base_upblock, "attentions"):
1751
+ has_crossattn = True
1752
+ transformer_layers_per_block = len(base_upblock.attentions[0].transformer_blocks)
1753
+ num_attention_heads = get_first_cross_attention(base_upblock).heads
1754
+ cross_attention_dim = get_first_cross_attention(base_upblock).cross_attention_dim
1755
+ upcast_attention = get_first_cross_attention(base_upblock).upcast_attention
1756
+ else:
1757
+ has_crossattn = False
1758
+ transformer_layers_per_block = None
1759
+ num_attention_heads = None
1760
+ cross_attention_dim = None
1761
+ upcast_attention = None
1762
+ add_upsample = base_upblock.upsamplers is not None
1763
+
1764
+ # create model
1765
+ model = cls(
1766
+ in_channels=in_channels,
1767
+ out_channels=out_channels,
1768
+ prev_output_channel=prev_output_channels,
1769
+ ctrl_skip_channels=ctrl_skip_channelss,
1770
+ temb_channels=temb_channels,
1771
+ norm_num_groups=num_groups,
1772
+ resolution_idx=resolution_idx,
1773
+ has_crossattn=has_crossattn,
1774
+ transformer_layers_per_block=transformer_layers_per_block,
1775
+ num_attention_heads=num_attention_heads,
1776
+ cross_attention_dim=cross_attention_dim,
1777
+ add_upsample=add_upsample,
1778
+ upcast_attention=upcast_attention,
1779
+ )
1780
+
1781
+ # load weights
1782
+ model.resnets.load_state_dict(base_upblock.resnets.state_dict())
1783
+ if has_crossattn:
1784
+ model.attentions.load_state_dict(base_upblock.attentions.state_dict())
1785
+ if add_upsample:
1786
+ model.upsamplers.load_state_dict(base_upblock.upsamplers[0].state_dict())
1787
+ model.ctrl_to_base.load_state_dict(ctrl_to_base_skip_connections.state_dict())
1788
+
1789
+ return model
1790
+
1791
+ def freeze_base_params(self) -> None:
1792
+ """Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
1793
+ tuning."""
1794
+ # Unfreeze everything
1795
+ for param in self.parameters():
1796
+ param.requires_grad = True
1797
+
1798
+ # Freeze base part
1799
+ base_parts = [self.resnets]
1800
+ if isinstance(self.attentions, nn.ModuleList): # attentions can be a list of Nones
1801
+ base_parts.append(self.attentions)
1802
+ if self.upsamplers is not None:
1803
+ base_parts.append(self.upsamplers)
1804
+ for part in base_parts:
1805
+ for param in part.parameters():
1806
+ param.requires_grad = False
1807
+
1808
+ def forward(
1809
+ self,
1810
+ hidden_states: Tensor,
1811
+ res_hidden_states_tuple_base: Tuple[Tensor, ...],
1812
+ res_hidden_states_tuple_ctrl: Tuple[Tensor, ...],
1813
+ temb: Tensor,
1814
+ encoder_hidden_states: Optional[Tensor] = None,
1815
+ conditioning_scale: Optional[float] = 1.0,
1816
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1817
+ attention_mask: Optional[Tensor] = None,
1818
+ upsample_size: Optional[int] = None,
1819
+ encoder_attention_mask: Optional[Tensor] = None,
1820
+ apply_control: bool = True,
1821
+ ) -> Tensor:
1822
+ if cross_attention_kwargs is not None:
1823
+ if cross_attention_kwargs.get("scale", None) is not None:
1824
+ logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
1825
+
1826
+ is_freeu_enabled = (
1827
+ getattr(self, "s1", None)
1828
+ and getattr(self, "s2", None)
1829
+ and getattr(self, "b1", None)
1830
+ and getattr(self, "b2", None)
1831
+ )
1832
+
1833
+ def create_custom_forward(module, return_dict=None):
1834
+ def custom_forward(*inputs):
1835
+ if return_dict is not None:
1836
+ return module(*inputs, return_dict=return_dict)
1837
+ else:
1838
+ return module(*inputs)
1839
+
1840
+ return custom_forward
1841
+
1842
+ def maybe_apply_freeu_to_subblock(hidden_states, res_h_base):
1843
+ # FreeU: Only operate on the first two stages
1844
+ if is_freeu_enabled:
1845
+ return apply_freeu(
1846
+ self.resolution_idx,
1847
+ hidden_states,
1848
+ res_h_base,
1849
+ s1=self.s1,
1850
+ s2=self.s2,
1851
+ b1=self.b1,
1852
+ b2=self.b2,
1853
+ )
1854
+ else:
1855
+ return hidden_states, res_h_base
1856
+
1857
+ for resnet, attn, c2b, res_h_base, res_h_ctrl in zip(
1858
+ self.resnets,
1859
+ self.attentions,
1860
+ self.ctrl_to_base,
1861
+ reversed(res_hidden_states_tuple_base),
1862
+ reversed(res_hidden_states_tuple_ctrl),
1863
+ ):
1864
+ if apply_control:
1865
+ hidden_states += c2b(res_h_ctrl) * conditioning_scale
1866
+
1867
+ hidden_states, res_h_base = maybe_apply_freeu_to_subblock(hidden_states, res_h_base)
1868
+ hidden_states = torch.cat([hidden_states, res_h_base], dim=1)
1869
+
1870
+ if self.training and self.gradient_checkpointing:
1871
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
1872
+ hidden_states = torch.utils.checkpoint.checkpoint(
1873
+ create_custom_forward(resnet),
1874
+ hidden_states,
1875
+ temb,
1876
+ **ckpt_kwargs,
1877
+ )
1878
+ else:
1879
+ hidden_states = resnet(hidden_states, temb)
1880
+
1881
+ if attn is not None:
1882
+ hidden_states = attn(
1883
+ hidden_states,
1884
+ encoder_hidden_states=encoder_hidden_states,
1885
+ cross_attention_kwargs=cross_attention_kwargs,
1886
+ attention_mask=attention_mask,
1887
+ encoder_attention_mask=encoder_attention_mask,
1888
+ return_dict=False,
1889
+ )[0]
1890
+
1891
+ if self.upsamplers is not None:
1892
+ hidden_states = self.upsamplers(hidden_states, upsample_size)
1893
+
1894
+ return hidden_states
1895
+
1896
+
1897
+ def make_zero_conv(in_channels, out_channels=None):
1898
+ return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
1899
+
1900
+
1901
+ def zero_module(module):
1902
+ for p in module.parameters():
1903
+ nn.init.zeros_(p)
1904
+ return module
1905
+
1906
+
1907
+ def find_largest_factor(number, max_factor):
1908
+ factor = max_factor
1909
+ if factor >= number:
1910
+ return number
1911
+ while factor != 0:
1912
+ residual = number % factor
1913
+ if residual == 0:
1914
+ return factor
1915
+ factor -= 1