diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. diffusers/__init__.py +97 -4
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +13 -1
  4. diffusers/image_processor.py +282 -71
  5. diffusers/loaders/__init__.py +24 -3
  6. diffusers/loaders/ip_adapter.py +543 -16
  7. diffusers/loaders/lora_base.py +138 -125
  8. diffusers/loaders/lora_conversion_utils.py +647 -0
  9. diffusers/loaders/lora_pipeline.py +2216 -230
  10. diffusers/loaders/peft.py +380 -0
  11. diffusers/loaders/single_file_model.py +71 -4
  12. diffusers/loaders/single_file_utils.py +597 -10
  13. diffusers/loaders/textual_inversion.py +5 -3
  14. diffusers/loaders/transformer_flux.py +181 -0
  15. diffusers/loaders/transformer_sd3.py +89 -0
  16. diffusers/loaders/unet.py +56 -12
  17. diffusers/models/__init__.py +49 -12
  18. diffusers/models/activations.py +22 -9
  19. diffusers/models/adapter.py +53 -53
  20. diffusers/models/attention.py +98 -13
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2160 -346
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +73 -12
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +70 -0
  36. diffusers/models/controlnet_sd3.py +26 -376
  37. diffusers/models/controlnet_sparsectrl.py +46 -719
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +996 -92
  49. diffusers/models/embeddings_flax.py +23 -9
  50. diffusers/models/model_loading_utils.py +264 -14
  51. diffusers/models/modeling_flax_utils.py +1 -1
  52. diffusers/models/modeling_utils.py +334 -51
  53. diffusers/models/normalization.py +157 -13
  54. diffusers/models/transformers/__init__.py +6 -0
  55. diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
  56. diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
  57. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  58. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  59. diffusers/models/transformers/pixart_transformer_2d.py +10 -2
  60. diffusers/models/transformers/sana_transformer.py +488 -0
  61. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  62. diffusers/models/transformers/transformer_2d.py +1 -1
  63. diffusers/models/transformers/transformer_allegro.py +422 -0
  64. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  65. diffusers/models/transformers/transformer_flux.py +189 -51
  66. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  67. diffusers/models/transformers/transformer_ltx.py +469 -0
  68. diffusers/models/transformers/transformer_mochi.py +499 -0
  69. diffusers/models/transformers/transformer_sd3.py +112 -18
  70. diffusers/models/transformers/transformer_temporal.py +1 -1
  71. diffusers/models/unets/unet_1d_blocks.py +1 -1
  72. diffusers/models/unets/unet_2d.py +8 -1
  73. diffusers/models/unets/unet_2d_blocks.py +88 -21
  74. diffusers/models/unets/unet_2d_condition.py +9 -9
  75. diffusers/models/unets/unet_3d_blocks.py +9 -7
  76. diffusers/models/unets/unet_motion_model.py +46 -68
  77. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  78. diffusers/models/unets/unet_stable_cascade.py +2 -2
  79. diffusers/models/unets/uvit_2d.py +1 -1
  80. diffusers/models/upsampling.py +14 -6
  81. diffusers/pipelines/__init__.py +69 -6
  82. diffusers/pipelines/allegro/__init__.py +48 -0
  83. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  84. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  85. diffusers/pipelines/animatediff/__init__.py +2 -0
  86. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  87. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
  88. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  89. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
  90. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
  91. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  92. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  93. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
  94. diffusers/pipelines/auto_pipeline.py +88 -10
  95. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  96. diffusers/pipelines/cogvideo/__init__.py +2 -0
  97. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
  98. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  99. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
  100. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
  101. diffusers/pipelines/cogview3/__init__.py +47 -0
  102. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  103. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  104. diffusers/pipelines/controlnet/__init__.py +86 -80
  105. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  106. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
  107. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
  108. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
  109. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
  110. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
  111. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
  112. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  113. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  114. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  115. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
  116. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  117. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
  118. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  119. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  120. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  121. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  122. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  123. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
  124. diffusers/pipelines/flux/__init__.py +23 -1
  125. diffusers/pipelines/flux/modeling_flux.py +47 -0
  126. diffusers/pipelines/flux/pipeline_flux.py +256 -48
  127. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  128. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  129. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  130. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  131. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  132. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  133. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  134. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  135. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  136. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  137. diffusers/pipelines/flux/pipeline_output.py +16 -0
  138. diffusers/pipelines/free_noise_utils.py +365 -5
  139. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  140. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  141. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  142. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
  143. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  145. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  146. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  147. diffusers/pipelines/kolors/text_encoder.py +2 -2
  148. diffusers/pipelines/kolors/tokenizer.py +4 -0
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  155. diffusers/pipelines/ltx/__init__.py +50 -0
  156. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  157. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  158. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  159. diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
  160. diffusers/pipelines/mochi/__init__.py +48 -0
  161. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  162. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  163. diffusers/pipelines/pag/__init__.py +13 -0
  164. diffusers/pipelines/pag/pag_utils.py +8 -2
  165. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
  166. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  167. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
  168. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  169. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
  170. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  171. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
  172. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  173. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  174. diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
  175. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  176. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  177. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  178. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  179. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  180. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  181. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  182. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  183. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  184. diffusers/pipelines/pipeline_loading_utils.py +250 -31
  185. diffusers/pipelines/pipeline_utils.py +158 -186
  186. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
  187. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
  188. diffusers/pipelines/sana/__init__.py +47 -0
  189. diffusers/pipelines/sana/pipeline_output.py +21 -0
  190. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  191. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  192. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  193. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  194. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
  195. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  196. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  197. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  198. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
  199. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
  200. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
  201. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  202. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  203. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  204. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  205. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
  206. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
  208. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  209. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  210. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  211. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  212. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  213. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  214. diffusers/quantizers/__init__.py +16 -0
  215. diffusers/quantizers/auto.py +139 -0
  216. diffusers/quantizers/base.py +233 -0
  217. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  218. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  219. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  220. diffusers/quantizers/gguf/__init__.py +1 -0
  221. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  222. diffusers/quantizers/gguf/utils.py +456 -0
  223. diffusers/quantizers/quantization_config.py +669 -0
  224. diffusers/quantizers/torchao/__init__.py +15 -0
  225. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  226. diffusers/schedulers/scheduling_ddim.py +4 -1
  227. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  228. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  229. diffusers/schedulers/scheduling_ddpm.py +6 -7
  230. diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
  231. diffusers/schedulers/scheduling_deis_multistep.py +102 -6
  232. diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
  236. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  237. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  238. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  239. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  240. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  241. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  244. diffusers/schedulers/scheduling_lcm.py +2 -6
  245. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  246. diffusers/schedulers/scheduling_repaint.py +1 -1
  247. diffusers/schedulers/scheduling_sasolver.py +102 -6
  248. diffusers/schedulers/scheduling_tcd.py +2 -6
  249. diffusers/schedulers/scheduling_unclip.py +4 -1
  250. diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
  251. diffusers/training_utils.py +63 -19
  252. diffusers/utils/__init__.py +7 -1
  253. diffusers/utils/constants.py +1 -0
  254. diffusers/utils/dummy_pt_objects.py +240 -0
  255. diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
  256. diffusers/utils/dynamic_modules_utils.py +3 -3
  257. diffusers/utils/hub_utils.py +44 -40
  258. diffusers/utils/import_utils.py +98 -8
  259. diffusers/utils/loading_utils.py +28 -4
  260. diffusers/utils/peft_utils.py +6 -3
  261. diffusers/utils/testing_utils.py +115 -1
  262. diffusers/utils/torch_utils.py +3 -0
  263. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
  264. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
  265. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  266. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  267. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  268. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,11 @@
1
1
  from .autoencoder_asym_kl import AsymmetricAutoencoderKL
2
+ from .autoencoder_dc import AutoencoderDC
2
3
  from .autoencoder_kl import AutoencoderKL
4
+ from .autoencoder_kl_allegro import AutoencoderKLAllegro
3
5
  from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
6
+ from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo
7
+ from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
8
+ from .autoencoder_kl_mochi import AutoencoderKLMochi
4
9
  from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
5
10
  from .autoencoder_oobleck import AutoencoderOobleck
6
11
  from .autoencoder_tiny import AutoencoderTiny
@@ -0,0 +1,620 @@
1
+ # Copyright 2024 MIT, Tsinghua University, NVIDIA CORPORATION and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+
22
+ from ...configuration_utils import ConfigMixin, register_to_config
23
+ from ...loaders import FromOriginalModelMixin
24
+ from ...utils.accelerate_utils import apply_forward_hook
25
+ from ..activations import get_activation
26
+ from ..attention_processor import SanaMultiscaleLinearAttention
27
+ from ..modeling_utils import ModelMixin
28
+ from ..normalization import RMSNorm, get_normalization
29
+ from ..transformers.sana_transformer import GLUMBConv
30
+ from .vae import DecoderOutput, EncoderOutput
31
+
32
+
33
+ class ResBlock(nn.Module):
34
+ def __init__(
35
+ self,
36
+ in_channels: int,
37
+ out_channels: int,
38
+ norm_type: str = "batch_norm",
39
+ act_fn: str = "relu6",
40
+ ) -> None:
41
+ super().__init__()
42
+
43
+ self.norm_type = norm_type
44
+
45
+ self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
46
+ self.conv1 = nn.Conv2d(in_channels, in_channels, 3, 1, 1)
47
+ self.conv2 = nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
48
+ self.norm = get_normalization(norm_type, out_channels)
49
+
50
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
51
+ residual = hidden_states
52
+ hidden_states = self.conv1(hidden_states)
53
+ hidden_states = self.nonlinearity(hidden_states)
54
+ hidden_states = self.conv2(hidden_states)
55
+
56
+ if self.norm_type == "rms_norm":
57
+ # move channel to the last dimension so we apply RMSnorm across channel dimension
58
+ hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
59
+ else:
60
+ hidden_states = self.norm(hidden_states)
61
+
62
+ return hidden_states + residual
63
+
64
+
65
+ class EfficientViTBlock(nn.Module):
66
+ def __init__(
67
+ self,
68
+ in_channels: int,
69
+ mult: float = 1.0,
70
+ attention_head_dim: int = 32,
71
+ qkv_multiscales: Tuple[int, ...] = (5,),
72
+ norm_type: str = "batch_norm",
73
+ ) -> None:
74
+ super().__init__()
75
+
76
+ self.attn = SanaMultiscaleLinearAttention(
77
+ in_channels=in_channels,
78
+ out_channels=in_channels,
79
+ mult=mult,
80
+ attention_head_dim=attention_head_dim,
81
+ norm_type=norm_type,
82
+ kernel_sizes=qkv_multiscales,
83
+ residual_connection=True,
84
+ )
85
+
86
+ self.conv_out = GLUMBConv(
87
+ in_channels=in_channels,
88
+ out_channels=in_channels,
89
+ norm_type="rms_norm",
90
+ )
91
+
92
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
93
+ x = self.attn(x)
94
+ x = self.conv_out(x)
95
+ return x
96
+
97
+
98
+ def get_block(
99
+ block_type: str,
100
+ in_channels: int,
101
+ out_channels: int,
102
+ attention_head_dim: int,
103
+ norm_type: str,
104
+ act_fn: str,
105
+ qkv_mutliscales: Tuple[int] = (),
106
+ ):
107
+ if block_type == "ResBlock":
108
+ block = ResBlock(in_channels, out_channels, norm_type, act_fn)
109
+
110
+ elif block_type == "EfficientViTBlock":
111
+ block = EfficientViTBlock(
112
+ in_channels, attention_head_dim=attention_head_dim, norm_type=norm_type, qkv_multiscales=qkv_mutliscales
113
+ )
114
+
115
+ else:
116
+ raise ValueError(f"Block with {block_type=} is not supported.")
117
+
118
+ return block
119
+
120
+
121
+ class DCDownBlock2d(nn.Module):
122
+ def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
123
+ super().__init__()
124
+
125
+ self.downsample = downsample
126
+ self.factor = 2
127
+ self.stride = 1 if downsample else 2
128
+ self.group_size = in_channels * self.factor**2 // out_channels
129
+ self.shortcut = shortcut
130
+
131
+ out_ratio = self.factor**2
132
+ if downsample:
133
+ assert out_channels % out_ratio == 0
134
+ out_channels = out_channels // out_ratio
135
+
136
+ self.conv = nn.Conv2d(
137
+ in_channels,
138
+ out_channels,
139
+ kernel_size=3,
140
+ stride=self.stride,
141
+ padding=1,
142
+ )
143
+
144
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
145
+ x = self.conv(hidden_states)
146
+ if self.downsample:
147
+ x = F.pixel_unshuffle(x, self.factor)
148
+
149
+ if self.shortcut:
150
+ y = F.pixel_unshuffle(hidden_states, self.factor)
151
+ y = y.unflatten(1, (-1, self.group_size))
152
+ y = y.mean(dim=2)
153
+ hidden_states = x + y
154
+ else:
155
+ hidden_states = x
156
+
157
+ return hidden_states
158
+
159
+
160
+ class DCUpBlock2d(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels: int,
164
+ out_channels: int,
165
+ interpolate: bool = False,
166
+ shortcut: bool = True,
167
+ interpolation_mode: str = "nearest",
168
+ ) -> None:
169
+ super().__init__()
170
+
171
+ self.interpolate = interpolate
172
+ self.interpolation_mode = interpolation_mode
173
+ self.shortcut = shortcut
174
+ self.factor = 2
175
+ self.repeats = out_channels * self.factor**2 // in_channels
176
+
177
+ out_ratio = self.factor**2
178
+
179
+ if not interpolate:
180
+ out_channels = out_channels * out_ratio
181
+
182
+ self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
183
+
184
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
185
+ if self.interpolate:
186
+ x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
187
+ x = self.conv(x)
188
+ else:
189
+ x = self.conv(hidden_states)
190
+ x = F.pixel_shuffle(x, self.factor)
191
+
192
+ if self.shortcut:
193
+ y = hidden_states.repeat_interleave(self.repeats, dim=1)
194
+ y = F.pixel_shuffle(y, self.factor)
195
+ hidden_states = x + y
196
+ else:
197
+ hidden_states = x
198
+
199
+ return hidden_states
200
+
201
+
202
+ class Encoder(nn.Module):
203
+ def __init__(
204
+ self,
205
+ in_channels: int,
206
+ latent_channels: int,
207
+ attention_head_dim: int = 32,
208
+ block_type: Union[str, Tuple[str]] = "ResBlock",
209
+ block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
210
+ layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
211
+ qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
212
+ downsample_block_type: str = "pixel_unshuffle",
213
+ out_shortcut: bool = True,
214
+ ):
215
+ super().__init__()
216
+
217
+ num_blocks = len(block_out_channels)
218
+
219
+ if isinstance(block_type, str):
220
+ block_type = (block_type,) * num_blocks
221
+
222
+ if layers_per_block[0] > 0:
223
+ self.conv_in = nn.Conv2d(
224
+ in_channels,
225
+ block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
226
+ kernel_size=3,
227
+ stride=1,
228
+ padding=1,
229
+ )
230
+ else:
231
+ self.conv_in = DCDownBlock2d(
232
+ in_channels=in_channels,
233
+ out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
234
+ downsample=downsample_block_type == "pixel_unshuffle",
235
+ shortcut=False,
236
+ )
237
+
238
+ down_blocks = []
239
+ for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
240
+ down_block_list = []
241
+
242
+ for _ in range(num_layers):
243
+ block = get_block(
244
+ block_type[i],
245
+ out_channel,
246
+ out_channel,
247
+ attention_head_dim=attention_head_dim,
248
+ norm_type="rms_norm",
249
+ act_fn="silu",
250
+ qkv_mutliscales=qkv_multiscales[i],
251
+ )
252
+ down_block_list.append(block)
253
+
254
+ if i < num_blocks - 1 and num_layers > 0:
255
+ downsample_block = DCDownBlock2d(
256
+ in_channels=out_channel,
257
+ out_channels=block_out_channels[i + 1],
258
+ downsample=downsample_block_type == "pixel_unshuffle",
259
+ shortcut=True,
260
+ )
261
+ down_block_list.append(downsample_block)
262
+
263
+ down_blocks.append(nn.Sequential(*down_block_list))
264
+
265
+ self.down_blocks = nn.ModuleList(down_blocks)
266
+
267
+ self.conv_out = nn.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
268
+
269
+ self.out_shortcut = out_shortcut
270
+ if out_shortcut:
271
+ self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
272
+
273
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
274
+ hidden_states = self.conv_in(hidden_states)
275
+ for down_block in self.down_blocks:
276
+ hidden_states = down_block(hidden_states)
277
+
278
+ if self.out_shortcut:
279
+ x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
280
+ x = x.mean(dim=2)
281
+ hidden_states = self.conv_out(hidden_states) + x
282
+ else:
283
+ hidden_states = self.conv_out(hidden_states)
284
+
285
+ return hidden_states
286
+
287
+
288
+ class Decoder(nn.Module):
289
+ def __init__(
290
+ self,
291
+ in_channels: int,
292
+ latent_channels: int,
293
+ attention_head_dim: int = 32,
294
+ block_type: Union[str, Tuple[str]] = "ResBlock",
295
+ block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
296
+ layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
297
+ qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
298
+ norm_type: Union[str, Tuple[str]] = "rms_norm",
299
+ act_fn: Union[str, Tuple[str]] = "silu",
300
+ upsample_block_type: str = "pixel_shuffle",
301
+ in_shortcut: bool = True,
302
+ ):
303
+ super().__init__()
304
+
305
+ num_blocks = len(block_out_channels)
306
+
307
+ if isinstance(block_type, str):
308
+ block_type = (block_type,) * num_blocks
309
+ if isinstance(norm_type, str):
310
+ norm_type = (norm_type,) * num_blocks
311
+ if isinstance(act_fn, str):
312
+ act_fn = (act_fn,) * num_blocks
313
+
314
+ self.conv_in = nn.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
315
+
316
+ self.in_shortcut = in_shortcut
317
+ if in_shortcut:
318
+ self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
319
+
320
+ up_blocks = []
321
+ for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
322
+ up_block_list = []
323
+
324
+ if i < num_blocks - 1 and num_layers > 0:
325
+ upsample_block = DCUpBlock2d(
326
+ block_out_channels[i + 1],
327
+ out_channel,
328
+ interpolate=upsample_block_type == "interpolate",
329
+ shortcut=True,
330
+ )
331
+ up_block_list.append(upsample_block)
332
+
333
+ for _ in range(num_layers):
334
+ block = get_block(
335
+ block_type[i],
336
+ out_channel,
337
+ out_channel,
338
+ attention_head_dim=attention_head_dim,
339
+ norm_type=norm_type[i],
340
+ act_fn=act_fn[i],
341
+ qkv_mutliscales=qkv_multiscales[i],
342
+ )
343
+ up_block_list.append(block)
344
+
345
+ up_blocks.insert(0, nn.Sequential(*up_block_list))
346
+
347
+ self.up_blocks = nn.ModuleList(up_blocks)
348
+
349
+ channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
350
+
351
+ self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
352
+ self.conv_act = nn.ReLU()
353
+ self.conv_out = None
354
+
355
+ if layers_per_block[0] > 0:
356
+ self.conv_out = nn.Conv2d(channels, in_channels, 3, 1, 1)
357
+ else:
358
+ self.conv_out = DCUpBlock2d(
359
+ channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
360
+ )
361
+
362
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
363
+ if self.in_shortcut:
364
+ x = hidden_states.repeat_interleave(self.in_shortcut_repeats, dim=1)
365
+ hidden_states = self.conv_in(hidden_states) + x
366
+ else:
367
+ hidden_states = self.conv_in(hidden_states)
368
+
369
+ for up_block in reversed(self.up_blocks):
370
+ hidden_states = up_block(hidden_states)
371
+
372
+ hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
373
+ hidden_states = self.conv_act(hidden_states)
374
+ hidden_states = self.conv_out(hidden_states)
375
+ return hidden_states
376
+
377
+
378
+ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
379
+ r"""
380
+ An Autoencoder model introduced in [DCAE](https://arxiv.org/abs/2410.10733) and used in
381
+ [SANA](https://arxiv.org/abs/2410.10629).
382
+
383
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
384
+ for all models (such as downloading or saving).
385
+
386
+ Args:
387
+ in_channels (`int`, defaults to `3`):
388
+ The number of input channels in samples.
389
+ latent_channels (`int`, defaults to `32`):
390
+ The number of channels in the latent space representation.
391
+ encoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
392
+ The type(s) of block to use in the encoder.
393
+ decoder_block_types (`Union[str, Tuple[str]]`, defaults to `"ResBlock"`):
394
+ The type(s) of block to use in the decoder.
395
+ encoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512, 1024, 1024)`):
396
+ The number of output channels for each block in the encoder.
397
+ decoder_block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512, 1024, 1024)`):
398
+ The number of output channels for each block in the decoder.
399
+ encoder_layers_per_block (`Tuple[int]`, defaults to `(2, 2, 2, 3, 3, 3)`):
400
+ The number of layers per block in the encoder.
401
+ decoder_layers_per_block (`Tuple[int]`, defaults to `(3, 3, 3, 3, 3, 3)`):
402
+ The number of layers per block in the decoder.
403
+ encoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
404
+ Multi-scale configurations for the encoder's QKV (query-key-value) transformations.
405
+ decoder_qkv_multiscales (`Tuple[Tuple[int, ...], ...]`, defaults to `((), (), (), (5,), (5,), (5,))`):
406
+ Multi-scale configurations for the decoder's QKV (query-key-value) transformations.
407
+ upsample_block_type (`str`, defaults to `"pixel_shuffle"`):
408
+ The type of block to use for upsampling in the decoder.
409
+ downsample_block_type (`str`, defaults to `"pixel_unshuffle"`):
410
+ The type of block to use for downsampling in the encoder.
411
+ decoder_norm_types (`Union[str, Tuple[str]]`, defaults to `"rms_norm"`):
412
+ The normalization type(s) to use in the decoder.
413
+ decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
414
+ The activation function(s) to use in the decoder.
415
+ scaling_factor (`float`, defaults to `1.0`):
416
+ The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
417
+ space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
418
+ z * scaling_factor` before being passed to the diffusion model. When decoding, the latents are scaled back
419
+ to the original scale with the formula: `z = 1 / scaling_factor * z`.
420
+ """
421
+
422
+ _supports_gradient_checkpointing = False
423
+
424
+ @register_to_config
425
+ def __init__(
426
+ self,
427
+ in_channels: int = 3,
428
+ latent_channels: int = 32,
429
+ attention_head_dim: int = 32,
430
+ encoder_block_types: Union[str, Tuple[str]] = "ResBlock",
431
+ decoder_block_types: Union[str, Tuple[str]] = "ResBlock",
432
+ encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
433
+ decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
434
+ encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3),
435
+ decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3),
436
+ encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
437
+ decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
438
+ upsample_block_type: str = "pixel_shuffle",
439
+ downsample_block_type: str = "pixel_unshuffle",
440
+ decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
441
+ decoder_act_fns: Union[str, Tuple[str]] = "silu",
442
+ scaling_factor: float = 1.0,
443
+ ) -> None:
444
+ super().__init__()
445
+
446
+ self.encoder = Encoder(
447
+ in_channels=in_channels,
448
+ latent_channels=latent_channels,
449
+ attention_head_dim=attention_head_dim,
450
+ block_type=encoder_block_types,
451
+ block_out_channels=encoder_block_out_channels,
452
+ layers_per_block=encoder_layers_per_block,
453
+ qkv_multiscales=encoder_qkv_multiscales,
454
+ downsample_block_type=downsample_block_type,
455
+ )
456
+ self.decoder = Decoder(
457
+ in_channels=in_channels,
458
+ latent_channels=latent_channels,
459
+ attention_head_dim=attention_head_dim,
460
+ block_type=decoder_block_types,
461
+ block_out_channels=decoder_block_out_channels,
462
+ layers_per_block=decoder_layers_per_block,
463
+ qkv_multiscales=decoder_qkv_multiscales,
464
+ norm_type=decoder_norm_types,
465
+ act_fn=decoder_act_fns,
466
+ upsample_block_type=upsample_block_type,
467
+ )
468
+
469
+ self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
470
+ self.temporal_compression_ratio = 1
471
+
472
+ # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
473
+ # to perform decoding of a single video latent at a time.
474
+ self.use_slicing = False
475
+
476
+ # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
477
+ # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
478
+ # intermediate tiles together, the memory requirement can be lowered.
479
+ self.use_tiling = False
480
+
481
+ # The minimal tile height and width for spatial tiling to be used
482
+ self.tile_sample_min_height = 512
483
+ self.tile_sample_min_width = 512
484
+
485
+ # The minimal distance between two spatial tiles
486
+ self.tile_sample_stride_height = 448
487
+ self.tile_sample_stride_width = 448
488
+
489
+ def enable_tiling(
490
+ self,
491
+ tile_sample_min_height: Optional[int] = None,
492
+ tile_sample_min_width: Optional[int] = None,
493
+ tile_sample_stride_height: Optional[float] = None,
494
+ tile_sample_stride_width: Optional[float] = None,
495
+ ) -> None:
496
+ r"""
497
+ Enable tiled AE decoding. When this option is enabled, the AE will split the input tensor into tiles to compute
498
+ decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
499
+ processing larger images.
500
+
501
+ Args:
502
+ tile_sample_min_height (`int`, *optional*):
503
+ The minimum height required for a sample to be separated into tiles across the height dimension.
504
+ tile_sample_min_width (`int`, *optional*):
505
+ The minimum width required for a sample to be separated into tiles across the width dimension.
506
+ tile_sample_stride_height (`int`, *optional*):
507
+ The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
508
+ no tiling artifacts produced across the height dimension.
509
+ tile_sample_stride_width (`int`, *optional*):
510
+ The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
511
+ artifacts produced across the width dimension.
512
+ """
513
+ self.use_tiling = True
514
+ self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
515
+ self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
516
+ self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
517
+ self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
518
+
519
+ def disable_tiling(self) -> None:
520
+ r"""
521
+ Disable tiled AE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
522
+ decoding in one step.
523
+ """
524
+ self.use_tiling = False
525
+
526
+ def enable_slicing(self) -> None:
527
+ r"""
528
+ Enable sliced AE decoding. When this option is enabled, the AE will split the input tensor in slices to compute
529
+ decoding in several steps. This is useful to save some memory and allow larger batch sizes.
530
+ """
531
+ self.use_slicing = True
532
+
533
+ def disable_slicing(self) -> None:
534
+ r"""
535
+ Disable sliced AE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
536
+ decoding in one step.
537
+ """
538
+ self.use_slicing = False
539
+
540
+ def _encode(self, x: torch.Tensor) -> torch.Tensor:
541
+ batch_size, num_channels, height, width = x.shape
542
+
543
+ if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
544
+ return self.tiled_encode(x, return_dict=False)[0]
545
+
546
+ encoded = self.encoder(x)
547
+
548
+ return encoded
549
+
550
+ @apply_forward_hook
551
+ def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[EncoderOutput, Tuple[torch.Tensor]]:
552
+ r"""
553
+ Encode a batch of images into latents.
554
+
555
+ Args:
556
+ x (`torch.Tensor`): Input batch of images.
557
+ return_dict (`bool`, defaults to `True`):
558
+ Whether to return a [`~models.vae.EncoderOutput`] instead of a plain tuple.
559
+
560
+ Returns:
561
+ The latent representations of the encoded videos. If `return_dict` is True, a
562
+ [`~models.vae.EncoderOutput`] is returned, otherwise a plain `tuple` is returned.
563
+ """
564
+ if self.use_slicing and x.shape[0] > 1:
565
+ encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
566
+ encoded = torch.cat(encoded_slices)
567
+ else:
568
+ encoded = self._encode(x)
569
+
570
+ if not return_dict:
571
+ return (encoded,)
572
+ return EncoderOutput(latent=encoded)
573
+
574
+ def _decode(self, z: torch.Tensor) -> torch.Tensor:
575
+ batch_size, num_channels, height, width = z.shape
576
+
577
+ if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
578
+ return self.tiled_decode(z, return_dict=False)[0]
579
+
580
+ decoded = self.decoder(z)
581
+
582
+ return decoded
583
+
584
+ @apply_forward_hook
585
+ def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
586
+ r"""
587
+ Decode a batch of images.
588
+
589
+ Args:
590
+ z (`torch.Tensor`): Input batch of latent vectors.
591
+ return_dict (`bool`, defaults to `True`):
592
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
593
+
594
+ Returns:
595
+ [`~models.vae.DecoderOutput`] or `tuple`:
596
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
597
+ returned.
598
+ """
599
+ if self.use_slicing and z.size(0) > 1:
600
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
601
+ decoded = torch.cat(decoded_slices)
602
+ else:
603
+ decoded = self._decode(z)
604
+
605
+ if not return_dict:
606
+ return (decoded,)
607
+ return DecoderOutput(sample=decoded)
608
+
609
+ def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
610
+ raise NotImplementedError("`tiled_encode` has not been implemented for AutoencoderDC.")
611
+
612
+ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
613
+ raise NotImplementedError("`tiled_decode` has not been implemented for AutoencoderDC.")
614
+
615
+ def forward(self, sample: torch.Tensor, return_dict: bool = True) -> torch.Tensor:
616
+ encoded = self.encode(sample, return_dict=False)[0]
617
+ decoded = self.decode(encoded, return_dict=False)[0]
618
+ if not return_dict:
619
+ return (decoded,)
620
+ return DecoderOutput(sample=decoded)