diffusers 0.31.0__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. diffusers/__init__.py +66 -5
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +1 -1
  4. diffusers/dependency_versions_table.py +1 -1
  5. diffusers/image_processor.py +25 -17
  6. diffusers/loaders/__init__.py +22 -3
  7. diffusers/loaders/ip_adapter.py +538 -15
  8. diffusers/loaders/lora_base.py +124 -118
  9. diffusers/loaders/lora_conversion_utils.py +318 -3
  10. diffusers/loaders/lora_pipeline.py +1688 -368
  11. diffusers/loaders/peft.py +379 -0
  12. diffusers/loaders/single_file_model.py +71 -4
  13. diffusers/loaders/single_file_utils.py +519 -9
  14. diffusers/loaders/textual_inversion.py +3 -3
  15. diffusers/loaders/transformer_flux.py +181 -0
  16. diffusers/loaders/transformer_sd3.py +89 -0
  17. diffusers/loaders/unet.py +17 -4
  18. diffusers/models/__init__.py +47 -14
  19. diffusers/models/activations.py +22 -9
  20. diffusers/models/attention.py +13 -4
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2059 -281
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +2 -1
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +36 -27
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +29 -495
  36. diffusers/models/controlnet_sd3.py +25 -379
  37. diffusers/models/controlnet_sparsectrl.py +46 -718
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +838 -43
  49. diffusers/models/model_loading_utils.py +88 -6
  50. diffusers/models/modeling_flax_utils.py +1 -1
  51. diffusers/models/modeling_utils.py +74 -28
  52. diffusers/models/normalization.py +78 -13
  53. diffusers/models/transformers/__init__.py +5 -0
  54. diffusers/models/transformers/auraflow_transformer_2d.py +2 -2
  55. diffusers/models/transformers/cogvideox_transformer_3d.py +46 -11
  56. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  57. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  58. diffusers/models/transformers/pixart_transformer_2d.py +1 -1
  59. diffusers/models/transformers/sana_transformer.py +488 -0
  60. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  61. diffusers/models/transformers/transformer_2d.py +1 -1
  62. diffusers/models/transformers/transformer_allegro.py +422 -0
  63. diffusers/models/transformers/transformer_cogview3plus.py +1 -1
  64. diffusers/models/transformers/transformer_flux.py +30 -9
  65. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  66. diffusers/models/transformers/transformer_ltx.py +469 -0
  67. diffusers/models/transformers/transformer_mochi.py +499 -0
  68. diffusers/models/transformers/transformer_sd3.py +105 -17
  69. diffusers/models/transformers/transformer_temporal.py +1 -1
  70. diffusers/models/unets/unet_1d_blocks.py +1 -1
  71. diffusers/models/unets/unet_2d.py +8 -1
  72. diffusers/models/unets/unet_2d_blocks.py +88 -21
  73. diffusers/models/unets/unet_2d_condition.py +1 -1
  74. diffusers/models/unets/unet_3d_blocks.py +9 -7
  75. diffusers/models/unets/unet_motion_model.py +5 -5
  76. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  77. diffusers/models/unets/unet_stable_cascade.py +2 -2
  78. diffusers/models/unets/uvit_2d.py +1 -1
  79. diffusers/models/upsampling.py +8 -0
  80. diffusers/pipelines/__init__.py +34 -0
  81. diffusers/pipelines/allegro/__init__.py +48 -0
  82. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  83. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  84. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +8 -2
  85. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +1 -1
  86. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +0 -6
  87. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +8 -8
  88. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  89. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +1 -8
  90. diffusers/pipelines/auto_pipeline.py +53 -6
  91. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  92. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +50 -22
  93. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +51 -20
  94. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +69 -21
  95. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +47 -21
  96. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +1 -1
  97. diffusers/pipelines/controlnet/__init__.py +86 -80
  98. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  99. diffusers/pipelines/controlnet/pipeline_controlnet.py +11 -2
  100. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +1 -2
  101. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +1 -2
  102. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +1 -2
  103. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +3 -3
  104. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +1 -3
  105. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  106. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  107. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  108. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +5 -1
  109. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +53 -19
  110. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +7 -7
  111. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +31 -8
  112. diffusers/pipelines/flux/__init__.py +13 -1
  113. diffusers/pipelines/flux/modeling_flux.py +47 -0
  114. diffusers/pipelines/flux/pipeline_flux.py +204 -29
  115. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  116. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  117. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  118. diffusers/pipelines/flux/pipeline_flux_controlnet.py +49 -27
  119. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +40 -30
  120. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +78 -56
  121. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  122. diffusers/pipelines/flux/pipeline_flux_img2img.py +33 -27
  123. diffusers/pipelines/flux/pipeline_flux_inpaint.py +36 -29
  124. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  125. diffusers/pipelines/flux/pipeline_output.py +16 -0
  126. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  127. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  128. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  129. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +5 -1
  130. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  131. diffusers/pipelines/kolors/text_encoder.py +2 -2
  132. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  133. diffusers/pipelines/ltx/__init__.py +50 -0
  134. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  135. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  136. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  137. diffusers/pipelines/lumina/pipeline_lumina.py +1 -8
  138. diffusers/pipelines/mochi/__init__.py +48 -0
  139. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  140. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  141. diffusers/pipelines/pag/__init__.py +7 -0
  142. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +1 -2
  143. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1 -2
  144. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +1 -3
  145. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1 -3
  146. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +5 -1
  147. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +6 -13
  148. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  149. diffusers/pipelines/pag/pipeline_pag_sd_3.py +6 -6
  150. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  151. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +3 -0
  152. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  153. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  154. diffusers/pipelines/pipeline_loading_utils.py +25 -4
  155. diffusers/pipelines/pipeline_utils.py +35 -6
  156. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +6 -13
  157. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +6 -13
  158. diffusers/pipelines/sana/__init__.py +47 -0
  159. diffusers/pipelines/sana/pipeline_output.py +21 -0
  160. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  161. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  162. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +18 -3
  163. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +216 -20
  164. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +62 -9
  165. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +57 -8
  166. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  167. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +0 -8
  168. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +0 -8
  169. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +0 -8
  170. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  171. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  172. diffusers/quantizers/auto.py +14 -1
  173. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +4 -1
  174. diffusers/quantizers/gguf/__init__.py +1 -0
  175. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  176. diffusers/quantizers/gguf/utils.py +456 -0
  177. diffusers/quantizers/quantization_config.py +280 -2
  178. diffusers/quantizers/torchao/__init__.py +15 -0
  179. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  180. diffusers/schedulers/scheduling_ddpm.py +2 -6
  181. diffusers/schedulers/scheduling_ddpm_parallel.py +2 -6
  182. diffusers/schedulers/scheduling_deis_multistep.py +28 -9
  183. diffusers/schedulers/scheduling_dpmsolver_multistep.py +35 -9
  184. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +35 -8
  185. diffusers/schedulers/scheduling_dpmsolver_sde.py +4 -4
  186. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +48 -10
  187. diffusers/schedulers/scheduling_euler_discrete.py +4 -4
  188. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  189. diffusers/schedulers/scheduling_heun_discrete.py +4 -4
  190. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +4 -4
  191. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +4 -4
  192. diffusers/schedulers/scheduling_lcm.py +2 -6
  193. diffusers/schedulers/scheduling_lms_discrete.py +4 -4
  194. diffusers/schedulers/scheduling_repaint.py +1 -1
  195. diffusers/schedulers/scheduling_sasolver.py +28 -9
  196. diffusers/schedulers/scheduling_tcd.py +2 -6
  197. diffusers/schedulers/scheduling_unipc_multistep.py +53 -8
  198. diffusers/training_utils.py +16 -2
  199. diffusers/utils/__init__.py +5 -0
  200. diffusers/utils/constants.py +1 -0
  201. diffusers/utils/dummy_pt_objects.py +180 -0
  202. diffusers/utils/dummy_torch_and_transformers_objects.py +270 -0
  203. diffusers/utils/dynamic_modules_utils.py +3 -3
  204. diffusers/utils/hub_utils.py +31 -39
  205. diffusers/utils/import_utils.py +67 -0
  206. diffusers/utils/peft_utils.py +3 -0
  207. diffusers/utils/testing_utils.py +56 -1
  208. diffusers/utils/torch_utils.py +3 -0
  209. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/METADATA +69 -69
  210. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/RECORD +214 -162
  211. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  212. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  213. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  214. {diffusers-0.31.0.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1176 @@
1
+ # Copyright 2024 The Hunyuan Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, Optional, Tuple, Union
16
+
17
+ import numpy as np
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+ import torch.utils.checkpoint
22
+
23
+ from ...configuration_utils import ConfigMixin, register_to_config
24
+ from ...utils import is_torch_version, logging
25
+ from ...utils.accelerate_utils import apply_forward_hook
26
+ from ..activations import get_activation
27
+ from ..attention_processor import Attention
28
+ from ..modeling_outputs import AutoencoderKLOutput
29
+ from ..modeling_utils import ModelMixin
30
+ from .vae import DecoderOutput, DiagonalGaussianDistribution
31
+
32
+
33
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
34
+
35
+
36
+ def prepare_causal_attention_mask(
37
+ num_frames: int, height_width: int, dtype: torch.dtype, device: torch.device, batch_size: int = None
38
+ ) -> torch.Tensor:
39
+ seq_len = num_frames * height_width
40
+ mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
41
+ for i in range(seq_len):
42
+ i_frame = i // height_width
43
+ mask[i, : (i_frame + 1) * height_width] = 0
44
+ if batch_size is not None:
45
+ mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
46
+ return mask
47
+
48
+
49
+ class HunyuanVideoCausalConv3d(nn.Module):
50
+ def __init__(
51
+ self,
52
+ in_channels: int,
53
+ out_channels: int,
54
+ kernel_size: Union[int, Tuple[int, int, int]] = 3,
55
+ stride: Union[int, Tuple[int, int, int]] = 1,
56
+ padding: Union[int, Tuple[int, int, int]] = 0,
57
+ dilation: Union[int, Tuple[int, int, int]] = 1,
58
+ bias: bool = True,
59
+ pad_mode: str = "replicate",
60
+ ) -> None:
61
+ super().__init__()
62
+
63
+ kernel_size = (kernel_size, kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
64
+
65
+ self.pad_mode = pad_mode
66
+ self.time_causal_padding = (
67
+ kernel_size[0] // 2,
68
+ kernel_size[0] // 2,
69
+ kernel_size[1] // 2,
70
+ kernel_size[1] // 2,
71
+ kernel_size[2] - 1,
72
+ 0,
73
+ )
74
+
75
+ self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
76
+
77
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
78
+ hidden_states = F.pad(hidden_states, self.time_causal_padding, mode=self.pad_mode)
79
+ return self.conv(hidden_states)
80
+
81
+
82
+ class HunyuanVideoUpsampleCausal3D(nn.Module):
83
+ def __init__(
84
+ self,
85
+ in_channels: int,
86
+ out_channels: Optional[int] = None,
87
+ kernel_size: int = 3,
88
+ stride: int = 1,
89
+ bias: bool = True,
90
+ upsample_factor: Tuple[float, float, float] = (2, 2, 2),
91
+ ) -> None:
92
+ super().__init__()
93
+
94
+ out_channels = out_channels or in_channels
95
+ self.upsample_factor = upsample_factor
96
+
97
+ self.conv = HunyuanVideoCausalConv3d(in_channels, out_channels, kernel_size, stride, bias=bias)
98
+
99
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
100
+ num_frames = hidden_states.size(2)
101
+
102
+ first_frame, other_frames = hidden_states.split((1, num_frames - 1), dim=2)
103
+ first_frame = F.interpolate(
104
+ first_frame.squeeze(2), scale_factor=self.upsample_factor[1:], mode="nearest"
105
+ ).unsqueeze(2)
106
+
107
+ if num_frames > 1:
108
+ # See: https://github.com/pytorch/pytorch/issues/81665
109
+ # Unless you have a version of pytorch where non-contiguous implementation of F.interpolate
110
+ # is fixed, this will raise either a runtime error, or fail silently with bad outputs.
111
+ # If you are encountering an error here, make sure to try running encoding/decoding with
112
+ # `vae.enable_tiling()` first. If that doesn't work, open an issue at:
113
+ # https://github.com/huggingface/diffusers/issues
114
+ other_frames = other_frames.contiguous()
115
+ other_frames = F.interpolate(other_frames, scale_factor=self.upsample_factor, mode="nearest")
116
+ hidden_states = torch.cat((first_frame, other_frames), dim=2)
117
+ else:
118
+ hidden_states = first_frame
119
+
120
+ hidden_states = self.conv(hidden_states)
121
+ return hidden_states
122
+
123
+
124
+ class HunyuanVideoDownsampleCausal3D(nn.Module):
125
+ def __init__(
126
+ self,
127
+ channels: int,
128
+ out_channels: Optional[int] = None,
129
+ padding: int = 1,
130
+ kernel_size: int = 3,
131
+ bias: bool = True,
132
+ stride=2,
133
+ ) -> None:
134
+ super().__init__()
135
+ out_channels = out_channels or channels
136
+
137
+ self.conv = HunyuanVideoCausalConv3d(channels, out_channels, kernel_size, stride, padding, bias=bias)
138
+
139
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
140
+ hidden_states = self.conv(hidden_states)
141
+ return hidden_states
142
+
143
+
144
+ class HunyuanVideoResnetBlockCausal3D(nn.Module):
145
+ def __init__(
146
+ self,
147
+ in_channels: int,
148
+ out_channels: Optional[int] = None,
149
+ dropout: float = 0.0,
150
+ groups: int = 32,
151
+ eps: float = 1e-6,
152
+ non_linearity: str = "swish",
153
+ ) -> None:
154
+ super().__init__()
155
+ out_channels = out_channels or in_channels
156
+
157
+ self.nonlinearity = get_activation(non_linearity)
158
+
159
+ self.norm1 = nn.GroupNorm(groups, in_channels, eps=eps, affine=True)
160
+ self.conv1 = HunyuanVideoCausalConv3d(in_channels, out_channels, 3, 1, 0)
161
+
162
+ self.norm2 = nn.GroupNorm(groups, out_channels, eps=eps, affine=True)
163
+ self.dropout = nn.Dropout(dropout)
164
+ self.conv2 = HunyuanVideoCausalConv3d(out_channels, out_channels, 3, 1, 0)
165
+
166
+ self.conv_shortcut = None
167
+ if in_channels != out_channels:
168
+ self.conv_shortcut = HunyuanVideoCausalConv3d(in_channels, out_channels, 1, 1, 0)
169
+
170
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
171
+ hidden_states = hidden_states.contiguous()
172
+ residual = hidden_states
173
+
174
+ hidden_states = self.norm1(hidden_states)
175
+ hidden_states = self.nonlinearity(hidden_states)
176
+ hidden_states = self.conv1(hidden_states)
177
+
178
+ hidden_states = self.norm2(hidden_states)
179
+ hidden_states = self.nonlinearity(hidden_states)
180
+ hidden_states = self.dropout(hidden_states)
181
+ hidden_states = self.conv2(hidden_states)
182
+
183
+ if self.conv_shortcut is not None:
184
+ residual = self.conv_shortcut(residual)
185
+
186
+ hidden_states = hidden_states + residual
187
+ return hidden_states
188
+
189
+
190
+ class HunyuanVideoMidBlock3D(nn.Module):
191
+ def __init__(
192
+ self,
193
+ in_channels: int,
194
+ dropout: float = 0.0,
195
+ num_layers: int = 1,
196
+ resnet_eps: float = 1e-6,
197
+ resnet_act_fn: str = "swish",
198
+ resnet_groups: int = 32,
199
+ add_attention: bool = True,
200
+ attention_head_dim: int = 1,
201
+ ) -> None:
202
+ super().__init__()
203
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
204
+ self.add_attention = add_attention
205
+
206
+ # There is always at least one resnet
207
+ resnets = [
208
+ HunyuanVideoResnetBlockCausal3D(
209
+ in_channels=in_channels,
210
+ out_channels=in_channels,
211
+ eps=resnet_eps,
212
+ groups=resnet_groups,
213
+ dropout=dropout,
214
+ non_linearity=resnet_act_fn,
215
+ )
216
+ ]
217
+ attentions = []
218
+
219
+ for _ in range(num_layers):
220
+ if self.add_attention:
221
+ attentions.append(
222
+ Attention(
223
+ in_channels,
224
+ heads=in_channels // attention_head_dim,
225
+ dim_head=attention_head_dim,
226
+ eps=resnet_eps,
227
+ norm_num_groups=resnet_groups,
228
+ residual_connection=True,
229
+ bias=True,
230
+ upcast_softmax=True,
231
+ _from_deprecated_attn_block=True,
232
+ )
233
+ )
234
+ else:
235
+ attentions.append(None)
236
+
237
+ resnets.append(
238
+ HunyuanVideoResnetBlockCausal3D(
239
+ in_channels=in_channels,
240
+ out_channels=in_channels,
241
+ eps=resnet_eps,
242
+ groups=resnet_groups,
243
+ dropout=dropout,
244
+ non_linearity=resnet_act_fn,
245
+ )
246
+ )
247
+
248
+ self.attentions = nn.ModuleList(attentions)
249
+ self.resnets = nn.ModuleList(resnets)
250
+
251
+ self.gradient_checkpointing = False
252
+
253
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
254
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
255
+
256
+ def create_custom_forward(module, return_dict=None):
257
+ def custom_forward(*inputs):
258
+ if return_dict is not None:
259
+ return module(*inputs, return_dict=return_dict)
260
+ else:
261
+ return module(*inputs)
262
+
263
+ return custom_forward
264
+
265
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
266
+
267
+ hidden_states = torch.utils.checkpoint.checkpoint(
268
+ create_custom_forward(self.resnets[0]), hidden_states, **ckpt_kwargs
269
+ )
270
+
271
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
272
+ if attn is not None:
273
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
274
+ hidden_states = hidden_states.permute(0, 2, 3, 4, 1).flatten(1, 3)
275
+ attention_mask = prepare_causal_attention_mask(
276
+ num_frames, height * width, hidden_states.dtype, hidden_states.device, batch_size=batch_size
277
+ )
278
+ hidden_states = attn(hidden_states, attention_mask=attention_mask)
279
+ hidden_states = hidden_states.unflatten(1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
280
+
281
+ hidden_states = torch.utils.checkpoint.checkpoint(
282
+ create_custom_forward(resnet), hidden_states, **ckpt_kwargs
283
+ )
284
+
285
+ else:
286
+ hidden_states = self.resnets[0](hidden_states)
287
+
288
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
289
+ if attn is not None:
290
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
291
+ hidden_states = hidden_states.permute(0, 2, 3, 4, 1).flatten(1, 3)
292
+ attention_mask = prepare_causal_attention_mask(
293
+ num_frames, height * width, hidden_states.dtype, hidden_states.device, batch_size=batch_size
294
+ )
295
+ hidden_states = attn(hidden_states, attention_mask=attention_mask)
296
+ hidden_states = hidden_states.unflatten(1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
297
+
298
+ hidden_states = resnet(hidden_states)
299
+
300
+ return hidden_states
301
+
302
+
303
+ class HunyuanVideoDownBlock3D(nn.Module):
304
+ def __init__(
305
+ self,
306
+ in_channels: int,
307
+ out_channels: int,
308
+ dropout: float = 0.0,
309
+ num_layers: int = 1,
310
+ resnet_eps: float = 1e-6,
311
+ resnet_act_fn: str = "swish",
312
+ resnet_groups: int = 32,
313
+ add_downsample: bool = True,
314
+ downsample_stride: int = 2,
315
+ downsample_padding: int = 1,
316
+ ) -> None:
317
+ super().__init__()
318
+ resnets = []
319
+
320
+ for i in range(num_layers):
321
+ in_channels = in_channels if i == 0 else out_channels
322
+ resnets.append(
323
+ HunyuanVideoResnetBlockCausal3D(
324
+ in_channels=in_channels,
325
+ out_channels=out_channels,
326
+ eps=resnet_eps,
327
+ groups=resnet_groups,
328
+ dropout=dropout,
329
+ non_linearity=resnet_act_fn,
330
+ )
331
+ )
332
+
333
+ self.resnets = nn.ModuleList(resnets)
334
+
335
+ if add_downsample:
336
+ self.downsamplers = nn.ModuleList(
337
+ [
338
+ HunyuanVideoDownsampleCausal3D(
339
+ out_channels,
340
+ out_channels=out_channels,
341
+ padding=downsample_padding,
342
+ stride=downsample_stride,
343
+ )
344
+ ]
345
+ )
346
+ else:
347
+ self.downsamplers = None
348
+
349
+ self.gradient_checkpointing = False
350
+
351
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
352
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
353
+
354
+ def create_custom_forward(module, return_dict=None):
355
+ def custom_forward(*inputs):
356
+ if return_dict is not None:
357
+ return module(*inputs, return_dict=return_dict)
358
+ else:
359
+ return module(*inputs)
360
+
361
+ return custom_forward
362
+
363
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
364
+
365
+ for resnet in self.resnets:
366
+ hidden_states = torch.utils.checkpoint.checkpoint(
367
+ create_custom_forward(resnet), hidden_states, **ckpt_kwargs
368
+ )
369
+ else:
370
+ for resnet in self.resnets:
371
+ hidden_states = resnet(hidden_states)
372
+
373
+ if self.downsamplers is not None:
374
+ for downsampler in self.downsamplers:
375
+ hidden_states = downsampler(hidden_states)
376
+
377
+ return hidden_states
378
+
379
+
380
+ class HunyuanVideoUpBlock3D(nn.Module):
381
+ def __init__(
382
+ self,
383
+ in_channels: int,
384
+ out_channels: int,
385
+ dropout: float = 0.0,
386
+ num_layers: int = 1,
387
+ resnet_eps: float = 1e-6,
388
+ resnet_act_fn: str = "swish",
389
+ resnet_groups: int = 32,
390
+ add_upsample: bool = True,
391
+ upsample_scale_factor: Tuple[int, int, int] = (2, 2, 2),
392
+ ) -> None:
393
+ super().__init__()
394
+ resnets = []
395
+
396
+ for i in range(num_layers):
397
+ input_channels = in_channels if i == 0 else out_channels
398
+
399
+ resnets.append(
400
+ HunyuanVideoResnetBlockCausal3D(
401
+ in_channels=input_channels,
402
+ out_channels=out_channels,
403
+ eps=resnet_eps,
404
+ groups=resnet_groups,
405
+ dropout=dropout,
406
+ non_linearity=resnet_act_fn,
407
+ )
408
+ )
409
+
410
+ self.resnets = nn.ModuleList(resnets)
411
+
412
+ if add_upsample:
413
+ self.upsamplers = nn.ModuleList(
414
+ [
415
+ HunyuanVideoUpsampleCausal3D(
416
+ out_channels,
417
+ out_channels=out_channels,
418
+ upsample_factor=upsample_scale_factor,
419
+ )
420
+ ]
421
+ )
422
+ else:
423
+ self.upsamplers = None
424
+
425
+ self.gradient_checkpointing = False
426
+
427
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
428
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
429
+
430
+ def create_custom_forward(module, return_dict=None):
431
+ def custom_forward(*inputs):
432
+ if return_dict is not None:
433
+ return module(*inputs, return_dict=return_dict)
434
+ else:
435
+ return module(*inputs)
436
+
437
+ return custom_forward
438
+
439
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
440
+
441
+ for resnet in self.resnets:
442
+ hidden_states = torch.utils.checkpoint.checkpoint(
443
+ create_custom_forward(resnet), hidden_states, **ckpt_kwargs
444
+ )
445
+
446
+ else:
447
+ for resnet in self.resnets:
448
+ hidden_states = resnet(hidden_states)
449
+
450
+ if self.upsamplers is not None:
451
+ for upsampler in self.upsamplers:
452
+ hidden_states = upsampler(hidden_states)
453
+
454
+ return hidden_states
455
+
456
+
457
+ class HunyuanVideoEncoder3D(nn.Module):
458
+ r"""
459
+ Causal encoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
460
+ """
461
+
462
+ def __init__(
463
+ self,
464
+ in_channels: int = 3,
465
+ out_channels: int = 3,
466
+ down_block_types: Tuple[str, ...] = (
467
+ "HunyuanVideoDownBlock3D",
468
+ "HunyuanVideoDownBlock3D",
469
+ "HunyuanVideoDownBlock3D",
470
+ "HunyuanVideoDownBlock3D",
471
+ ),
472
+ block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
473
+ layers_per_block: int = 2,
474
+ norm_num_groups: int = 32,
475
+ act_fn: str = "silu",
476
+ double_z: bool = True,
477
+ mid_block_add_attention=True,
478
+ temporal_compression_ratio: int = 4,
479
+ spatial_compression_ratio: int = 8,
480
+ ) -> None:
481
+ super().__init__()
482
+
483
+ self.conv_in = HunyuanVideoCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
484
+ self.mid_block = None
485
+ self.down_blocks = nn.ModuleList([])
486
+
487
+ output_channel = block_out_channels[0]
488
+ for i, down_block_type in enumerate(down_block_types):
489
+ if down_block_type != "HunyuanVideoDownBlock3D":
490
+ raise ValueError(f"Unsupported down_block_type: {down_block_type}")
491
+
492
+ input_channel = output_channel
493
+ output_channel = block_out_channels[i]
494
+ is_final_block = i == len(block_out_channels) - 1
495
+ num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
496
+ num_time_downsample_layers = int(np.log2(temporal_compression_ratio))
497
+
498
+ if temporal_compression_ratio == 4:
499
+ add_spatial_downsample = bool(i < num_spatial_downsample_layers)
500
+ add_time_downsample = bool(
501
+ i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block
502
+ )
503
+ elif temporal_compression_ratio == 8:
504
+ add_spatial_downsample = bool(i < num_spatial_downsample_layers)
505
+ add_time_downsample = bool(i < num_time_downsample_layers)
506
+ else:
507
+ raise ValueError(f"Unsupported time_compression_ratio: {temporal_compression_ratio}")
508
+
509
+ downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
510
+ downsample_stride_T = (2,) if add_time_downsample else (1,)
511
+ downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
512
+
513
+ down_block = HunyuanVideoDownBlock3D(
514
+ num_layers=layers_per_block,
515
+ in_channels=input_channel,
516
+ out_channels=output_channel,
517
+ add_downsample=bool(add_spatial_downsample or add_time_downsample),
518
+ resnet_eps=1e-6,
519
+ resnet_act_fn=act_fn,
520
+ resnet_groups=norm_num_groups,
521
+ downsample_stride=downsample_stride,
522
+ downsample_padding=0,
523
+ )
524
+
525
+ self.down_blocks.append(down_block)
526
+
527
+ self.mid_block = HunyuanVideoMidBlock3D(
528
+ in_channels=block_out_channels[-1],
529
+ resnet_eps=1e-6,
530
+ resnet_act_fn=act_fn,
531
+ attention_head_dim=block_out_channels[-1],
532
+ resnet_groups=norm_num_groups,
533
+ add_attention=mid_block_add_attention,
534
+ )
535
+
536
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
537
+ self.conv_act = nn.SiLU()
538
+
539
+ conv_out_channels = 2 * out_channels if double_z else out_channels
540
+ self.conv_out = HunyuanVideoCausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
541
+
542
+ self.gradient_checkpointing = False
543
+
544
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
545
+ hidden_states = self.conv_in(hidden_states)
546
+
547
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
548
+
549
+ def create_custom_forward(module, return_dict=None):
550
+ def custom_forward(*inputs):
551
+ if return_dict is not None:
552
+ return module(*inputs, return_dict=return_dict)
553
+ else:
554
+ return module(*inputs)
555
+
556
+ return custom_forward
557
+
558
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
559
+
560
+ for down_block in self.down_blocks:
561
+ hidden_states = torch.utils.checkpoint.checkpoint(
562
+ create_custom_forward(down_block), hidden_states, **ckpt_kwargs
563
+ )
564
+
565
+ hidden_states = torch.utils.checkpoint.checkpoint(
566
+ create_custom_forward(self.mid_block), hidden_states, **ckpt_kwargs
567
+ )
568
+ else:
569
+ for down_block in self.down_blocks:
570
+ hidden_states = down_block(hidden_states)
571
+
572
+ hidden_states = self.mid_block(hidden_states)
573
+
574
+ hidden_states = self.conv_norm_out(hidden_states)
575
+ hidden_states = self.conv_act(hidden_states)
576
+ hidden_states = self.conv_out(hidden_states)
577
+
578
+ return hidden_states
579
+
580
+
581
+ class HunyuanVideoDecoder3D(nn.Module):
582
+ r"""
583
+ Causal decoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
584
+ """
585
+
586
+ def __init__(
587
+ self,
588
+ in_channels: int = 3,
589
+ out_channels: int = 3,
590
+ up_block_types: Tuple[str, ...] = (
591
+ "HunyuanVideoUpBlock3D",
592
+ "HunyuanVideoUpBlock3D",
593
+ "HunyuanVideoUpBlock3D",
594
+ "HunyuanVideoUpBlock3D",
595
+ ),
596
+ block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
597
+ layers_per_block: int = 2,
598
+ norm_num_groups: int = 32,
599
+ act_fn: str = "silu",
600
+ mid_block_add_attention=True,
601
+ time_compression_ratio: int = 4,
602
+ spatial_compression_ratio: int = 8,
603
+ ):
604
+ super().__init__()
605
+ self.layers_per_block = layers_per_block
606
+
607
+ self.conv_in = HunyuanVideoCausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
608
+ self.up_blocks = nn.ModuleList([])
609
+
610
+ # mid
611
+ self.mid_block = HunyuanVideoMidBlock3D(
612
+ in_channels=block_out_channels[-1],
613
+ resnet_eps=1e-6,
614
+ resnet_act_fn=act_fn,
615
+ attention_head_dim=block_out_channels[-1],
616
+ resnet_groups=norm_num_groups,
617
+ add_attention=mid_block_add_attention,
618
+ )
619
+
620
+ # up
621
+ reversed_block_out_channels = list(reversed(block_out_channels))
622
+ output_channel = reversed_block_out_channels[0]
623
+ for i, up_block_type in enumerate(up_block_types):
624
+ if up_block_type != "HunyuanVideoUpBlock3D":
625
+ raise ValueError(f"Unsupported up_block_type: {up_block_type}")
626
+
627
+ prev_output_channel = output_channel
628
+ output_channel = reversed_block_out_channels[i]
629
+ is_final_block = i == len(block_out_channels) - 1
630
+ num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
631
+ num_time_upsample_layers = int(np.log2(time_compression_ratio))
632
+
633
+ if time_compression_ratio == 4:
634
+ add_spatial_upsample = bool(i < num_spatial_upsample_layers)
635
+ add_time_upsample = bool(
636
+ i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block
637
+ )
638
+ else:
639
+ raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
640
+
641
+ upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
642
+ upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
643
+ upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
644
+
645
+ up_block = HunyuanVideoUpBlock3D(
646
+ num_layers=self.layers_per_block + 1,
647
+ in_channels=prev_output_channel,
648
+ out_channels=output_channel,
649
+ add_upsample=bool(add_spatial_upsample or add_time_upsample),
650
+ upsample_scale_factor=upsample_scale_factor,
651
+ resnet_eps=1e-6,
652
+ resnet_act_fn=act_fn,
653
+ resnet_groups=norm_num_groups,
654
+ )
655
+
656
+ self.up_blocks.append(up_block)
657
+ prev_output_channel = output_channel
658
+
659
+ # out
660
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
661
+ self.conv_act = nn.SiLU()
662
+ self.conv_out = HunyuanVideoCausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
663
+
664
+ self.gradient_checkpointing = False
665
+
666
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
667
+ hidden_states = self.conv_in(hidden_states)
668
+
669
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
670
+
671
+ def create_custom_forward(module, return_dict=None):
672
+ def custom_forward(*inputs):
673
+ if return_dict is not None:
674
+ return module(*inputs, return_dict=return_dict)
675
+ else:
676
+ return module(*inputs)
677
+
678
+ return custom_forward
679
+
680
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
681
+
682
+ hidden_states = torch.utils.checkpoint.checkpoint(
683
+ create_custom_forward(self.mid_block), hidden_states, **ckpt_kwargs
684
+ )
685
+
686
+ for up_block in self.up_blocks:
687
+ hidden_states = torch.utils.checkpoint.checkpoint(
688
+ create_custom_forward(up_block), hidden_states, **ckpt_kwargs
689
+ )
690
+ else:
691
+ hidden_states = self.mid_block(hidden_states)
692
+
693
+ for up_block in self.up_blocks:
694
+ hidden_states = up_block(hidden_states)
695
+
696
+ # post-process
697
+ hidden_states = self.conv_norm_out(hidden_states)
698
+ hidden_states = self.conv_act(hidden_states)
699
+ hidden_states = self.conv_out(hidden_states)
700
+
701
+ return hidden_states
702
+
703
+
704
+ class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin):
705
+ r"""
706
+ A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
707
+ Introduced in [HunyuanVideo](https://huggingface.co/papers/2412.03603).
708
+
709
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
710
+ for all models (such as downloading or saving).
711
+ """
712
+
713
+ _supports_gradient_checkpointing = True
714
+
715
+ @register_to_config
716
+ def __init__(
717
+ self,
718
+ in_channels: int = 3,
719
+ out_channels: int = 3,
720
+ latent_channels: int = 16,
721
+ down_block_types: Tuple[str, ...] = (
722
+ "HunyuanVideoDownBlock3D",
723
+ "HunyuanVideoDownBlock3D",
724
+ "HunyuanVideoDownBlock3D",
725
+ "HunyuanVideoDownBlock3D",
726
+ ),
727
+ up_block_types: Tuple[str, ...] = (
728
+ "HunyuanVideoUpBlock3D",
729
+ "HunyuanVideoUpBlock3D",
730
+ "HunyuanVideoUpBlock3D",
731
+ "HunyuanVideoUpBlock3D",
732
+ ),
733
+ block_out_channels: Tuple[int] = (128, 256, 512, 512),
734
+ layers_per_block: int = 2,
735
+ act_fn: str = "silu",
736
+ norm_num_groups: int = 32,
737
+ scaling_factor: float = 0.476986,
738
+ spatial_compression_ratio: int = 8,
739
+ temporal_compression_ratio: int = 4,
740
+ mid_block_add_attention: bool = True,
741
+ ) -> None:
742
+ super().__init__()
743
+
744
+ self.time_compression_ratio = temporal_compression_ratio
745
+
746
+ self.encoder = HunyuanVideoEncoder3D(
747
+ in_channels=in_channels,
748
+ out_channels=latent_channels,
749
+ down_block_types=down_block_types,
750
+ block_out_channels=block_out_channels,
751
+ layers_per_block=layers_per_block,
752
+ norm_num_groups=norm_num_groups,
753
+ act_fn=act_fn,
754
+ double_z=True,
755
+ mid_block_add_attention=mid_block_add_attention,
756
+ temporal_compression_ratio=temporal_compression_ratio,
757
+ spatial_compression_ratio=spatial_compression_ratio,
758
+ )
759
+
760
+ self.decoder = HunyuanVideoDecoder3D(
761
+ in_channels=latent_channels,
762
+ out_channels=out_channels,
763
+ up_block_types=up_block_types,
764
+ block_out_channels=block_out_channels,
765
+ layers_per_block=layers_per_block,
766
+ norm_num_groups=norm_num_groups,
767
+ act_fn=act_fn,
768
+ time_compression_ratio=temporal_compression_ratio,
769
+ spatial_compression_ratio=spatial_compression_ratio,
770
+ mid_block_add_attention=mid_block_add_attention,
771
+ )
772
+
773
+ self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
774
+ self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
775
+
776
+ self.spatial_compression_ratio = spatial_compression_ratio
777
+ self.temporal_compression_ratio = temporal_compression_ratio
778
+
779
+ # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
780
+ # to perform decoding of a single video latent at a time.
781
+ self.use_slicing = False
782
+
783
+ # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
784
+ # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
785
+ # intermediate tiles together, the memory requirement can be lowered.
786
+ self.use_tiling = False
787
+
788
+ # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
789
+ # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered.
790
+ self.use_framewise_encoding = True
791
+ self.use_framewise_decoding = True
792
+
793
+ # The minimal tile height and width for spatial tiling to be used
794
+ self.tile_sample_min_height = 256
795
+ self.tile_sample_min_width = 256
796
+ self.tile_sample_min_num_frames = 16
797
+
798
+ # The minimal distance between two spatial tiles
799
+ self.tile_sample_stride_height = 192
800
+ self.tile_sample_stride_width = 192
801
+ self.tile_sample_stride_num_frames = 12
802
+
803
+ def _set_gradient_checkpointing(self, module, value=False):
804
+ if isinstance(module, (HunyuanVideoEncoder3D, HunyuanVideoDecoder3D)):
805
+ module.gradient_checkpointing = value
806
+
807
+ def enable_tiling(
808
+ self,
809
+ tile_sample_min_height: Optional[int] = None,
810
+ tile_sample_min_width: Optional[int] = None,
811
+ tile_sample_min_num_frames: Optional[int] = None,
812
+ tile_sample_stride_height: Optional[float] = None,
813
+ tile_sample_stride_width: Optional[float] = None,
814
+ tile_sample_stride_num_frames: Optional[float] = None,
815
+ ) -> None:
816
+ r"""
817
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
818
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
819
+ processing larger images.
820
+
821
+ Args:
822
+ tile_sample_min_height (`int`, *optional*):
823
+ The minimum height required for a sample to be separated into tiles across the height dimension.
824
+ tile_sample_min_width (`int`, *optional*):
825
+ The minimum width required for a sample to be separated into tiles across the width dimension.
826
+ tile_sample_min_num_frames (`int`, *optional*):
827
+ The minimum number of frames required for a sample to be separated into tiles across the frame
828
+ dimension.
829
+ tile_sample_stride_height (`int`, *optional*):
830
+ The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
831
+ no tiling artifacts produced across the height dimension.
832
+ tile_sample_stride_width (`int`, *optional*):
833
+ The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
834
+ artifacts produced across the width dimension.
835
+ tile_sample_stride_num_frames (`int`, *optional*):
836
+ The stride between two consecutive frame tiles. This is to ensure that there are no tiling artifacts
837
+ produced across the frame dimension.
838
+ """
839
+ self.use_tiling = True
840
+ self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
841
+ self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
842
+ self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames
843
+ self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
844
+ self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
845
+ self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames
846
+
847
+ def disable_tiling(self) -> None:
848
+ r"""
849
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
850
+ decoding in one step.
851
+ """
852
+ self.use_tiling = False
853
+
854
+ def enable_slicing(self) -> None:
855
+ r"""
856
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
857
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
858
+ """
859
+ self.use_slicing = True
860
+
861
+ def disable_slicing(self) -> None:
862
+ r"""
863
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
864
+ decoding in one step.
865
+ """
866
+ self.use_slicing = False
867
+
868
+ def _encode(self, x: torch.Tensor) -> torch.Tensor:
869
+ batch_size, num_channels, num_frames, height, width = x.shape
870
+
871
+ if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames:
872
+ return self._temporal_tiled_encode(x)
873
+
874
+ if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
875
+ return self.tiled_encode(x)
876
+
877
+ x = self.encoder(x)
878
+ enc = self.quant_conv(x)
879
+ return enc
880
+
881
+ @apply_forward_hook
882
+ def encode(
883
+ self, x: torch.Tensor, return_dict: bool = True
884
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
885
+ r"""
886
+ Encode a batch of images into latents.
887
+
888
+ Args:
889
+ x (`torch.Tensor`): Input batch of images.
890
+ return_dict (`bool`, *optional*, defaults to `True`):
891
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
892
+
893
+ Returns:
894
+ The latent representations of the encoded videos. If `return_dict` is True, a
895
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
896
+ """
897
+ if self.use_slicing and x.shape[0] > 1:
898
+ encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
899
+ h = torch.cat(encoded_slices)
900
+ else:
901
+ h = self._encode(x)
902
+
903
+ posterior = DiagonalGaussianDistribution(h)
904
+
905
+ if not return_dict:
906
+ return (posterior,)
907
+ return AutoencoderKLOutput(latent_dist=posterior)
908
+
909
+ def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
910
+ batch_size, num_channels, num_frames, height, width = z.shape
911
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
912
+ tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
913
+ tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
914
+
915
+ if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
916
+ return self._temporal_tiled_decode(z, return_dict=return_dict)
917
+
918
+ if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
919
+ return self.tiled_decode(z, return_dict=return_dict)
920
+
921
+ z = self.post_quant_conv(z)
922
+ dec = self.decoder(z)
923
+
924
+ if not return_dict:
925
+ return (dec,)
926
+
927
+ return DecoderOutput(sample=dec)
928
+
929
+ @apply_forward_hook
930
+ def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
931
+ r"""
932
+ Decode a batch of images.
933
+
934
+ Args:
935
+ z (`torch.Tensor`): Input batch of latent vectors.
936
+ return_dict (`bool`, *optional*, defaults to `True`):
937
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
938
+
939
+ Returns:
940
+ [`~models.vae.DecoderOutput`] or `tuple`:
941
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
942
+ returned.
943
+ """
944
+ if self.use_slicing and z.shape[0] > 1:
945
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
946
+ decoded = torch.cat(decoded_slices)
947
+ else:
948
+ decoded = self._decode(z).sample
949
+
950
+ if not return_dict:
951
+ return (decoded,)
952
+
953
+ return DecoderOutput(sample=decoded)
954
+
955
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
956
+ blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
957
+ for y in range(blend_extent):
958
+ b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
959
+ y / blend_extent
960
+ )
961
+ return b
962
+
963
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
964
+ blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
965
+ for x in range(blend_extent):
966
+ b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
967
+ x / blend_extent
968
+ )
969
+ return b
970
+
971
+ def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
972
+ blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
973
+ for x in range(blend_extent):
974
+ b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
975
+ x / blend_extent
976
+ )
977
+ return b
978
+
979
+ def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
980
+ r"""Encode a batch of images using a tiled encoder.
981
+
982
+ Args:
983
+ x (`torch.Tensor`): Input batch of videos.
984
+
985
+ Returns:
986
+ `torch.Tensor`:
987
+ The latent representation of the encoded videos.
988
+ """
989
+ batch_size, num_channels, num_frames, height, width = x.shape
990
+ latent_height = height // self.spatial_compression_ratio
991
+ latent_width = width // self.spatial_compression_ratio
992
+
993
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
994
+ tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
995
+ tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
996
+ tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
997
+
998
+ blend_height = tile_latent_min_height - tile_latent_stride_height
999
+ blend_width = tile_latent_min_width - tile_latent_stride_width
1000
+
1001
+ # Split x into overlapping tiles and encode them separately.
1002
+ # The tiles have an overlap to avoid seams between tiles.
1003
+ rows = []
1004
+ for i in range(0, height, self.tile_sample_stride_height):
1005
+ row = []
1006
+ for j in range(0, width, self.tile_sample_stride_width):
1007
+ tile = x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
1008
+ tile = self.encoder(tile)
1009
+ tile = self.quant_conv(tile)
1010
+ row.append(tile)
1011
+ rows.append(row)
1012
+
1013
+ result_rows = []
1014
+ for i, row in enumerate(rows):
1015
+ result_row = []
1016
+ for j, tile in enumerate(row):
1017
+ # blend the above tile and the left tile
1018
+ # to the current tile and add the current tile to the result row
1019
+ if i > 0:
1020
+ tile = self.blend_v(rows[i - 1][j], tile, blend_height)
1021
+ if j > 0:
1022
+ tile = self.blend_h(row[j - 1], tile, blend_width)
1023
+ result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
1024
+ result_rows.append(torch.cat(result_row, dim=4))
1025
+
1026
+ enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
1027
+ return enc
1028
+
1029
+ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
1030
+ r"""
1031
+ Decode a batch of images using a tiled decoder.
1032
+
1033
+ Args:
1034
+ z (`torch.Tensor`): Input batch of latent vectors.
1035
+ return_dict (`bool`, *optional*, defaults to `True`):
1036
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
1037
+
1038
+ Returns:
1039
+ [`~models.vae.DecoderOutput`] or `tuple`:
1040
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
1041
+ returned.
1042
+ """
1043
+
1044
+ batch_size, num_channels, num_frames, height, width = z.shape
1045
+ sample_height = height * self.spatial_compression_ratio
1046
+ sample_width = width * self.spatial_compression_ratio
1047
+
1048
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1049
+ tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
1050
+ tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
1051
+ tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
1052
+
1053
+ blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
1054
+ blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
1055
+
1056
+ # Split z into overlapping tiles and decode them separately.
1057
+ # The tiles have an overlap to avoid seams between tiles.
1058
+ rows = []
1059
+ for i in range(0, height, tile_latent_stride_height):
1060
+ row = []
1061
+ for j in range(0, width, tile_latent_stride_width):
1062
+ tile = z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
1063
+ tile = self.post_quant_conv(tile)
1064
+ decoded = self.decoder(tile)
1065
+ row.append(decoded)
1066
+ rows.append(row)
1067
+
1068
+ result_rows = []
1069
+ for i, row in enumerate(rows):
1070
+ result_row = []
1071
+ for j, tile in enumerate(row):
1072
+ # blend the above tile and the left tile
1073
+ # to the current tile and add the current tile to the result row
1074
+ if i > 0:
1075
+ tile = self.blend_v(rows[i - 1][j], tile, blend_height)
1076
+ if j > 0:
1077
+ tile = self.blend_h(row[j - 1], tile, blend_width)
1078
+ result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
1079
+ result_rows.append(torch.cat(result_row, dim=-1))
1080
+
1081
+ dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
1082
+
1083
+ if not return_dict:
1084
+ return (dec,)
1085
+ return DecoderOutput(sample=dec)
1086
+
1087
+ def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
1088
+ batch_size, num_channels, num_frames, height, width = x.shape
1089
+ latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1
1090
+
1091
+ tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
1092
+ tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
1093
+ blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames
1094
+
1095
+ row = []
1096
+ for i in range(0, num_frames, self.tile_sample_stride_num_frames):
1097
+ tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :]
1098
+ if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width):
1099
+ tile = self.tiled_encode(tile)
1100
+ else:
1101
+ tile = self.encoder(tile)
1102
+ tile = self.quant_conv(tile)
1103
+ if i > 0:
1104
+ tile = tile[:, :, 1:, :, :]
1105
+ row.append(tile)
1106
+
1107
+ result_row = []
1108
+ for i, tile in enumerate(row):
1109
+ if i > 0:
1110
+ tile = self.blend_t(row[i - 1], tile, blend_num_frames)
1111
+ result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :])
1112
+ else:
1113
+ result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :])
1114
+
1115
+ enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
1116
+ return enc
1117
+
1118
+ def _temporal_tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
1119
+ batch_size, num_channels, num_frames, height, width = z.shape
1120
+ num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
1121
+
1122
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1123
+ tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
1124
+ tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
1125
+ tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
1126
+ blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
1127
+
1128
+ row = []
1129
+ for i in range(0, num_frames, tile_latent_stride_num_frames):
1130
+ tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
1131
+ if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
1132
+ decoded = self.tiled_decode(tile, return_dict=True).sample
1133
+ else:
1134
+ tile = self.post_quant_conv(tile)
1135
+ decoded = self.decoder(tile)
1136
+ if i > 0:
1137
+ decoded = decoded[:, :, 1:, :, :]
1138
+ row.append(decoded)
1139
+
1140
+ result_row = []
1141
+ for i, tile in enumerate(row):
1142
+ if i > 0:
1143
+ tile = self.blend_t(row[i - 1], tile, blend_num_frames)
1144
+ result_row.append(tile[:, :, : self.tile_sample_stride_num_frames, :, :])
1145
+ else:
1146
+ result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :])
1147
+
1148
+ dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
1149
+
1150
+ if not return_dict:
1151
+ return (dec,)
1152
+ return DecoderOutput(sample=dec)
1153
+
1154
+ def forward(
1155
+ self,
1156
+ sample: torch.Tensor,
1157
+ sample_posterior: bool = False,
1158
+ return_dict: bool = True,
1159
+ generator: Optional[torch.Generator] = None,
1160
+ ) -> Union[DecoderOutput, torch.Tensor]:
1161
+ r"""
1162
+ Args:
1163
+ sample (`torch.Tensor`): Input sample.
1164
+ sample_posterior (`bool`, *optional*, defaults to `False`):
1165
+ Whether to sample from the posterior.
1166
+ return_dict (`bool`, *optional*, defaults to `True`):
1167
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
1168
+ """
1169
+ x = sample
1170
+ posterior = self.encode(x).latent_dist
1171
+ if sample_posterior:
1172
+ z = posterior.sample(generator=generator)
1173
+ else:
1174
+ z = posterior.mode()
1175
+ dec = self.decode(z, return_dict=return_dict)
1176
+ return dec