diffusers 0.30.3__py3-none-any.whl → 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. diffusers/__init__.py +97 -4
  2. diffusers/callbacks.py +56 -3
  3. diffusers/configuration_utils.py +13 -1
  4. diffusers/image_processor.py +282 -71
  5. diffusers/loaders/__init__.py +24 -3
  6. diffusers/loaders/ip_adapter.py +543 -16
  7. diffusers/loaders/lora_base.py +138 -125
  8. diffusers/loaders/lora_conversion_utils.py +647 -0
  9. diffusers/loaders/lora_pipeline.py +2216 -230
  10. diffusers/loaders/peft.py +380 -0
  11. diffusers/loaders/single_file_model.py +71 -4
  12. diffusers/loaders/single_file_utils.py +597 -10
  13. diffusers/loaders/textual_inversion.py +5 -3
  14. diffusers/loaders/transformer_flux.py +181 -0
  15. diffusers/loaders/transformer_sd3.py +89 -0
  16. diffusers/loaders/unet.py +56 -12
  17. diffusers/models/__init__.py +49 -12
  18. diffusers/models/activations.py +22 -9
  19. diffusers/models/adapter.py +53 -53
  20. diffusers/models/attention.py +98 -13
  21. diffusers/models/attention_flax.py +1 -1
  22. diffusers/models/attention_processor.py +2160 -346
  23. diffusers/models/autoencoders/__init__.py +5 -0
  24. diffusers/models/autoencoders/autoencoder_dc.py +620 -0
  25. diffusers/models/autoencoders/autoencoder_kl.py +73 -12
  26. diffusers/models/autoencoders/autoencoder_kl_allegro.py +1149 -0
  27. diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +213 -105
  28. diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +1176 -0
  29. diffusers/models/autoencoders/autoencoder_kl_ltx.py +1338 -0
  30. diffusers/models/autoencoders/autoencoder_kl_mochi.py +1166 -0
  31. diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +3 -10
  32. diffusers/models/autoencoders/autoencoder_tiny.py +4 -2
  33. diffusers/models/autoencoders/vae.py +18 -5
  34. diffusers/models/controlnet.py +47 -802
  35. diffusers/models/controlnet_flux.py +70 -0
  36. diffusers/models/controlnet_sd3.py +26 -376
  37. diffusers/models/controlnet_sparsectrl.py +46 -719
  38. diffusers/models/controlnets/__init__.py +23 -0
  39. diffusers/models/controlnets/controlnet.py +872 -0
  40. diffusers/models/{controlnet_flax.py → controlnets/controlnet_flax.py} +5 -5
  41. diffusers/models/controlnets/controlnet_flux.py +536 -0
  42. diffusers/models/{controlnet_hunyuan.py → controlnets/controlnet_hunyuan.py} +7 -7
  43. diffusers/models/controlnets/controlnet_sd3.py +489 -0
  44. diffusers/models/controlnets/controlnet_sparsectrl.py +788 -0
  45. diffusers/models/controlnets/controlnet_union.py +832 -0
  46. diffusers/models/{controlnet_xs.py → controlnets/controlnet_xs.py} +14 -13
  47. diffusers/models/controlnets/multicontrolnet.py +183 -0
  48. diffusers/models/embeddings.py +996 -92
  49. diffusers/models/embeddings_flax.py +23 -9
  50. diffusers/models/model_loading_utils.py +264 -14
  51. diffusers/models/modeling_flax_utils.py +1 -1
  52. diffusers/models/modeling_utils.py +334 -51
  53. diffusers/models/normalization.py +157 -13
  54. diffusers/models/transformers/__init__.py +6 -0
  55. diffusers/models/transformers/auraflow_transformer_2d.py +3 -2
  56. diffusers/models/transformers/cogvideox_transformer_3d.py +69 -13
  57. diffusers/models/transformers/dit_transformer_2d.py +1 -1
  58. diffusers/models/transformers/latte_transformer_3d.py +4 -4
  59. diffusers/models/transformers/pixart_transformer_2d.py +10 -2
  60. diffusers/models/transformers/sana_transformer.py +488 -0
  61. diffusers/models/transformers/stable_audio_transformer.py +1 -1
  62. diffusers/models/transformers/transformer_2d.py +1 -1
  63. diffusers/models/transformers/transformer_allegro.py +422 -0
  64. diffusers/models/transformers/transformer_cogview3plus.py +386 -0
  65. diffusers/models/transformers/transformer_flux.py +189 -51
  66. diffusers/models/transformers/transformer_hunyuan_video.py +789 -0
  67. diffusers/models/transformers/transformer_ltx.py +469 -0
  68. diffusers/models/transformers/transformer_mochi.py +499 -0
  69. diffusers/models/transformers/transformer_sd3.py +112 -18
  70. diffusers/models/transformers/transformer_temporal.py +1 -1
  71. diffusers/models/unets/unet_1d_blocks.py +1 -1
  72. diffusers/models/unets/unet_2d.py +8 -1
  73. diffusers/models/unets/unet_2d_blocks.py +88 -21
  74. diffusers/models/unets/unet_2d_condition.py +9 -9
  75. diffusers/models/unets/unet_3d_blocks.py +9 -7
  76. diffusers/models/unets/unet_motion_model.py +46 -68
  77. diffusers/models/unets/unet_spatio_temporal_condition.py +23 -0
  78. diffusers/models/unets/unet_stable_cascade.py +2 -2
  79. diffusers/models/unets/uvit_2d.py +1 -1
  80. diffusers/models/upsampling.py +14 -6
  81. diffusers/pipelines/__init__.py +69 -6
  82. diffusers/pipelines/allegro/__init__.py +48 -0
  83. diffusers/pipelines/allegro/pipeline_allegro.py +938 -0
  84. diffusers/pipelines/allegro/pipeline_output.py +23 -0
  85. diffusers/pipelines/animatediff/__init__.py +2 -0
  86. diffusers/pipelines/animatediff/pipeline_animatediff.py +45 -21
  87. diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py +52 -22
  88. diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +18 -4
  89. diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py +3 -1
  90. diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +104 -72
  91. diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.py +1341 -0
  92. diffusers/pipelines/audioldm2/modeling_audioldm2.py +3 -3
  93. diffusers/pipelines/aura_flow/pipeline_aura_flow.py +2 -9
  94. diffusers/pipelines/auto_pipeline.py +88 -10
  95. diffusers/pipelines/blip_diffusion/modeling_blip2.py +1 -1
  96. diffusers/pipelines/cogvideo/__init__.py +2 -0
  97. diffusers/pipelines/cogvideo/pipeline_cogvideox.py +80 -39
  98. diffusers/pipelines/cogvideo/pipeline_cogvideox_fun_control.py +825 -0
  99. diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +108 -50
  100. diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +89 -50
  101. diffusers/pipelines/cogview3/__init__.py +47 -0
  102. diffusers/pipelines/cogview3/pipeline_cogview3plus.py +674 -0
  103. diffusers/pipelines/cogview3/pipeline_output.py +21 -0
  104. diffusers/pipelines/controlnet/__init__.py +86 -80
  105. diffusers/pipelines/controlnet/multicontrolnet.py +7 -178
  106. diffusers/pipelines/controlnet/pipeline_controlnet.py +20 -3
  107. diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +9 -2
  108. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +9 -2
  109. diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +37 -15
  110. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +12 -4
  111. diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +9 -4
  112. diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py +1790 -0
  113. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py +1501 -0
  114. diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py +1627 -0
  115. diffusers/pipelines/controlnet_hunyuandit/pipeline_hunyuandit_controlnet.py +22 -4
  116. diffusers/pipelines/controlnet_sd3/__init__.py +4 -0
  117. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +56 -20
  118. diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +1153 -0
  119. diffusers/pipelines/ddpm/pipeline_ddpm.py +2 -2
  120. diffusers/pipelines/deepfloyd_if/pipeline_output.py +6 -5
  121. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +16 -4
  122. diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +1 -1
  123. diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +32 -9
  124. diffusers/pipelines/flux/__init__.py +23 -1
  125. diffusers/pipelines/flux/modeling_flux.py +47 -0
  126. diffusers/pipelines/flux/pipeline_flux.py +256 -48
  127. diffusers/pipelines/flux/pipeline_flux_control.py +889 -0
  128. diffusers/pipelines/flux/pipeline_flux_control_img2img.py +945 -0
  129. diffusers/pipelines/flux/pipeline_flux_control_inpaint.py +1141 -0
  130. diffusers/pipelines/flux/pipeline_flux_controlnet.py +1006 -0
  131. diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +998 -0
  132. diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +1204 -0
  133. diffusers/pipelines/flux/pipeline_flux_fill.py +969 -0
  134. diffusers/pipelines/flux/pipeline_flux_img2img.py +856 -0
  135. diffusers/pipelines/flux/pipeline_flux_inpaint.py +1022 -0
  136. diffusers/pipelines/flux/pipeline_flux_prior_redux.py +492 -0
  137. diffusers/pipelines/flux/pipeline_output.py +16 -0
  138. diffusers/pipelines/free_noise_utils.py +365 -5
  139. diffusers/pipelines/hunyuan_video/__init__.py +48 -0
  140. diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py +687 -0
  141. diffusers/pipelines/hunyuan_video/pipeline_output.py +20 -0
  142. diffusers/pipelines/hunyuandit/pipeline_hunyuandit.py +20 -4
  143. diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +9 -9
  144. diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +2 -2
  145. diffusers/pipelines/kolors/pipeline_kolors.py +1 -1
  146. diffusers/pipelines/kolors/pipeline_kolors_img2img.py +14 -11
  147. diffusers/pipelines/kolors/text_encoder.py +2 -2
  148. diffusers/pipelines/kolors/tokenizer.py +4 -0
  149. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +1 -1
  150. diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +1 -1
  151. diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +1 -1
  152. diffusers/pipelines/latte/pipeline_latte.py +2 -2
  153. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +15 -3
  154. diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +15 -3
  155. diffusers/pipelines/ltx/__init__.py +50 -0
  156. diffusers/pipelines/ltx/pipeline_ltx.py +789 -0
  157. diffusers/pipelines/ltx/pipeline_ltx_image2video.py +885 -0
  158. diffusers/pipelines/ltx/pipeline_output.py +20 -0
  159. diffusers/pipelines/lumina/pipeline_lumina.py +3 -10
  160. diffusers/pipelines/mochi/__init__.py +48 -0
  161. diffusers/pipelines/mochi/pipeline_mochi.py +748 -0
  162. diffusers/pipelines/mochi/pipeline_output.py +20 -0
  163. diffusers/pipelines/pag/__init__.py +13 -0
  164. diffusers/pipelines/pag/pag_utils.py +8 -2
  165. diffusers/pipelines/pag/pipeline_pag_controlnet_sd.py +2 -3
  166. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_inpaint.py +1543 -0
  167. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py +3 -5
  168. diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +1683 -0
  169. diffusers/pipelines/pag/pipeline_pag_hunyuandit.py +22 -6
  170. diffusers/pipelines/pag/pipeline_pag_kolors.py +1 -1
  171. diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +7 -14
  172. diffusers/pipelines/pag/pipeline_pag_sana.py +886 -0
  173. diffusers/pipelines/pag/pipeline_pag_sd.py +18 -6
  174. diffusers/pipelines/pag/pipeline_pag_sd_3.py +18 -9
  175. diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +1058 -0
  176. diffusers/pipelines/pag/pipeline_pag_sd_animatediff.py +5 -1
  177. diffusers/pipelines/pag/pipeline_pag_sd_img2img.py +1094 -0
  178. diffusers/pipelines/pag/pipeline_pag_sd_inpaint.py +1356 -0
  179. diffusers/pipelines/pag/pipeline_pag_sd_xl.py +18 -6
  180. diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py +31 -16
  181. diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py +42 -19
  182. diffusers/pipelines/pia/pipeline_pia.py +2 -0
  183. diffusers/pipelines/pipeline_flax_utils.py +1 -1
  184. diffusers/pipelines/pipeline_loading_utils.py +250 -31
  185. diffusers/pipelines/pipeline_utils.py +158 -186
  186. diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +7 -14
  187. diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +7 -14
  188. diffusers/pipelines/sana/__init__.py +47 -0
  189. diffusers/pipelines/sana/pipeline_output.py +21 -0
  190. diffusers/pipelines/sana/pipeline_sana.py +884 -0
  191. diffusers/pipelines/stable_audio/pipeline_stable_audio.py +12 -1
  192. diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +35 -3
  193. diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +2 -2
  194. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +46 -9
  195. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1 -1
  196. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1 -1
  197. diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +241 -81
  198. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +228 -23
  199. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +82 -13
  200. diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +60 -11
  201. diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +11 -1
  202. diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +1 -1
  203. diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +16 -4
  204. diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +16 -4
  205. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +16 -12
  206. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +29 -22
  207. diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +29 -22
  208. diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +1 -1
  209. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +1 -1
  210. diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +16 -4
  211. diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +15 -3
  212. diffusers/pipelines/unidiffuser/modeling_uvit.py +2 -2
  213. diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py +1 -1
  214. diffusers/quantizers/__init__.py +16 -0
  215. diffusers/quantizers/auto.py +139 -0
  216. diffusers/quantizers/base.py +233 -0
  217. diffusers/quantizers/bitsandbytes/__init__.py +2 -0
  218. diffusers/quantizers/bitsandbytes/bnb_quantizer.py +561 -0
  219. diffusers/quantizers/bitsandbytes/utils.py +306 -0
  220. diffusers/quantizers/gguf/__init__.py +1 -0
  221. diffusers/quantizers/gguf/gguf_quantizer.py +159 -0
  222. diffusers/quantizers/gguf/utils.py +456 -0
  223. diffusers/quantizers/quantization_config.py +669 -0
  224. diffusers/quantizers/torchao/__init__.py +15 -0
  225. diffusers/quantizers/torchao/torchao_quantizer.py +285 -0
  226. diffusers/schedulers/scheduling_ddim.py +4 -1
  227. diffusers/schedulers/scheduling_ddim_cogvideox.py +4 -1
  228. diffusers/schedulers/scheduling_ddim_parallel.py +4 -1
  229. diffusers/schedulers/scheduling_ddpm.py +6 -7
  230. diffusers/schedulers/scheduling_ddpm_parallel.py +6 -7
  231. diffusers/schedulers/scheduling_deis_multistep.py +102 -6
  232. diffusers/schedulers/scheduling_dpmsolver_multistep.py +113 -6
  233. diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +111 -5
  234. diffusers/schedulers/scheduling_dpmsolver_sde.py +125 -10
  235. diffusers/schedulers/scheduling_dpmsolver_singlestep.py +126 -7
  236. diffusers/schedulers/scheduling_edm_euler.py +8 -6
  237. diffusers/schedulers/scheduling_euler_ancestral_discrete.py +4 -1
  238. diffusers/schedulers/scheduling_euler_discrete.py +92 -7
  239. diffusers/schedulers/scheduling_flow_match_euler_discrete.py +153 -6
  240. diffusers/schedulers/scheduling_flow_match_heun_discrete.py +4 -5
  241. diffusers/schedulers/scheduling_heun_discrete.py +114 -8
  242. diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +116 -11
  243. diffusers/schedulers/scheduling_k_dpm_2_discrete.py +110 -8
  244. diffusers/schedulers/scheduling_lcm.py +2 -6
  245. diffusers/schedulers/scheduling_lms_discrete.py +76 -1
  246. diffusers/schedulers/scheduling_repaint.py +1 -1
  247. diffusers/schedulers/scheduling_sasolver.py +102 -6
  248. diffusers/schedulers/scheduling_tcd.py +2 -6
  249. diffusers/schedulers/scheduling_unclip.py +4 -1
  250. diffusers/schedulers/scheduling_unipc_multistep.py +127 -5
  251. diffusers/training_utils.py +63 -19
  252. diffusers/utils/__init__.py +7 -1
  253. diffusers/utils/constants.py +1 -0
  254. diffusers/utils/dummy_pt_objects.py +240 -0
  255. diffusers/utils/dummy_torch_and_transformers_objects.py +435 -0
  256. diffusers/utils/dynamic_modules_utils.py +3 -3
  257. diffusers/utils/hub_utils.py +44 -40
  258. diffusers/utils/import_utils.py +98 -8
  259. diffusers/utils/loading_utils.py +28 -4
  260. diffusers/utils/peft_utils.py +6 -3
  261. diffusers/utils/testing_utils.py +115 -1
  262. diffusers/utils/torch_utils.py +3 -0
  263. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/METADATA +73 -72
  264. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/RECORD +268 -193
  265. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/WHEEL +1 -1
  266. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/LICENSE +0 -0
  267. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/entry_points.txt +0 -0
  268. {diffusers-0.30.3.dist-info → diffusers-0.32.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1338 @@
1
+ # Copyright 2024 The Lightricks team and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from ...configuration_utils import ConfigMixin, register_to_config
22
+ from ...loaders import FromOriginalModelMixin
23
+ from ...utils.accelerate_utils import apply_forward_hook
24
+ from ..activations import get_activation
25
+ from ..embeddings import PixArtAlphaCombinedTimestepSizeEmbeddings
26
+ from ..modeling_outputs import AutoencoderKLOutput
27
+ from ..modeling_utils import ModelMixin
28
+ from ..normalization import RMSNorm
29
+ from .vae import DecoderOutput, DiagonalGaussianDistribution
30
+
31
+
32
+ class LTXVideoCausalConv3d(nn.Module):
33
+ def __init__(
34
+ self,
35
+ in_channels: int,
36
+ out_channels: int,
37
+ kernel_size: Union[int, Tuple[int, int, int]] = 3,
38
+ stride: Union[int, Tuple[int, int, int]] = 1,
39
+ dilation: Union[int, Tuple[int, int, int]] = 1,
40
+ groups: int = 1,
41
+ padding_mode: str = "zeros",
42
+ is_causal: bool = True,
43
+ ):
44
+ super().__init__()
45
+
46
+ self.in_channels = in_channels
47
+ self.out_channels = out_channels
48
+ self.is_causal = is_causal
49
+ self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
50
+
51
+ dilation = dilation if isinstance(dilation, tuple) else (dilation, 1, 1)
52
+ stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
53
+ height_pad = self.kernel_size[1] // 2
54
+ width_pad = self.kernel_size[2] // 2
55
+ padding = (0, height_pad, width_pad)
56
+
57
+ self.conv = nn.Conv3d(
58
+ in_channels,
59
+ out_channels,
60
+ self.kernel_size,
61
+ stride=stride,
62
+ dilation=dilation,
63
+ groups=groups,
64
+ padding=padding,
65
+ padding_mode=padding_mode,
66
+ )
67
+
68
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
69
+ time_kernel_size = self.kernel_size[0]
70
+
71
+ if self.is_causal:
72
+ pad_left = hidden_states[:, :, :1, :, :].repeat((1, 1, time_kernel_size - 1, 1, 1))
73
+ hidden_states = torch.concatenate([pad_left, hidden_states], dim=2)
74
+ else:
75
+ pad_left = hidden_states[:, :, :1, :, :].repeat((1, 1, (time_kernel_size - 1) // 2, 1, 1))
76
+ pad_right = hidden_states[:, :, -1:, :, :].repeat((1, 1, (time_kernel_size - 1) // 2, 1, 1))
77
+ hidden_states = torch.concatenate([pad_left, hidden_states, pad_right], dim=2)
78
+
79
+ hidden_states = self.conv(hidden_states)
80
+ return hidden_states
81
+
82
+
83
+ class LTXVideoResnetBlock3d(nn.Module):
84
+ r"""
85
+ A 3D ResNet block used in the LTXVideo model.
86
+
87
+ Args:
88
+ in_channels (`int`):
89
+ Number of input channels.
90
+ out_channels (`int`, *optional*):
91
+ Number of output channels. If None, defaults to `in_channels`.
92
+ dropout (`float`, defaults to `0.0`):
93
+ Dropout rate.
94
+ eps (`float`, defaults to `1e-6`):
95
+ Epsilon value for normalization layers.
96
+ elementwise_affine (`bool`, defaults to `False`):
97
+ Whether to enable elementwise affinity in the normalization layers.
98
+ non_linearity (`str`, defaults to `"swish"`):
99
+ Activation function to use.
100
+ conv_shortcut (bool, defaults to `False`):
101
+ Whether or not to use a convolution shortcut.
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ in_channels: int,
107
+ out_channels: Optional[int] = None,
108
+ dropout: float = 0.0,
109
+ eps: float = 1e-6,
110
+ elementwise_affine: bool = False,
111
+ non_linearity: str = "swish",
112
+ is_causal: bool = True,
113
+ inject_noise: bool = False,
114
+ timestep_conditioning: bool = False,
115
+ ) -> None:
116
+ super().__init__()
117
+
118
+ out_channels = out_channels or in_channels
119
+
120
+ self.nonlinearity = get_activation(non_linearity)
121
+
122
+ self.norm1 = RMSNorm(in_channels, eps=1e-8, elementwise_affine=elementwise_affine)
123
+ self.conv1 = LTXVideoCausalConv3d(
124
+ in_channels=in_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
125
+ )
126
+
127
+ self.norm2 = RMSNorm(out_channels, eps=1e-8, elementwise_affine=elementwise_affine)
128
+ self.dropout = nn.Dropout(dropout)
129
+ self.conv2 = LTXVideoCausalConv3d(
130
+ in_channels=out_channels, out_channels=out_channels, kernel_size=3, is_causal=is_causal
131
+ )
132
+
133
+ self.norm3 = None
134
+ self.conv_shortcut = None
135
+ if in_channels != out_channels:
136
+ self.norm3 = nn.LayerNorm(in_channels, eps=eps, elementwise_affine=True, bias=True)
137
+ self.conv_shortcut = LTXVideoCausalConv3d(
138
+ in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, is_causal=is_causal
139
+ )
140
+
141
+ self.per_channel_scale1 = None
142
+ self.per_channel_scale2 = None
143
+ if inject_noise:
144
+ self.per_channel_scale1 = nn.Parameter(torch.zeros(in_channels, 1, 1))
145
+ self.per_channel_scale2 = nn.Parameter(torch.zeros(in_channels, 1, 1))
146
+
147
+ self.scale_shift_table = None
148
+ if timestep_conditioning:
149
+ self.scale_shift_table = nn.Parameter(torch.randn(4, in_channels) / in_channels**0.5)
150
+
151
+ def forward(
152
+ self, inputs: torch.Tensor, temb: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None
153
+ ) -> torch.Tensor:
154
+ hidden_states = inputs
155
+
156
+ hidden_states = self.norm1(hidden_states.movedim(1, -1)).movedim(-1, 1)
157
+
158
+ if self.scale_shift_table is not None:
159
+ temb = temb.unflatten(1, (4, -1)) + self.scale_shift_table[None, ..., None, None, None]
160
+ shift_1, scale_1, shift_2, scale_2 = temb.unbind(dim=1)
161
+ hidden_states = hidden_states * (1 + scale_1) + shift_1
162
+
163
+ hidden_states = self.nonlinearity(hidden_states)
164
+ hidden_states = self.conv1(hidden_states)
165
+
166
+ if self.per_channel_scale1 is not None:
167
+ spatial_shape = hidden_states.shape[-2:]
168
+ spatial_noise = torch.randn(
169
+ spatial_shape, generator=generator, device=hidden_states.device, dtype=hidden_states.dtype
170
+ )[None]
171
+ hidden_states = hidden_states + (spatial_noise * self.per_channel_scale1)[None, :, None, ...]
172
+
173
+ hidden_states = self.norm2(hidden_states.movedim(1, -1)).movedim(-1, 1)
174
+
175
+ if self.scale_shift_table is not None:
176
+ hidden_states = hidden_states * (1 + scale_2) + shift_2
177
+
178
+ hidden_states = self.nonlinearity(hidden_states)
179
+ hidden_states = self.dropout(hidden_states)
180
+ hidden_states = self.conv2(hidden_states)
181
+
182
+ if self.per_channel_scale2 is not None:
183
+ spatial_shape = hidden_states.shape[-2:]
184
+ spatial_noise = torch.randn(
185
+ spatial_shape, generator=generator, device=hidden_states.device, dtype=hidden_states.dtype
186
+ )[None]
187
+ hidden_states = hidden_states + (spatial_noise * self.per_channel_scale2)[None, :, None, ...]
188
+
189
+ if self.norm3 is not None:
190
+ inputs = self.norm3(inputs.movedim(1, -1)).movedim(-1, 1)
191
+
192
+ if self.conv_shortcut is not None:
193
+ inputs = self.conv_shortcut(inputs)
194
+
195
+ hidden_states = hidden_states + inputs
196
+ return hidden_states
197
+
198
+
199
+ class LTXVideoUpsampler3d(nn.Module):
200
+ def __init__(
201
+ self,
202
+ in_channels: int,
203
+ stride: Union[int, Tuple[int, int, int]] = 1,
204
+ is_causal: bool = True,
205
+ residual: bool = False,
206
+ upscale_factor: int = 1,
207
+ ) -> None:
208
+ super().__init__()
209
+
210
+ self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
211
+ self.residual = residual
212
+ self.upscale_factor = upscale_factor
213
+
214
+ out_channels = (in_channels * stride[0] * stride[1] * stride[2]) // upscale_factor
215
+
216
+ self.conv = LTXVideoCausalConv3d(
217
+ in_channels=in_channels,
218
+ out_channels=out_channels,
219
+ kernel_size=3,
220
+ stride=1,
221
+ is_causal=is_causal,
222
+ )
223
+
224
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
225
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
226
+
227
+ if self.residual:
228
+ residual = hidden_states.reshape(
229
+ batch_size, -1, self.stride[0], self.stride[1], self.stride[2], num_frames, height, width
230
+ )
231
+ residual = residual.permute(0, 1, 5, 2, 6, 3, 7, 4).flatten(6, 7).flatten(4, 5).flatten(2, 3)
232
+ repeats = (self.stride[0] * self.stride[1] * self.stride[2]) // self.upscale_factor
233
+ residual = residual.repeat(1, repeats, 1, 1, 1)
234
+ residual = residual[:, :, self.stride[0] - 1 :]
235
+
236
+ hidden_states = self.conv(hidden_states)
237
+ hidden_states = hidden_states.reshape(
238
+ batch_size, -1, self.stride[0], self.stride[1], self.stride[2], num_frames, height, width
239
+ )
240
+ hidden_states = hidden_states.permute(0, 1, 5, 2, 6, 3, 7, 4).flatten(6, 7).flatten(4, 5).flatten(2, 3)
241
+ hidden_states = hidden_states[:, :, self.stride[0] - 1 :]
242
+
243
+ if self.residual:
244
+ hidden_states = hidden_states + residual
245
+
246
+ return hidden_states
247
+
248
+
249
+ class LTXVideoDownBlock3D(nn.Module):
250
+ r"""
251
+ Down block used in the LTXVideo model.
252
+
253
+ Args:
254
+ in_channels (`int`):
255
+ Number of input channels.
256
+ out_channels (`int`, *optional*):
257
+ Number of output channels. If None, defaults to `in_channels`.
258
+ num_layers (`int`, defaults to `1`):
259
+ Number of resnet layers.
260
+ dropout (`float`, defaults to `0.0`):
261
+ Dropout rate.
262
+ resnet_eps (`float`, defaults to `1e-6`):
263
+ Epsilon value for normalization layers.
264
+ resnet_act_fn (`str`, defaults to `"swish"`):
265
+ Activation function to use.
266
+ spatio_temporal_scale (`bool`, defaults to `True`):
267
+ Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
268
+ Whether or not to downsample across temporal dimension.
269
+ is_causal (`bool`, defaults to `True`):
270
+ Whether this layer behaves causally (future frames depend only on past frames) or not.
271
+ """
272
+
273
+ _supports_gradient_checkpointing = True
274
+
275
+ def __init__(
276
+ self,
277
+ in_channels: int,
278
+ out_channels: Optional[int] = None,
279
+ num_layers: int = 1,
280
+ dropout: float = 0.0,
281
+ resnet_eps: float = 1e-6,
282
+ resnet_act_fn: str = "swish",
283
+ spatio_temporal_scale: bool = True,
284
+ is_causal: bool = True,
285
+ ):
286
+ super().__init__()
287
+
288
+ out_channels = out_channels or in_channels
289
+
290
+ resnets = []
291
+ for _ in range(num_layers):
292
+ resnets.append(
293
+ LTXVideoResnetBlock3d(
294
+ in_channels=in_channels,
295
+ out_channels=in_channels,
296
+ dropout=dropout,
297
+ eps=resnet_eps,
298
+ non_linearity=resnet_act_fn,
299
+ is_causal=is_causal,
300
+ )
301
+ )
302
+ self.resnets = nn.ModuleList(resnets)
303
+
304
+ self.downsamplers = None
305
+ if spatio_temporal_scale:
306
+ self.downsamplers = nn.ModuleList(
307
+ [
308
+ LTXVideoCausalConv3d(
309
+ in_channels=in_channels,
310
+ out_channels=in_channels,
311
+ kernel_size=3,
312
+ stride=(2, 2, 2),
313
+ is_causal=is_causal,
314
+ )
315
+ ]
316
+ )
317
+
318
+ self.conv_out = None
319
+ if in_channels != out_channels:
320
+ self.conv_out = LTXVideoResnetBlock3d(
321
+ in_channels=in_channels,
322
+ out_channels=out_channels,
323
+ dropout=dropout,
324
+ eps=resnet_eps,
325
+ non_linearity=resnet_act_fn,
326
+ is_causal=is_causal,
327
+ )
328
+
329
+ self.gradient_checkpointing = False
330
+
331
+ def forward(
332
+ self,
333
+ hidden_states: torch.Tensor,
334
+ temb: Optional[torch.Tensor] = None,
335
+ generator: Optional[torch.Generator] = None,
336
+ ) -> torch.Tensor:
337
+ r"""Forward method of the `LTXDownBlock3D` class."""
338
+
339
+ for i, resnet in enumerate(self.resnets):
340
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
341
+
342
+ def create_custom_forward(module):
343
+ def create_forward(*inputs):
344
+ return module(*inputs)
345
+
346
+ return create_forward
347
+
348
+ hidden_states = torch.utils.checkpoint.checkpoint(
349
+ create_custom_forward(resnet), hidden_states, temb, generator
350
+ )
351
+ else:
352
+ hidden_states = resnet(hidden_states, temb, generator)
353
+
354
+ if self.downsamplers is not None:
355
+ for downsampler in self.downsamplers:
356
+ hidden_states = downsampler(hidden_states)
357
+
358
+ if self.conv_out is not None:
359
+ hidden_states = self.conv_out(hidden_states, temb, generator)
360
+
361
+ return hidden_states
362
+
363
+
364
+ # Adapted from diffusers.models.autoencoders.autoencoder_kl_cogvideox.CogVideoMidBlock3d
365
+ class LTXVideoMidBlock3d(nn.Module):
366
+ r"""
367
+ A middle block used in the LTXVideo model.
368
+
369
+ Args:
370
+ in_channels (`int`):
371
+ Number of input channels.
372
+ num_layers (`int`, defaults to `1`):
373
+ Number of resnet layers.
374
+ dropout (`float`, defaults to `0.0`):
375
+ Dropout rate.
376
+ resnet_eps (`float`, defaults to `1e-6`):
377
+ Epsilon value for normalization layers.
378
+ resnet_act_fn (`str`, defaults to `"swish"`):
379
+ Activation function to use.
380
+ is_causal (`bool`, defaults to `True`):
381
+ Whether this layer behaves causally (future frames depend only on past frames) or not.
382
+ """
383
+
384
+ _supports_gradient_checkpointing = True
385
+
386
+ def __init__(
387
+ self,
388
+ in_channels: int,
389
+ num_layers: int = 1,
390
+ dropout: float = 0.0,
391
+ resnet_eps: float = 1e-6,
392
+ resnet_act_fn: str = "swish",
393
+ is_causal: bool = True,
394
+ inject_noise: bool = False,
395
+ timestep_conditioning: bool = False,
396
+ ) -> None:
397
+ super().__init__()
398
+
399
+ self.time_embedder = None
400
+ if timestep_conditioning:
401
+ self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(in_channels * 4, 0)
402
+
403
+ resnets = []
404
+ for _ in range(num_layers):
405
+ resnets.append(
406
+ LTXVideoResnetBlock3d(
407
+ in_channels=in_channels,
408
+ out_channels=in_channels,
409
+ dropout=dropout,
410
+ eps=resnet_eps,
411
+ non_linearity=resnet_act_fn,
412
+ is_causal=is_causal,
413
+ inject_noise=inject_noise,
414
+ timestep_conditioning=timestep_conditioning,
415
+ )
416
+ )
417
+ self.resnets = nn.ModuleList(resnets)
418
+
419
+ self.gradient_checkpointing = False
420
+
421
+ def forward(
422
+ self,
423
+ hidden_states: torch.Tensor,
424
+ temb: Optional[torch.Tensor] = None,
425
+ generator: Optional[torch.Generator] = None,
426
+ ) -> torch.Tensor:
427
+ r"""Forward method of the `LTXMidBlock3D` class."""
428
+
429
+ if self.time_embedder is not None:
430
+ temb = self.time_embedder(
431
+ timestep=temb.flatten(),
432
+ resolution=None,
433
+ aspect_ratio=None,
434
+ batch_size=hidden_states.size(0),
435
+ hidden_dtype=hidden_states.dtype,
436
+ )
437
+ temb = temb.view(hidden_states.size(0), -1, 1, 1, 1)
438
+
439
+ for i, resnet in enumerate(self.resnets):
440
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
441
+
442
+ def create_custom_forward(module):
443
+ def create_forward(*inputs):
444
+ return module(*inputs)
445
+
446
+ return create_forward
447
+
448
+ hidden_states = torch.utils.checkpoint.checkpoint(
449
+ create_custom_forward(resnet), hidden_states, temb, generator
450
+ )
451
+ else:
452
+ hidden_states = resnet(hidden_states, temb, generator)
453
+
454
+ return hidden_states
455
+
456
+
457
+ class LTXVideoUpBlock3d(nn.Module):
458
+ r"""
459
+ Up block used in the LTXVideo model.
460
+
461
+ Args:
462
+ in_channels (`int`):
463
+ Number of input channels.
464
+ out_channels (`int`, *optional*):
465
+ Number of output channels. If None, defaults to `in_channels`.
466
+ num_layers (`int`, defaults to `1`):
467
+ Number of resnet layers.
468
+ dropout (`float`, defaults to `0.0`):
469
+ Dropout rate.
470
+ resnet_eps (`float`, defaults to `1e-6`):
471
+ Epsilon value for normalization layers.
472
+ resnet_act_fn (`str`, defaults to `"swish"`):
473
+ Activation function to use.
474
+ spatio_temporal_scale (`bool`, defaults to `True`):
475
+ Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
476
+ Whether or not to downsample across temporal dimension.
477
+ is_causal (`bool`, defaults to `True`):
478
+ Whether this layer behaves causally (future frames depend only on past frames) or not.
479
+ """
480
+
481
+ _supports_gradient_checkpointing = True
482
+
483
+ def __init__(
484
+ self,
485
+ in_channels: int,
486
+ out_channels: Optional[int] = None,
487
+ num_layers: int = 1,
488
+ dropout: float = 0.0,
489
+ resnet_eps: float = 1e-6,
490
+ resnet_act_fn: str = "swish",
491
+ spatio_temporal_scale: bool = True,
492
+ is_causal: bool = True,
493
+ inject_noise: bool = False,
494
+ timestep_conditioning: bool = False,
495
+ upsample_residual: bool = False,
496
+ upscale_factor: int = 1,
497
+ ):
498
+ super().__init__()
499
+
500
+ out_channels = out_channels or in_channels
501
+
502
+ self.time_embedder = None
503
+ if timestep_conditioning:
504
+ self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(in_channels * 4, 0)
505
+
506
+ self.conv_in = None
507
+ if in_channels != out_channels:
508
+ self.conv_in = LTXVideoResnetBlock3d(
509
+ in_channels=in_channels,
510
+ out_channels=out_channels,
511
+ dropout=dropout,
512
+ eps=resnet_eps,
513
+ non_linearity=resnet_act_fn,
514
+ is_causal=is_causal,
515
+ inject_noise=inject_noise,
516
+ timestep_conditioning=timestep_conditioning,
517
+ )
518
+
519
+ self.upsamplers = None
520
+ if spatio_temporal_scale:
521
+ self.upsamplers = nn.ModuleList(
522
+ [
523
+ LTXVideoUpsampler3d(
524
+ out_channels * upscale_factor,
525
+ stride=(2, 2, 2),
526
+ is_causal=is_causal,
527
+ residual=upsample_residual,
528
+ upscale_factor=upscale_factor,
529
+ )
530
+ ]
531
+ )
532
+
533
+ resnets = []
534
+ for _ in range(num_layers):
535
+ resnets.append(
536
+ LTXVideoResnetBlock3d(
537
+ in_channels=out_channels,
538
+ out_channels=out_channels,
539
+ dropout=dropout,
540
+ eps=resnet_eps,
541
+ non_linearity=resnet_act_fn,
542
+ is_causal=is_causal,
543
+ inject_noise=inject_noise,
544
+ timestep_conditioning=timestep_conditioning,
545
+ )
546
+ )
547
+ self.resnets = nn.ModuleList(resnets)
548
+
549
+ self.gradient_checkpointing = False
550
+
551
+ def forward(
552
+ self,
553
+ hidden_states: torch.Tensor,
554
+ temb: Optional[torch.Tensor] = None,
555
+ generator: Optional[torch.Generator] = None,
556
+ ) -> torch.Tensor:
557
+ if self.conv_in is not None:
558
+ hidden_states = self.conv_in(hidden_states, temb, generator)
559
+
560
+ if self.time_embedder is not None:
561
+ temb = self.time_embedder(
562
+ timestep=temb.flatten(),
563
+ resolution=None,
564
+ aspect_ratio=None,
565
+ batch_size=hidden_states.size(0),
566
+ hidden_dtype=hidden_states.dtype,
567
+ )
568
+ temb = temb.view(hidden_states.size(0), -1, 1, 1, 1)
569
+
570
+ if self.upsamplers is not None:
571
+ for upsampler in self.upsamplers:
572
+ hidden_states = upsampler(hidden_states)
573
+
574
+ for i, resnet in enumerate(self.resnets):
575
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
576
+
577
+ def create_custom_forward(module):
578
+ def create_forward(*inputs):
579
+ return module(*inputs)
580
+
581
+ return create_forward
582
+
583
+ hidden_states = torch.utils.checkpoint.checkpoint(
584
+ create_custom_forward(resnet), hidden_states, temb, generator
585
+ )
586
+ else:
587
+ hidden_states = resnet(hidden_states, temb, generator)
588
+
589
+ return hidden_states
590
+
591
+
592
+ class LTXVideoEncoder3d(nn.Module):
593
+ r"""
594
+ The `LTXVideoEncoder3d` layer of a variational autoencoder that encodes input video samples to its latent
595
+ representation.
596
+
597
+ Args:
598
+ in_channels (`int`, defaults to 3):
599
+ Number of input channels.
600
+ out_channels (`int`, defaults to 128):
601
+ Number of latent channels.
602
+ block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
603
+ The number of output channels for each block.
604
+ spatio_temporal_scaling (`Tuple[bool, ...], defaults to `(True, True, True, False)`:
605
+ Whether a block should contain spatio-temporal downscaling layers or not.
606
+ layers_per_block (`Tuple[int, ...]`, defaults to `(4, 3, 3, 3, 4)`):
607
+ The number of layers per block.
608
+ patch_size (`int`, defaults to `4`):
609
+ The size of spatial patches.
610
+ patch_size_t (`int`, defaults to `1`):
611
+ The size of temporal patches.
612
+ resnet_norm_eps (`float`, defaults to `1e-6`):
613
+ Epsilon value for ResNet normalization layers.
614
+ is_causal (`bool`, defaults to `True`):
615
+ Whether this layer behaves causally (future frames depend only on past frames) or not.
616
+ """
617
+
618
+ def __init__(
619
+ self,
620
+ in_channels: int = 3,
621
+ out_channels: int = 128,
622
+ block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
623
+ spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
624
+ layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
625
+ patch_size: int = 4,
626
+ patch_size_t: int = 1,
627
+ resnet_norm_eps: float = 1e-6,
628
+ is_causal: bool = True,
629
+ ):
630
+ super().__init__()
631
+
632
+ self.patch_size = patch_size
633
+ self.patch_size_t = patch_size_t
634
+ self.in_channels = in_channels * patch_size**2
635
+
636
+ output_channel = block_out_channels[0]
637
+
638
+ self.conv_in = LTXVideoCausalConv3d(
639
+ in_channels=self.in_channels,
640
+ out_channels=output_channel,
641
+ kernel_size=3,
642
+ stride=1,
643
+ is_causal=is_causal,
644
+ )
645
+
646
+ # down blocks
647
+ num_block_out_channels = len(block_out_channels)
648
+ self.down_blocks = nn.ModuleList([])
649
+ for i in range(num_block_out_channels):
650
+ input_channel = output_channel
651
+ output_channel = block_out_channels[i + 1] if i + 1 < num_block_out_channels else block_out_channels[i]
652
+
653
+ down_block = LTXVideoDownBlock3D(
654
+ in_channels=input_channel,
655
+ out_channels=output_channel,
656
+ num_layers=layers_per_block[i],
657
+ resnet_eps=resnet_norm_eps,
658
+ spatio_temporal_scale=spatio_temporal_scaling[i],
659
+ is_causal=is_causal,
660
+ )
661
+
662
+ self.down_blocks.append(down_block)
663
+
664
+ # mid block
665
+ self.mid_block = LTXVideoMidBlock3d(
666
+ in_channels=output_channel,
667
+ num_layers=layers_per_block[-1],
668
+ resnet_eps=resnet_norm_eps,
669
+ is_causal=is_causal,
670
+ )
671
+
672
+ # out
673
+ self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
674
+ self.conv_act = nn.SiLU()
675
+ self.conv_out = LTXVideoCausalConv3d(
676
+ in_channels=output_channel, out_channels=out_channels + 1, kernel_size=3, stride=1, is_causal=is_causal
677
+ )
678
+
679
+ self.gradient_checkpointing = False
680
+
681
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
682
+ r"""The forward method of the `LTXVideoEncoder3d` class."""
683
+
684
+ p = self.patch_size
685
+ p_t = self.patch_size_t
686
+
687
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
688
+ post_patch_num_frames = num_frames // p_t
689
+ post_patch_height = height // p
690
+ post_patch_width = width // p
691
+
692
+ hidden_states = hidden_states.reshape(
693
+ batch_size, num_channels, post_patch_num_frames, p_t, post_patch_height, p, post_patch_width, p
694
+ )
695
+ # Thanks for driving me insane with the weird patching order :(
696
+ hidden_states = hidden_states.permute(0, 1, 3, 7, 5, 2, 4, 6).flatten(1, 4)
697
+ hidden_states = self.conv_in(hidden_states)
698
+
699
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
700
+
701
+ def create_custom_forward(module):
702
+ def create_forward(*inputs):
703
+ return module(*inputs)
704
+
705
+ return create_forward
706
+
707
+ for down_block in self.down_blocks:
708
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), hidden_states)
709
+
710
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), hidden_states)
711
+ else:
712
+ for down_block in self.down_blocks:
713
+ hidden_states = down_block(hidden_states)
714
+
715
+ hidden_states = self.mid_block(hidden_states)
716
+
717
+ hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
718
+ hidden_states = self.conv_act(hidden_states)
719
+ hidden_states = self.conv_out(hidden_states)
720
+
721
+ last_channel = hidden_states[:, -1:]
722
+ last_channel = last_channel.repeat(1, hidden_states.size(1) - 2, 1, 1, 1)
723
+ hidden_states = torch.cat([hidden_states, last_channel], dim=1)
724
+
725
+ return hidden_states
726
+
727
+
728
+ class LTXVideoDecoder3d(nn.Module):
729
+ r"""
730
+ The `LTXVideoDecoder3d` layer of a variational autoencoder that decodes its latent representation into an output
731
+ sample.
732
+
733
+ Args:
734
+ in_channels (`int`, defaults to 128):
735
+ Number of latent channels.
736
+ out_channels (`int`, defaults to 3):
737
+ Number of output channels.
738
+ block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
739
+ The number of output channels for each block.
740
+ spatio_temporal_scaling (`Tuple[bool, ...], defaults to `(True, True, True, False)`:
741
+ Whether a block should contain spatio-temporal upscaling layers or not.
742
+ layers_per_block (`Tuple[int, ...]`, defaults to `(4, 3, 3, 3, 4)`):
743
+ The number of layers per block.
744
+ patch_size (`int`, defaults to `4`):
745
+ The size of spatial patches.
746
+ patch_size_t (`int`, defaults to `1`):
747
+ The size of temporal patches.
748
+ resnet_norm_eps (`float`, defaults to `1e-6`):
749
+ Epsilon value for ResNet normalization layers.
750
+ is_causal (`bool`, defaults to `False`):
751
+ Whether this layer behaves causally (future frames depend only on past frames) or not.
752
+ timestep_conditioning (`bool`, defaults to `False`):
753
+ Whether to condition the model on timesteps.
754
+ """
755
+
756
+ def __init__(
757
+ self,
758
+ in_channels: int = 128,
759
+ out_channels: int = 3,
760
+ block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
761
+ spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
762
+ layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
763
+ patch_size: int = 4,
764
+ patch_size_t: int = 1,
765
+ resnet_norm_eps: float = 1e-6,
766
+ is_causal: bool = False,
767
+ inject_noise: Tuple[bool, ...] = (False, False, False, False),
768
+ timestep_conditioning: bool = False,
769
+ upsample_residual: Tuple[bool, ...] = (False, False, False, False),
770
+ upsample_factor: Tuple[bool, ...] = (1, 1, 1, 1),
771
+ ) -> None:
772
+ super().__init__()
773
+
774
+ self.patch_size = patch_size
775
+ self.patch_size_t = patch_size_t
776
+ self.out_channels = out_channels * patch_size**2
777
+
778
+ block_out_channels = tuple(reversed(block_out_channels))
779
+ spatio_temporal_scaling = tuple(reversed(spatio_temporal_scaling))
780
+ layers_per_block = tuple(reversed(layers_per_block))
781
+ inject_noise = tuple(reversed(inject_noise))
782
+ upsample_residual = tuple(reversed(upsample_residual))
783
+ upsample_factor = tuple(reversed(upsample_factor))
784
+ output_channel = block_out_channels[0]
785
+
786
+ self.conv_in = LTXVideoCausalConv3d(
787
+ in_channels=in_channels, out_channels=output_channel, kernel_size=3, stride=1, is_causal=is_causal
788
+ )
789
+
790
+ self.mid_block = LTXVideoMidBlock3d(
791
+ in_channels=output_channel,
792
+ num_layers=layers_per_block[0],
793
+ resnet_eps=resnet_norm_eps,
794
+ is_causal=is_causal,
795
+ inject_noise=inject_noise[0],
796
+ timestep_conditioning=timestep_conditioning,
797
+ )
798
+
799
+ # up blocks
800
+ num_block_out_channels = len(block_out_channels)
801
+ self.up_blocks = nn.ModuleList([])
802
+ for i in range(num_block_out_channels):
803
+ input_channel = output_channel // upsample_factor[i]
804
+ output_channel = block_out_channels[i] // upsample_factor[i]
805
+
806
+ up_block = LTXVideoUpBlock3d(
807
+ in_channels=input_channel,
808
+ out_channels=output_channel,
809
+ num_layers=layers_per_block[i + 1],
810
+ resnet_eps=resnet_norm_eps,
811
+ spatio_temporal_scale=spatio_temporal_scaling[i],
812
+ is_causal=is_causal,
813
+ inject_noise=inject_noise[i + 1],
814
+ timestep_conditioning=timestep_conditioning,
815
+ upsample_residual=upsample_residual[i],
816
+ upscale_factor=upsample_factor[i],
817
+ )
818
+
819
+ self.up_blocks.append(up_block)
820
+
821
+ # out
822
+ self.norm_out = RMSNorm(out_channels, eps=1e-8, elementwise_affine=False)
823
+ self.conv_act = nn.SiLU()
824
+ self.conv_out = LTXVideoCausalConv3d(
825
+ in_channels=output_channel, out_channels=self.out_channels, kernel_size=3, stride=1, is_causal=is_causal
826
+ )
827
+
828
+ # timestep embedding
829
+ self.time_embedder = None
830
+ self.scale_shift_table = None
831
+ if timestep_conditioning:
832
+ self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(output_channel * 2, 0)
833
+ self.scale_shift_table = nn.Parameter(torch.randn(2, output_channel) / output_channel**0.5)
834
+
835
+ self.gradient_checkpointing = False
836
+
837
+ def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
838
+ hidden_states = self.conv_in(hidden_states)
839
+
840
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
841
+
842
+ def create_custom_forward(module):
843
+ def create_forward(*inputs):
844
+ return module(*inputs)
845
+
846
+ return create_forward
847
+
848
+ hidden_states = torch.utils.checkpoint.checkpoint(
849
+ create_custom_forward(self.mid_block), hidden_states, temb
850
+ )
851
+
852
+ for up_block in self.up_blocks:
853
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), hidden_states, temb)
854
+ else:
855
+ hidden_states = self.mid_block(hidden_states, temb)
856
+
857
+ for up_block in self.up_blocks:
858
+ hidden_states = up_block(hidden_states, temb)
859
+
860
+ hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
861
+
862
+ if self.time_embedder is not None:
863
+ temb = self.time_embedder(
864
+ timestep=temb.flatten(),
865
+ resolution=None,
866
+ aspect_ratio=None,
867
+ batch_size=hidden_states.size(0),
868
+ hidden_dtype=hidden_states.dtype,
869
+ )
870
+ temb = temb.view(hidden_states.size(0), -1, 1, 1, 1).unflatten(1, (2, -1))
871
+ temb = temb + self.scale_shift_table[None, ..., None, None, None]
872
+ shift, scale = temb.unbind(dim=1)
873
+ hidden_states = hidden_states * (1 + scale) + shift
874
+
875
+ hidden_states = self.conv_act(hidden_states)
876
+ hidden_states = self.conv_out(hidden_states)
877
+
878
+ p = self.patch_size
879
+ p_t = self.patch_size_t
880
+
881
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
882
+ hidden_states = hidden_states.reshape(batch_size, -1, p_t, p, p, num_frames, height, width)
883
+ hidden_states = hidden_states.permute(0, 1, 5, 2, 6, 4, 7, 3).flatten(6, 7).flatten(4, 5).flatten(2, 3)
884
+
885
+ return hidden_states
886
+
887
+
888
+ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
889
+ r"""
890
+ A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
891
+ [LTX](https://huggingface.co/Lightricks/LTX-Video).
892
+
893
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
894
+ for all models (such as downloading or saving).
895
+
896
+ Args:
897
+ in_channels (`int`, defaults to `3`):
898
+ Number of input channels.
899
+ out_channels (`int`, defaults to `3`):
900
+ Number of output channels.
901
+ latent_channels (`int`, defaults to `128`):
902
+ Number of latent channels.
903
+ block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
904
+ The number of output channels for each block.
905
+ spatio_temporal_scaling (`Tuple[bool, ...], defaults to `(True, True, True, False)`:
906
+ Whether a block should contain spatio-temporal downscaling or not.
907
+ layers_per_block (`Tuple[int, ...]`, defaults to `(4, 3, 3, 3, 4)`):
908
+ The number of layers per block.
909
+ patch_size (`int`, defaults to `4`):
910
+ The size of spatial patches.
911
+ patch_size_t (`int`, defaults to `1`):
912
+ The size of temporal patches.
913
+ resnet_norm_eps (`float`, defaults to `1e-6`):
914
+ Epsilon value for ResNet normalization layers.
915
+ scaling_factor (`float`, *optional*, defaults to `1.0`):
916
+ The component-wise standard deviation of the trained latent space computed using the first batch of the
917
+ training set. This is used to scale the latent space to have unit variance when training the diffusion
918
+ model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
919
+ diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
920
+ / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
921
+ Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
922
+ encoder_causal (`bool`, defaults to `True`):
923
+ Whether the encoder should behave causally (future frames depend only on past frames) or not.
924
+ decoder_causal (`bool`, defaults to `False`):
925
+ Whether the decoder should behave causally (future frames depend only on past frames) or not.
926
+ """
927
+
928
+ _supports_gradient_checkpointing = True
929
+
930
+ @register_to_config
931
+ def __init__(
932
+ self,
933
+ in_channels: int = 3,
934
+ out_channels: int = 3,
935
+ latent_channels: int = 128,
936
+ block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
937
+ decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
938
+ layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
939
+ decoder_layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
940
+ spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
941
+ decoder_spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
942
+ decoder_inject_noise: Tuple[bool, ...] = (False, False, False, False, False),
943
+ upsample_residual: Tuple[bool, ...] = (False, False, False, False),
944
+ upsample_factor: Tuple[int, ...] = (1, 1, 1, 1),
945
+ timestep_conditioning: bool = False,
946
+ patch_size: int = 4,
947
+ patch_size_t: int = 1,
948
+ resnet_norm_eps: float = 1e-6,
949
+ scaling_factor: float = 1.0,
950
+ encoder_causal: bool = True,
951
+ decoder_causal: bool = False,
952
+ ) -> None:
953
+ super().__init__()
954
+
955
+ self.encoder = LTXVideoEncoder3d(
956
+ in_channels=in_channels,
957
+ out_channels=latent_channels,
958
+ block_out_channels=block_out_channels,
959
+ spatio_temporal_scaling=spatio_temporal_scaling,
960
+ layers_per_block=layers_per_block,
961
+ patch_size=patch_size,
962
+ patch_size_t=patch_size_t,
963
+ resnet_norm_eps=resnet_norm_eps,
964
+ is_causal=encoder_causal,
965
+ )
966
+ self.decoder = LTXVideoDecoder3d(
967
+ in_channels=latent_channels,
968
+ out_channels=out_channels,
969
+ block_out_channels=decoder_block_out_channels,
970
+ spatio_temporal_scaling=decoder_spatio_temporal_scaling,
971
+ layers_per_block=decoder_layers_per_block,
972
+ patch_size=patch_size,
973
+ patch_size_t=patch_size_t,
974
+ resnet_norm_eps=resnet_norm_eps,
975
+ is_causal=decoder_causal,
976
+ timestep_conditioning=timestep_conditioning,
977
+ inject_noise=decoder_inject_noise,
978
+ upsample_residual=upsample_residual,
979
+ upsample_factor=upsample_factor,
980
+ )
981
+
982
+ latents_mean = torch.zeros((latent_channels,), requires_grad=False)
983
+ latents_std = torch.ones((latent_channels,), requires_grad=False)
984
+ self.register_buffer("latents_mean", latents_mean, persistent=True)
985
+ self.register_buffer("latents_std", latents_std, persistent=True)
986
+
987
+ self.spatial_compression_ratio = patch_size * 2 ** sum(spatio_temporal_scaling)
988
+ self.temporal_compression_ratio = patch_size_t * 2 ** sum(spatio_temporal_scaling)
989
+
990
+ # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
991
+ # to perform decoding of a single video latent at a time.
992
+ self.use_slicing = False
993
+
994
+ # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
995
+ # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
996
+ # intermediate tiles together, the memory requirement can be lowered.
997
+ self.use_tiling = False
998
+
999
+ # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
1000
+ # at a fixed frame batch size (based on `self.num_latent_frames_batch_sizes`), the memory requirement can be lowered.
1001
+ self.use_framewise_encoding = False
1002
+ self.use_framewise_decoding = False
1003
+
1004
+ # This can be configured based on the amount of GPU memory available.
1005
+ # `16` for sample frames and `2` for latent frames are sensible defaults for consumer GPUs.
1006
+ # Setting it to higher values results in higher memory usage.
1007
+ self.num_sample_frames_batch_size = 16
1008
+ self.num_latent_frames_batch_size = 2
1009
+
1010
+ # The minimal tile height and width for spatial tiling to be used
1011
+ self.tile_sample_min_height = 512
1012
+ self.tile_sample_min_width = 512
1013
+
1014
+ # The minimal distance between two spatial tiles
1015
+ self.tile_sample_stride_height = 448
1016
+ self.tile_sample_stride_width = 448
1017
+
1018
+ def _set_gradient_checkpointing(self, module, value=False):
1019
+ if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
1020
+ module.gradient_checkpointing = value
1021
+
1022
+ def enable_tiling(
1023
+ self,
1024
+ tile_sample_min_height: Optional[int] = None,
1025
+ tile_sample_min_width: Optional[int] = None,
1026
+ tile_sample_stride_height: Optional[float] = None,
1027
+ tile_sample_stride_width: Optional[float] = None,
1028
+ ) -> None:
1029
+ r"""
1030
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
1031
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
1032
+ processing larger images.
1033
+
1034
+ Args:
1035
+ tile_sample_min_height (`int`, *optional*):
1036
+ The minimum height required for a sample to be separated into tiles across the height dimension.
1037
+ tile_sample_min_width (`int`, *optional*):
1038
+ The minimum width required for a sample to be separated into tiles across the width dimension.
1039
+ tile_sample_stride_height (`int`, *optional*):
1040
+ The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
1041
+ no tiling artifacts produced across the height dimension.
1042
+ tile_sample_stride_width (`int`, *optional*):
1043
+ The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
1044
+ artifacts produced across the width dimension.
1045
+ """
1046
+ self.use_tiling = True
1047
+ self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
1048
+ self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
1049
+ self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
1050
+ self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
1051
+
1052
+ def disable_tiling(self) -> None:
1053
+ r"""
1054
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
1055
+ decoding in one step.
1056
+ """
1057
+ self.use_tiling = False
1058
+
1059
+ def enable_slicing(self) -> None:
1060
+ r"""
1061
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
1062
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
1063
+ """
1064
+ self.use_slicing = True
1065
+
1066
+ def disable_slicing(self) -> None:
1067
+ r"""
1068
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
1069
+ decoding in one step.
1070
+ """
1071
+ self.use_slicing = False
1072
+
1073
+ def _encode(self, x: torch.Tensor) -> torch.Tensor:
1074
+ batch_size, num_channels, num_frames, height, width = x.shape
1075
+
1076
+ if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
1077
+ return self.tiled_encode(x)
1078
+
1079
+ if self.use_framewise_encoding:
1080
+ # TODO(aryan): requires investigation
1081
+ raise NotImplementedError(
1082
+ "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1083
+ "quality issues caused by splitting inference across frame dimension. If you believe this "
1084
+ "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1085
+ )
1086
+ else:
1087
+ enc = self.encoder(x)
1088
+
1089
+ return enc
1090
+
1091
+ @apply_forward_hook
1092
+ def encode(
1093
+ self, x: torch.Tensor, return_dict: bool = True
1094
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
1095
+ """
1096
+ Encode a batch of images into latents.
1097
+
1098
+ Args:
1099
+ x (`torch.Tensor`): Input batch of images.
1100
+ return_dict (`bool`, *optional*, defaults to `True`):
1101
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
1102
+
1103
+ Returns:
1104
+ The latent representations of the encoded videos. If `return_dict` is True, a
1105
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
1106
+ """
1107
+ if self.use_slicing and x.shape[0] > 1:
1108
+ encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
1109
+ h = torch.cat(encoded_slices)
1110
+ else:
1111
+ h = self._encode(x)
1112
+ posterior = DiagonalGaussianDistribution(h)
1113
+
1114
+ if not return_dict:
1115
+ return (posterior,)
1116
+ return AutoencoderKLOutput(latent_dist=posterior)
1117
+
1118
+ def _decode(
1119
+ self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True
1120
+ ) -> Union[DecoderOutput, torch.Tensor]:
1121
+ batch_size, num_channels, num_frames, height, width = z.shape
1122
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1123
+ tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
1124
+
1125
+ if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
1126
+ return self.tiled_decode(z, temb, return_dict=return_dict)
1127
+
1128
+ if self.use_framewise_decoding:
1129
+ # TODO(aryan): requires investigation
1130
+ raise NotImplementedError(
1131
+ "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1132
+ "quality issues caused by splitting inference across frame dimension. If you believe this "
1133
+ "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1134
+ )
1135
+ else:
1136
+ dec = self.decoder(z, temb)
1137
+
1138
+ if not return_dict:
1139
+ return (dec,)
1140
+
1141
+ return DecoderOutput(sample=dec)
1142
+
1143
+ @apply_forward_hook
1144
+ def decode(
1145
+ self, z: torch.Tensor, temb: Optional[torch.Tensor] = None, return_dict: bool = True
1146
+ ) -> Union[DecoderOutput, torch.Tensor]:
1147
+ """
1148
+ Decode a batch of images.
1149
+
1150
+ Args:
1151
+ z (`torch.Tensor`): Input batch of latent vectors.
1152
+ return_dict (`bool`, *optional*, defaults to `True`):
1153
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
1154
+
1155
+ Returns:
1156
+ [`~models.vae.DecoderOutput`] or `tuple`:
1157
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
1158
+ returned.
1159
+ """
1160
+ if self.use_slicing and z.shape[0] > 1:
1161
+ if temb is not None:
1162
+ decoded_slices = [
1163
+ self._decode(z_slice, t_slice).sample for z_slice, t_slice in (z.split(1), temb.split(1))
1164
+ ]
1165
+ else:
1166
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
1167
+ decoded = torch.cat(decoded_slices)
1168
+ else:
1169
+ decoded = self._decode(z, temb).sample
1170
+
1171
+ if not return_dict:
1172
+ return (decoded,)
1173
+
1174
+ return DecoderOutput(sample=decoded)
1175
+
1176
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
1177
+ blend_extent = min(a.shape[3], b.shape[3], blend_extent)
1178
+ for y in range(blend_extent):
1179
+ b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
1180
+ y / blend_extent
1181
+ )
1182
+ return b
1183
+
1184
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
1185
+ blend_extent = min(a.shape[4], b.shape[4], blend_extent)
1186
+ for x in range(blend_extent):
1187
+ b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
1188
+ x / blend_extent
1189
+ )
1190
+ return b
1191
+
1192
+ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
1193
+ r"""Encode a batch of images using a tiled encoder.
1194
+
1195
+ Args:
1196
+ x (`torch.Tensor`): Input batch of videos.
1197
+
1198
+ Returns:
1199
+ `torch.Tensor`:
1200
+ The latent representation of the encoded videos.
1201
+ """
1202
+ batch_size, num_channels, num_frames, height, width = x.shape
1203
+ latent_height = height // self.spatial_compression_ratio
1204
+ latent_width = width // self.spatial_compression_ratio
1205
+
1206
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1207
+ tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
1208
+ tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
1209
+ tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
1210
+
1211
+ blend_height = tile_latent_min_height - tile_latent_stride_height
1212
+ blend_width = tile_latent_min_width - tile_latent_stride_width
1213
+
1214
+ # Split x into overlapping tiles and encode them separately.
1215
+ # The tiles have an overlap to avoid seams between tiles.
1216
+ rows = []
1217
+ for i in range(0, height, self.tile_sample_stride_height):
1218
+ row = []
1219
+ for j in range(0, width, self.tile_sample_stride_width):
1220
+ if self.use_framewise_encoding:
1221
+ # TODO(aryan): requires investigation
1222
+ raise NotImplementedError(
1223
+ "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1224
+ "quality issues caused by splitting inference across frame dimension. If you believe this "
1225
+ "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1226
+ )
1227
+ else:
1228
+ time = self.encoder(
1229
+ x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
1230
+ )
1231
+
1232
+ row.append(time)
1233
+ rows.append(row)
1234
+
1235
+ result_rows = []
1236
+ for i, row in enumerate(rows):
1237
+ result_row = []
1238
+ for j, tile in enumerate(row):
1239
+ # blend the above tile and the left tile
1240
+ # to the current tile and add the current tile to the result row
1241
+ if i > 0:
1242
+ tile = self.blend_v(rows[i - 1][j], tile, blend_height)
1243
+ if j > 0:
1244
+ tile = self.blend_h(row[j - 1], tile, blend_width)
1245
+ result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
1246
+ result_rows.append(torch.cat(result_row, dim=4))
1247
+
1248
+ enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
1249
+ return enc
1250
+
1251
+ def tiled_decode(
1252
+ self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True
1253
+ ) -> Union[DecoderOutput, torch.Tensor]:
1254
+ r"""
1255
+ Decode a batch of images using a tiled decoder.
1256
+
1257
+ Args:
1258
+ z (`torch.Tensor`): Input batch of latent vectors.
1259
+ return_dict (`bool`, *optional*, defaults to `True`):
1260
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
1261
+
1262
+ Returns:
1263
+ [`~models.vae.DecoderOutput`] or `tuple`:
1264
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
1265
+ returned.
1266
+ """
1267
+
1268
+ batch_size, num_channels, num_frames, height, width = z.shape
1269
+ sample_height = height * self.spatial_compression_ratio
1270
+ sample_width = width * self.spatial_compression_ratio
1271
+
1272
+ tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
1273
+ tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
1274
+ tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
1275
+ tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
1276
+
1277
+ blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
1278
+ blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
1279
+
1280
+ # Split z into overlapping tiles and decode them separately.
1281
+ # The tiles have an overlap to avoid seams between tiles.
1282
+ rows = []
1283
+ for i in range(0, height, tile_latent_stride_height):
1284
+ row = []
1285
+ for j in range(0, width, tile_latent_stride_width):
1286
+ if self.use_framewise_decoding:
1287
+ # TODO(aryan): requires investigation
1288
+ raise NotImplementedError(
1289
+ "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
1290
+ "quality issues caused by splitting inference across frame dimension. If you believe this "
1291
+ "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
1292
+ )
1293
+ else:
1294
+ time = self.decoder(
1295
+ z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
1296
+ )
1297
+
1298
+ row.append(time)
1299
+ rows.append(row)
1300
+
1301
+ result_rows = []
1302
+ for i, row in enumerate(rows):
1303
+ result_row = []
1304
+ for j, tile in enumerate(row):
1305
+ # blend the above tile and the left tile
1306
+ # to the current tile and add the current tile to the result row
1307
+ if i > 0:
1308
+ tile = self.blend_v(rows[i - 1][j], tile, blend_height)
1309
+ if j > 0:
1310
+ tile = self.blend_h(row[j - 1], tile, blend_width)
1311
+ result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
1312
+ result_rows.append(torch.cat(result_row, dim=4))
1313
+
1314
+ dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
1315
+
1316
+ if not return_dict:
1317
+ return (dec,)
1318
+
1319
+ return DecoderOutput(sample=dec)
1320
+
1321
+ def forward(
1322
+ self,
1323
+ sample: torch.Tensor,
1324
+ temb: Optional[torch.Tensor] = None,
1325
+ sample_posterior: bool = False,
1326
+ return_dict: bool = True,
1327
+ generator: Optional[torch.Generator] = None,
1328
+ ) -> Union[torch.Tensor, torch.Tensor]:
1329
+ x = sample
1330
+ posterior = self.encode(x).latent_dist
1331
+ if sample_posterior:
1332
+ z = posterior.sample(generator=generator)
1333
+ else:
1334
+ z = posterior.mode()
1335
+ dec = self.decode(z, temb)
1336
+ if not return_dict:
1337
+ return (dec,)
1338
+ return dec