PyPI - diffusers - Versions diffs - 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (389) hide show

diffusers/models/autoencoders/autoencoder_kl_ltx.py CHANGED Viewed

@@ -196,6 +196,55 @@ class LTXVideoResnetBlock3d(nn.Module):
         return hidden_states
+class LTXVideoDownsampler3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        is_causal: bool = True,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
+        self.group_size = (in_channels * stride[0] * stride[1] * stride[2]) // out_channels
+        out_channels = out_channels // (self.stride[0] * self.stride[1] * self.stride[2])
+        self.conv = LTXVideoCausalConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            is_causal=is_causal,
+            padding_mode=padding_mode,
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = torch.cat([hidden_states[:, :, : self.stride[0] - 1], hidden_states], dim=2)
+        residual = (
+            hidden_states.unflatten(4, (-1, self.stride[2]))
+            .unflatten(3, (-1, self.stride[1]))
+            .unflatten(2, (-1, self.stride[0]))
+        )
+        residual = residual.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(1, 4)
+        residual = residual.unflatten(1, (-1, self.group_size))
+        residual = residual.mean(dim=2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = (
+            hidden_states.unflatten(4, (-1, self.stride[2]))
+            .unflatten(3, (-1, self.stride[1]))
+            .unflatten(2, (-1, self.stride[0]))
+        )
+        hidden_states = hidden_states.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(1, 4)
+        hidden_states = hidden_states + residual
+        return hidden_states
 class LTXVideoUpsampler3d(nn.Module):
     def __init__(
         self,
@@ -204,6 +253,7 @@ class LTXVideoUpsampler3d(nn.Module):
         is_causal: bool = True,
         residual: bool = False,
         upscale_factor: int = 1,
+        padding_mode: str = "zeros",
     ) -> None:
         super().__init__()
@@ -219,6 +269,7 @@ class LTXVideoUpsampler3d(nn.Module):
             kernel_size=3,
             stride=1,
             is_causal=is_causal,
+            padding_mode=padding_mode,
         )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -338,16 +389,122 @@ class LTXVideoDownBlock3D(nn.Module):
         for i, resnet in enumerate(self.resnets):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb, generator)
+            else:
+                hidden_states = resnet(hidden_states, temb, generator)
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
-                    return create_forward
+        if self.conv_out is not None:
+            hidden_states = self.conv_out(hidden_states, temb, generator)
+        return hidden_states
+class LTXVideo095DownBlock3D(nn.Module):
+    r"""
+    Down block used in the LTXVideo model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        spatio_temporal_scale (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+            Whether or not to downsample across temporal dimension.
+        is_causal (`bool`, defaults to `True`):
+            Whether this layer behaves causally (future frames depend only on past frames) or not.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        spatio_temporal_scale: bool = True,
+        is_causal: bool = True,
+        downsample_type: str = "conv",
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                LTXVideoResnetBlock3d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    dropout=dropout,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    is_causal=is_causal,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.downsamplers = None
+        if spatio_temporal_scale:
+            self.downsamplers = nn.ModuleList()
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, generator
+            if downsample_type == "conv":
+                self.downsamplers.append(
+                    LTXVideoCausalConv3d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=3,
+                        stride=(2, 2, 2),
+                        is_causal=is_causal,
+                    )
+                )
+            elif downsample_type == "spatial":
+                self.downsamplers.append(
+                    LTXVideoDownsampler3d(
+                        in_channels=in_channels, out_channels=out_channels, stride=(1, 2, 2), is_causal=is_causal
+                    )
+                )
+            elif downsample_type == "temporal":
+                self.downsamplers.append(
+                    LTXVideoDownsampler3d(
+                        in_channels=in_channels, out_channels=out_channels, stride=(2, 1, 1), is_causal=is_causal
+                    )
+                )
+            elif downsample_type == "spatiotemporal":
+                self.downsamplers.append(
+                    LTXVideoDownsampler3d(
+                        in_channels=in_channels, out_channels=out_channels, stride=(2, 2, 2), is_causal=is_causal
+                    )
                 )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ) -> torch.Tensor:
+        r"""Forward method of the `LTXDownBlock3D` class."""
+        for i, resnet in enumerate(self.resnets):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb, generator)
             else:
                 hidden_states = resnet(hidden_states, temb, generator)
@@ -355,9 +512,6 @@ class LTXVideoDownBlock3D(nn.Module):
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
-        if self.conv_out is not None:
-            hidden_states = self.conv_out(hidden_states, temb, generator)
         return hidden_states
@@ -438,16 +592,7 @@ class LTXVideoMidBlock3d(nn.Module):
         for i, resnet in enumerate(self.resnets):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, generator
-                )
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb, generator)
             else:
                 hidden_states = resnet(hidden_states, temb, generator)
@@ -573,16 +718,7 @@ class LTXVideoUpBlock3d(nn.Module):
         for i, resnet in enumerate(self.resnets):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, generator
-                )
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb, generator)
             else:
                 hidden_states = resnet(hidden_states, temb, generator)
@@ -620,8 +756,15 @@ class LTXVideoEncoder3d(nn.Module):
         in_channels: int = 3,
         out_channels: int = 128,
         block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        down_block_types: Tuple[str, ...] = (
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+        ),
         spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
         layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
+        downsample_type: Tuple[str, ...] = ("conv", "conv", "conv", "conv"),
         patch_size: int = 4,
         patch_size_t: int = 1,
         resnet_norm_eps: float = 1e-6,
@@ -644,20 +787,37 @@ class LTXVideoEncoder3d(nn.Module):
         )
         # down blocks
-        num_block_out_channels = len(block_out_channels)
+        is_ltx_095 = down_block_types[-1] == "LTXVideo095DownBlock3D"
+        num_block_out_channels = len(block_out_channels) - (1 if is_ltx_095 else 0)
         self.down_blocks = nn.ModuleList([])
         for i in range(num_block_out_channels):
             input_channel = output_channel
-            output_channel = block_out_channels[i + 1] if i + 1 < num_block_out_channels else block_out_channels[i]
-            down_block = LTXVideoDownBlock3D(
-                in_channels=input_channel,
-                out_channels=output_channel,
-                num_layers=layers_per_block[i],
-                resnet_eps=resnet_norm_eps,
-                spatio_temporal_scale=spatio_temporal_scaling[i],
-                is_causal=is_causal,
-            )
+            if not is_ltx_095:
+                output_channel = block_out_channels[i + 1] if i + 1 < num_block_out_channels else block_out_channels[i]
+            else:
+                output_channel = block_out_channels[i + 1]
+            if down_block_types[i] == "LTXVideoDownBlock3D":
+                down_block = LTXVideoDownBlock3D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    num_layers=layers_per_block[i],
+                    resnet_eps=resnet_norm_eps,
+                    spatio_temporal_scale=spatio_temporal_scaling[i],
+                    is_causal=is_causal,
+                )
+            elif down_block_types[i] == "LTXVideo095DownBlock3D":
+                down_block = LTXVideo095DownBlock3D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    num_layers=layers_per_block[i],
+                    resnet_eps=resnet_norm_eps,
+                    spatio_temporal_scale=spatio_temporal_scaling[i],
+                    is_causal=is_causal,
+                    downsample_type=downsample_type[i],
+                )
+            else:
+                raise ValueError(f"Unknown down block type: {down_block_types[i]}")
             self.down_blocks.append(down_block)
@@ -697,17 +857,10 @@ class LTXVideoEncoder3d(nn.Module):
         hidden_states = self.conv_in(hidden_states)
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def create_forward(*inputs):
-                    return module(*inputs)
-                return create_forward
             for down_block in self.down_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), hidden_states)
+                hidden_states = self._gradient_checkpointing_func(down_block, hidden_states)
-            hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), hidden_states)
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
         else:
             for down_block in self.down_blocks:
                 hidden_states = down_block(hidden_states)
@@ -828,7 +981,9 @@ class LTXVideoDecoder3d(nn.Module):
         # timestep embedding
         self.time_embedder = None
         self.scale_shift_table = None
+        self.timestep_scale_multiplier = None
         if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(torch.tensor(1000.0, dtype=torch.float32))
             self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(output_channel * 2, 0)
             self.scale_shift_table = nn.Parameter(torch.randn(2, output_channel) / output_channel**0.5)
@@ -837,20 +992,14 @@ class LTXVideoDecoder3d(nn.Module):
     def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
         hidden_states = self.conv_in(hidden_states)
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
+        if self.timestep_scale_multiplier is not None:
+            temb = temb * self.timestep_scale_multiplier
-            def create_custom_forward(module):
-                def create_forward(*inputs):
-                    return module(*inputs)
-                return create_forward
-            hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block), hidden_states, temb
-            )
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states, temb)
             for up_block in self.up_blocks:
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), hidden_states, temb)
+                hidden_states = self._gradient_checkpointing_func(up_block, hidden_states, temb)
         else:
             hidden_states = self.mid_block(hidden_states, temb)
@@ -934,12 +1083,19 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         out_channels: int = 3,
         latent_channels: int = 128,
         block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        down_block_types: Tuple[str, ...] = (
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+            "LTXVideoDownBlock3D",
+        ),
         decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
         layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
         decoder_layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
         spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
         decoder_spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
         decoder_inject_noise: Tuple[bool, ...] = (False, False, False, False, False),
+        downsample_type: Tuple[str, ...] = ("conv", "conv", "conv", "conv"),
         upsample_residual: Tuple[bool, ...] = (False, False, False, False),
         upsample_factor: Tuple[int, ...] = (1, 1, 1, 1),
         timestep_conditioning: bool = False,
@@ -949,6 +1105,8 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         scaling_factor: float = 1.0,
         encoder_causal: bool = True,
         decoder_causal: bool = False,
+        spatial_compression_ratio: int = None,
+        temporal_compression_ratio: int = None,
     ) -> None:
         super().__init__()
@@ -956,8 +1114,10 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             in_channels=in_channels,
             out_channels=latent_channels,
             block_out_channels=block_out_channels,
+            down_block_types=down_block_types,
             spatio_temporal_scaling=spatio_temporal_scaling,
             layers_per_block=layers_per_block,
+            downsample_type=downsample_type,
             patch_size=patch_size,
             patch_size_t=patch_size_t,
             resnet_norm_eps=resnet_norm_eps,
@@ -984,8 +1144,16 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         self.register_buffer("latents_mean", latents_mean, persistent=True)
         self.register_buffer("latents_std", latents_std, persistent=True)
-        self.spatial_compression_ratio = patch_size * 2 ** sum(spatio_temporal_scaling)
-        self.temporal_compression_ratio = patch_size_t * 2 ** sum(spatio_temporal_scaling)
+        self.spatial_compression_ratio = (
+            patch_size * 2 ** sum(spatio_temporal_scaling)
+            if spatial_compression_ratio is None
+            else spatial_compression_ratio
+        )
+        self.temporal_compression_ratio = (
+            patch_size_t * 2 ** sum(spatio_temporal_scaling)
+            if temporal_compression_ratio is None
+            else temporal_compression_ratio
+        )
         # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
         # to perform decoding of a single video latent at a time.
@@ -1010,21 +1178,21 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         # The minimal tile height and width for spatial tiling to be used
         self.tile_sample_min_height = 512
         self.tile_sample_min_width = 512
+        self.tile_sample_min_num_frames = 16
         # The minimal distance between two spatial tiles
         self.tile_sample_stride_height = 448
         self.tile_sample_stride_width = 448
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)):
-            module.gradient_checkpointing = value
+        self.tile_sample_stride_num_frames = 8
     def enable_tiling(
         self,
         tile_sample_min_height: Optional[int] = None,
         tile_sample_min_width: Optional[int] = None,
+        tile_sample_min_num_frames: Optional[int] = None,
         tile_sample_stride_height: Optional[float] = None,
         tile_sample_stride_width: Optional[float] = None,
+        tile_sample_stride_num_frames: Optional[float] = None,
     ) -> None:
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -1046,8 +1214,10 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         self.use_tiling = True
         self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
         self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames
         self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
         self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+        self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames
     def disable_tiling(self) -> None:
         r"""
@@ -1073,18 +1243,13 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     def _encode(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = x.shape
+        if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames:
+            return self._temporal_tiled_encode(x)
         if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
             return self.tiled_encode(x)
-        if self.use_framewise_encoding:
-            # TODO(aryan): requires investigation
-            raise NotImplementedError(
-                "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                "quality issues caused by splitting inference across frame dimension. If you believe this "
-                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-            )
-        else:
-            enc = self.encoder(x)
+        enc = self.encoder(x)
         return enc
@@ -1121,19 +1286,15 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         batch_size, num_channels, num_frames, height, width = z.shape
         tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
         tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
+            return self._temporal_tiled_decode(z, temb, return_dict=return_dict)
         if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
             return self.tiled_decode(z, temb, return_dict=return_dict)
-        if self.use_framewise_decoding:
-            # TODO(aryan): requires investigation
-            raise NotImplementedError(
-                "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                "quality issues caused by splitting inference across frame dimension. If you believe this "
-                "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-            )
-        else:
-            dec = self.decoder(z, temb)
+        dec = self.decoder(z, temb)
         if not return_dict:
             return (dec,)
@@ -1189,6 +1350,14 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             )
         return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
     def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         r"""Encode a batch of images using a tiled encoder.
@@ -1217,17 +1386,9 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         for i in range(0, height, self.tile_sample_stride_height):
             row = []
             for j in range(0, width, self.tile_sample_stride_width):
-                if self.use_framewise_encoding:
-                    # TODO(aryan): requires investigation
-                    raise NotImplementedError(
-                        "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                        "quality issues caused by splitting inference across frame dimension. If you believe this "
-                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-                    )
-                else:
-                    time = self.encoder(
-                        x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
-                    )
+                time = self.encoder(
+                    x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                )
                 row.append(time)
             rows.append(row)
@@ -1283,17 +1444,7 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         for i in range(0, height, tile_latent_stride_height):
             row = []
             for j in range(0, width, tile_latent_stride_width):
-                if self.use_framewise_decoding:
-                    # TODO(aryan): requires investigation
-                    raise NotImplementedError(
-                        "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to "
-                        "quality issues caused by splitting inference across frame dimension. If you believe this "
-                        "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls."
-                    )
-                else:
-                    time = self.decoder(
-                        z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb
-                    )
+                time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb)
                 row.append(time)
             rows.append(row)
@@ -1318,6 +1469,74 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         return DecoderOutput(sample=dec)
+    def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        batch_size, num_channels, num_frames, height, width = x.shape
+        latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
+        blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames
+        row = []
+        for i in range(0, num_frames, self.tile_sample_stride_num_frames):
+            tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :]
+            if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width):
+                tile = self.tiled_encode(tile)
+            else:
+                tile = self.encoder(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
+                result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :])
+            else:
+                result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :])
+        enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
+        return enc
+    def _temporal_tiled_decode(
+        self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+        num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
+        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
+        blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
+        row = []
+        for i in range(0, num_frames, tile_latent_stride_num_frames):
+            tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
+            if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
+                decoded = self.tiled_decode(tile, temb, return_dict=True).sample
+            else:
+                decoded = self.decoder(tile, temb)
+            if i > 0:
+                decoded = decoded[:, :, :-1, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
+                tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :]
+                result_row.append(tile)
+            else:
+                result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :])
+        dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
     def forward(
         self,
         sample: torch.Tensor,
@@ -1334,5 +1553,5 @@ class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             z = posterior.mode()
         dec = self.decode(z, temb)
         if not return_dict:
-            return (dec,)
+            return (dec.sample,)
         return dec

diffusers 0.32.2__py3-none-any.whl → 0.33.0__py3-none-any.whl

diffusers 0.32.2py3-none-any.whl → 0.33.0py3-none-any.whl