PyPI - diffusers - Versions diffs - 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl - Mend

diffusers 0.23.1py3-none-any.whl → 0.25.0py3-none-any.whl

Files changed (238) hide show

diffusers/models/resnet.py CHANGED Viewed

@@ -23,530 +23,23 @@ import torch.nn.functional as F
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .attention_processor import SpatialNorm
+from .downsampling import (  # noqa
+    Downsample1D,
+    Downsample2D,
+    FirDownsample2D,
+    KDownsample2D,
+    downsample_2d,
+)
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 from .normalization import AdaGroupNorm
-class Upsample1D(nn.Module):
-    """A 1D upsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        use_conv_transpose (`bool`, default `False`):
-            option to use a convolution transpose.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        name (`str`, default `conv`):
-            name of the upsampling 1D layer.
-    """
-    def __init__(
-        self,
-        channels: int,
-        use_conv: bool = False,
-        use_conv_transpose: bool = False,
-        out_channels: Optional[int] = None,
-        name: str = "conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        self.conv = None
-        if use_conv_transpose:
-            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        assert inputs.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(inputs)
-        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
-        if self.use_conv:
-            outputs = self.conv(outputs)
-        return outputs
-class Downsample1D(nn.Module):
-    """A 1D downsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        padding (`int`, default `1`):
-            padding for the convolution.
-        name (`str`, default `conv`):
-            name of the downsampling 1D layer.
-    """
-    def __init__(
-        self,
-        channels: int,
-        use_conv: bool = False,
-        out_channels: Optional[int] = None,
-        padding: int = 1,
-        name: str = "conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-        if use_conv:
-            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            assert self.channels == self.out_channels
-            self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride)
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        assert inputs.shape[1] == self.channels
-        return self.conv(inputs)
-class Upsample2D(nn.Module):
-    """A 2D upsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        use_conv_transpose (`bool`, default `False`):
-            option to use a convolution transpose.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        name (`str`, default `conv`):
-            name of the upsampling 2D layer.
-    """
-    def __init__(
-        self,
-        channels: int,
-        use_conv: bool = False,
-        use_conv_transpose: bool = False,
-        out_channels: Optional[int] = None,
-        name: str = "conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
-        conv = None
-        if use_conv_transpose:
-            conv = nn.ConvTranspose2d(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = conv_cls(self.channels, self.out_channels, 3, padding=1)
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-    def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, scale: float = 1.0):
-        assert hidden_states.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(hidden_states)
-        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
-        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
-        # https://github.com/pytorch/pytorch/issues/86679
-        dtype = hidden_states.dtype
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(torch.float32)
-        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
-        if hidden_states.shape[0] >= 64:
-            hidden_states = hidden_states.contiguous()
-        # if `output_size` is passed we force the interpolation output
-        # size and do not make use of `scale_factor=2`
-        if output_size is None:
-            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
-        else:
-            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
-        # If the input is bfloat16, we cast back to bfloat16
-        if dtype == torch.bfloat16:
-            hidden_states = hidden_states.to(dtype)
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                if isinstance(self.conv, LoRACompatibleConv) and not USE_PEFT_BACKEND:
-                    hidden_states = self.conv(hidden_states, scale)
-                else:
-                    hidden_states = self.conv(hidden_states)
-            else:
-                if isinstance(self.Conv2d_0, LoRACompatibleConv) and not USE_PEFT_BACKEND:
-                    hidden_states = self.Conv2d_0(hidden_states, scale)
-                else:
-                    hidden_states = self.Conv2d_0(hidden_states)
-        return hidden_states
-class Downsample2D(nn.Module):
-    """A 2D downsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        padding (`int`, default `1`):
-            padding for the convolution.
-        name (`str`, default `conv`):
-            name of the downsampling 2D layer.
-    """
-    def __init__(
-        self,
-        channels: int,
-        use_conv: bool = False,
-        out_channels: Optional[int] = None,
-        padding: int = 1,
-        name: str = "conv",
-    ):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
-        if use_conv:
-            conv = conv_cls(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-    def forward(self, hidden_states, scale: float = 1.0):
-        assert hidden_states.shape[1] == self.channels
-        if self.use_conv and self.padding == 0:
-            pad = (0, 1, 0, 1)
-            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
-        assert hidden_states.shape[1] == self.channels
-        if not USE_PEFT_BACKEND:
-            if isinstance(self.conv, LoRACompatibleConv):
-                hidden_states = self.conv(hidden_states, scale)
-            else:
-                hidden_states = self.conv(hidden_states)
-        else:
-            hidden_states = self.conv(hidden_states)
-        return hidden_states
-class FirUpsample2D(nn.Module):
-    """A 2D FIR upsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
-            kernel for the FIR filter.
-    """
-    def __init__(
-        self,
-        channels: int = None,
-        out_channels: Optional[int] = None,
-        use_conv: bool = False,
-        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
-    ):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.use_conv = use_conv
-        self.fir_kernel = fir_kernel
-        self.out_channels = out_channels
-    def _upsample_2d(
-        self,
-        hidden_states: torch.Tensor,
-        weight: Optional[torch.Tensor] = None,
-        kernel: Optional[torch.FloatTensor] = None,
-        factor: int = 2,
-        gain: float = 1,
-    ) -> torch.Tensor:
-        """Fused `upsample_2d()` followed by `Conv2d()`.
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
-        arbitrary order.
-        Args:
-            hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-            weight: Weight tensor of the shape `[filterH, filterW, inChannels,
-                outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
-            kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-                (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-            factor: Integer upsampling factor (default: 2).
-            gain: Scaling factor for signal magnitude (default: 1.0).
-        Returns:
-            output: Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
-            datatype as `hidden_states`.
-        """
-        assert isinstance(factor, int) and factor >= 1
-        # Setup filter kernel.
-        if kernel is None:
-            kernel = [1] * factor
-        # setup kernel
-        kernel = torch.tensor(kernel, dtype=torch.float32)
-        if kernel.ndim == 1:
-            kernel = torch.outer(kernel, kernel)
-        kernel /= torch.sum(kernel)
-        kernel = kernel * (gain * (factor**2))
-        if self.use_conv:
-            convH = weight.shape[2]
-            convW = weight.shape[3]
-            inC = weight.shape[1]
-            pad_value = (kernel.shape[0] - factor) - (convW - 1)
-            stride = (factor, factor)
-            # Determine data dimensions.
-            output_shape = (
-                (hidden_states.shape[2] - 1) * factor + convH,
-                (hidden_states.shape[3] - 1) * factor + convW,
-            )
-            output_padding = (
-                output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
-                output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
-            )
-            assert output_padding[0] >= 0 and output_padding[1] >= 0
-            num_groups = hidden_states.shape[1] // inC
-            # Transpose weights.
-            weight = torch.reshape(weight, (num_groups, -1, inC, convH, convW))
-            weight = torch.flip(weight, dims=[3, 4]).permute(0, 2, 1, 3, 4)
-            weight = torch.reshape(weight, (num_groups * inC, -1, convH, convW))
-            inverse_conv = F.conv_transpose2d(
-                hidden_states, weight, stride=stride, output_padding=output_padding, padding=0
-            )
-            output = upfirdn2d_native(
-                inverse_conv,
-                torch.tensor(kernel, device=inverse_conv.device),
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
-            )
-        else:
-            pad_value = kernel.shape[0] - factor
-            output = upfirdn2d_native(
-                hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
-                up=factor,
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
-            )
-        return output
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.use_conv:
-            height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
-            height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
-        else:
-            height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
-        return height
-class FirDownsample2D(nn.Module):
-    """A 2D FIR downsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
-            kernel for the FIR filter.
-    """
-    def __init__(
-        self,
-        channels: int = None,
-        out_channels: Optional[int] = None,
-        use_conv: bool = False,
-        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
-    ):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.fir_kernel = fir_kernel
-        self.use_conv = use_conv
-        self.out_channels = out_channels
-    def _downsample_2d(
-        self,
-        hidden_states: torch.Tensor,
-        weight: Optional[torch.Tensor] = None,
-        kernel: Optional[torch.FloatTensor] = None,
-        factor: int = 2,
-        gain: float = 1,
-    ) -> torch.Tensor:
-        """Fused `Conv2d()` followed by `downsample_2d()`.
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
-        arbitrary order.
-        Args:
-            hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-            weight:
-                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
-                performed by `inChannels = x.shape[0] // numGroups`.
-            kernel: FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] *
-            factor`, which corresponds to average pooling.
-            factor: Integer downsampling factor (default: 2).
-            gain: Scaling factor for signal magnitude (default: 1.0).
-        Returns:
-            output: Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and
-            same datatype as `x`.
-        """
-        assert isinstance(factor, int) and factor >= 1
-        if kernel is None:
-            kernel = [1] * factor
-        # setup kernel
-        kernel = torch.tensor(kernel, dtype=torch.float32)
-        if kernel.ndim == 1:
-            kernel = torch.outer(kernel, kernel)
-        kernel /= torch.sum(kernel)
-        kernel = kernel * gain
-        if self.use_conv:
-            _, _, convH, convW = weight.shape
-            pad_value = (kernel.shape[0] - factor) + (convW - 1)
-            stride_value = [factor, factor]
-            upfirdn_input = upfirdn2d_native(
-                hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
-                pad=((pad_value + 1) // 2, pad_value // 2),
-            )
-            output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
-        else:
-            pad_value = kernel.shape[0] - factor
-            output = upfirdn2d_native(
-                hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
-                down=factor,
-                pad=((pad_value + 1) // 2, pad_value // 2),
-            )
-        return output
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.use_conv:
-            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
-            hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
-        else:
-            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
-        return hidden_states
-# downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
-class KDownsample2D(nn.Module):
-    r"""A 2D K-downsampling layer.
-    Parameters:
-        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
-    """
-    def __init__(self, pad_mode: str = "reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]])
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        inputs = F.pad(inputs, (self.pad,) * 4, self.pad_mode)
-        weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(inputs.shape[1], device=inputs.device)
-        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
-        weight[indices, indices] = kernel
-        return F.conv2d(inputs, weight, stride=2)
-class KUpsample2D(nn.Module):
-    r"""A 2D K-upsampling layer.
-    Parameters:
-        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
-    """
-    def __init__(self, pad_mode: str = "reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]]) * 2
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        inputs = F.pad(inputs, ((self.pad + 1) // 2,) * 4, self.pad_mode)
-        weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(inputs.shape[1], device=inputs.device)
-        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
-        weight[indices, indices] = kernel
-        return F.conv_transpose2d(inputs, weight, stride=2, padding=self.pad * 2 + 1)
+from .upsampling import (  # noqa
+    FirUpsample2D,
+    KUpsample2D,
+    Upsample1D,
+    Upsample2D,
+    upfirdn2d_native,
+    upsample_2d,
+)
 class ResnetBlock2D(nn.Module):
@@ -679,10 +172,20 @@ class ResnetBlock2D(nn.Module):
         self.conv_shortcut = None
         if self.use_in_shortcut:
             self.conv_shortcut = conv_cls(
-                in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias=conv_shortcut_bias
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
             )
-    def forward(self, input_tensor, temb, scale: float = 1.0):
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
         hidden_states = input_tensor
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
@@ -778,7 +281,7 @@ class Conv1dBlock(nn.Module):
         out_channels (`int`): Number of output channels.
         kernel_size (`int` or `tuple`): Size of the convolving kernel.
         n_groups (`int`, default `8`): Number of groups to separate the channels into.
-        activation (`str`, defaults `mish`): Name of the activation function.
+        activation (`str`, defaults to `mish`): Name of the activation function.
     """
     def __init__(
@@ -852,128 +355,6 @@ class ResidualTemporalBlock1D(nn.Module):
         return out + self.residual_conv(inputs)
-def upsample_2d(
-    hidden_states: torch.Tensor, kernel: Optional[torch.FloatTensor] = None, factor: int = 2, gain: float = 1
-) -> torch.Tensor:
-    r"""Upsample2D a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
-    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
-    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is
-    a: multiple of the upsampling factor.
-    Args:
-        hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-        factor: Integer upsampling factor (default: 2).
-        gain: Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        output: Tensor of the shape `[N, C, H * factor, W * factor]`
-    """
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-    kernel = torch.tensor(kernel, dtype=torch.float32)
-    if kernel.ndim == 1:
-        kernel = torch.outer(kernel, kernel)
-    kernel /= torch.sum(kernel)
-    kernel = kernel * (gain * (factor**2))
-    pad_value = kernel.shape[0] - factor
-    output = upfirdn2d_native(
-        hidden_states,
-        kernel.to(device=hidden_states.device),
-        up=factor,
-        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
-    )
-    return output
-def downsample_2d(
-    hidden_states: torch.Tensor, kernel: Optional[torch.FloatTensor] = None, factor: int = 2, gain: float = 1
-) -> torch.Tensor:
-    r"""Downsample2D a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
-    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
-    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
-    shape is a multiple of the downsampling factor.
-    Args:
-        hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to average pooling.
-        factor: Integer downsampling factor (default: 2).
-        gain: Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        output: Tensor of the shape `[N, C, H // factor, W // factor]`
-    """
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-    kernel = torch.tensor(kernel, dtype=torch.float32)
-    if kernel.ndim == 1:
-        kernel = torch.outer(kernel, kernel)
-    kernel /= torch.sum(kernel)
-    kernel = kernel * gain
-    pad_value = kernel.shape[0] - factor
-    output = upfirdn2d_native(
-        hidden_states, kernel.to(device=hidden_states.device), down=factor, pad=((pad_value + 1) // 2, pad_value // 2)
-    )
-    return output
-def upfirdn2d_native(
-    tensor: torch.Tensor, kernel: torch.Tensor, up: int = 1, down: int = 1, pad: Tuple[int, int] = (0, 0)
-) -> torch.Tensor:
-    up_x = up_y = up
-    down_x = down_y = down
-    pad_x0 = pad_y0 = pad[0]
-    pad_x1 = pad_y1 = pad[1]
-    _, channel, in_h, in_w = tensor.shape
-    tensor = tensor.reshape(-1, in_h, in_w, 1)
-    _, in_h, in_w, minor = tensor.shape
-    kernel_h, kernel_w = kernel.shape
-    out = tensor.view(-1, in_h, 1, in_w, 1, minor)
-    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
-    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
-    out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
-    out = out.to(tensor.device)  # Move back to mps if necessary
-    out = out[
-        :,
-        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
-        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
-        :,
-    ]
-    out = out.permute(0, 3, 1, 2)
-    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
-    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
-    out = F.conv2d(out, w)
-    out = out.reshape(
-        -1,
-        minor,
-        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
-    )
-    out = out.permute(0, 2, 3, 1)
-    out = out[:, ::down_y, ::down_x, :]
-    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-    return out.view(-1, channel, out_h, out_w)
 class TemporalConvLayer(nn.Module):
     """
     Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
@@ -985,7 +366,13 @@ class TemporalConvLayer(nn.Module):
         dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
     """
-    def __init__(self, in_dim: int, out_dim: Optional[int] = None, dropout: float = 0.0):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+    ):
         super().__init__()
         out_dim = out_dim or in_dim
         self.in_dim = in_dim
@@ -993,22 +380,24 @@ class TemporalConvLayer(nn.Module):
         # conv layers
         self.conv1 = nn.Sequential(
-            nn.GroupNorm(32, in_dim), nn.SiLU(), nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0))
+            nn.GroupNorm(norm_num_groups, in_dim),
+            nn.SiLU(),
+            nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)),
         )
         self.conv2 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
         )
         self.conv3 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
         )
         self.conv4 = nn.Sequential(
-            nn.GroupNorm(32, out_dim),
+            nn.GroupNorm(norm_num_groups, out_dim),
             nn.SiLU(),
             nn.Dropout(dropout),
             nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
@@ -1035,3 +424,261 @@ class TemporalConvLayer(nn.Module):
             (hidden_states.shape[0] * hidden_states.shape[2], -1) + hidden_states.shape[3:]
         )
         return hidden_states
+class TemporalResnetBlock(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        kernel_size = (3, 1, 1)
+        padding = [k // 2 for k in kernel_size]
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(0.0)
+        self.conv2 = nn.Conv3d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+        self.nonlinearity = get_activation("silu")
+        self.use_in_shortcut = self.in_channels != out_channels
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, :, None, None]
+            temb = temb.permute(0, 2, 1, 3, 4)
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+# VideoResBlock
+class SpatioTemporalResBlock(nn.Module):
+    r"""
+    A SpatioTemporal Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
+        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
+        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+        temporal_eps: Optional[float] = None,
+        merge_factor: float = 0.5,
+        merge_strategy="learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.spatial_res_block = ResnetBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            eps=eps,
+        )
+        self.temporal_res_block = TemporalResnetBlock(
+            in_channels=out_channels if out_channels is not None else in_channels,
+            out_channels=out_channels if out_channels is not None else in_channels,
+            temb_channels=temb_channels,
+            eps=temporal_eps if temporal_eps is not None else eps,
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            switch_spatial_to_temporal_mix=switch_spatial_to_temporal_mix,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ):
+        num_frames = image_only_indicator.shape[-1]
+        hidden_states = self.spatial_res_block(hidden_states, temb)
+        batch_frames, channels, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states_mix = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        hidden_states = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        if temb is not None:
+            temb = temb.reshape(batch_size, num_frames, -1)
+        hidden_states = self.temporal_res_block(hidden_states, temb)
+        hidden_states = self.time_mixer(
+            x_spatial=hidden_states_mix,
+            x_temporal=hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+        return hidden_states
+class AlphaBlender(nn.Module):
+    r"""
+    A module to blend spatial and temporal features.
+    Parameters:
+        alpha (`float`): The initial value of the blending factor.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+    strategies = ["learned", "fixed", "learned_with_images"]
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.switch_spatial_to_temporal_mix = switch_spatial_to_temporal_mix  # For TemporalVAE
+        if merge_strategy not in self.strategies:
+            raise ValueError(f"merge_strategy needs to be in {self.strategies}")
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned" or self.merge_strategy == "learned_with_images":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, image_only_indicator: torch.Tensor, ndims: int) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+        elif self.merge_strategy == "learned_with_images":
+            if image_only_indicator is None:
+                raise ValueError("Please provide image_only_indicator to use learned_with_images merge strategy")
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                torch.sigmoid(self.mix_factor)[..., None],
+            )
+            # (batch, channel, frames, height, width)
+            if ndims == 5:
+                alpha = alpha[:, None, :, None, None]
+            # (batch*frames, height*width, channels)
+            elif ndims == 3:
+                alpha = alpha.reshape(-1)[:, None, None]
+            else:
+                raise ValueError(f"Unexpected ndims {ndims}. Dimensions should be 3 or 5")
+        else:
+            raise NotImplementedError
+        return alpha
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator, x_spatial.ndim)
+        alpha = alpha.to(x_spatial.dtype)
+        if self.switch_spatial_to_temporal_mix:
+            alpha = 1.0 - alpha
+        x = alpha * x_spatial + (1.0 - alpha) * x_temporal
+        return x

diffusers 0.23.1__py3-none-any.whl → 0.25.0__py3-none-any.whl

diffusers 0.23.1py3-none-any.whl → 0.25.0py3-none-any.whl