PyPI - diffusers - Versions diffs - 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl - Mend

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

diffusers/models/unets/unet_3d_blocks.py CHANGED Viewed

@@ -27,17 +27,58 @@ from ..resnet import (
     TemporalConvLayer,
     Upsample2D,
 )
-from ..transformers.dual_transformer_2d import DualTransformer2DModel
 from ..transformers.transformer_2d import Transformer2DModel
 from ..transformers.transformer_temporal import (
     TransformerSpatioTemporalModel,
     TransformerTemporalModel,
 )
+from .unet_motion_model import (
+    CrossAttnDownBlockMotion,
+    CrossAttnUpBlockMotion,
+    DownBlockMotion,
+    UNetMidBlockCrossAttnMotion,
+    UpBlockMotion,
+)
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class DownBlockMotion(DownBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `DownBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import DownBlockMotion` instead."
+        deprecate("DownBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+class CrossAttnDownBlockMotion(CrossAttnDownBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `CrossAttnDownBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import CrossAttnDownBlockMotion` instead."
+        deprecate("CrossAttnDownBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+class UpBlockMotion(UpBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `UpBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import UpBlockMotion` instead."
+        deprecate("UpBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+class CrossAttnUpBlockMotion(CrossAttnUpBlockMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `CrossAttnUpBlockMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import CrossAttnUpBlockMotion` instead."
+        deprecate("CrossAttnUpBlockMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+class UNetMidBlockCrossAttnMotion(UNetMidBlockCrossAttnMotion):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `UNetMidBlockCrossAttnMotion` from `diffusers.models.unets.unet_3d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_motion_model import UNetMidBlockCrossAttnMotion` instead."
+        deprecate("UNetMidBlockCrossAttnMotion", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
 def get_down_block(
     down_block_type: str,
     num_layers: int,
@@ -58,12 +99,12 @@ def get_down_block(
     resnet_time_scale_shift: str = "default",
     temporal_num_attention_heads: int = 8,
     temporal_max_seq_length: int = 32,
-    transformer_layers_per_block: int = 1,
+    transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    dropout: float = 0.0,
 ) -> Union[
     "DownBlock3D",
     "CrossAttnDownBlock3D",
-    "DownBlockMotion",
-    "CrossAttnDownBlockMotion",
     "DownBlockSpatioTemporal",
     "CrossAttnDownBlockSpatioTemporal",
 ]:
@@ -79,6 +120,7 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            dropout=dropout,
         )
     elif down_block_type == "CrossAttnDownBlock3D":
         if cross_attention_dim is None:
@@ -100,45 +142,7 @@ def get_down_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    if down_block_type == "DownBlockMotion":
-        return DownBlockMotion(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            temporal_num_attention_heads=temporal_num_attention_heads,
-            temporal_max_seq_length=temporal_max_seq_length,
-        )
-    elif down_block_type == "CrossAttnDownBlockMotion":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
-        return CrossAttnDownBlockMotion(
-            num_layers=num_layers,
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            temporal_num_attention_heads=temporal_num_attention_heads,
-            temporal_max_seq_length=temporal_max_seq_length,
+            dropout=dropout,
         )
     elif down_block_type == "DownBlockSpatioTemporal":
         # added for SDV
@@ -189,13 +193,12 @@ def get_up_block(
     temporal_num_attention_heads: int = 8,
     temporal_cross_attention_dim: Optional[int] = None,
     temporal_max_seq_length: int = 32,
-    transformer_layers_per_block: int = 1,
+    transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+    temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
     dropout: float = 0.0,
 ) -> Union[
     "UpBlock3D",
     "CrossAttnUpBlock3D",
-    "UpBlockMotion",
-    "CrossAttnUpBlockMotion",
     "UpBlockSpatioTemporal",
     "CrossAttnUpBlockSpatioTemporal",
 ]:
@@ -212,6 +215,7 @@ def get_up_block(
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
             resolution_idx=resolution_idx,
+            dropout=dropout,
         )
     elif up_block_type == "CrossAttnUpBlock3D":
         if cross_attention_dim is None:
@@ -234,47 +238,7 @@ def get_up_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
             resolution_idx=resolution_idx,
-        )
-    if up_block_type == "UpBlockMotion":
-        return UpBlockMotion(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resolution_idx=resolution_idx,
-            temporal_num_attention_heads=temporal_num_attention_heads,
-            temporal_max_seq_length=temporal_max_seq_length,
-        )
-    elif up_block_type == "CrossAttnUpBlockMotion":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
-        return CrossAttnUpBlockMotion(
-            num_layers=num_layers,
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resolution_idx=resolution_idx,
-            temporal_num_attention_heads=temporal_num_attention_heads,
-            temporal_max_seq_length=temporal_max_seq_length,
+            dropout=dropout,
         )
     elif up_block_type == "UpBlockSpatioTemporal":
         # added for SDV
@@ -932,839 +896,6 @@ class UpBlock3D(nn.Module):
         return hidden_states
-class DownBlockMotion(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        temporal_num_attention_heads: int = 1,
-        temporal_cross_attention_dim: Optional[int] = None,
-        temporal_max_seq_length: int = 32,
-    ):
-        super().__init__()
-        resnets = []
-        motion_modules = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            motion_modules.append(
-                TransformerTemporalModel(
-                    num_attention_heads=temporal_num_attention_heads,
-                    in_channels=out_channels,
-                    norm_num_groups=resnet_groups,
-                    cross_attention_dim=temporal_cross_attention_dim,
-                    attention_bias=False,
-                    activation_fn="geglu",
-                    positional_embeddings="sinusoidal",
-                    num_positional_embeddings=temporal_max_seq_length,
-                    attention_head_dim=out_channels // temporal_num_attention_heads,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        self.motion_modules = nn.ModuleList(motion_modules)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        out_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        num_frames: int = 1,
-        *args,
-        **kwargs,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
-        if len(args) > 0 or kwargs.get("scale", None) is not None:
-            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
-            deprecate("scale", "1.0.0", deprecation_message)
-        output_states = ()
-        blocks = zip(self.resnets, self.motion_modules)
-        for resnet, motion_module in blocks:
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                if is_torch_version(">=", "1.11.0"):
-                    hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(resnet),
-                        hidden_states,
-                        temb,
-                        use_reentrant=False,
-                    )
-                else:
-                    hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(resnet), hidden_states, temb
-                    )
-            else:
-                hidden_states = resnet(hidden_states, temb)
-            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
-            output_states = output_states + (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states = output_states + (hidden_states,)
-        return hidden_states, output_states
-class CrossAttnDownBlockMotion(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        num_attention_heads: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        downsample_padding: int = 1,
-        add_downsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        attention_type: str = "default",
-        temporal_cross_attention_dim: Optional[int] = None,
-        temporal_num_attention_heads: int = 8,
-        temporal_max_seq_length: int = 32,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        motion_modules = []
-        self.has_cross_attention = True
-        self.num_attention_heads = num_attention_heads
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        num_attention_heads,
-                        out_channels // num_attention_heads,
-                        in_channels=out_channels,
-                        num_layers=transformer_layers_per_block,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                        attention_type=attention_type,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        num_attention_heads,
-                        out_channels // num_attention_heads,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            motion_modules.append(
-                TransformerTemporalModel(
-                    num_attention_heads=temporal_num_attention_heads,
-                    in_channels=out_channels,
-                    norm_num_groups=resnet_groups,
-                    cross_attention_dim=temporal_cross_attention_dim,
-                    attention_bias=False,
-                    activation_fn="geglu",
-                    positional_embeddings="sinusoidal",
-                    num_positional_embeddings=temporal_max_seq_length,
-                    attention_head_dim=out_channels // temporal_num_attention_heads,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        self.motion_modules = nn.ModuleList(motion_modules)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        out_channels,
-                        use_conv=True,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        name="op",
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        num_frames: int = 1,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        additional_residuals: Optional[torch.Tensor] = None,
-    ):
-        if cross_attention_kwargs is not None:
-            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
-        output_states = ()
-        blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
-        for i, (resnet, attn, motion_module) in enumerate(blocks):
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                    **ckpt_kwargs,
-                )
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-            hidden_states = motion_module(
-                hidden_states,
-                num_frames=num_frames,
-            )[0]
-            # apply additional residuals to the output of the last pair of resnet and attention blocks
-            if i == len(blocks) - 1 and additional_residuals is not None:
-                hidden_states = hidden_states + additional_residuals
-            output_states = output_states + (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states = output_states + (hidden_states,)
-        return hidden_states, output_states
-class CrossAttnUpBlockMotion(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        resolution_idx: Optional[int] = None,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        num_attention_heads: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        attention_type: str = "default",
-        temporal_cross_attention_dim: Optional[int] = None,
-        temporal_num_attention_heads: int = 8,
-        temporal_max_seq_length: int = 32,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        motion_modules = []
-        self.has_cross_attention = True
-        self.num_attention_heads = num_attention_heads
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        num_attention_heads,
-                        out_channels // num_attention_heads,
-                        in_channels=out_channels,
-                        num_layers=transformer_layers_per_block,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                        attention_type=attention_type,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        num_attention_heads,
-                        out_channels // num_attention_heads,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            motion_modules.append(
-                TransformerTemporalModel(
-                    num_attention_heads=temporal_num_attention_heads,
-                    in_channels=out_channels,
-                    norm_num_groups=resnet_groups,
-                    cross_attention_dim=temporal_cross_attention_dim,
-                    attention_bias=False,
-                    activation_fn="geglu",
-                    positional_embeddings="sinusoidal",
-                    num_positional_embeddings=temporal_max_seq_length,
-                    attention_head_dim=out_channels // temporal_num_attention_heads,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        self.motion_modules = nn.ModuleList(motion_modules)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-        self.gradient_checkpointing = False
-        self.resolution_idx = resolution_idx
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
-        temb: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        upsample_size: Optional[int] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        num_frames: int = 1,
-    ) -> torch.Tensor:
-        if cross_attention_kwargs is not None:
-            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
-        is_freeu_enabled = (
-            getattr(self, "s1", None)
-            and getattr(self, "s2", None)
-            and getattr(self, "b1", None)
-            and getattr(self, "b2", None)
-        )
-        blocks = zip(self.resnets, self.attentions, self.motion_modules)
-        for resnet, attn, motion_module in blocks:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            # FreeU: Only operate on the first two stages
-            if is_freeu_enabled:
-                hidden_states, res_hidden_states = apply_freeu(
-                    self.resolution_idx,
-                    hidden_states,
-                    res_hidden_states,
-                    s1=self.s1,
-                    s2=self.s2,
-                    b1=self.b1,
-                    b2=self.b2,
-                )
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                    **ckpt_kwargs,
-                )
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-            hidden_states = motion_module(
-                hidden_states,
-                num_frames=num_frames,
-            )[0]
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-        return hidden_states
-class UpBlockMotion(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        resolution_idx: Optional[int] = None,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        temporal_norm_num_groups: int = 32,
-        temporal_cross_attention_dim: Optional[int] = None,
-        temporal_num_attention_heads: int = 8,
-        temporal_max_seq_length: int = 32,
-    ):
-        super().__init__()
-        resnets = []
-        motion_modules = []
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            motion_modules.append(
-                TransformerTemporalModel(
-                    num_attention_heads=temporal_num_attention_heads,
-                    in_channels=out_channels,
-                    norm_num_groups=temporal_norm_num_groups,
-                    cross_attention_dim=temporal_cross_attention_dim,
-                    attention_bias=False,
-                    activation_fn="geglu",
-                    positional_embeddings="sinusoidal",
-                    num_positional_embeddings=temporal_max_seq_length,
-                    attention_head_dim=out_channels // temporal_num_attention_heads,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        self.motion_modules = nn.ModuleList(motion_modules)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-        self.gradient_checkpointing = False
-        self.resolution_idx = resolution_idx
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        res_hidden_states_tuple: Tuple[torch.Tensor, ...],
-        temb: Optional[torch.Tensor] = None,
-        upsample_size=None,
-        num_frames: int = 1,
-        *args,
-        **kwargs,
-    ) -> torch.Tensor:
-        if len(args) > 0 or kwargs.get("scale", None) is not None:
-            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
-            deprecate("scale", "1.0.0", deprecation_message)
-        is_freeu_enabled = (
-            getattr(self, "s1", None)
-            and getattr(self, "s2", None)
-            and getattr(self, "b1", None)
-            and getattr(self, "b2", None)
-        )
-        blocks = zip(self.resnets, self.motion_modules)
-        for resnet, motion_module in blocks:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            # FreeU: Only operate on the first two stages
-            if is_freeu_enabled:
-                hidden_states, res_hidden_states = apply_freeu(
-                    self.resolution_idx,
-                    hidden_states,
-                    res_hidden_states,
-                    s1=self.s1,
-                    s2=self.s2,
-                    b1=self.b1,
-                    b2=self.b2,
-                )
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                if is_torch_version(">=", "1.11.0"):
-                    hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(resnet),
-                        hidden_states,
-                        temb,
-                        use_reentrant=False,
-                    )
-                else:
-                    hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(resnet), hidden_states, temb
-                    )
-            else:
-                hidden_states = resnet(hidden_states, temb)
-            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-        return hidden_states
-class UNetMidBlockCrossAttnMotion(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        num_attention_heads: int = 1,
-        output_scale_factor: float = 1.0,
-        cross_attention_dim: int = 1280,
-        dual_cross_attention: float = False,
-        use_linear_projection: float = False,
-        upcast_attention: float = False,
-        attention_type: str = "default",
-        temporal_num_attention_heads: int = 1,
-        temporal_cross_attention_dim: Optional[int] = None,
-        temporal_max_seq_length: int = 32,
-    ):
-        super().__init__()
-        self.has_cross_attention = True
-        self.num_attention_heads = num_attention_heads
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        attentions = []
-        motion_modules = []
-        for _ in range(num_layers):
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        num_attention_heads,
-                        in_channels // num_attention_heads,
-                        in_channels=in_channels,
-                        num_layers=transformer_layers_per_block,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        upcast_attention=upcast_attention,
-                        attention_type=attention_type,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        num_attention_heads,
-                        in_channels // num_attention_heads,
-                        in_channels=in_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            motion_modules.append(
-                TransformerTemporalModel(
-                    num_attention_heads=temporal_num_attention_heads,
-                    attention_head_dim=in_channels // temporal_num_attention_heads,
-                    in_channels=in_channels,
-                    norm_num_groups=resnet_groups,
-                    cross_attention_dim=temporal_cross_attention_dim,
-                    attention_bias=False,
-                    positional_embeddings="sinusoidal",
-                    num_positional_embeddings=temporal_max_seq_length,
-                    activation_fn="geglu",
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        self.motion_modules = nn.ModuleList(motion_modules)
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        num_frames: int = 1,
-    ) -> torch.Tensor:
-        if cross_attention_kwargs is not None:
-            if cross_attention_kwargs.get("scale", None) is not None:
-                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
-        hidden_states = self.resnets[0](hidden_states, temb)
-        blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
-        for attn, resnet, motion_module in blocks:
-            if self.training and self.gradient_checkpointing:
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(motion_module),
-                    hidden_states,
-                    temb,
-                    **ckpt_kwargs,
-                )
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                    **ckpt_kwargs,
-                )
-            else:
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                    return_dict=False,
-                )[0]
-                hidden_states = motion_module(
-                    hidden_states,
-                    num_frames=num_frames,
-                )[0]
-                hidden_states = resnet(hidden_states, temb)
-        return hidden_states
 class MidBlockTemporalDecoder(nn.Module):
     def __init__(
         self,

diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl