PyPI - diffusers - Versions diffs - 0.26.2__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

diffusers 0.26.2py3-none-any.whl → 0.27.0py3-none-any.whl

Files changed (299) hide show

diffusers/models/unets/unet_2d_condition_flax.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -75,6 +75,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
             The tuple of downsample blocks to use.
         up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
             The tuple of upsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer is skipped.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
             The tuple of output channels for each block.
         layers_per_block (`int`, *optional*, defaults to 2):
@@ -107,6 +109,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         "DownBlock2D",
     )
     up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
+    mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn"
     only_cross_attention: Union[bool, Tuple[bool]] = False
     block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280)
     layers_per_block: int = 2
@@ -252,16 +255,21 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         self.down_blocks = down_blocks
         # mid
-        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
-            in_channels=block_out_channels[-1],
-            dropout=self.dropout,
-            num_attention_heads=num_attention_heads[-1],
-            transformer_layers_per_block=transformer_layers_per_block[-1],
-            use_linear_projection=self.use_linear_projection,
-            use_memory_efficient_attention=self.use_memory_efficient_attention,
-            split_head_dim=self.split_head_dim,
-            dtype=self.dtype,
-        )
+        if self.config.mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                dropout=self.dropout,
+                num_attention_heads=num_attention_heads[-1],
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+        elif self.config.mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"Unexpected mid_block_type {self.config.mid_block_type}")
         # up
         up_blocks = []
@@ -412,7 +420,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
             down_block_res_samples = new_down_block_res_samples
         # 4. mid
-        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+        if self.mid_block is not None:
+            sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
         if mid_block_additional_residual is not None:
             sample += mid_block_additional_residual

diffusers/models/unets/unet_3d_blocks.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@ from typing import Any, Dict, Optional, Tuple, Union
 import torch
 from torch import nn
-from ...utils import is_torch_version
+from ...utils import deprecate, is_torch_version, logging
 from ...utils.torch_utils import apply_freeu
 from ..attention import Attention
 from ..resnet import (
@@ -35,6 +35,9 @@ from ..transformers.transformer_temporal import (
 )
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def get_down_block(
     down_block_type: str,
     num_layers: int,
@@ -1005,9 +1008,14 @@ class DownBlockMotion(nn.Module):
         self,
         hidden_states: torch.FloatTensor,
         temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
         num_frames: int = 1,
+        *args,
+        **kwargs,
     ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
         output_states = ()
         blocks = zip(self.resnets, self.motion_modules)
@@ -1029,24 +1037,18 @@ class DownBlockMotion(nn.Module):
                     )
                 else:
                     hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(resnet), hidden_states, temb, scale
+                        create_custom_forward(resnet), hidden_states, temb
                     )
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(motion_module),
-                    hidden_states.requires_grad_(),
-                    temb,
-                    num_frames,
-                )
             else:
-                hidden_states = resnet(hidden_states, temb, scale=scale)
-                hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+                hidden_states = resnet(hidden_states, temb)
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
             output_states = output_states + (hidden_states,)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, scale=scale)
+                hidden_states = downsampler(hidden_states)
             output_states = output_states + (hidden_states,)
@@ -1179,9 +1181,11 @@ class CrossAttnDownBlockMotion(nn.Module):
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         additional_residuals: Optional[torch.FloatTensor] = None,
     ):
-        output_states = ()
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        output_states = ()
         blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
         for i, (resnet, attn, motion_module) in enumerate(blocks):
@@ -1212,7 +1216,7 @@ class CrossAttnDownBlockMotion(nn.Module):
                     return_dict=False,
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -1221,10 +1225,10 @@ class CrossAttnDownBlockMotion(nn.Module):
                     encoder_attention_mask=encoder_attention_mask,
                     return_dict=False,
                 )[0]
-                hidden_states = motion_module(
-                    hidden_states,
-                    num_frames=num_frames,
-                )[0]
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )[0]
             # apply additional residuals to the output of the last pair of resnet and attention blocks
             if i == len(blocks) - 1 and additional_residuals is not None:
@@ -1234,7 +1238,7 @@ class CrossAttnDownBlockMotion(nn.Module):
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, scale=lora_scale)
+                hidden_states = downsampler(hidden_states)
             output_states = output_states + (hidden_states,)
@@ -1361,7 +1365,10 @@ class CrossAttnUpBlockMotion(nn.Module):
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         num_frames: int = 1,
     ) -> torch.FloatTensor:
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
         is_freeu_enabled = (
             getattr(self, "s1", None)
             and getattr(self, "s2", None)
@@ -1416,7 +1423,7 @@ class CrossAttnUpBlockMotion(nn.Module):
                     return_dict=False,
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -1425,14 +1432,14 @@ class CrossAttnUpBlockMotion(nn.Module):
                     encoder_attention_mask=encoder_attention_mask,
                     return_dict=False,
                 )[0]
-                hidden_states = motion_module(
-                    hidden_states,
-                    num_frames=num_frames,
-                )[0]
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )[0]
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+                hidden_states = upsampler(hidden_states, upsample_size)
         return hidden_states
@@ -1513,9 +1520,14 @@ class UpBlockMotion(nn.Module):
         res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
         temb: Optional[torch.FloatTensor] = None,
         upsample_size=None,
-        scale: float = 1.0,
         num_frames: int = 1,
+        *args,
+        **kwargs,
     ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
         is_freeu_enabled = (
             getattr(self, "s1", None)
             and getattr(self, "s2", None)
@@ -1563,19 +1575,14 @@ class UpBlockMotion(nn.Module):
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(resnet), hidden_states, temb
                     )
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                )
             else:
-                hidden_states = resnet(hidden_states, temb, scale=scale)
-                hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+                hidden_states = resnet(hidden_states, temb)
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+                hidden_states = upsampler(hidden_states, upsample_size)
         return hidden_states
@@ -1698,8 +1705,11 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         num_frames: int = 1,
     ) -> torch.FloatTensor:
-        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+        hidden_states = self.resnets[0](hidden_states, temb)
         blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
         for attn, resnet, motion_module in blocks:
@@ -1748,7 +1758,7 @@ class UNetMidBlockCrossAttnMotion(nn.Module):
                     hidden_states,
                     num_frames=num_frames,
                 )[0]
-                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = resnet(hidden_states, temb)
         return hidden_states

diffusers/models/unets/unet_3d_condition.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
-# Copyright 2023 The ModelScope Team.
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2024 The ModelScope Team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@ from ..activations import get_activation
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
+    Attention,
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
@@ -54,7 +55,7 @@ class UNet3DConditionOutput(BaseOutput):
     The output of [`UNet3DConditionModel`].
     Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
             The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
     """
@@ -74,9 +75,9 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
             Height and width of input/output sample.
         in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D")`):
             The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D")`):
             The tuple of upsample blocks to use.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
             The tuple of output channels for each block.
@@ -87,8 +88,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, normalization and activation layers is skipped in post-processing.
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
         num_attention_heads (`int`, *optional*): The number of attention heads.
     """
@@ -503,6 +504,44 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
                 if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
                     setattr(upsample_block, k, None)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora
     def unload_lora(self):
         """Unloads LoRA weights."""
@@ -533,7 +572,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         Args:
             sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+                The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`.
             timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.FloatTensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.

diffusers/models/unets/unet_i2vgen_xl.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,29 +48,6 @@ from .unet_3d_condition import UNet3DConditionOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def _to_tensor(inputs, device):
-    if not torch.is_tensor(inputs):
-        # TODO: this requires sync between CPU and GPU. So try to pass `inputs` as tensors if you can
-        # This would be a good case for the `match` statement (Python 3.10+)
-        is_mps = device.type == "mps"
-        if isinstance(inputs, float):
-            dtype = torch.float32 if is_mps else torch.float64
-        else:
-            dtype = torch.int32 if is_mps else torch.int64
-        inputs = torch.tensor([inputs], dtype=dtype, device=device)
-    elif len(inputs.shape) == 0:
-        inputs = inputs[None].to(device)
-    return inputs
-def _collapse_frames_into_batch(sample: torch.Tensor) -> torch.Tensor:
-    batch_size, channels, num_frames, height, width = sample.shape
-    sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
-    return sample
 class I2VGenXLTransformerTemporalEncoder(nn.Module):
     def __init__(
         self,
@@ -112,7 +89,7 @@ class I2VGenXLTransformerTemporalEncoder(nn.Module):
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
-        ff_output = self.ff(hidden_states, scale=1.0)
+        ff_output = self.ff(hidden_states)
         hidden_states = ff_output + hidden_states
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
@@ -143,6 +120,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, normalization and activation layers is skipped in post-processing.
         cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
         num_attention_heads (`int`, *optional*): The number of attention heads.
     """
@@ -170,11 +148,18 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         layers_per_block: int = 2,
         norm_num_groups: Optional[int] = 32,
         cross_attention_dim: int = 1024,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = 64,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
     ):
         super().__init__()
-        self.sample_size = sample_size
+        # When we first integrated the UNet into the library, we didn't have `attention_head_dim`. As a consequence
+        # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
+        # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
+        # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
+        # without running proper depcrecation cycles for the {down,mid,up} blocks which are a
+        # part of the public API.
+        num_attention_heads = attention_head_dim
         # Check inputs
         if len(down_block_types) != len(up_block_types):
@@ -489,6 +474,44 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                 if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
                     setattr(upsample_block, k, None)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
     def forward(
         self,
         sample: torch.FloatTensor,
@@ -543,7 +566,18 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
             forward_upsample_size = True
         # 1. time
-        timesteps = _to_tensor(timestep, sample.device)
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass `timesteps` as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
         timesteps = timesteps.expand(sample.shape[0])
@@ -572,7 +606,13 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         context_emb = sample.new_zeros(batch_size, 0, self.config.cross_attention_dim)
         context_emb = torch.cat([context_emb, encoder_hidden_states], dim=1)
-        image_latents_context_embs = _collapse_frames_into_batch(image_latents[:, :, :1, :])
+        image_latents_for_context_embds = image_latents[:, :, :1, :]
+        image_latents_context_embs = image_latents_for_context_embds.permute(0, 2, 1, 3, 4).reshape(
+            image_latents_for_context_embds.shape[0] * image_latents_for_context_embds.shape[2],
+            image_latents_for_context_embds.shape[1],
+            image_latents_for_context_embds.shape[3],
+            image_latents_for_context_embds.shape[4],
+        )
         image_latents_context_embs = self.image_latents_context_embedding(image_latents_context_embs)
         _batch_size, _channels, _height, _width = image_latents_context_embs.shape
@@ -586,7 +626,12 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         context_emb = torch.cat([context_emb, image_emb], dim=1)
         context_emb = context_emb.repeat_interleave(repeats=num_frames, dim=0)
-        image_latents = _collapse_frames_into_batch(image_latents)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4).reshape(
+            image_latents.shape[0] * image_latents.shape[2],
+            image_latents.shape[1],
+            image_latents.shape[3],
+            image_latents.shape[4],
+        )
         image_latents = self.image_latents_proj_in(image_latents)
         image_latents = (
             image_latents[None, :]

diffusers/models/unets/unet_kandinsky3.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diffusers/models/unets/unet_motion_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ from ...utils import logging
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
+    Attention,
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
@@ -217,6 +218,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         use_motion_mid_block: int = True,
         encoder_hid_dim: Optional[int] = None,
         encoder_hid_dim_type: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
     ):
         super().__init__()
@@ -252,9 +254,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         timestep_input_dim = block_out_channels[0]
         self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
+            timestep_input_dim, time_embed_dim, act_fn=act_fn, cond_proj_dim=time_cond_proj_dim
         )
         if encoder_hid_dim_type is None:
@@ -306,6 +306,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                 num_attention_heads=num_attention_heads[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
                 temporal_num_attention_heads=motion_num_attention_heads,
                 temporal_max_seq_length=motion_max_seq_length,
             )
@@ -321,6 +322,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                 num_attention_heads=num_attention_heads[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
             )
         # count how many layers upsample the images
@@ -700,6 +702,44 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                 if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
                     setattr(upsample_block, k, None)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
     def forward(
         self,
         sample: torch.FloatTensor,
@@ -792,6 +832,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
         if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
             if "image_embeds" not in added_cond_kwargs:
@@ -799,10 +840,9 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                     f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
-            encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
-        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds]
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
         # 2. pre-process
         sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])

diffusers 0.26.2__py3-none-any.whl → 0.27.0__py3-none-any.whl

diffusers 0.26.2py3-none-any.whl → 0.27.0py3-none-any.whl