PyPI - diffusers - Versions diffs - 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl - Mend

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

diffusers/models/controlnet_xs.py CHANGED Viewed

@@ -29,6 +29,7 @@ from .attention_processor import (
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
+    FusedAttnProcessor2_0,
 )
 from .controlnet import ControlNetConditioningEmbedding
 from .embeddings import TimestepEmbedding, Timesteps
@@ -114,6 +115,7 @@ def get_down_block_adapter(
     cross_attention_dim: Optional[int] = 1024,
     add_downsample: bool = True,
     upcast_attention: Optional[bool] = False,
+    use_linear_projection: Optional[bool] = True,
 ):
     num_layers = 2  # only support sd + sdxl
@@ -152,7 +154,7 @@ def get_down_block_adapter(
                     in_channels=ctrl_out_channels,
                     num_layers=transformer_layers_per_block[i],
                     cross_attention_dim=cross_attention_dim,
-                    use_linear_projection=True,
+                    use_linear_projection=use_linear_projection,
                     upcast_attention=upcast_attention,
                     norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=max_norm_num_groups),
                 )
@@ -200,6 +202,7 @@ def get_mid_block_adapter(
     num_attention_heads: Optional[int] = 1,
     cross_attention_dim: Optional[int] = 1024,
     upcast_attention: bool = False,
+    use_linear_projection: bool = True,
 ):
     # Before the midblock application, information is concatted from base to control.
     # Concat doesn't require change in number of channels
@@ -214,7 +217,7 @@ def get_mid_block_adapter(
         resnet_groups=find_largest_factor(gcd(ctrl_channels, ctrl_channels + base_channels), max_norm_num_groups),
         cross_attention_dim=cross_attention_dim,
         num_attention_heads=num_attention_heads,
-        use_linear_projection=True,
+        use_linear_projection=use_linear_projection,
         upcast_attention=upcast_attention,
     )
@@ -282,7 +285,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
         upcast_attention (`bool`, defaults to `True`):
             Whether the attention computation should always be upcasted.
         max_norm_num_groups (`int`, defaults to 32):
-            Maximum number of groups in group normal. The actual number will the the largest divisor of the respective
+            Maximum number of groups in group normal. The actual number will be the largest divisor of the respective
             channels, that is <= max_norm_num_groups.
     """
@@ -308,6 +311,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
         transformer_layers_per_block: Union[int, Tuple[int]] = 1,
         upcast_attention: bool = True,
         max_norm_num_groups: int = 32,
+        use_linear_projection: bool = True,
     ):
         super().__init__()
@@ -381,6 +385,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
                     cross_attention_dim=cross_attention_dim[i],
                     add_downsample=not is_final_block,
                     upcast_attention=upcast_attention,
+                    use_linear_projection=use_linear_projection,
                 )
             )
@@ -393,6 +398,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
             num_attention_heads=num_attention_heads[-1],
             cross_attention_dim=cross_attention_dim[-1],
             upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
         )
         # up
@@ -489,6 +495,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin):
             transformer_layers_per_block=unet.config.transformer_layers_per_block,
             upcast_attention=unet.config.upcast_attention,
             max_norm_num_groups=unet.config.norm_num_groups,
+            use_linear_projection=unet.config.use_linear_projection,
         )
         # ensure that the ControlNetXSAdapter is the same dtype as the UNet2DConditionModel
@@ -538,6 +545,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
         addition_embed_type: Optional[str] = None,
         addition_time_embed_dim: Optional[int] = None,
         upcast_attention: bool = True,
+        use_linear_projection: bool = True,
         time_cond_proj_dim: Optional[int] = None,
         projection_class_embeddings_input_dim: Optional[int] = None,
         # additional controlnet configs
@@ -595,7 +603,12 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
             time_embed_dim,
             cond_proj_dim=time_cond_proj_dim,
         )
-        self.ctrl_time_embedding = TimestepEmbedding(in_channels=time_embed_input_dim, time_embed_dim=time_embed_dim)
+        if ctrl_learn_time_embedding:
+            self.ctrl_time_embedding = TimestepEmbedding(
+                in_channels=time_embed_input_dim, time_embed_dim=time_embed_dim
+            )
+        else:
+            self.ctrl_time_embedding = None
         if addition_embed_type is None:
             self.base_add_time_proj = None
@@ -632,6 +645,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
                     cross_attention_dim=cross_attention_dim[i],
                     add_downsample=not is_final_block,
                     upcast_attention=upcast_attention,
+                    use_linear_projection=use_linear_projection,
                 )
             )
@@ -647,6 +661,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
             ctrl_num_attention_heads=ctrl_num_attention_heads[-1],
             cross_attention_dim=cross_attention_dim[-1],
             upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
         )
         # # Create up blocks
@@ -690,6 +705,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
                     add_upsample=not is_final_block,
                     upcast_attention=upcast_attention,
                     norm_num_groups=norm_num_groups,
+                    use_linear_projection=use_linear_projection,
                 )
             )
@@ -754,6 +770,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
             "addition_embed_type",
             "addition_time_embed_dim",
             "upcast_attention",
+            "use_linear_projection",
             "time_cond_proj_dim",
             "projection_class_embeddings_input_dim",
         ]
@@ -864,7 +881,7 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
         def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
             if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+                processors[f"{name}.processor"] = module.get_processor()
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -985,6 +1002,8 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
             if isinstance(module, Attention):
                 module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedAttnProcessor2_0())
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.
@@ -1219,6 +1238,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
         cross_attention_dim: Optional[int] = 1024,
         add_downsample: bool = True,
         upcast_attention: Optional[bool] = False,
+        use_linear_projection: Optional[bool] = True,
     ):
         super().__init__()
         base_resnets = []
@@ -1270,7 +1290,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
                         in_channels=base_out_channels,
                         num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
-                        use_linear_projection=True,
+                        use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
                         norm_num_groups=norm_num_groups,
                     )
@@ -1282,7 +1302,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
                         in_channels=ctrl_out_channels,
                         num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
-                        use_linear_projection=True,
+                        use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
                         norm_num_groups=find_largest_factor(ctrl_out_channels, max_factor=ctrl_max_norm_num_groups),
                     )
@@ -1342,6 +1362,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
             ctrl_num_attention_heads = get_first_cross_attention(ctrl_downblock).heads
             cross_attention_dim = get_first_cross_attention(base_downblock).cross_attention_dim
             upcast_attention = get_first_cross_attention(base_downblock).upcast_attention
+            use_linear_projection = base_downblock.attentions[0].use_linear_projection
         else:
             has_crossattn = False
             transformer_layers_per_block = None
@@ -1349,6 +1370,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
             ctrl_num_attention_heads = None
             cross_attention_dim = None
             upcast_attention = None
+            use_linear_projection = None
         add_downsample = base_downblock.downsamplers is not None
         # create model
@@ -1367,6 +1389,7 @@ class ControlNetXSCrossAttnDownBlock2D(nn.Module):
             cross_attention_dim=cross_attention_dim,
             add_downsample=add_downsample,
             upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
         )
         # # load weights
@@ -1527,6 +1550,7 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
         ctrl_num_attention_heads: Optional[int] = 1,
         cross_attention_dim: Optional[int] = 1024,
         upcast_attention: bool = False,
+        use_linear_projection: Optional[bool] = True,
     ):
         super().__init__()
@@ -1541,7 +1565,7 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
             resnet_groups=norm_num_groups,
             cross_attention_dim=cross_attention_dim,
             num_attention_heads=base_num_attention_heads,
-            use_linear_projection=True,
+            use_linear_projection=use_linear_projection,
             upcast_attention=upcast_attention,
         )
@@ -1556,7 +1580,7 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
             ),
             cross_attention_dim=cross_attention_dim,
             num_attention_heads=ctrl_num_attention_heads,
-            use_linear_projection=True,
+            use_linear_projection=use_linear_projection,
             upcast_attention=upcast_attention,
         )
@@ -1590,6 +1614,7 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
         ctrl_num_attention_heads = get_first_cross_attention(ctrl_midblock).heads
         cross_attention_dim = get_first_cross_attention(base_midblock).cross_attention_dim
         upcast_attention = get_first_cross_attention(base_midblock).upcast_attention
+        use_linear_projection = base_midblock.attentions[0].use_linear_projection
         # create model
         model = cls(
@@ -1603,6 +1628,7 @@ class ControlNetXSCrossAttnMidBlock2D(nn.Module):
             ctrl_num_attention_heads=ctrl_num_attention_heads,
             cross_attention_dim=cross_attention_dim,
             upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
         )
         # load weights
@@ -1677,6 +1703,7 @@ class ControlNetXSCrossAttnUpBlock2D(nn.Module):
         cross_attention_dim: int = 1024,
         add_upsample: bool = True,
         upcast_attention: bool = False,
+        use_linear_projection: Optional[bool] = True,
     ):
         super().__init__()
         resnets = []
@@ -1714,7 +1741,7 @@ class ControlNetXSCrossAttnUpBlock2D(nn.Module):
                         in_channels=out_channels,
                         num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
-                        use_linear_projection=True,
+                        use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
                         norm_num_groups=norm_num_groups,
                     )
@@ -1753,12 +1780,14 @@ class ControlNetXSCrossAttnUpBlock2D(nn.Module):
             num_attention_heads = get_first_cross_attention(base_upblock).heads
             cross_attention_dim = get_first_cross_attention(base_upblock).cross_attention_dim
             upcast_attention = get_first_cross_attention(base_upblock).upcast_attention
+            use_linear_projection = base_upblock.attentions[0].use_linear_projection
         else:
             has_crossattn = False
             transformer_layers_per_block = None
             num_attention_heads = None
             cross_attention_dim = None
             upcast_attention = None
+            use_linear_projection = None
         add_upsample = base_upblock.upsamplers is not None
         # create model
@@ -1776,6 +1805,7 @@ class ControlNetXSCrossAttnUpBlock2D(nn.Module):
             cross_attention_dim=cross_attention_dim,
             add_upsample=add_upsample,
             upcast_attention=upcast_attention,
+            use_linear_projection=use_linear_projection,
         )
         # load weights

diffusers/models/downsampling.py CHANGED Viewed

@@ -285,6 +285,74 @@ class KDownsample2D(nn.Module):
         return F.conv2d(inputs, weight, stride=2)
+class CogVideoXDownsample3D(nn.Module):
+    # Todo: Wait for paper relase.
+    r"""
+    A 3D Downsampling layer using in [CogVideoX]() by Tsinghua University & ZhipuAI
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `2`):
+            Stride of the convolution.
+        padding (`int`, defaults to `0`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 2,
+        padding: int = 0,
+        compress_time: bool = False,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            batch_size, channels, frames, height, width = x.shape
+            # (batch_size, channels, frames, height, width) -> (batch_size, height, width, channels, frames) -> (batch_size * height * width, channels, frames)
+            x = x.permute(0, 3, 4, 1, 2).reshape(batch_size * height * width, channels, frames)
+            if x.shape[-1] % 2 == 1:
+                x_first, x_rest = x[..., 0], x[..., 1:]
+                if x_rest.shape[-1] > 0:
+                    # (batch_size * height * width, channels, frames - 1) -> (batch_size * height * width, channels, (frames - 1) // 2)
+                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+                x = torch.cat([x_first[..., None], x_rest], dim=-1)
+                # (batch_size * height * width, channels, (frames // 2) + 1) -> (batch_size, height, width, channels, (frames // 2) + 1) -> (batch_size, channels, (frames // 2) + 1, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+            else:
+                # (batch_size * height * width, channels, frames) -> (batch_size * height * width, channels, frames // 2)
+                x = F.avg_pool1d(x, kernel_size=2, stride=2)
+                # (batch_size * height * width, channels, frames // 2) -> (batch_size, height, width, channels, frames // 2) -> (batch_size, channels, frames // 2, height, width)
+                x = x.reshape(batch_size, height, width, channels, x.shape[-1]).permute(0, 3, 4, 1, 2)
+        # Pad the tensor
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        batch_size, channels, frames, height, width = x.shape
+        # (batch_size, channels, frames, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size * frames, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channels, height, width)
+        x = self.conv(x)
+        # (batch_size * frames, channels, height, width) -> (batch_size, frames, channels, height, width) -> (batch_size, channels, frames, height, width)
+        x = x.reshape(batch_size, frames, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+        return x
 def downsample_2d(
     hidden_states: torch.Tensor,
     kernel: Optional[torch.Tensor] = None,

diffusers 0.29.2__py3-none-any.whl → 0.30.1__py3-none-any.whl

diffusers 0.29.2py3-none-any.whl → 0.30.1py3-none-any.whl