PyPI - broccoli-ml - Versions diffs - 0.29.1__py3-none-any.whl → 10.0.1__py3-none-any.whl - Mend

broccoli-ml 0.29.1py3-none-any.whl → 10.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

broccoli/activation.py +1 -4
broccoli/cnn.py +1 -289
broccoli/linear.py +237 -7
broccoli/rope.py +19 -4
broccoli/tensor.py +36 -31
broccoli/transformer.py +523 -186
broccoli/utils.py +13 -7
broccoli/vit.py +214 -56
{broccoli_ml-0.29.1.dist-info → broccoli_ml-10.0.1.dist-info}/METADATA +5 -3
broccoli_ml-10.0.1.dist-info/RECORD +13 -0
broccoli/assets/2025_resnet_imagenet_1k_pretrained_state_dict.pkl +0 -0
broccoli/assets/cifar100_eigenvectors_size_2.pt +0 -0
broccoli/assets/cifar100_eigenvectors_size_3.pt +0 -0
broccoli/eigenpatches.py +0 -49
broccoli_ml-0.29.1.dist-info/RECORD +0 -17
{broccoli_ml-0.29.1.dist-info → broccoli_ml-10.0.1.dist-info}/LICENSE +0 -0
{broccoli_ml-0.29.1.dist-info → broccoli_ml-10.0.1.dist-info}/WHEEL +0 -0

broccoli/transformer.py CHANGED Viewed

@@ -1,16 +1,72 @@
+import warnings
 import math
-from collections import OrderedDict
-from typing import Optional
-from numpy import random
+from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
 from einops import rearrange
 from .rope import RotaryEmbedding, apply_rotary_emb
-from .linear import AnchoredLinear, SpectralNormLinear
+try:
+    from flash_attn import flash_attn_func
+    print("Using flash-attn.")
+    FLASH_ATTN = True
+except ImportError:
+    pass
+    FLASH_ATTN = False
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-4):
+        super().__init__()
+        self.nondecay_scale = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x * self.nondecay_scale
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
 class MHAttention(nn.Module):
@@ -21,45 +77,6 @@ class MHAttention(nn.Module):
         are the same shape.
     Assumes bias=False and batch_first=True, as God intended.
-    Optionally adds various bells and whistles suggested in the
-        literature, including:
-        Noam Shazeer's scaled attention per "Attention is All You Need"
-            (https://arxiv.org/abs/1706.03762).
-        Max subtract softmax as discussed in "Attention As An RNN"
-            (https://arxiv.org/abs/2405.13956)
-        Log-length scaled softmax per "Overcoming a Theoretical Limitation of
-            Self-Attention" (https://arxiv.org/abs/2202.12172).
-        Quiet softmax per
-            https://www.evanmiller.org/attention-is-off-by-one.html
-    Args:
-        d_model: ...
-        n_heads: ...
-        dropout: ...
-        causal: should a causal mask be applied to the logits before attention
-            is applied? This is standard when using self-attention. Cannot be
-            True if inputs won't be square (e.g. if sequence length for
-            encoder and decoder are different)
-        sequence_length: ...
-        share_kv: ...
-        linear_module: ...
-        max_subtract: if True, the maximum logit value is subtracted from all
-            logits before performing the softmax operation to create a more
-            numerically stable softmax. This is discussed in "Attention As An
-            RNN" (https://arxiv.org/abs/2405.13956).
-        d_model_scale: ...
-        log_length_scale: if True, multiplies logits by the log length of
-            the decoder sequence before performing the softmax operation, as
-            proposed in "Overcoming a Theoretical Limitation of Self-Attention"
-            (https://arxiv.org/abs/2202.12172).
-        quiet: if True, adds 1 to the denominator of the softmax operation,
-            allowing some tokens to attend to no other tokens as described in
-            https://www.evanmiller.org/attention-is-off-by-one.html.
     """
     def __init__(
@@ -70,10 +87,19 @@ class MHAttention(nn.Module):
         causal=False,
         seq_len=None,
         linear_module: nn.Module = nn.Linear,
-        bos_tokens=0,
+        utility_tokens=0,
+        talking_heads=False,
         rotary_embedding=None,
         source_size=None,
+        scaling="d",
     ):
+        """
+        Args:
+            scaling: how should the attention logits be scaled? Can be "sqrtd"
+                to mimic the original Attention is All You Need approach of
+                dividing by the sqrt of the embedding Dimension or "d" per
+                "Tensor Programs V...". Default "d"
+        """
         super().__init__()
         if rotary_embedding is not None:
@@ -81,12 +107,31 @@ class MHAttention(nn.Module):
         if causal:
             assert seq_len is not None
+        self.talking_heads = talking_heads
+        if self.talking_heads:
+            self.head_projection = nn.Linear(n_heads, n_heads, bias=False)
+            self.sample_projection = nn.Linear(n_heads, n_heads, bias=False)
+        else:
+            self.head_projection = None
+            self.sample_projection = None
         self.embed_dim = embed_dim
         self.n_heads = n_heads
         assert embed_dim % n_heads == 0
+        self.scaling = scaling
         self.head_dim = self.embed_dim // self.n_heads
+        if self.scaling == "sqrtd":
+            self.scaling_factor = 1 / math.sqrt(self.head_dim)
+        elif self.scaling == "d":
+            # 8/d_model for backwards compatibility,
+            #     per https://github.com/microsoft/mup
+            self.scaling_factor = 8 / self.head_dim
+        else:
+            raise ValueError('`scaling` argument to MHAttention must be "d" or "sqrtd"')
         self.q_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
         self.k_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
@@ -105,7 +150,9 @@ class MHAttention(nn.Module):
             )
         self.rotary_embedding = rotary_embedding
         self.source_size = source_size
-        self.bos_tokens = bos_tokens
+        self.utility_tokens = utility_tokens
+        self.reset_parameters()
     @property
     def _kv_distance(self) -> float:
@@ -126,7 +173,71 @@ class MHAttention(nn.Module):
         return 1 - similarity
-    def forward(self, q, k, v):
+    def add_axial_rope(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply Axial RoPE to all tokens except utility tokens
+        """
+        if len(self.source_size) == 1:
+            spatial_dimension_names = "D1"
+            spatial_dimension_values = {"D1": self.source_size[0]}
+        elif len(self.source_size) == 2:
+            spatial_dimension_names = "D1 D2"
+            spatial_dimension_values = {
+                "D1": self.source_size[0],
+                "D2": self.source_size[1],
+            }
+        elif len(self.source_size) == 3:
+            spatial_dimension_names = "D1 D2 D3"
+            spatial_dimension_values = {
+                "D1": self.source_size[0],
+                "D2": self.source_size[1],
+                "D3": self.source_size[2],
+            }
+        else:
+            raise NotImplementedError(
+                "`source_size` must be a tuple of 1, 2 or 3 integers"
+            )
+        q_util, q_img = q[:, : self.utility_tokens, :], q[:, self.utility_tokens :, :]
+        k_util, k_img = k[:, : self.utility_tokens, :], k[:, self.utility_tokens :, :]
+        q_img = rearrange(
+            q_img,
+            f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
+            **spatial_dimension_values,
+        )
+        k_img = rearrange(
+            k_img,
+            f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
+            **spatial_dimension_values,
+        )
+        freqs = self.rotary_embedding.get_axial_freqs(*self.source_size)
+        q_img = apply_rotary_emb(freqs, q_img)
+        k_img = apply_rotary_emb(freqs, k_img)
+        q_img = rearrange(
+            q_img,
+            f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
+        )
+        k_img = rearrange(
+            k_img,
+            f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
+        )
+        # Re-combine the utility tokens and the RoPE-enhanced sequence tokens
+        q = torch.cat([q_util, q_img], dim=1)
+        k = torch.cat([k_util, k_img], dim=1)
+        return q, k
+    def project_qkv(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         query_batch_size, query_tokens, query_features = q.size()
         key_batch_size, key_tokens, key_features = k.size()
@@ -139,66 +250,74 @@ class MHAttention(nn.Module):
         if self.causal:
             assert query_tokens == key_tokens
-            assert query_tokens == self.sequence_length
+            assert query_tokens == self.seq_len
-        # Project q, k and v
-        q = self.q_proj(q)
-        k = self.k_proj(k)
-        v = self.v_proj(v)
+        q, k, v = self.q_proj(q), self.k_proj(k), self.v_proj(v)
-        # Rearrange dimensions and add RoPE if needed
         if self.rotary_embedding is not None:
+            q, k = self.add_axial_rope(q, k)
-            if len(self.source_size) == 1:
-                spatial_dimension_names = "D1"
-                spatial_dimension_values = {"D1": self.source_size[0]}
-            elif len(self.source_size) == 2:
-                spatial_dimension_names = "D1 D2"
-                spatial_dimension_values = {
-                    "D1": self.source_size[0],
-                    "D2": self.source_size[1],
-                }
-            elif len(self.source_size) == 3:
-                spatial_dimension_names = "D1 D2 D3"
-                spatial_dimension_values = {
-                    "D1": self.source_size[0],
-                    "D2": self.source_size[1],
-                    "D3": self.source_size[2],
-                }
-            else:
-                raise NotImplementedError(
-                    "`source_size` must be a tuple of 1, 2 or 3 integers"
-                )
+        return q, k, v
-            q_bos, q_img = q[:, : self.bos_tokens, :], q[:, self.bos_tokens :, :]
-            k_bos, k_img = k[:, : self.bos_tokens, :], k[:, self.bos_tokens :, :]
+    def forward(self, q, k, v):
-            q_img = rearrange(
-                q_img,
-                f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
-                **spatial_dimension_values,
-            )
-            k_img = rearrange(
-                k_img,
-                f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
-                **spatial_dimension_values,
+        q, k, v = self.project_qkv(q, k, v)
+        if FLASH_ATTN and not self.talking_heads:
+            # Divide Q/K/V into heads
+            q = rearrange(q, "b t (h d) -> b t h d", h=self.n_heads)
+            k = rearrange(k, "b t (h d) -> b t h d", h=self.n_heads)
+            v = rearrange(v, "b t (h d) -> b t h d", h=self.n_heads)
+            output_with_heads = flash_attn_func(
+                q,
+                k,
+                v,
+                dropout_p=self.dropout.p if self.training else 0.0,
+                softmax_scale=self.scaling_factor,
+                causal=self.causal,
             )
-            freqs = self.rotary_embedding.get_axial_freqs(*self.source_size)
-            q_img = apply_rotary_emb(freqs, q_img)
-            k_img = apply_rotary_emb(freqs, k_img)
-            q_img = rearrange(
-                q_img,
-                f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
-            )
-            k_img = rearrange(
-                k_img,
-                f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
-            )
+            output_without_heads = rearrange(output_with_heads, "b t h d -> b t (h d)")
+            return self.out_proj(output_without_heads)
+        else:
+            # Divide Q/K/V into heads
+            q = rearrange(q, "b t (h d) -> b h t d", h=self.n_heads)
+            k = rearrange(k, "b t (h d) -> b h t d", h=self.n_heads)
+            v = rearrange(v, "b t (h d) -> b h t d", h=self.n_heads)
-            # Re-combine the BOS tokens and the RoPE-enhanced image tokens
-            q = torch.cat([q_bos, q_img], dim=1)
-            k = torch.cat([k_bos, k_img], dim=1)
+            qk_scores = q @ k.transpose(-1, -2)
+            qk_scores *= self.scaling_factor
+            if self.talking_heads:
+                qk_scores = torch.einsum(
+                    "b h i j, o h -> b o i j", qk_scores, self.head_projection.weight
+                )
+            # Apply mask if causal (must come before softmax)
+            if self.causal:
+                qk_scores.masked_fill_(self.mask, float("-inf"))
+            qk_scores = F.softmax(qk_scores, dim=-1)
+            if self.talking_heads:
+                qk_scores = torch.einsum(
+                    "b h i j, o h -> b o i j", qk_scores, self.sample_projection.weight
+                )
+            qk_scores = self.dropout(qk_scores)
+            output_with_heads = qk_scores @ v
+            output_without_heads = rearrange(output_with_heads, "b h t d -> b t (h d)")
+            return self.out_proj(output_without_heads)
+    def attention_logits(self, q, k, v):
+        q, k, v = self.project_qkv(q, k, v)
         # Divide Q/K/V into heads
         q = rearrange(q, "b t (h d) -> b h t d", h=self.n_heads)
@@ -207,19 +326,24 @@ class MHAttention(nn.Module):
         qk_scores = q @ k.transpose(-1, -2)
-        qk_scores /= math.sqrt(self.head_dim)
+        qk_scores *= self.scaling_factor
         # Apply mask if causal (must come before softmax)
         if self.causal:
             qk_scores.masked_fill_(self.mask, float("-inf"))
-        qk_scores = F.softmax(qk_scores, dim=-1)
-        output_with_heads = qk_scores @ v
+        return qk_scores  # (batch, head, seq_len, seq_len)
-        output_without_heads = rearrange(output_with_heads, "b h t d -> b t (h d)")
-        return self.out_proj(output_without_heads)
+    def reset_parameters(self):
+        # Default nn.Linear init is kaiming_uniform, which is fine
+        self.q_proj.reset_parameters()
+        self.k_proj.reset_parameters()
+        self.v_proj.reset_parameters()
+        self.out_proj.reset_parameters()
+        if self.talking_heads:
+            # Initialize close to identity
+            nn.init.eye_(self.head_projection.weight)
+            nn.init.eye_(self.sample_projection.weight)
 class FeedforwardBlock(nn.Module):
@@ -235,44 +359,123 @@ class FeedforwardBlock(nn.Module):
         activation=nn.ReLU,
         activation_kwargs=None,
         dropout=0.0,
-        linear_module=nn.Linear,
+        inner_dropout=None,
+        outer_dropout=None,
+        linear_module_up=nn.Linear,
+        linear_module_down=nn.Linear,
         pre_norm=True,
         normformer=False,
-        raw_input=False,
+        post_norm=True,
+        residual_path=True,
+        checkpoint=True,
     ):
         super().__init__()
+        self.checkpoint = checkpoint
+        self.residual_path = residual_path
+        self.post_norm = post_norm
+        self.xglu = activation.__name__.endswith("GLU")
+        if self.residual_path and (output_features < input_features):
+            raise ValueError(
+                "If the number of output features will be less than "
+                "the number of input features, then `residual_path` "
+                "should be set to False."
+            )
+        if self.post_norm:
+            self.layernorm = nn.LayerNorm(output_features)
         if activation_kwargs is not None:
             self.activation = activation(**activation_kwargs)
         else:
             self.activation = activation()
-        if raw_input:
-            self.memory_type = SpectralNormLinear
-        else:
-            self.memory_type = nn.Linear
-        self.dropout = nn.Dropout(dropout)
+        self.inner_dropout = nn.Dropout(
+            inner_dropout if inner_dropout is not None else dropout
+        )
+        self.outer_dropout = nn.Dropout(
+            outer_dropout if outer_dropout is not None else dropout
+        )
         self.max_features = (
-            2 * ratio * output_features
-            if activation.__name__.endswith("GLU")
-            else ratio * output_features
+            2 * int(ratio * output_features)
+            if self.xglu
+            else int(ratio * output_features)
+        )
+        self.linear_in = linear_module_up(input_features, self.max_features)
+        self.linear_out = linear_module_down(
+            int(ratio * output_features), output_features
         )
         self.process = nn.Sequential(
             *[
                 nn.LayerNorm(input_features) if pre_norm else nn.Identity(),
-                linear_module(input_features, self.max_features),
+                self.linear_in,
                 self.activation,
-                nn.LayerNorm(ratio * output_features) if normformer else nn.Identity(),
-                self.memory_type(ratio * output_features, output_features),
-                self.dropout,
+                self.inner_dropout,
+                (
+                    nn.LayerNorm(int(ratio * output_features))
+                    if normformer
+                    else nn.Identity()
+                ),
+                self.linear_out,
+                self.outer_dropout,
             ]
         )
+        self.recycling_enabled = False
+        if hasattr(self.linear_in, "row_recycling_rate") and hasattr(
+            self.linear_out, "column_recycling_rate"
+        ):
+            self.recycling_enabled = True
+            self.master_recycling_rate = self.linear_in.row_recycling_rate
+            self.linear_in.row_recycling_rate = 0.0
+            self.linear_out.column_recycling_rate = 0.0
+            if (
+                hasattr(self.linear_in, "column_recycling_rate")
+                and self.linear_in.column_recycling_rate > 0
+            ) or (
+                hasattr(self.linear_out, "row_recycling_rate")
+                and self.linear_out.row_recycling_rate > 0
+            ):
+                raise NotImplementedError(
+                    "At the moment this layer can only support recycling linear "
+                    "layers if the in layer resets only rows and the out layer "
+                    "resets only columns."
+                )
+        self.reset_parameters()
     def forward(self, x):
-        return self.process(x)
+        # Recycle weights if using recycling linear layers
+        if self.training and self.recycling_enabled:
+            indices = self.linear_out.get_reset_indices(1)
+            self.linear_in.reset_rows(indices, incoming_data=x)
+            self.linear_out.reset_columns(indices)
+        if self.checkpoint:
+            processed = checkpoint(self.process, x, use_reentrant=False)
+        else:
+            processed = self.process(x)
+        if self.residual_path and self.post_norm:
+            return self.layernorm(x + processed)
+        elif self.residual_path:
+            return x + processed
+        else:
+            return processed
+    def reset_parameters(self):
+        if self.post_norm:
+            self.layernorm.reset_parameters()
+        # Iterate over the sequential block to reset parameters
+        for module in self.process:
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
 class TransformerBlock(nn.Module):
@@ -289,30 +492,57 @@ class TransformerBlock(nn.Module):
         seq_len,
         d_model,
         n_heads,
-        position_embedding_type="absolute",  # absolute or relative
+        relative_position_embedding=False,
         source_size=None,
-        bos_tokens=0,
+        utility_tokens=0,
+        talking_heads=False,
         mlp_ratio=4,
         activation: nn.Module = nn.ReLU,
         activation_kwargs: Optional[dict] = None,
-        mlp_dropout=0.0,
+        ff_linear_module_up=None,
+        ff_linear_module_down=None,
+        msa_scaling="d",
+        ff_dropout=0.0,
+        ff_inner_dropout=0.0,
+        ff_outer_dropout=0.0,
         msa_dropout=0.0,
         identity_probability=0.0,
         causal=False,
         linear_module=nn.Linear,
         pre_norm=True,
+        post_norm=False,
         normformer=False,
+        checkpoint_ff=True,
+        layerscale=True,
     ):
+        """
+        Args:
+            msa_scaling: how should the attention logits be scaled? Can be "sqrtd"
+                to mimic the original Attention is All You Need approach of
+                dividing by the sqrt of the embedding Dimension or "d" per
+                "Tensor Programs V...". Default "d"
+        """
         super().__init__()
         self.pre_norm = pre_norm
+        self.post_norm = post_norm
+        self.normformer = normformer
-        self.identity_probability = identity_probability
+        self.drop_path = DropPath(drop_prob=identity_probability, scale_by_keep=True)
         self.layer_norm_1 = nn.LayerNorm(d_model)
         self.layer_norm_2 = nn.LayerNorm(d_model)
+        self.layer_norm_3 = nn.LayerNorm(d_model)
-        if position_embedding_type == "relative":
+        if layerscale:
+            self.layerscale1 = LayerScale(d_model)
+            self.layerscale2 = LayerScale(d_model)
+        else:
+            self.layerscale1 = nn.Identity()
+            self.layerscale2 = nn.Identity()
+        if relative_position_embedding:
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
             if d_model < 16:
                 dim = d_model
@@ -333,63 +563,87 @@ class TransformerBlock(nn.Module):
             linear_module=linear_module,
             rotary_embedding=self.rotary_embedding,
             source_size=source_size,
-            bos_tokens=bos_tokens,
+            utility_tokens=utility_tokens,
+            talking_heads=talking_heads,
+            scaling=msa_scaling,
         )
-        # Submodules for the feedforward process
+        # Submodule for the feedforward process
         self.ff = FeedforwardBlock(
             d_model,
             mlp_ratio,
             d_model,
             activation=activation,
             activation_kwargs=activation_kwargs,
-            dropout=mlp_dropout,
-            linear_module=linear_module,
-            pre_norm=pre_norm,
+            dropout=ff_dropout,
+            inner_dropout=ff_inner_dropout,
+            outer_dropout=ff_outer_dropout,
+            linear_module_up=(
+                ff_linear_module_up
+                if ff_linear_module_up is not None
+                else linear_module
+            ),
+            linear_module_down=(
+                ff_linear_module_down
+                if ff_linear_module_down is not None
+                else linear_module
+            ),
+            pre_norm=False,  # Handled outside the block
             normformer=normformer,
+            post_norm=False,  # Handled outside the block
+            residual_path=False,  # Handled outside the block
+            checkpoint=checkpoint_ff,
         )
+        self.reset_parameters()
     @property
     def _kv_distance(self) -> float:
         return self.attn._kv_distance
     def forward(self, x):
-        if not self.training:
-            identity_probability = 0.0
-        else:
-            identity_probability = self.identity_probability
-        # perform the identity operation for some rows in the batch
-        identity_count = random.binomial(n=x.size(0), p=identity_probability)
-        shuffle_indices = torch.randperm(x.size(0), device=x.device)
-        unshuffle_indices = torch.argsort(shuffle_indices)
-        shuffled = x[shuffle_indices, :, :]
-        identity_x = shuffled[:identity_count, :, :]
-        process_x = shuffled[identity_count:, :, :]
         if self.pre_norm:
-            norm_process_x = self.layer_norm_1(process_x)
-            process_x = process_x + self.attn(
-                norm_process_x, norm_process_x, norm_process_x
-            )
-            process_x = process_x + self.ff(process_x)
-        else:  # post-norm
-            process_x = process_x + self.attn(process_x, process_x, process_x)
-            norm_process_x = self.layer_norm_1(process_x)
-            process_x = process_x + self.ff(process_x)
-        # Always post norm as eventually we reach the classification head!
-        x = self.layer_norm_2(
-            torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
-        )
+            x = self.layer_norm_1(x)
+            x = x + self.drop_path(self.layerscale1(self.attn(x, x, x)))
+            x = self.layer_norm_2(x)
+            x = x + self.drop_path(self.layerscale2(self.ff(x)))
+            if self.post_norm:  # i.e. in addition! Pre and post.
+                x = self.layer_norm_3(x)
+        elif self.post_norm:  # i.e. only, not prenorm, just post
+            x = x + self.drop_path(self.layerscale1(self.attn(x, x, x)))
+            x = self.layer_norm_1(x)
+            x = x + self.drop_path(self.layerscale2(self.ff(x)))
+            x = self.layer_norm_2(x)
+        else:  # Not pre or post norm. Stand well back.
+            x = x + self.drop_path(self.layerscale1(self.attn(x, x, x)))
+            x = x + self.drop_path(self.layerscale2(self.ff(x)))
         return x
+    def attention_logits(self, x):
+        """
+        Give back the attention scores used in this layer.
+        """
+        if self.pre_norm:
+            x = self.layer_norm_1(x)
+            return self.attn.attention_logits(x, x, x)
+        else:
+            return self.attn.attention_logits(x, x, x)
+    def reset_parameters(self):
+        self.layer_norm_1.reset_parameters()
+        self.layer_norm_2.reset_parameters()
+        self.layer_norm_3.reset_parameters()
+        self.attn.reset_parameters()
+        self.ff.reset_parameters()
 class TransformerEncoder(nn.Module):
     """
     This assumes we already get a sequence of embeddings (e.g. word or image
-        patch embeddings). It uses learned positional embeddings.
+        patch embeddings).
     """
     def __init__(
@@ -398,53 +652,93 @@ class TransformerEncoder(nn.Module):
         d_model,
         n_layers,
         n_heads,
-        position_embedding_type="absolute",  # absolute or relative
+        absolute_position_embedding=True,
+        relative_position_embedding=False,
         source_size=None,
         mlp_ratio=4,
         activation: nn.Module = nn.ReLU,
         activation_kwargs: Optional[dict] = None,
-        mlp_dropout=0.0,
+        ff_linear_module_up=None,
+        ff_linear_module_down=None,
+        ff_dropout=0.0,
+        ff_inner_dropout=0.0,
+        ff_outer_dropout=0.0,
         msa_dropout=0.0,
         stochastic_depth=0.0,
         causal=False,
         linear_module=nn.Linear,
-        bos_tokens=0,
-        return_bos_tokens=False,
+        utility_tokens=0,
+        talking_heads=False,
+        return_utility_tokens=False,
         pre_norm=True,
+        post_norm=False,
         normformer=False,
+        msa_scaling="d",
+        checkpoint_ff=True,
+        layerscale=True,
     ):
-        if position_embedding_type == "relative":
-            assert source_size is not None  # TODO: make this a proper exception
+        """
+        Args:
+            msa_scaling: how should the attention logits be scaled? Can be "sqrtd"
+                to mimic the original Attention is All You Need approach of
+                dividing by the sqrt of the embedding Dimension or "d" per
+                "Tensor Programs V...". Default "d"
+        """
+        if relative_position_embedding and (source_size is None):
+            raise ValueError(
+                "`source_size` for TransformerEncoder cannot be None if"
+                " `relative_position_embedding` is True"
+            )
+        if absolute_position_embedding and (seq_len is None):
+            raise ValueError(
+                "`seq_len` for TransformerEncoder cannot be None if"
+                " `absolute_position_embedding` is True"
+            )
         super().__init__()
+        if FLASH_ATTN and talking_heads:
+            warnings.warn(
+                "Using talking heads currently prevents using flash attention.",
+                stacklevel=2,
+            )
         self.seq_len = seq_len
         self.n_heads = n_heads
-        self._bos_tokens = bos_tokens
-        self.return_bos_tokens = return_bos_tokens
-        # Initialise BOS tokens with normal init, like usual Pytorch embeddings
-        if self._bos_tokens:
-            self._bos_embedding = nn.Parameter(torch.empty(self._bos_tokens, d_model))
-            nn.init.normal_(self._bos_embedding, mean=0.0, std=1.0)
-            self.full_sequence_length = self.seq_len + self._bos_tokens
+        self._utility_tokens = utility_tokens
+        self.return_utility_tokens = return_utility_tokens
+        # Initialise utility tokens with normal init, like usual Pytorch embeddings
+        if self._utility_tokens:
+            self._utility_token_embedding = nn.Parameter(
+                torch.empty(self._utility_tokens, d_model)
+            )
+            nn.init.normal_(self._utility_token_embedding, mean=0.0, std=1.0)
+        else:
+            self._utility_token_embedding = None
+        if self._utility_tokens and (self.seq_len is not None):
+            self.full_sequence_length = self.seq_len + self._utility_tokens
         else:
-            self._bos_embedding = None
             self.full_sequence_length = self.seq_len
         self.d_model = d_model
-        self.position_embedding_type = position_embedding_type
-        if self.position_embedding_type == "absolute":
+        if absolute_position_embedding:
             self.absolute_position_embedding = nn.Embedding(
                 self.full_sequence_length, d_model
             )
+        else:
+            self.absolute_position_embedding = None
-        self.mlp_dropout = mlp_dropout
+        self.mlp_dropout = ff_dropout
         self.msa_dropout = msa_dropout
         self.stochastic_depth = stochastic_depth
-        assert isinstance(n_layers, int)  # XXX: make this a proper Exception
+        assert isinstance(n_layers, int)
         if n_layers == 1:
             self.stochastic_depth_probabilities = [0.0]
         else:
@@ -459,35 +753,48 @@ class TransformerEncoder(nn.Module):
                     self.full_sequence_length,
                     d_model,
                     n_heads,
-                    position_embedding_type=position_embedding_type,
+                    relative_position_embedding=relative_position_embedding,
                     source_size=source_size,
-                    bos_tokens=bos_tokens,
+                    utility_tokens=utility_tokens,
+                    talking_heads=talking_heads,
                     mlp_ratio=mlp_ratio,
                     activation=activation,
                     activation_kwargs=activation_kwargs,
-                    mlp_dropout=mlp_dropout,
+                    ff_linear_module_up=ff_linear_module_up,
+                    ff_linear_module_down=ff_linear_module_down,
+                    msa_scaling=msa_scaling,
+                    ff_dropout=ff_dropout,
+                    ff_inner_dropout=ff_inner_dropout,
+                    ff_outer_dropout=ff_outer_dropout,
                     msa_dropout=msa_dropout,
                     identity_probability=self.stochastic_depth_probabilities[i],
                     causal=causal,
                     linear_module=linear_module,
                     pre_norm=pre_norm,
+                    post_norm=post_norm,
                     normformer=normformer,
+                    checkpoint_ff=checkpoint_ff,
+                    layerscale=layerscale,
                 )
                 for i in range(n_layers)
             ]
         )
+        self.reset_parameters()
     @property
     def _kv_distances(self) -> float:
         return ",".join([str(block._kv_distance) for block in self.blocks])
-    def forward(self, x):
-        if self._bos_tokens:
-            x = torch.cat([self._bos_embedding.expand(x.size(0), -1, -1), x], dim=1)
+    def preprocess(self, x):
+        if self._utility_tokens:
+            x = torch.cat(
+                [self._utility_token_embedding.expand(x.size(0), -1, -1), x], dim=1
+            )
         else:
             x = x
-        if self.position_embedding_type == "absolute":
+        if self.absolute_position_embedding is not None:
             x = x + self.absolute_position_embedding(
                 torch.arange(
                     0, self.full_sequence_length, dtype=torch.long, device=x.device
@@ -496,10 +803,40 @@ class TransformerEncoder(nn.Module):
                 )  # to shape (1, seq_len) to broadcast over batch
             )
+        return x
+    def forward(self, x):
+        x = self.preprocess(x)
         for block in self.blocks:
             x = block(x)
-        if self._bos_tokens and not self.return_bos_tokens:
-            return x[:, self._bos_tokens :, :]
+        if self._utility_tokens and not self.return_utility_tokens:
+            return x[:, self._utility_tokens :, :]
         else:
             return x
+    def attention_logits(self, x):
+        x = self.preprocess(x)
+        layer_scores = []
+        for block in self.blocks:
+            # Get attention scores with shape (batch, 1, head, seq_len, seq_len)
+            layer_attention_logits = block.attention_logits(x).unsqueeze(1)
+            layer_scores.append(layer_attention_logits)
+            x = block(x)
+        return torch.cat(layer_scores, dim=1)  # (batch, layer, head, seq_len, seq_len)
+    def reset_parameters(self):
+        if self._utility_token_embedding is not None:
+            nn.init.normal_(self._utility_token_embedding, mean=0.0, std=1.0)
+        if self.absolute_position_embedding is not None:
+            self.absolute_position_embedding.reset_parameters()
+        for block in self.blocks:
+            block.reset_parameters()

broccoli-ml 0.29.1__py3-none-any.whl → 10.0.1__py3-none-any.whl

broccoli-ml 0.29.1py3-none-any.whl → 10.0.1py3-none-any.whl