PyPI - broccoli-ml - Versions diffs - 0.1.41__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

broccoli-ml 0.1.41py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

broccoli/cnn.py +404 -322
broccoli/transformer.py +96 -82
broccoli/vit.py +173 -125
{broccoli_ml-0.1.41.dist-info → broccoli_ml-0.3.0.dist-info}/METADATA +1 -1
{broccoli_ml-0.1.41.dist-info → broccoli_ml-0.3.0.dist-info}/RECORD +7 -7
{broccoli_ml-0.1.41.dist-info → broccoli_ml-0.3.0.dist-info}/LICENSE +0 -0
{broccoli_ml-0.1.41.dist-info → broccoli_ml-0.3.0.dist-info}/WHEEL +0 -0

broccoli/transformer.py CHANGED Viewed

@@ -68,12 +68,7 @@ class MHAttention(nn.Module):
         dropout=0.0,
         causal=False,
         seq_len=None,
-        share_kv=False,
         linear_module: nn.Module = nn.Linear,
-        max_subtract=False,
-        d_model_scale=True,
-        log_length_scale=False,
-        quiet=False,
         bos_tokens=0,
         rotary_embedding=None,
         source_size=None,
@@ -88,15 +83,15 @@ class MHAttention(nn.Module):
         self.embed_dim = embed_dim
         self.n_heads = n_heads
         assert embed_dim % n_heads == 0
         self.head_dim = self.embed_dim // self.n_heads
-        self.share_kv = share_kv
         self.q_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
         self.k_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
-        if self.share_kv:
-            self.v_proj = self.k_proj
-        else:
-            self.v_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
         self.out_proj = linear_module(self.embed_dim, self.embed_dim, bias=False)
         self.causal = causal
         self.seq_len = seq_len
         self.dropout = nn.Dropout(dropout)
@@ -107,10 +102,6 @@ class MHAttention(nn.Module):
                 .unsqueeze(0)
                 .unsqueeze(0),
             )
-        self.max_subtract = max_subtract
-        self.d_model_scale = d_model_scale
-        self.log_length_scale = log_length_scale
-        self.quiet = quiet
         self.rotary_embedding = rotary_embedding
         self.source_size = source_size
         self.bos_tokens = bos_tokens
@@ -152,37 +143,57 @@ class MHAttention(nn.Module):
         # Project q, k and v
         q = self.q_proj(q)
         k = self.k_proj(k)
-        if self.share_kv:
-            v = self.k_proj(v)
-        else:
-            v = self.v_proj(v)
+        v = self.v_proj(v)
         # Rearrange dimensions and add RoPE if needed
         if self.rotary_embedding is not None:
+            if len(self.source_size) == 1:
+                spatial_dimension_names = "D1"
+                spatial_dimension_values = {"D1": self.source_size[0]}
+            elif len(self.source_size) == 2:
+                spatial_dimension_names = "D1 D2"
+                spatial_dimension_values = {
+                    "D1": self.source_size[0],
+                    "D2": self.source_size[1],
+                }
+            elif len(self.source_size) == 3:
+                spatial_dimension_names = "D1 D2 D3"
+                spatial_dimension_values = {
+                    "D1": self.source_size[0],
+                    "D2": self.source_size[1],
+                    "D3": self.source_size[2],
+                }
+            else:
+                raise NotImplementedError(
+                    "`source_size` must be a tuple of 1, 2 or 3 integers"
+                )
             q_bos, q_img = q[:, : self.bos_tokens, :], q[:, self.bos_tokens :, :]
             k_bos, k_img = k[:, : self.bos_tokens, :], k[:, self.bos_tokens :, :]
             q_img = rearrange(
                 q_img,
-                "b (height width) d -> b height width d",
-                height=self.source_size[0],
-                width=self.source_size[1],
+                f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
+                **spatial_dimension_values,
             )
             k_img = rearrange(
                 k_img,
-                "b (height width) d -> b height width d",
-                height=self.source_size[0],
-                width=self.source_size[1],
-            )
-            freqs = self.rotary_embedding.get_axial_freqs(
-                self.source_size[0], self.source_size[1]
+                f"b ({spatial_dimension_names}) d -> b {spatial_dimension_names} d",
+                **spatial_dimension_values,
             )
+            freqs = self.rotary_embedding.get_axial_freqs(*self.source_size)
             q_img = apply_rotary_emb(freqs, q_img)
             k_img = apply_rotary_emb(freqs, k_img)
-            q_img = rearrange(q_img, "b height width d -> b (height width) d")
-            k_img = rearrange(k_img, "b height width d -> b (height width) d")
+            q_img = rearrange(
+                q_img,
+                f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
+            )
+            k_img = rearrange(
+                k_img,
+                f"b {spatial_dimension_names} d -> b ({spatial_dimension_names}) d",
+            )
             # Re-combine the BOS tokens and the RoPE-enhanced image tokens
             q = torch.cat([q_bos, q_img], dim=1)
@@ -195,26 +206,13 @@ class MHAttention(nn.Module):
         qk_scores = q @ k.transpose(-1, -2)
-        if self.d_model_scale:
-            qk_scores /= math.sqrt(self.head_dim)  # scaling
-        if self.log_length_scale:
-            qk_scores *= math.log(qk_scores.size(0))
-        if self.max_subtract:
-            max_scores, _ = torch.max(qk_scores, dim=-1, keepdim=True)
-            qk_scores -= max_scores
+        qk_scores /= math.sqrt(self.head_dim)
         # Apply mask if causal (must come before softmax)
         if self.causal:
             qk_scores.masked_fill_(self.mask, float("-inf"))
-        # Apply softmax and dropout
-        denominator = torch.sum(torch.exp(qk_scores), dim=-1, keepdim=True)
-        if self.quiet:
-            denominator += 1
-        numerator = torch.exp(qk_scores)
-        qk_scores = self.dropout(numerator / denominator)
+        qk_scores = F.softmax(qk_scores, dim=-1)
         output_with_heads = qk_scores @ v
@@ -223,6 +221,50 @@ class MHAttention(nn.Module):
         return self.out_proj(output_without_heads)
+class DenoisingAutoEncoder(nn.Module):
+    """
+    A denoising autoencoder, of the type used in transformer blocks.
+    """
+    def __init__(
+        self,
+        input_features,
+        ratio,
+        output_features,
+        activation=nn.ReLU,
+        activation_kwargs=None,
+        dropout=0.0,
+        linear_module=nn.Linear,
+    ):
+        super().__init__()
+        if activation_kwargs is not None:
+            self.activation = activation(**activation_kwargs)
+        else:
+            self.activation = activation()
+        self.dropout = nn.Dropout(dropout)
+        self.process = nn.Sequential(
+            *[
+                linear_module(
+                    input_features,
+                    (
+                        2 * ratio * input_features
+                        if activation.__name__.endswith("GLU")
+                        else ratio * input_features
+                    ),
+                ),
+                self.activation,
+                self.dropout,
+                linear_module(ratio * input_features, output_features),
+            ]
+        )
+    def forward(self, x):
+        return self.process(x)
 class TransformerBlock(nn.Module):
     """
     Performs LayerNorms first (as in PyTorch Transformers when norm_first=True),
@@ -247,28 +289,18 @@ class TransformerBlock(nn.Module):
         msa_dropout=0.0,
         identity_probability=0.0,
         causal=False,
-        share_kv=False,
-        max_subtract=False,
-        d_model_scale=True,
-        log_length_scale=False,
-        quiet_attention=False,
         linear_module=nn.Linear,
     ):
         super().__init__()
         self.identity_probability = identity_probability
-        if activation_kwargs is not None:
-            self.activation = activation(**activation_kwargs)
-        else:
-            self.activation = activation()
         # Submodules for applying attention
         self.layer_norm = nn.LayerNorm(d_model)
         if position_embedding_type == "relative":
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
-            if d_model < 48:
+            if d_model < 16:
                 dim = d_model
             else:
                 dim = 16
@@ -284,11 +316,6 @@ class TransformerBlock(nn.Module):
             dropout=msa_dropout,
             causal=causal,
             seq_len=seq_len,
-            share_kv=share_kv,
-            max_subtract=max_subtract,
-            d_model_scale=d_model_scale,
-            log_length_scale=log_length_scale,
-            quiet=quiet_attention,
             linear_module=linear_module,
             rotary_embedding=self.rotary_embedding,
             source_size=source_size,
@@ -301,20 +328,17 @@ class TransformerBlock(nn.Module):
                 [
                     ("layer_norm", nn.LayerNorm(d_model)),
                     (
-                        # up_projection is appropriate to activation
-                        "up_projection",
-                        linear_module(
+                        "denoising_autoencoder",
+                        DenoisingAutoEncoder(
+                            d_model,
+                            mlp_ratio,
                             d_model,
-                            (
-                                2 * mlp_ratio * d_model
-                                if activation.__name__.endswith("GLU")
-                                else mlp_ratio * d_model
-                            ),
+                            activation=activation,
+                            activation_kwargs=activation_kwargs,
+                            dropout=0.0,
+                            linear_module=linear_module,
                         ),
                     ),
-                    # xGLU activations will halve embedding size
-                    ("activation", self.activation),
-                    ("down_projection", linear_module(mlp_ratio * d_model, d_model)),
                     ("dropout", nn.Dropout(mlp_dropout)),
                 ]
             )
@@ -369,11 +393,6 @@ class TransformerEncoder(nn.Module):
         msa_dropout=0.0,
         stochastic_depth=0.0,
         causal=False,
-        share_kv=False,
-        max_subtract=False,
-        d_model_scale=True,
-        log_length_scale=False,
-        quiet_attention=False,
         linear_module=nn.Linear,
         bos_tokens=0,
     ):
@@ -419,7 +438,7 @@ class TransformerEncoder(nn.Module):
         self.blocks = nn.ModuleList(
             [
                 TransformerBlock(
-                    seq_len,
+                    self.full_sequence_length,
                     d_model,
                     n_heads,
                     position_embedding_type=position_embedding_type,
@@ -432,11 +451,6 @@ class TransformerEncoder(nn.Module):
                     msa_dropout=msa_dropout,
                     identity_probability=self.stochastic_depth_probabilities[i],
                     causal=causal,
-                    share_kv=share_kv,
-                    max_subtract=max_subtract,
-                    d_model_scale=d_model_scale,
-                    log_length_scale=log_length_scale,
-                    quiet_attention=quiet_attention,
                     linear_module=linear_module,
                 )
                 for i in range(n_layers)

broccoli-ml 0.1.41__py3-none-any.whl → 0.3.0__py3-none-any.whl

broccoli-ml 0.1.41py3-none-any.whl → 0.3.0py3-none-any.whl