PyPI - broccoli-ml - Versions diffs - 0.24.3__tar.gz → 0.26.0__tar.gz - Mend

broccoli-ml 0.24.3tar.gz → 0.26.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.24.3
+Version: 0.26.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/broccoli/linear.py RENAMED Viewed

@@ -5,7 +5,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
-from .tensor import SigmaReparamTensor, AnchoredReparamTensor
+from .tensor import SigmaReparamTensor, AnchoredReparamTensor, NormReparamTensor
 class SpectralNormLinear(nn.Module):
@@ -93,3 +93,46 @@ class AnchoredLinear(nn.Module):
             f"AnchoredLinear(in_features={self.in_features},"
             f"out_features={self.out_features}, bias={self.use_bias})"
         )
+class ReparamLinear(nn.Module):
+    """
+    ...
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.weights = None
+        # Define the bias vector as a learnable parameter if required.
+        if self.use_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            # If no bias, register it as None.
+            # This is important so that PyTorch doesn't complain when saving/loading the model.
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        weights = torch.empty(self.out_features, self.in_features)
+        nn.init.kaiming_uniform_(weights, a=math.sqrt(5))
+        if self.use_bias:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+        self.weights = NormReparamTensor(weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weights(), self.bias)
+    def __repr__(self) -> str:
+        # Optional: A nice representation for printing the module.
+        return (
+            f"AnchoredLinear(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.use_bias})"
+        )

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/broccoli/tensor.py RENAMED Viewed

@@ -100,3 +100,24 @@ class AnchoredReparamTensor(nn.Module):
         # Return the reparameterized tensor.
         return self.scale * (self.nondecay_weight / (norm + 1e-6))
+class NormReparamTensor(nn.Module):
+    """
+    Reparameterise a tensor as a normalised tensor of weights multiplied by a
+        learnable scaling factor.
+    """
+    def __init__(self, init_tensor: torch.Tensor):
+        assert init_tensor.ndim == 2, "Input tensor must be a 2D matrix."
+        super().__init__()
+        # Use the gradboard convention of calling something nondecay_* if we should
+        # exclude it from weight decay
+        self.nondecay_weight = nn.Parameter(init_tensor.clone(), requires_grad=True)
+        self.scale = nn.Parameter(
+            torch.linalg.norm(self.nondecay_weight).clone().detach(), requires_grad=True
+        )
+    def forward(self) -> torch.Tensor:
+        return self.scale * F.normalize(self.nondecay_weight)

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/broccoli/transformer.py RENAMED Viewed

@@ -236,7 +236,8 @@ class FeedforwardBlock(nn.Module):
         activation_kwargs=None,
         dropout=0.0,
         linear_module=nn.Linear,
-        raw_input=False,
+        pre_norm=True,
+        normformer=False,
     ):
         super().__init__()
@@ -253,19 +254,13 @@ class FeedforwardBlock(nn.Module):
             else ratio * output_features
         )
-        if raw_input:
-            self.memory_type = AnchoredLinear
-        else:
-            self.memory_type = linear_module
         self.process = nn.Sequential(
             *[
-                nn.LayerNorm(input_features),
+                nn.LayerNorm(input_features) if pre_norm else nn.Identity(),
                 linear_module(input_features, self.max_features),
                 self.activation,
-                # nn.LayerNorm(ratio * output_features) if raw_input else nn.Identity(),
-                self.memory_type(ratio * output_features, output_features),
+                nn.LayerNorm(ratio * output_features) if normformer else nn.Identity(),
+                linear_module(ratio * output_features, output_features),
                 self.dropout,
             ]
         )
@@ -299,12 +294,17 @@ class TransformerBlock(nn.Module):
         identity_probability=0.0,
         causal=False,
         linear_module=nn.Linear,
+        pre_norm=True,
+        normformer=False,
     ):
         super().__init__()
+        self.pre_norm = pre_norm
         self.identity_probability = identity_probability
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm_1 = nn.LayerNorm(d_model)
+        self.layer_norm_2 = nn.LayerNorm(d_model)
         if position_embedding_type == "relative":
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
@@ -339,6 +339,8 @@ class TransformerBlock(nn.Module):
             activation_kwargs=activation_kwargs,
             dropout=mlp_dropout,
             linear_module=linear_module,
+            pre_norm=pre_norm,
+            normformer=normformer,
         )
     @property
@@ -359,12 +361,19 @@ class TransformerBlock(nn.Module):
         identity_x = shuffled[:identity_count, :, :]
         process_x = shuffled[identity_count:, :, :]
-        norm_process_x = self.layer_norm(process_x)
-        process_x = process_x + self.attn(
-            norm_process_x, norm_process_x, norm_process_x
-        )
-        process_x = process_x + self.ff(process_x)
-        x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+        if self.pre_norm:
+            norm_process_x = self.layer_norm_1(process_x)
+            process_x = process_x + self.attn(
+                norm_process_x, norm_process_x, norm_process_x
+            )
+            process_x = process_x + self.ff(process_x)
+        else:  # post-norm
+            process_x = process_x + self.attn(process_x, process_x, process_x)
+            norm_process_x = self.layer_norm_1(process_x)
+            process_x = process_x + self.ff(process_x)
+            x = self.layer_norm_2(
+                torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+            )
         return x
@@ -393,6 +402,8 @@ class TransformerEncoder(nn.Module):
         linear_module=nn.Linear,
         bos_tokens=0,
         return_bos_tokens=False,
+        pre_norm=True,
+        normformer=False,
     ):
         if position_embedding_type == "relative":
             assert source_size is not None  # TODO: make this a proper exception
@@ -451,6 +462,8 @@ class TransformerEncoder(nn.Module):
                     identity_probability=self.stochastic_depth_probabilities[i],
                     causal=causal,
                     linear_module=linear_module,
+                    pre_norm=pre_norm,
+                    normformer=normformer,
                 )
                 for i in range(n_layers)
             ]

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/broccoli/vit.py RENAMED Viewed

@@ -117,6 +117,8 @@ class ViTEncoder(nn.Module):
         pooling_kernel_stride=2,
         pooling_padding=1,
         transformer_feedforward_first=True,
+        transformer_pre_norm=True,
+        transformer_normformer=False,
         transformer_position_embedding="relative",  # absolute or relative
         transformer_embedding_size=256,
         transformer_layers=7,
@@ -289,6 +291,8 @@ class ViTEncoder(nn.Module):
                 linear_module=linear_module,
                 bos_tokens=transformer_bos_tokens,
                 return_bos_tokens=transformer_return_bos_tokens,
+                pre_norm=transformer_pre_norm,
+                normformer=transformer_normformer,
             )
         else:
             self.transformer = nn.Identity()
@@ -302,7 +306,8 @@ class ViTEncoder(nn.Module):
                 activation_kwargs=transformer_activation_kwargs,
                 dropout=transformer_mlp_dropout,
                 linear_module=linear_module,
-                raw_input=not cnn,
+                pre_norm=transformer_pre_norm,
+                normformer=transformer_normformer,
             )
         else:
             self.initial_ff = nn.Identity()
@@ -356,6 +361,8 @@ class ViT(nn.Module):
         pooling_kernel_stride=2,
         pooling_padding=1,
         transformer_feedforward_first=True,
+        transformer_pre_norm=True,
+        transformer_normformer=False,
         transformer_position_embedding="relative",  # absolute or relative
         transformer_embedding_size=256,
         transformer_layers=7,
@@ -410,6 +417,8 @@ class ViT(nn.Module):
             pooling_kernel_stride=pooling_kernel_stride,
             pooling_padding=pooling_padding,
             transformer_feedforward_first=transformer_feedforward_first,
+            transformer_pre_norm=transformer_pre_norm,
+            transformer_normformer=transformer_normformer,
             transformer_position_embedding=transformer_position_embedding,
             transformer_embedding_size=transformer_embedding_size,
             transformer_layers=transformer_layers,

{broccoli_ml-0.24.3 → broccoli_ml-0.26.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "broccoli-ml"
-version = "0.24.3"
+version = "0.26.0"
 description = "Some useful Pytorch models, circa 2025"
 authors = [
     {name = "Nicholas Bailey"}