PyPI - broccoli-ml - Versions diffs - 0.23.1__py3-none-any.whl → 0.24.1__py3-none-any.whl - Mend

broccoli-ml 0.23.1py3-none-any.whl → 0.24.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

broccoli/activation.py +69 -33
broccoli/linear.py +47 -41
broccoli/tensor.py +40 -0
broccoli/transformer.py +5 -5
broccoli/vit.py +2 -2
{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/METADATA +1 -1
{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/RECORD +9 -9
{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/LICENSE +0 -0
{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/WHEEL +0 -0

broccoli/activation.py CHANGED Viewed

@@ -1,10 +1,50 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
-from einops import rearrange
-class SwiGLU(nn.Module):
+class ReLU(nn.Module):
+    """
+    A ReLU activation function with optional clamp and leakiness.
+    """
+    def __init__(
+        self, clamp=True, leaky=True, negative_slope=0.01, clamp_max=6.0
+    ) -> None:
+        super().__init__()
+        self.clamp = clamp
+        self.leaky = leaky
+        self.negative_slope = negative_slope
+        self.clamp_max = clamp_max
+    def forward(self, x):
+        if self.leaky:
+            relu = F.leaky_relu(x, negative_slope=self.negative_slope)
+        else:
+            relu = F.relu(x)
+        if self.clamp:
+            relu = torch.clamp(relu, max=self.clamp_max)
+        return relu
+class GELU(nn.Module):
+    """
+    A GELU activation function with optional clamp.
+    """
+    def __init__(self, clamp=True) -> None:
+        super().__init__()
+        self.clamp = clamp
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        gelu = self.gelu(x)
+        if self.clamp:
+            gelu = torch.clamp(gelu, max=6)
+        return gelu
+class Swish(nn.Module):
     """
     Implementation of (beta) SwiGLU, as introduced in "GLU Variants Improve Transformer"
         (https://arxiv.org/abs/2002.05202v1) and used to great effect in LLaMa 2.0.
@@ -16,12 +56,10 @@ class SwiGLU(nn.Module):
         super().__init__()
         # Learnable parameter is called "swiglu beta" so that it is easy to find
         #   and exclude from weight decay
-        self.swiglu_beta = nn.Parameter(torch.tensor([1.0]))
+        self.swish_beta = nn.Parameter(torch.tensor([1.0]))
     def forward(self, x):
-        gate, value = rearrange(x, "... (split c) -> split ... c", split=2)
-        beta_swish = gate * F.sigmoid(self.swiglu_beta * gate)
-        return beta_swish * value
+        return x * F.sigmoid(self.swish_beta * x)
 class SquaredReLU(nn.Module):
@@ -32,54 +70,52 @@ class SquaredReLU(nn.Module):
       https://azizbelaweid.substack.com/p/what-is-swiglu-how-to-implement-it
     """
-    def __init__(self, clamp=True, leaky=True) -> None:
+    def __init__(
+        self, clamp=True, leaky=True, negative_slope: float = 0.01, clamp_max=6
+    ) -> None:
         super().__init__()
         self.clamp = clamp
         self.leaky = leaky
+        self.negative_slope = negative_slope
+        self.clamp_max = clamp_max
     def forward(self, x):
         if self.leaky:
-            relu = F.leaky_relu(x)
+            relu = F.leaky_relu(x, negative_slope=self.negative_slope)
         else:
             relu = F.relu(x)
         relu_squared = relu**2
         if self.clamp:
-            relu_squared = torch.clamp(relu_squared, max=6)
+            relu_squared = torch.clamp(relu_squared, max=self.clamp_max)
         return relu_squared
-class ReLU(nn.Module):
+class XGLU(nn.Module):
     """
-    A ReLU activation function with optional clamp and leakiness.
+    Generic Gated Linear Unit
     """
-    def __init__(self, clamp=True, leaky=True) -> None:
+    def __init__(self, activation_module: nn.Module) -> None:
         super().__init__()
-        self.clamp = clamp
-        self.leaky = leaky
+        self.activation = activation_module
-    def forward(self, x):
-        if self.leaky:
-            relu = F.leaky_relu(x)
-        else:
-            relu = F.relu(x)
-        if self.clamp:
-            relu = torch.clamp(relu, max=6)
-        return relu
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, value = x.chunk(2, dim=-1)
+        return self.activation(gate) * value
-class GELU(nn.Module):
+def SquaredReGLU(clamp=True, leaky=True, negative_slope=0.01, clamp_max=6.0) -> XGLU:
     """
-    A ReLU activation function with optional clamp and leakiness.
+    Factory function that creates a GLU with a SquaredReLU activation.
     """
+    activation_module = SquaredReLU(
+        clamp=clamp, leaky=leaky, negative_slope=negative_slope, clamp_max=clamp_max
+    )
+    return XGLU(activation_module)
-    def __init__(self, clamp=True) -> None:
-        super().__init__()
-        self.clamp = clamp
-        self.gelu = nn.GELU()
-    def forward(self, x):
-        gelu = self.gelu(x)
-        if self.clamp:
-            gelu = torch.clamp(gelu, max=6)
-        return gelu
+def SwiGLU() -> XGLU:
+    """
+    Factory function that creates a GLU with a Swish activation.
+    """
+    return XGLU(Swish())

broccoli/linear.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
-from .tensor import SigmaReparamTensor
+from .tensor import SigmaReparamTensor, AnchoredReparamTensor
 class SpectralNormLinear(nn.Module):
@@ -22,8 +22,6 @@ class SpectralNormLinear(nn.Module):
         self.weights = None
-        self.weight_init = nn.Parameter(torch.empty(out_features, in_features))
         # Define the bias vector as a learnable parameter if required.
         if self.use_bias:
             self.bias = nn.Parameter(torch.empty(out_features))
@@ -35,12 +33,13 @@ class SpectralNormLinear(nn.Module):
         self.reset_parameters()
     def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self.weight_init, a=math.sqrt(5))
+        weights = torch.empty(self.out_features, self.in_features)
+        nn.init.kaiming_uniform_(weights, a=math.sqrt(5))
         if self.use_bias:
-            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_init)
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
             bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
             nn.init.uniform_(self.bias, -bound, bound)
-        self.weights = SigmaReparamTensor(self.weight_init)
+        self.weights = SigmaReparamTensor(weights)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return F.linear(x, self.weights(), self.bias)
@@ -48,42 +47,49 @@ class SpectralNormLinear(nn.Module):
     def __repr__(self) -> str:
         # Optional: A nice representation for printing the module.
         return (
-            f"SpectralNormFeedForward(in_features={self.in_features}",
-            f"out_features={self.out_features}, bias={self.use_bias})",
+            f"SpectralNormFeedForward(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.use_bias})"
         )
-class RandomLinear(nn.Linear):
-    """ """
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = False,  # <---- TODO: explain this
-        beta=0.1,
-        forward_looks_random=True,
-    ):
-        super().__init__(in_features, out_features, bias=False)
-        self.beta = beta
-        self.forward_looks_random = forward_looks_random
-    def forward(self, inputs: torch.Tensor):
-        if not self.training:
-            return F.linear(inputs, self.weight)
+class AnchoredLinear(nn.Module):
+    """
+    ...
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.weights = None
+        # Define the bias vector as a learnable parameter if required.
+        if self.use_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
         else:
-            # Initialise self.random_weights
-            random_weights = torch.empty_like(self.weight)
-            nn.init.trunc_normal_(random_weights)
-            random_weights *= self.beta
-            if self.forward_looks_random:
-                # Forward using a reparameterisation trick
-                a = F.linear(inputs.detach(), self.weight, self.bias)
-                b = F.linear(inputs, random_weights, bias=None)
-            else:
-                # Forward as (W_actual * input + W_random * input) + bias
-                a = F.linear(inputs, self.weight, self.bias)
-                b = F.linear(inputs, random_weights, bias=None)
-            return a + b
+            # If no bias, register it as None.
+            # This is important so that PyTorch doesn't complain when saving/loading the model.
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        weights = torch.empty(self.out_features, self.in_features)
+        nn.init.kaiming_uniform_(weights, a=math.sqrt(5))
+        if self.use_bias:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+        self.weights = AnchoredReparamTensor(weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weights(), self.bias)
+    def __repr__(self) -> str:
+        # Optional: A nice representation for printing the module.
+        return (
+            f"AnchoredLinear(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.use_bias})"
+        )

broccoli/tensor.py CHANGED Viewed

@@ -54,3 +54,43 @@ class SigmaReparamTensor(nn.Module):
         return self.sigma_reparam_scale * (
             self.sigma_reparam_tensor / self.approx_spectral_norm
         )
+class AnchoredReparamTensor(nn.Module):
+    """
+    Reparameterise a tensor as a normalised tensor of weights multiplied by a
+        learnable scaling factor.
+    The tensor of weights is also reparameterised as the product of a learnable
+        weight tensor with the (fixed) dominant right-singular vector of the
+        weight tensor as it was initialised.
+    i.e this module represents a tensor reparameterised as:
+        W_reparam = scale * (W / ||W @ v_0||_2)
+        where v_0 is the dominant right-singular vector of the initial tensor W_init.
+    """
+    def __init__(self, init_tensor: torch.Tensor):
+        assert init_tensor.ndim == 2, "Input tensor must be a 2D matrix."
+        super().__init__()
+        self.weight = nn.Parameter(init_tensor.clone(), requires_grad=True)
+        # At initialization, compute the dominant right-singular vector (v_0)
+        # and store it in a non-trainable buffer.
+        with torch.no_grad():
+            _, _, v_transpose = torch.linalg.svd(self.weight, full_matrices=False)
+            # v_transpose[0] is the first row of V^T, which is the first right-singular vector.
+            self.register_buffer("anchor_vector", v_transpose[0])
+        initial_norm = torch.linalg.vector_norm(self.weight.mv(self.anchor_vector))
+        self.scale = nn.Parameter(initial_norm.clone().detach(), requires_grad=True)
+    def forward(self) -> torch.Tensor:
+        # Calculate the L2 norm of the matrix-vector product W @ v_0
+        norm = torch.linalg.vector_norm(self.weight.mv(self.anchor_vector))
+        # Return the reparameterized tensor.
+        return self.scale * (self.weight / (norm + 1e-6))

broccoli/transformer.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch.nn.functional as F
 from einops import rearrange
 from .rope import RotaryEmbedding, apply_rotary_emb
-from .linear import SpectralNormLinear
+from .linear import AnchoredLinear
 class MHAttention(nn.Module):
@@ -236,7 +236,7 @@ class FeedforwardBlock(nn.Module):
         activation_kwargs=None,
         dropout=0.0,
         linear_module=nn.Linear,
-        sigma_reparam=False,
+        reparam=False,
     ):
         super().__init__()
@@ -253,8 +253,8 @@ class FeedforwardBlock(nn.Module):
             else ratio * output_features
         )
-        if sigma_reparam:
-            self.memory_type = SpectralNormLinear
+        if reparam:
+            self.memory_type = AnchoredLinear
         else:
             self.memory_type = linear_module
@@ -263,7 +263,7 @@ class FeedforwardBlock(nn.Module):
                 nn.LayerNorm(input_features),
                 linear_module(input_features, self.max_features),
                 self.activation,
-                nn.LayerNorm(ratio * output_features),
+                # nn.LayerNorm(ratio * output_features),
                 self.memory_type(ratio * output_features, output_features),
                 self.dropout,
             ]

broccoli/vit.py CHANGED Viewed

@@ -295,14 +295,14 @@ class ViTEncoder(nn.Module):
         if transformer_feedforward_first:
             self.initial_ff = FeedforwardBlock(
-                transformer_embedding_size,
+                max(transformer_embedding_size, pooling_out_channels),
                 transformer_mlp_ratio,
                 transformer_embedding_size,
                 activation=transformer_activation,
                 activation_kwargs=transformer_activation_kwargs,
                 dropout=transformer_mlp_dropout,
                 linear_module=linear_module,
-                sigma_reparam=not cnn,
+                reparam=not cnn,
             )
         else:
             self.initial_ff = nn.Identity()

{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.23.1
+Version: 0.24.1
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 broccoli/__init__.py,sha256=tmyspsVxqPZHRQCY_NRwpW4SMNBbtE8E_8z7l-SAzSo,127
-broccoli/activation.py,sha256=AYTH6BNgXSBN8fnOMec_x94gISFj_r9y1IXIvhur7I4,2425
+broccoli/activation.py,sha256=-Jf30C6iGqWCorC9HEGn2oduWwjeaCAxGLUUYIy1zX8,3438
 broccoli/assets/2025_resnet_imagenet_1k_pretrained_state_dict.pkl,sha256=RZpPupWxFaVfgZrK-gBgfW1hj78oMEGhVWTbjRB3qMo,46835797
 broccoli/assets/cifar100_eigenvectors_size_2.pt,sha256=DjXDOXMeuMpIqNuGhX9z-OWYVqZwIMScSXZApRr9JjU,2501
 broccoli/assets/cifar100_eigenvectors_size_3.pt,sha256=gL6k0xtXYiYP6ZSvEiMBdJ7kIkT0AngTpDJHFQqwgxA,7173
 broccoli/cnn.py,sha256=jeRyKIAMWu1E3iyI14MGgSZuZivPMh12iqkqW9ilNjo,17785
 broccoli/eigenpatches.py,sha256=J6n2usN1oQuHEHYiBNyYpn_a9eQcHjOBiIlvSei520Y,2413
-broccoli/linear.py,sha256=g8YrxNl6g_WcHrWVmbaBHJU5hv6daFS0r4TxAoPJ9UE,3012
+broccoli/linear.py,sha256=4bxVDsO8E1d5-RZ23u160ZntazrT7Vt4AYTdAdCQU-w,3300
 broccoli/rope.py,sha256=hw7kBPNR9GQXj4GxyIAffsGKPfcTPOFh8Bc7oEHtaZY,12108
-broccoli/tensor.py,sha256=MUvXtwD2f1sPTBym4FB0x_ZfsJUBNLgULUlN8btV8GI,1943
-broccoli/transformer.py,sha256=NxOHP-XQRCtoiiTh7WJWNvSjpZzamiqQU966nQh5vhQ,16091
+broccoli/tensor.py,sha256=_YJP9tSFRkoKrR7cfnROSpWqfMyJLjgPmtFxEWRwgz8,3606
+broccoli/transformer.py,sha256=L1bVQZLUbtFtOy30yPVkjnqyELGhQoHJ_lFP_WPfYUA,16073
 broccoli/utils.py,sha256=htq_hOsdhUhL0nJi9WkKiEYOjEoWqFpK5X49PtgTf-0,299
-broccoli/vit.py,sha256=KJ4qVg53GTHYKQHXzjEd2v6sjbNBlWQXdUV1hh5bPEQ,15628
-broccoli_ml-0.23.1.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
-broccoli_ml-0.23.1.dist-info/METADATA,sha256=L6bC5tUPCMIsAMQgPWXTVI4S_PJuHSLVtxcsXS4AHPg,1257
-broccoli_ml-0.23.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-broccoli_ml-0.23.1.dist-info/RECORD,,
+broccoli/vit.py,sha256=qGCx4cnpAkPpVHFrz6bFHdnPJXPaCxtTxKlI9YQJZWg,15649
+broccoli_ml-0.24.1.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
+broccoli_ml-0.24.1.dist-info/METADATA,sha256=HOchT-ECPmQWjc0nQN7ohhOiKUbOqBVO_yKJLh_k9b8,1257
+broccoli_ml-0.24.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+broccoli_ml-0.24.1.dist-info/RECORD,,

{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{broccoli_ml-0.23.1.dist-info → broccoli_ml-0.24.1.dist-info}/WHEEL RENAMED Viewed

File without changes

broccoli-ml 0.23.1__py3-none-any.whl → 0.24.1__py3-none-any.whl

broccoli-ml 0.23.1py3-none-any.whl → 0.24.1py3-none-any.whl