PyPI - broccoli-ml - Versions diffs - 0.23.1__tar.gz → 0.24.0__tar.gz - Mend

broccoli-ml 0.23.1tar.gz → 0.24.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.23.1
+Version: 0.24.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/broccoli/activation.py RENAMED Viewed

@@ -1,10 +1,48 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
-from einops import rearrange
-class SwiGLU(nn.Module):
+class ReLU(nn.Module):
+    """
+    A ReLU activation function with optional clamp and leakiness.
+    """
+    def __init__(self, clamp=True, leaky=True, leaky_slope=0.01, clamp_max=6.0) -> None:
+        super().__init__()
+        self.clamp = clamp
+        self.leaky = leaky
+        self.leaky_slope = leaky_slope
+        self.clamp_max = clamp_max
+    def forward(self, x):
+        if self.leaky:
+            relu = F.leaky_relu(x, leaky_slope=self.leaky_slope)
+        else:
+            relu = F.relu(x)
+        if self.clamp:
+            relu = torch.clamp(relu, max=self.clamp_max)
+        return relu
+class GELU(nn.Module):
+    """
+    A GELU activation function with optional clamp.
+    """
+    def __init__(self, clamp=True) -> None:
+        super().__init__()
+        self.clamp = clamp
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        gelu = self.gelu(x)
+        if self.clamp:
+            gelu = torch.clamp(gelu, max=6)
+        return gelu
+class Swish(nn.Module):
     """
     Implementation of (beta) SwiGLU, as introduced in "GLU Variants Improve Transformer"
         (https://arxiv.org/abs/2002.05202v1) and used to great effect in LLaMa 2.0.
@@ -16,12 +54,10 @@ class SwiGLU(nn.Module):
         super().__init__()
         # Learnable parameter is called "swiglu beta" so that it is easy to find
         #   and exclude from weight decay
-        self.swiglu_beta = nn.Parameter(torch.tensor([1.0]))
+        self.swish_beta = nn.Parameter(torch.tensor([1.0]))
     def forward(self, x):
-        gate, value = rearrange(x, "... (split c) -> split ... c", split=2)
-        beta_swish = gate * F.sigmoid(self.swiglu_beta * gate)
-        return beta_swish * value
+        return x * F.sigmoid(self.swish_beta * x)
 class SquaredReLU(nn.Module):
@@ -32,54 +68,52 @@ class SquaredReLU(nn.Module):
       https://azizbelaweid.substack.com/p/what-is-swiglu-how-to-implement-it
     """
-    def __init__(self, clamp=True, leaky=True) -> None:
+    def __init__(
+        self, clamp=True, leaky=True, leaky_slope: float = 0.01, clamp_max=6
+    ) -> None:
         super().__init__()
         self.clamp = clamp
         self.leaky = leaky
+        self.leaky_slope = leaky_slope
+        self.clamp_max = clamp_max
     def forward(self, x):
         if self.leaky:
-            relu = F.leaky_relu(x)
+            relu = F.leaky_relu(x, leaky_slope=self.leaky_slope)
         else:
             relu = F.relu(x)
         relu_squared = relu**2
         if self.clamp:
-            relu_squared = torch.clamp(relu_squared, max=6)
+            relu_squared = torch.clamp(relu_squared, max=self.clamp_max)
         return relu_squared
-class ReLU(nn.Module):
+class XGLU(nn.Module):
     """
-    A ReLU activation function with optional clamp and leakiness.
+    Generic Gated Linear Unit
     """
-    def __init__(self, clamp=True, leaky=True) -> None:
+    def __init__(self, activation_module: nn.Module) -> None:
         super().__init__()
-        self.clamp = clamp
-        self.leaky = leaky
+        self.activation = activation_module
-    def forward(self, x):
-        if self.leaky:
-            relu = F.leaky_relu(x)
-        else:
-            relu = F.relu(x)
-        if self.clamp:
-            relu = torch.clamp(relu, max=6)
-        return relu
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, value = x.chunk(2, dim=-1)
+        return self.activation(gate) * value
-class GELU(nn.Module):
+def SquaredReGLU(clamp=True, leaky=True, leaky_slope=0.01, clamp_max=6.0) -> XGLU:
     """
-    A ReLU activation function with optional clamp and leakiness.
+    Factory function that creates a GLU with a SquaredReLU activation.
     """
+    activation_module = SquaredReLU(
+        clamp=clamp, leaky=leaky, leaky_slope=leaky_slope, clamp_max=clamp_max
+    )
+    return XGLU(activation_module)
-    def __init__(self, clamp=True) -> None:
-        super().__init__()
-        self.clamp = clamp
-        self.gelu = nn.GELU()
-    def forward(self, x):
-        gelu = self.gelu(x)
-        if self.clamp:
-            gelu = torch.clamp(gelu, max=6)
-        return gelu
+def SwiGLU() -> XGLU:
+    """
+    Factory function that creates a GLU with a Swish activation.
+    """
+    return XGLU(Swish())

broccoli_ml-0.24.0/broccoli/linear.py ADDED Viewed

@@ -0,0 +1,95 @@
+# UNDER CONSTRUCTION
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .tensor import SigmaReparamTensor, AnchoredReparamTensor
+class SpectralNormLinear(nn.Module):
+    """
+    Inspired by Apple's Spectral Normed Linear Layers
+        (https://github.com/apple/ml-sigma-reparam)
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.weights = None
+        # Define the bias vector as a learnable parameter if required.
+        if self.use_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            # If no bias, register it as None.
+            # This is important so that PyTorch doesn't complain when saving/loading the model.
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        weights = torch.empty(self.out_features, self.in_features)
+        nn.init.kaiming_uniform_(weights, a=math.sqrt(5))
+        if self.use_bias:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+        self.weights = SigmaReparamTensor(weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weights(), self.bias)
+    def __repr__(self) -> str:
+        # Optional: A nice representation for printing the module.
+        return (
+            f"SpectralNormFeedForward(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.use_bias})"
+        )
+class AnchoredLinear(nn.Module):
+    """
+    ...
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+        self.weights = None
+        # Define the bias vector as a learnable parameter if required.
+        if self.use_bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            # If no bias, register it as None.
+            # This is important so that PyTorch doesn't complain when saving/loading the model.
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        weights = torch.empty(self.out_features, self.in_features)
+        nn.init.kaiming_uniform_(weights, a=math.sqrt(5))
+        if self.use_bias:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            nn.init.uniform_(self.bias, -bound, bound)
+        self.weights = AnchoredReparamTensor(weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weights(), self.bias)
+    def __repr__(self) -> str:
+        # Optional: A nice representation for printing the module.
+        return (
+            f"AnchoredLinear(in_features={self.in_features},"
+            f"out_features={self.out_features}, bias={self.use_bias})"
+        )

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/broccoli/tensor.py RENAMED Viewed

@@ -54,3 +54,43 @@ class SigmaReparamTensor(nn.Module):
         return self.sigma_reparam_scale * (
             self.sigma_reparam_tensor / self.approx_spectral_norm
         )
+class AnchoredReparamTensor(nn.Module):
+    """
+    Reparameterise a tensor as a normalised tensor of weights multiplied by a
+        learnable scaling factor.
+    The tensor of weights is also reparameterised as the product of a learnable
+        weight tensor with the (fixed) dominant right-singular vector of the
+        weight tensor as it was initialised.
+    i.e this module represents a tensor reparameterised as:
+        W_reparam = scale * (W / ||W @ v_0||_2)
+        where v_0 is the dominant right-singular vector of the initial tensor W_init.
+    """
+    def __init__(self, init_tensor: torch.Tensor):
+        assert init_tensor.ndim == 2, "Input tensor must be a 2D matrix."
+        super().__init__()
+        self.weight = nn.Parameter(init_tensor.clone(), requires_grad=True)
+        # At initialization, compute the dominant right-singular vector (v_0)
+        # and store it in a non-trainable buffer.
+        with torch.no_grad():
+            _, _, v_transpose = torch.linalg.svd(self.weight, full_matrices=False)
+            # v_transpose[0] is the first row of V^T, which is the first right-singular vector.
+            self.register_buffer("anchor_vector", v_transpose[0])
+        initial_norm = torch.linalg.vector_norm(self.weight.mv(self.anchor_vector))
+        self.scale = nn.Parameter(initial_norm.clone().detach(), requires_grad=True)
+    def forward(self) -> torch.Tensor:
+        # Calculate the L2 norm of the matrix-vector product W @ v_0
+        norm = torch.linalg.vector_norm(self.weight.mv(self.anchor_vector))
+        # Return the reparameterized tensor.
+        return self.scale * (self.weight / (norm + 1e-6))

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/broccoli/transformer.py RENAMED Viewed

@@ -10,7 +10,7 @@ import torch.nn.functional as F
 from einops import rearrange
 from .rope import RotaryEmbedding, apply_rotary_emb
-from .linear import SpectralNormLinear
+from .linear import AnchoredLinear
 class MHAttention(nn.Module):
@@ -236,7 +236,7 @@ class FeedforwardBlock(nn.Module):
         activation_kwargs=None,
         dropout=0.0,
         linear_module=nn.Linear,
-        sigma_reparam=False,
+        reparam=False,
     ):
         super().__init__()
@@ -253,8 +253,8 @@ class FeedforwardBlock(nn.Module):
             else ratio * output_features
         )
-        if sigma_reparam:
-            self.memory_type = SpectralNormLinear
+        if reparam:
+            self.memory_type = AnchoredLinear
         else:
             self.memory_type = linear_module
@@ -263,7 +263,7 @@ class FeedforwardBlock(nn.Module):
                 nn.LayerNorm(input_features),
                 linear_module(input_features, self.max_features),
                 self.activation,
-                nn.LayerNorm(ratio * output_features),
+                # nn.LayerNorm(ratio * output_features),
                 self.memory_type(ratio * output_features, output_features),
                 self.dropout,
             ]

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/broccoli/vit.py RENAMED Viewed

@@ -295,14 +295,14 @@ class ViTEncoder(nn.Module):
         if transformer_feedforward_first:
             self.initial_ff = FeedforwardBlock(
-                transformer_embedding_size,
+                max(transformer_embedding_size, pooling_out_channels),
                 transformer_mlp_ratio,
                 transformer_embedding_size,
                 activation=transformer_activation,
                 activation_kwargs=transformer_activation_kwargs,
                 dropout=transformer_mlp_dropout,
                 linear_module=linear_module,
-                sigma_reparam=not cnn,
+                reparam=not cnn,
             )
         else:
             self.initial_ff = nn.Identity()

{broccoli_ml-0.23.1 → broccoli_ml-0.24.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "broccoli-ml"
-version = "0.23.1"
+version = "0.24.0"
 description = "Some useful Pytorch models, circa 2025"
 authors = [
     {name = "Nicholas Bailey"}

broccoli_ml-0.23.1/broccoli/linear.py DELETED Viewed

@@ -1,89 +0,0 @@
-# UNDER CONSTRUCTION
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .tensor import SigmaReparamTensor
-class SpectralNormLinear(nn.Module):
-    """
-    Inspired by Apple's Spectral Normed Linear Layers
-        (https://github.com/apple/ml-sigma-reparam)
-    """
-    def __init__(self, in_features: int, out_features: int, bias: bool = True):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.use_bias = bias
-        self.weights = None
-        self.weight_init = nn.Parameter(torch.empty(out_features, in_features))
-        # Define the bias vector as a learnable parameter if required.
-        if self.use_bias:
-            self.bias = nn.Parameter(torch.empty(out_features))
-        else:
-            # If no bias, register it as None.
-            # This is important so that PyTorch doesn't complain when saving/loading the model.
-            self.register_parameter("bias", None)
-        self.reset_parameters()
-    def reset_parameters(self) -> None:
-        nn.init.kaiming_uniform_(self.weight_init, a=math.sqrt(5))
-        if self.use_bias:
-            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_init)
-            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-            nn.init.uniform_(self.bias, -bound, bound)
-        self.weights = SigmaReparamTensor(self.weight_init)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return F.linear(x, self.weights(), self.bias)
-    def __repr__(self) -> str:
-        # Optional: A nice representation for printing the module.
-        return (
-            f"SpectralNormFeedForward(in_features={self.in_features}",
-            f"out_features={self.out_features}, bias={self.use_bias})",
-        )
-class RandomLinear(nn.Linear):
-    """ """
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = False,  # <---- TODO: explain this
-        beta=0.1,
-        forward_looks_random=True,
-    ):
-        super().__init__(in_features, out_features, bias=False)
-        self.beta = beta
-        self.forward_looks_random = forward_looks_random
-    def forward(self, inputs: torch.Tensor):
-        if not self.training:
-            return F.linear(inputs, self.weight)
-        else:
-            # Initialise self.random_weights
-            random_weights = torch.empty_like(self.weight)
-            nn.init.trunc_normal_(random_weights)
-            random_weights *= self.beta
-            if self.forward_looks_random:
-                # Forward using a reparameterisation trick
-                a = F.linear(inputs.detach(), self.weight, self.bias)
-                b = F.linear(inputs, random_weights, bias=None)
-            else:
-                # Forward as (W_actual * input + W_random * input) + bias
-                a = F.linear(inputs, self.weight, self.bias)
-                b = F.linear(inputs, random_weights, bias=None)
-            return a + b