PyPI - broccoli-ml - Versions diffs - 9.2.1__py3-none-any.whl → 9.3.0__py3-none-any.whl - Mend

broccoli-ml 9.2.1py3-none-any.whl → 9.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

broccoli/linear.py CHANGED Viewed

@@ -151,8 +151,13 @@ class RecyclingLinear(nn.Module):
         row_recycling_rate: float = 0.0,
         column_recycling_rate: float = 0.0,
         adaptive=False,
+        xglu=False,
     ):
         super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bias = bias
+        self.xglu = xglu
         self.linear = nn.Linear(in_features, out_features, bias=bias)
         self.row_recycling_rate = row_recycling_rate
         self.column_recycling_rate = column_recycling_rate
@@ -188,28 +193,50 @@ class RecyclingLinear(nn.Module):
             multipliers = [a / b for a, b in pairs if b != 0.0]
             return min(multipliers) if multipliers else 0.0
-    def forward(self, x):
-        multiplier = self._get_multiplier()
-        col_recycling_rate = self.column_recycling_rate * multiplier
-        row_recycling_rate = self.row_recycling_rate * multiplier
+    def reset_rows(self, indices):
+        if not torch.is_tensor(indices):
+            idx_tensor = torch.as_tensor(
+                list(indices), dtype=torch.long, device=self.linear.weight.device
+            )
+        else:
+            idx_tensor = indices
-        if self.training and self.optimisers:
+        if idx_tensor.size(0):
+            random_weights = self._random_weights(
+                indices.size(0), self.linear.weight.size(1)
+            )
+            if self.xglu:
+                gate_indices = indices
+                value_indices = indices + (self.linear.out_features // 2)
+                self._update_weights(value_indices, 0, random_weights, self.optimisers)
+                centred_gate_weights = self._mean_gate_weights()
+                centred_gate_weights = centred_gate_weights.expand(indices.size(0), -1)
+                self._update_weights(
+                    gate_indices, 0, centred_gate_weights, self.optimisers  # dim
+                )
+        else:
+            return
-            if row_recycling_rate > 0:
-                probs = torch.rand(self.linear.out_features, device=x.device)
-                mask = probs < row_recycling_rate
-                if mask.any():
-                    # nonzero returns [N, 1], squeeze to get [N]
-                    indices = torch.nonzero(mask).squeeze(-1)
-                    self.reset_rows(indices, self.optimisers)
-            if col_recycling_rate > 0:
-                probs = torch.rand(self.linear.in_features, device=x.device)
-                mask = probs < col_recycling_rate
-                if mask.any():
-                    indices = torch.nonzero(mask).squeeze(-1)
-                    self.reset_columns(indices, self.optimisers)
+    def reset_columns(self, indices):
+        if not torch.is_tensor(indices):
+            idx_tensor = torch.as_tensor(
+                list(indices), dtype=torch.long, device=self.linear.weight.device
+            )
+        else:
+            idx_tensor = indices
+        if idx_tensor.size(0):
+            random_weights = self._random_weights(
+                self.linear.weight.size(0), indices.size(0)
+            )
+            self._update_weights(indices, 1, random_weights, self.optimisers)  # dim
+        else:
+            return
+    def forward(self, x):
+        if self.training and self.optimisers:
+            self.reset_rows(self.get_reset_indices(0))
+            self.reset_columns(self.get_reset_indices(1))
         elif self.training and not self._warned_about_registration:
             warnings.warn(
                 "RecyclingLinear: No optimiser registered. Recycling disabled.",
@@ -219,82 +246,77 @@ class RecyclingLinear(nn.Module):
         return self.linear(x)
-    def reset_rows(
-        self,
-        indices: Iterable[int],
-        optimisers: Union[
-            List[torch.optim.Optimizer], torch.optim.Optimizer, None
-        ] = None,
-    ):
-        """
-        Update some of the weight rows to be equal to the mean of all weight rows.
-        """
-        if optimisers is None:
-            optimisers = []
-        if not isinstance(optimisers, list):
-            optimisers = [optimisers]
-        device = self.linear.weight.device
-        idx_tensor = torch.as_tensor(list(indices), dtype=torch.long, device=device)
-        if idx_tensor.numel() == 0:
-            return
-        with torch.no_grad():
-            # Calculate mean of all rows including the rows to be reset
-            mean_vector = self.linear.weight.data.mean(
-                dim=0, keepdim=True
-            )  # [1, in_features]
-            update_data = mean_vector.expand(idx_tensor.size(0), -1)
-            self.linear.weight.data[idx_tensor] = update_data
+    def get_reset_indices(self, dim):
+        base_rate = self.row_recycling_rate if dim == 0 else self.column_recycling_rate
+        p = base_rate * self._get_multiplier()
+        if dim == 0:
+            if self.xglu:
+                sample_space = self.linear.out_features // 2
+            else:
+                sample_space = self.linear.out_features
+        elif dim == 1:
+            sample_space = self.linear.in_features
+        else:
+            raise ValueError("`dim` must be 0 or 1")
-            if self.linear.bias is not None:
-                self.linear.bias.data[idx_tensor] = 0.0
+        # Sample the indices
+        probs = torch.rand(sample_space, device=self.linear.weight.device)
+        mask = probs < p
+        if mask.any():
+            return torch.nonzero(mask).squeeze(-1)
+        else:
+            return torch.tensor([], dtype=torch.long, device=self.linear.weight.device)
-            self._reset_optim_state(self.linear.weight, idx_tensor, optimisers, dim=0)
-            if self.linear.bias is not None:
-                self._reset_optim_state(self.linear.bias, idx_tensor, optimisers, dim=0)
+    def _random_weights(self, rows, columns):
+        device = self.linear.weight.device
+        weights = self.linear.weight.data
+        stdv = 1.0 / math.sqrt(weights.size(1))
+        random_weights = torch.rand(rows, columns, device=device)
+        random_weights -= 0.5  # Range [-0.5, +0.5]
+        random_weights *= 2.0 * stdv  # Range [-stdv, +stdv]
+        return random_weights
+    def _mean_gate_weights(self):
+        """
+        Only used when self.xglu
+        """
+        weights = self.linear.weight.data
+        rows = weights.size(0)
+        return self.linear.weight[: int(rows / 2)].data.mean(dim=0, keepdim=True)
-    def reset_columns(
+    def _update_weights(
         self,
         indices: Iterable[int],
+        dim: int,
+        data: torch.Tensor,
         optimisers: Union[
             List[torch.optim.Optimizer], torch.optim.Optimizer, None
         ] = None,
     ):
-        """
-        Update some of the weight columns to be random as though reinitialised.
-        """
         if optimisers is None:
             optimisers = []
         if not isinstance(optimisers, list):
             optimisers = [optimisers]
-        device = self.linear.weight.device
-        idx_tensor = torch.as_tensor(list(indices), dtype=torch.long, device=device)
+        if not torch.is_tensor(indices):
+            idx_tensor = torch.as_tensor(
+                list(indices), dtype=torch.long, device=self.linear.weight.device
+            )
+        else:
+            idx_tensor = indices
         if idx_tensor.numel() == 0:
             return
         with torch.no_grad():
-            # 1. Generate Random Columns
-            # Shape: [out_features, N_indices]
-            weights = self.linear.weight.data
-            stdv = 1.0 / math.sqrt(weights.size(1))
-            # Generate [Rows, N] block
-            random_weights = torch.rand(
-                weights.size(0), idx_tensor.size(0), device=device
-            )
-            random_weights = (random_weights - 0.5) * 2.0 * stdv
-            # 2. Update Weights (One-shot)
-            # We assign into the columns specified by idx_tensor
-            self.linear.weight.data[:, idx_tensor] = random_weights
-            # 3. Update Optimizers
-            # Bias is untouched by column resets (bias is shape [Out], cols are [In])
-            self._reset_optim_state(self.linear.weight, idx_tensor, optimisers, dim=1)
+            if dim == 0:
+                self.linear.weight.data[idx_tensor] = data
+            elif dim == 1:
+                self.linear.weight.data[:, idx_tensor] = data
+            else:
+                raise ValueError("`dim` must be 0 or 1")
+            self._reset_optim_state(self.linear.weight, idx_tensor, optimisers, dim=dim)
     def _reset_optim_state(self, param, idx_tensor, optimisers, dim):
         """

broccoli/transformer.py CHANGED Viewed

@@ -410,21 +410,9 @@ class FeedforwardBlock(nn.Module):
         # Recycle weights if using recycling linear layers
         if self.training and self.recycling_enabled:
-            multiplier = self.linear_in._get_multiplier()
-            rate = self.master_recycling_rate * multiplier
-            if rate > 0:
-                probs = torch.rand(self.linear_out.in_features, device=x.device)
-                mask = probs < rate
-                if mask.any():
-                    indices = torch.nonzero(mask).squeeze(-1)
-                    self.linear_out.reset_columns(indices, self.linear_out.optimisers)
-                    if self.xglu:
-                        indices_in = torch.cat(
-                            [indices, indices + self.linear_out.in_features]
-                        )
-                        self.linear_in.reset_rows(indices_in, self.linear_in.optimisers)
-                    else:
-                        self.linear_in.reset_rows(indices, self.linear_in.optimisers)
+            indices = self.linear_out.get_reset_indices(1)
+            self.linear_in.reset_rows(indices)
+            self.linear_out.reset_columns(indices)
         if self.checkpoint:
             processed = checkpoint(self.process, x, use_reentrant=False)

{broccoli_ml-9.2.1.dist-info → broccoli_ml-9.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 9.2.1
+Version: 9.3.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-9.2.1.dist-info → broccoli_ml-9.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 broccoli/__init__.py,sha256=tmyspsVxqPZHRQCY_NRwpW4SMNBbtE8E_8z7l-SAzSo,127
 broccoli/activation.py,sha256=-Jf30C6iGqWCorC9HEGn2oduWwjeaCAxGLUUYIy1zX8,3438
 broccoli/cnn.py,sha256=WjoPDSpe3ttwxCBNfCVRdaCHvbeZ7G-a5_i8fUsK_d8,4889
-broccoli/linear.py,sha256=7uN7zVPJ6Ptec31O8a-GvWT5nZk56Wf1RLJRvUAT0yo,11406
+broccoli/linear.py,sha256=Fn3eqgv1X2M5iXZmtP6jBzfUYuWMkiLlgkBDryv6Ho8,11999
 broccoli/rope.py,sha256=GRqApBNmYCFaDak0WL1xE_BC5CTTYKQU_PBdeTcQcjc,12557
 broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
-broccoli/transformer.py,sha256=r-ggAeNDW5QpBi9As1U9sIfxITBOx0WHk_K4zWpyTM8,26233
+broccoli/transformer.py,sha256=ULk-QQX3hAI14-aCKhp9QSebzX4KUjlisEGup2Eycck,25565
 broccoli/utils.py,sha256=oOWzn6dJ5nC_9r4zq0emmfmaYACJXJNFS48AOpW2jqc,358
 broccoli/vit.py,sha256=sC6K3FK3a8ojOgvNWSWhuZHBtnFrrTQbsDdlagcKJH4,22224
-broccoli_ml-9.2.1.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
-broccoli_ml-9.2.1.dist-info/METADATA,sha256=Nj7WnXKxlvSlrK8rQp9wizgPGs7ZMnhCi-KY5O6W-wc,1368
-broccoli_ml-9.2.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-broccoli_ml-9.2.1.dist-info/RECORD,,
+broccoli_ml-9.3.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
+broccoli_ml-9.3.0.dist-info/METADATA,sha256=avjuGvDLh6q6v-7E3dCq0jCNC17-vag52vweC2W26QU,1368
+broccoli_ml-9.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+broccoli_ml-9.3.0.dist-info/RECORD,,

{broccoli_ml-9.2.1.dist-info → broccoli_ml-9.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{broccoli_ml-9.2.1.dist-info → broccoli_ml-9.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

broccoli-ml 9.2.1__py3-none-any.whl → 9.3.0__py3-none-any.whl

broccoli-ml 9.2.1py3-none-any.whl → 9.3.0py3-none-any.whl