PyPI - broccoli-ml - Versions diffs - 0.35.0__py3-none-any.whl → 0.36.0__py3-none-any.whl - Mend

broccoli-ml 0.35.0py3-none-any.whl → 0.36.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

broccoli/tensor.py CHANGED Viewed

@@ -77,25 +77,25 @@ class AnchoredReparamTensor(nn.Module):
         super().__init__()
-        self.nondecay_weight = nn.Parameter(init_tensor, requires_grad=True)
+        self.weight = nn.Parameter(init_tensor, requires_grad=True)
         with torch.no_grad():
-            _, sigma, v_transpose = torch.linalg.svd(
-                self.nondecay_weight, full_matrices=False
-            )
+            _, sigma, v_transpose = torch.linalg.svd(self.weight, full_matrices=False)
         self.register_buffer("rayleigh_norm", sigma[:1])
         self.register_buffer("initial_right_singular", v_transpose[0])
-        self.scale = nn.Parameter(sigma[:1].clone().detach(), requires_grad=True)
+        self.nondecay_scale = nn.Parameter(
+            sigma[:1].clone().detach(), requires_grad=True
+        )
     def _update_rayleigh_norm(self):
         with torch.no_grad():
-            product = self.nondecay_weight.mv(self.initial_right_singular)
+            product = self.weight.mv(self.initial_right_singular)
             normed_product = F.normalize(product, dim=0)
             rayleigh_norm = torch.einsum(
                 "m,mn,n->",
                 normed_product,
-                self.nondecay_weight,
+                self.weight,
                 self.initial_right_singular,
             )
             self.rayleigh_norm.data.copy_(rayleigh_norm)
@@ -103,7 +103,7 @@ class AnchoredReparamTensor(nn.Module):
     def forward(self):
         if self.training:
             self._update_rayleigh_norm()
-        return self.scale * (self.nondecay_weight / (self.rayleigh_norm + 1e-6))
+        return self.nondecay_scale * (self.weight / (self.rayleigh_norm + 1e-6))
 class NormReparamTensor(nn.Module):
@@ -118,11 +118,11 @@ class NormReparamTensor(nn.Module):
         # Use the gradboard convention of calling something nondecay_* if we should
         # exclude it from weight decay
-        self.nondecay_weight = nn.Parameter(init_tensor.clone(), requires_grad=True)
-        self.scale = nn.Parameter(
-            torch.linalg.norm(self.nondecay_weight).clone().detach(), requires_grad=True
+        self.weight = nn.Parameter(init_tensor.clone(), requires_grad=True)
+        self.nondecay_scale = nn.Parameter(
+            torch.linalg.norm(self.weight).clone().detach(), requires_grad=True
         )
     def forward(self) -> torch.Tensor:
-        norm = torch.linalg.norm(self.nondecay_weight)
-        return self.scale * (self.nondecay_weight / (norm + 1e-6))
+        norm = torch.linalg.norm(self.weight)
+        return self.nondecay_scale * (self.weight / (norm + 1e-6))

broccoli/transformer.py CHANGED Viewed

@@ -21,45 +21,6 @@ class MHAttention(nn.Module):
         are the same shape.
     Assumes bias=False and batch_first=True, as God intended.
-    Optionally adds various bells and whistles suggested in the
-        literature, including:
-        Noam Shazeer's scaled attention per "Attention is All You Need"
-            (https://arxiv.org/abs/1706.03762).
-        Max subtract softmax as discussed in "Attention As An RNN"
-            (https://arxiv.org/abs/2405.13956)
-        Log-length scaled softmax per "Overcoming a Theoretical Limitation of
-            Self-Attention" (https://arxiv.org/abs/2202.12172).
-        Quiet softmax per
-            https://www.evanmiller.org/attention-is-off-by-one.html
-    Args:
-        d_model: ...
-        n_heads: ...
-        dropout: ...
-        causal: should a causal mask be applied to the logits before attention
-            is applied? This is standard when using self-attention. Cannot be
-            True if inputs won't be square (e.g. if sequence length for
-            encoder and decoder are different)
-        sequence_length: ...
-        share_kv: ...
-        linear_module: ...
-        max_subtract: if True, the maximum logit value is subtracted from all
-            logits before performing the softmax operation to create a more
-            numerically stable softmax. This is discussed in "Attention As An
-            RNN" (https://arxiv.org/abs/2405.13956).
-        d_model_scale: ...
-        log_length_scale: if True, multiplies logits by the log length of
-            the decoder sequence before performing the softmax operation, as
-            proposed in "Overcoming a Theoretical Limitation of Self-Attention"
-            (https://arxiv.org/abs/2202.12172).
-        quiet: if True, adds 1 to the denominator of the softmax operation,
-            allowing some tokens to attend to no other tokens as described in
-            https://www.evanmiller.org/attention-is-off-by-one.html.
     """
     def __init__(
@@ -280,7 +241,7 @@ class FeedforwardBlock(nn.Module):
         elif self.residual_path:
             return x + self.process(x)
         else:
-            return x
+            return self.process(x)
 class TransformerBlock(nn.Module):
@@ -374,7 +335,9 @@ class TransformerBlock(nn.Module):
             identity_probability = self.identity_probability
         # perform the identity operation for some rows in the batch
-        identity_count = random.binomial(n=x.size(0), p=identity_probability)
+        dist = torch.distributions.Binomial(x.size(0), identity_probability)
+        identity_count = int(dist.sample().item())
         shuffle_indices = torch.randperm(x.size(0), device=x.device)
         unshuffle_indices = torch.argsort(shuffle_indices)
         shuffled = x[shuffle_indices, :, :]

broccoli/vit.py CHANGED Viewed

@@ -236,13 +236,7 @@ class ViTEncoder(nn.Module):
         if pooling_type is None:
             pooling_out_channels = cnn_activation_out_channels
-            self.pool = nn.Sequential(
-                *[
-                    Rearrange(
-                        f"N C {spatial_dim_names} -> N ({spatial_dim_names}) C"
-                    ),  # for transformer
-                ]
-            )
+            self.pool = nn.Identity()
         elif pooling_type == "max":
             pooling_out_channels = cnn_activation_out_channels

{broccoli_ml-0.35.0.dist-info → broccoli_ml-0.36.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.35.0
+Version: 0.36.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.35.0.dist-info → broccoli_ml-0.36.0.dist-info}/RECORD RENAMED Viewed

@@ -7,11 +7,11 @@ broccoli/cnn.py,sha256=jeRyKIAMWu1E3iyI14MGgSZuZivPMh12iqkqW9ilNjo,17785
 broccoli/eigenpatches.py,sha256=J6n2usN1oQuHEHYiBNyYpn_a9eQcHjOBiIlvSei520Y,2413
 broccoli/linear.py,sha256=8Y9vD85ZEgNZsIQgO3uRQ3lOQR-JjwvabY8liCrfNCk,4831
 broccoli/rope.py,sha256=hw7kBPNR9GQXj4GxyIAffsGKPfcTPOFh8Bc7oEHtaZY,12108
-broccoli/tensor.py,sha256=ks2TRCdS10k2XvxEieh2sj_LzjTNRuiO6gekKFTtziI,4533
-broccoli/transformer.py,sha256=t0gsADJC9UOlwjm7tDKdy0pAZ8l3clTcCnes86zvH-k,17203
+broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
+broccoli/transformer.py,sha256=NH94U6lxHzmDGDHTTtJV2kUs7IcS2iNmFJl44_6KtQ0,15456
 broccoli/utils.py,sha256=htq_hOsdhUhL0nJi9WkKiEYOjEoWqFpK5X49PtgTf-0,299
-broccoli/vit.py,sha256=c-ZRHiLDOoQDJO9OJ51zD9HqaluG33flIwTXQQfms-g,17389
-broccoli_ml-0.35.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
-broccoli_ml-0.35.0.dist-info/METADATA,sha256=v0JSpcubSGwxA5dFPbDwz2r2oGZWSeqYND1Mu8WOiJY,1257
-broccoli_ml-0.35.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-broccoli_ml-0.35.0.dist-info/RECORD,,
+broccoli/vit.py,sha256=05xqIw9xvE5easXcp4wIA1jQ0xUyRIq6h0ZDtbitXi4,17184
+broccoli_ml-0.36.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
+broccoli_ml-0.36.0.dist-info/METADATA,sha256=csog4ZG1PGeRuFO5QnHdVPgmDYXsGQQJ621JgU0D83w,1257
+broccoli_ml-0.36.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+broccoli_ml-0.36.0.dist-info/RECORD,,

{broccoli_ml-0.35.0.dist-info → broccoli_ml-0.36.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{broccoli_ml-0.35.0.dist-info → broccoli_ml-0.36.0.dist-info}/WHEEL RENAMED Viewed

File without changes

broccoli-ml 0.35.0__py3-none-any.whl → 0.36.0__py3-none-any.whl

broccoli-ml 0.35.0py3-none-any.whl → 0.36.0py3-none-any.whl