PyPI - broccoli-ml - Versions diffs - 0.35.1__py3-none-any.whl → 0.37.0__py3-none-any.whl - Mend

broccoli-ml 0.35.1py3-none-any.whl → 0.37.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

broccoli/transformer.py CHANGED Viewed

@@ -13,6 +13,45 @@ from .rope import RotaryEmbedding, apply_rotary_emb
 from .linear import AnchoredLinear, SpectralNormLinear
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
 class MHAttention(nn.Module):
     """
     Multi-head self-attention using einops and optionally a custom linear layer.
@@ -21,45 +60,6 @@ class MHAttention(nn.Module):
         are the same shape.
     Assumes bias=False and batch_first=True, as God intended.
-    Optionally adds various bells and whistles suggested in the
-        literature, including:
-        Noam Shazeer's scaled attention per "Attention is All You Need"
-            (https://arxiv.org/abs/1706.03762).
-        Max subtract softmax as discussed in "Attention As An RNN"
-            (https://arxiv.org/abs/2405.13956)
-        Log-length scaled softmax per "Overcoming a Theoretical Limitation of
-            Self-Attention" (https://arxiv.org/abs/2202.12172).
-        Quiet softmax per
-            https://www.evanmiller.org/attention-is-off-by-one.html
-    Args:
-        d_model: ...
-        n_heads: ...
-        dropout: ...
-        causal: should a causal mask be applied to the logits before attention
-            is applied? This is standard when using self-attention. Cannot be
-            True if inputs won't be square (e.g. if sequence length for
-            encoder and decoder are different)
-        sequence_length: ...
-        share_kv: ...
-        linear_module: ...
-        max_subtract: if True, the maximum logit value is subtracted from all
-            logits before performing the softmax operation to create a more
-            numerically stable softmax. This is discussed in "Attention As An
-            RNN" (https://arxiv.org/abs/2405.13956).
-        d_model_scale: ...
-        log_length_scale: if True, multiplies logits by the log length of
-            the decoder sequence before performing the softmax operation, as
-            proposed in "Overcoming a Theoretical Limitation of Self-Attention"
-            (https://arxiv.org/abs/2202.12172).
-        quiet: if True, adds 1 to the denominator of the softmax operation,
-            allowing some tokens to attend to no other tokens as described in
-            https://www.evanmiller.org/attention-is-off-by-one.html.
     """
     def __init__(
@@ -280,7 +280,7 @@ class FeedforwardBlock(nn.Module):
         elif self.residual_path:
             return x + self.process(x)
         else:
-            return x
+            return self.process(x)
 class TransformerBlock(nn.Module):
@@ -318,10 +318,11 @@ class TransformerBlock(nn.Module):
         self.post_norm = post_norm
         self.normformer = normformer
-        self.identity_probability = identity_probability
+        self.drop_path = DropPath(drop_prob=identity_probability, scale_by_keep=True)
         self.layer_norm_1 = nn.LayerNorm(d_model)
         self.layer_norm_2 = nn.LayerNorm(d_model)
+        self.layer_norm_3 = nn.LayerNorm(d_model)
         if position_embedding_type == "relative":
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
@@ -357,10 +358,10 @@ class TransformerBlock(nn.Module):
             dropout=mlp_dropout,
             linear_module_up=linear_module,
             linear_module_down=linear_module,
-            pre_norm=pre_norm,
+            pre_norm=False,  # Handled outside the block
             normformer=normformer,
-            post_norm=post_norm,
-            residual_path=True,
+            post_norm=False,  # Handled outside the block
+            residual_path=False,  # Handled outside the block
         )
     @property
@@ -368,34 +369,70 @@ class TransformerBlock(nn.Module):
         return self.attn._kv_distance
     def forward(self, x):
-        if not self.training:
-            identity_probability = 0.0
+        if self.pre_norm:
+            normx = self.layer_norm_1(x)
+            x = x + self.drop_path(self.attn(normx, normx, normx))
+            normx = self.layer_norm_2(x)
+            x = x + self.drop_path(self.ff(normx))
+        elif self.post_norm:
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = self.layer_norm_1(x)
+            x = x + self.drop_path(self.ff(x))
+            x = self.layer_norm_2(x)
         else:
-            identity_probability = self.identity_probability
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = x + self.drop_path(self.ff(x))
-        # perform the identity operation for some rows in the batch
-        identity_count = random.binomial(n=x.size(0), p=identity_probability)
-        shuffle_indices = torch.randperm(x.size(0), device=x.device)
-        unshuffle_indices = torch.argsort(shuffle_indices)
-        shuffled = x[shuffle_indices, :, :]
-        identity_x = shuffled[:identity_count, :, :]
-        process_x = shuffled[identity_count:, :, :]
+        if self.pre_norm and self.post_norm:
+            x = self.layer_norm_3(x)
-        residual_x = process_x
+        return x
-        if self.pre_norm:
-            process_x = self.layer_norm_1(process_x)
+    #     if not self.training:
+    #         identity_probability = 0.0
+    #     else:
+    #         identity_probability = self.identity_probability
-        process_x = residual_x + self.attn(process_x, process_x, process_x)
+    #     if random.random() < identity_probability:
+    #         return x
+    #     else:
+    #         ...
-        if self.post_norm:
-            process_x = self.layer_norm_2(process_x)
+    #     # perform the identity operation for some rows in the batch
+    #     dist = torch.distributions.Binomial(x.size(0), identity_probability)
+    #     identity_count = int(dist.sample().item())
-        process_x = self.ff(process_x)
+    #     shuffle_indices = torch.randperm(x.size(0), device=x.device)
+    #     unshuffle_indices = torch.argsort(shuffle_indices)
+    #     shuffled = x[shuffle_indices, :, :]
+    #     norm_shuffled = self.layer_norm_1(shuffled)
+    #     identity_x = shuffled[:identity_count, :, :]
+    #     process_x = shuffled[identity_count:, :, :]
+    #     residual = process_x
-        x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     if self.pre_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
-        return x
+    #     process_x = residual + self.attn(process_x, process_x, process_x)
+    #     residual = process_x
+    #     shuffled = torch.cat([identity_x, process_x])
+    #     norm_shuffled = self.layer_norm_2(shuffled)
+    #     if self.pre_norm:
+    #         residual = process_x # residual NOT normed
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #     if self.post_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #         residual = process_x # residual normed
+    #     process_x = residual + self.ff(process_x) # handles residual connection
+    #     x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     return x if not self.post_norm else self.layer_norm_3(x)
 class TransformerEncoder(nn.Module):

broccoli/vit.py CHANGED Viewed

@@ -236,13 +236,7 @@ class ViTEncoder(nn.Module):
         if pooling_type is None:
             pooling_out_channels = cnn_activation_out_channels
-            self.pool = nn.Sequential(
-                *[
-                    Rearrange(
-                        f"N C {spatial_dim_names} -> N ({spatial_dim_names}) C"
-                    ),  # for transformer
-                ]
-            )
+            self.pool = nn.Identity()
         elif pooling_type == "max":
             pooling_out_channels = cnn_activation_out_channels

{broccoli_ml-0.35.1.dist-info → broccoli_ml-0.37.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.35.1
+Version: 0.37.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.35.1.dist-info → broccoli_ml-0.37.0.dist-info}/RECORD RENAMED Viewed

@@ -8,10 +8,10 @@ broccoli/eigenpatches.py,sha256=J6n2usN1oQuHEHYiBNyYpn_a9eQcHjOBiIlvSei520Y,2413
 broccoli/linear.py,sha256=8Y9vD85ZEgNZsIQgO3uRQ3lOQR-JjwvabY8liCrfNCk,4831
 broccoli/rope.py,sha256=hw7kBPNR9GQXj4GxyIAffsGKPfcTPOFh8Bc7oEHtaZY,12108
 broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
-broccoli/transformer.py,sha256=t0gsADJC9UOlwjm7tDKdy0pAZ8l3clTcCnes86zvH-k,17203
+broccoli/transformer.py,sha256=hhembQe9tEVNZMRtgbdGEsHWaBXSl95h_RpDhFde030,18171
 broccoli/utils.py,sha256=htq_hOsdhUhL0nJi9WkKiEYOjEoWqFpK5X49PtgTf-0,299
-broccoli/vit.py,sha256=c-ZRHiLDOoQDJO9OJ51zD9HqaluG33flIwTXQQfms-g,17389
-broccoli_ml-0.35.1.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
-broccoli_ml-0.35.1.dist-info/METADATA,sha256=5pQA45ytAkkn0F5il2zuSN0vY7hFtVJvyUi9MXF-0EA,1257
-broccoli_ml-0.35.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-broccoli_ml-0.35.1.dist-info/RECORD,,
+broccoli/vit.py,sha256=05xqIw9xvE5easXcp4wIA1jQ0xUyRIq6h0ZDtbitXi4,17184
+broccoli_ml-0.37.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
+broccoli_ml-0.37.0.dist-info/METADATA,sha256=jUDSeLfYphtaOGvJn64v3deZw1nmKn4VYc7PO69BSPk,1257
+broccoli_ml-0.37.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+broccoli_ml-0.37.0.dist-info/RECORD,,

{broccoli_ml-0.35.1.dist-info → broccoli_ml-0.37.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{broccoli_ml-0.35.1.dist-info → broccoli_ml-0.37.0.dist-info}/WHEEL RENAMED Viewed

File without changes

broccoli-ml 0.35.1__py3-none-any.whl → 0.37.0__py3-none-any.whl

broccoli-ml 0.35.1py3-none-any.whl → 0.37.0py3-none-any.whl