PyPI - broccoli-ml - Versions diffs - 0.35.1__tar.gz → 0.37.0__tar.gz - Mend

@@ -13,6 +13,45 @@ from .rope import RotaryEmbedding, apply_rotary_emb
 from .linear import AnchoredLinear, SpectralNormLinear
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
 class MHAttention(nn.Module):
     """
     Multi-head self-attention using einops and optionally a custom linear layer.
@@ -21,45 +60,6 @@ class MHAttention(nn.Module):
         are the same shape.
     Assumes bias=False and batch_first=True, as God intended.
-    Optionally adds various bells and whistles suggested in the
-        literature, including:
-        Noam Shazeer's scaled attention per "Attention is All You Need"
-            (https://arxiv.org/abs/1706.03762).
-        Max subtract softmax as discussed in "Attention As An RNN"
-            (https://arxiv.org/abs/2405.13956)
-        Log-length scaled softmax per "Overcoming a Theoretical Limitation of
-            Self-Attention" (https://arxiv.org/abs/2202.12172).
-        Quiet softmax per
-            https://www.evanmiller.org/attention-is-off-by-one.html
-    Args:
-        d_model: ...
-        n_heads: ...
-        dropout: ...
-        causal: should a causal mask be applied to the logits before attention
-            is applied? This is standard when using self-attention. Cannot be
-            True if inputs won't be square (e.g. if sequence length for
-            encoder and decoder are different)
-        sequence_length: ...
-        share_kv: ...
-        linear_module: ...
-        max_subtract: if True, the maximum logit value is subtracted from all
-            logits before performing the softmax operation to create a more
-            numerically stable softmax. This is discussed in "Attention As An
-            RNN" (https://arxiv.org/abs/2405.13956).
-        d_model_scale: ...
-        log_length_scale: if True, multiplies logits by the log length of
-            the decoder sequence before performing the softmax operation, as
-            proposed in "Overcoming a Theoretical Limitation of Self-Attention"
-            (https://arxiv.org/abs/2202.12172).
-        quiet: if True, adds 1 to the denominator of the softmax operation,
-            allowing some tokens to attend to no other tokens as described in
-            https://www.evanmiller.org/attention-is-off-by-one.html.
     """
     def __init__(
@@ -280,7 +280,7 @@ class FeedforwardBlock(nn.Module):
         elif self.residual_path:
             return x + self.process(x)
         else:
-            return x
+            return self.process(x)
 class TransformerBlock(nn.Module):
@@ -318,10 +318,11 @@ class TransformerBlock(nn.Module):
         self.post_norm = post_norm
         self.normformer = normformer
-        self.identity_probability = identity_probability
+        self.drop_path = DropPath(drop_prob=identity_probability, scale_by_keep=True)
         self.layer_norm_1 = nn.LayerNorm(d_model)
         self.layer_norm_2 = nn.LayerNorm(d_model)
+        self.layer_norm_3 = nn.LayerNorm(d_model)
         if position_embedding_type == "relative":
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
@@ -357,10 +358,10 @@ class TransformerBlock(nn.Module):
             dropout=mlp_dropout,
             linear_module_up=linear_module,
             linear_module_down=linear_module,
-            pre_norm=pre_norm,
+            pre_norm=False,  # Handled outside the block
             normformer=normformer,
-            post_norm=post_norm,
-            residual_path=True,
+            post_norm=False,  # Handled outside the block
+            residual_path=False,  # Handled outside the block
         )
     @property
@@ -368,34 +369,70 @@ class TransformerBlock(nn.Module):
         return self.attn._kv_distance
     def forward(self, x):
-        if not self.training:
-            identity_probability = 0.0
+        if self.pre_norm:
+            normx = self.layer_norm_1(x)
+            x = x + self.drop_path(self.attn(normx, normx, normx))
+            normx = self.layer_norm_2(x)
+            x = x + self.drop_path(self.ff(normx))
+        elif self.post_norm:
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = self.layer_norm_1(x)
+            x = x + self.drop_path(self.ff(x))
+            x = self.layer_norm_2(x)
         else:
-            identity_probability = self.identity_probability
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = x + self.drop_path(self.ff(x))
-        # perform the identity operation for some rows in the batch
-        identity_count = random.binomial(n=x.size(0), p=identity_probability)
-        shuffle_indices = torch.randperm(x.size(0), device=x.device)
-        unshuffle_indices = torch.argsort(shuffle_indices)
-        shuffled = x[shuffle_indices, :, :]
-        identity_x = shuffled[:identity_count, :, :]
-        process_x = shuffled[identity_count:, :, :]
+        if self.pre_norm and self.post_norm:
+            x = self.layer_norm_3(x)
-        residual_x = process_x
+        return x
-        if self.pre_norm:
-            process_x = self.layer_norm_1(process_x)
+    #     if not self.training:
+    #         identity_probability = 0.0
+    #     else:
+    #         identity_probability = self.identity_probability
-        process_x = residual_x + self.attn(process_x, process_x, process_x)
+    #     if random.random() < identity_probability:
+    #         return x
+    #     else:
+    #         ...
-        if self.post_norm:
-            process_x = self.layer_norm_2(process_x)
+    #     # perform the identity operation for some rows in the batch
+    #     dist = torch.distributions.Binomial(x.size(0), identity_probability)
+    #     identity_count = int(dist.sample().item())
-        process_x = self.ff(process_x)
+    #     shuffle_indices = torch.randperm(x.size(0), device=x.device)
+    #     unshuffle_indices = torch.argsort(shuffle_indices)
+    #     shuffled = x[shuffle_indices, :, :]
+    #     norm_shuffled = self.layer_norm_1(shuffled)
+    #     identity_x = shuffled[:identity_count, :, :]
+    #     process_x = shuffled[identity_count:, :, :]
+    #     residual = process_x
-        x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     if self.pre_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
-        return x
+    #     process_x = residual + self.attn(process_x, process_x, process_x)
+    #     residual = process_x
+    #     shuffled = torch.cat([identity_x, process_x])
+    #     norm_shuffled = self.layer_norm_2(shuffled)
+    #     if self.pre_norm:
+    #         residual = process_x # residual NOT normed
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #     if self.post_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #         residual = process_x # residual normed
+    #     process_x = residual + self.ff(process_x) # handles residual connection
+    #     x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     return x if not self.post_norm else self.layer_norm_3(x)
 class TransformerEncoder(nn.Module):

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.35.1
+Version: 0.37.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

@@ -236,13 +236,7 @@ class ViTEncoder(nn.Module):
         if pooling_type is None:
             pooling_out_channels = cnn_activation_out_channels
-            self.pool = nn.Sequential(
-                *[
-                    Rearrange(
-                        f"N C {spatial_dim_names} -> N ({spatial_dim_names}) C"
-                    ),  # for transformer
-                ]
-            )
+            self.pool = nn.Identity()
         elif pooling_type == "max":
             pooling_out_channels = cnn_activation_out_channels

@@ -1,6 +1,6 @@
 [project]
 name = "broccoli-ml"
-version = "0.35.1"
+version = "0.37.0"
 description = "Some useful Pytorch models, circa 2025"
 authors = [
     {name = "Nicholas Bailey"}

broccoli-ml 0.35.1__tar.gz → 0.37.0__tar.gz

broccoli-ml 0.35.1tar.gz → 0.37.0tar.gz