PyPI - broccoli-ml - Versions diffs - 0.36.0__py3-none-any.whl → 0.37.0__py3-none-any.whl - Mend

broccoli-ml 0.36.0py3-none-any.whl → 0.37.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

broccoli/transformer.py CHANGED Viewed

@@ -13,6 +13,45 @@ from .rope import RotaryEmbedding, apply_rotary_emb
 from .linear import AnchoredLinear, SpectralNormLinear
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """
+    From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    Copyright 2019 Ross Wightman
+    See documentation and licence there.
+    """
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
 class MHAttention(nn.Module):
     """
     Multi-head self-attention using einops and optionally a custom linear layer.
@@ -279,10 +318,11 @@ class TransformerBlock(nn.Module):
         self.post_norm = post_norm
         self.normformer = normformer
-        self.identity_probability = identity_probability
+        self.drop_path = DropPath(drop_prob=identity_probability, scale_by_keep=True)
         self.layer_norm_1 = nn.LayerNorm(d_model)
         self.layer_norm_2 = nn.LayerNorm(d_model)
+        self.layer_norm_3 = nn.LayerNorm(d_model)
         if position_embedding_type == "relative":
             max_freq = int(max(source_size) / 2)  # Suggested by Gemini!
@@ -318,10 +358,10 @@ class TransformerBlock(nn.Module):
             dropout=mlp_dropout,
             linear_module_up=linear_module,
             linear_module_down=linear_module,
-            pre_norm=pre_norm,
+            pre_norm=False,  # Handled outside the block
             normformer=normformer,
-            post_norm=post_norm,
-            residual_path=True,
+            post_norm=False,  # Handled outside the block
+            residual_path=False,  # Handled outside the block
         )
     @property
@@ -329,36 +369,70 @@ class TransformerBlock(nn.Module):
         return self.attn._kv_distance
     def forward(self, x):
-        if not self.training:
-            identity_probability = 0.0
+        if self.pre_norm:
+            normx = self.layer_norm_1(x)
+            x = x + self.drop_path(self.attn(normx, normx, normx))
+            normx = self.layer_norm_2(x)
+            x = x + self.drop_path(self.ff(normx))
+        elif self.post_norm:
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = self.layer_norm_1(x)
+            x = x + self.drop_path(self.ff(x))
+            x = self.layer_norm_2(x)
         else:
-            identity_probability = self.identity_probability
+            x = x + self.drop_path(self.attn(x, x, x))
+            x = x + self.drop_path(self.ff(x))
-        # perform the identity operation for some rows in the batch
-        dist = torch.distributions.Binomial(x.size(0), identity_probability)
-        identity_count = int(dist.sample().item())
+        if self.pre_norm and self.post_norm:
+            x = self.layer_norm_3(x)
-        shuffle_indices = torch.randperm(x.size(0), device=x.device)
-        unshuffle_indices = torch.argsort(shuffle_indices)
-        shuffled = x[shuffle_indices, :, :]
-        identity_x = shuffled[:identity_count, :, :]
-        process_x = shuffled[identity_count:, :, :]
+        return x
-        residual_x = process_x
+    #     if not self.training:
+    #         identity_probability = 0.0
+    #     else:
+    #         identity_probability = self.identity_probability
-        if self.pre_norm:
-            process_x = self.layer_norm_1(process_x)
+    #     if random.random() < identity_probability:
+    #         return x
+    #     else:
+    #         ...
-        process_x = residual_x + self.attn(process_x, process_x, process_x)
+    #     # perform the identity operation for some rows in the batch
+    #     dist = torch.distributions.Binomial(x.size(0), identity_probability)
+    #     identity_count = int(dist.sample().item())
-        if self.post_norm:
-            process_x = self.layer_norm_2(process_x)
+    #     shuffle_indices = torch.randperm(x.size(0), device=x.device)
+    #     unshuffle_indices = torch.argsort(shuffle_indices)
+    #     shuffled = x[shuffle_indices, :, :]
+    #     norm_shuffled = self.layer_norm_1(shuffled)
+    #     identity_x = shuffled[:identity_count, :, :]
+    #     process_x = shuffled[identity_count:, :, :]
+    #     residual = process_x
-        process_x = self.ff(process_x)
+    #     if self.pre_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
-        x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     process_x = residual + self.attn(process_x, process_x, process_x)
+    #     residual = process_x
-        return x
+    #     shuffled = torch.cat([identity_x, process_x])
+    #     norm_shuffled = self.layer_norm_2(shuffled)
+    #     if self.pre_norm:
+    #         residual = process_x # residual NOT normed
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #     if self.post_norm:
+    #         process_x = norm_shuffled[identity_count:, :, :]
+    #         residual = process_x # residual normed
+    #     process_x = residual + self.ff(process_x) # handles residual connection
+    #     x = torch.cat([identity_x, process_x])[unshuffle_indices, :, :].contiguous()
+    #     return x if not self.post_norm else self.layer_norm_3(x)
 class TransformerEncoder(nn.Module):

{broccoli_ml-0.36.0.dist-info → broccoli_ml-0.37.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: broccoli-ml
-Version: 0.36.0
+Version: 0.37.0
 Summary: Some useful Pytorch models, circa 2025
 License: MIT
 Author: Nicholas Bailey

{broccoli_ml-0.36.0.dist-info → broccoli_ml-0.37.0.dist-info}/RECORD RENAMED Viewed

@@ -8,10 +8,10 @@ broccoli/eigenpatches.py,sha256=J6n2usN1oQuHEHYiBNyYpn_a9eQcHjOBiIlvSei520Y,2413
 broccoli/linear.py,sha256=8Y9vD85ZEgNZsIQgO3uRQ3lOQR-JjwvabY8liCrfNCk,4831
 broccoli/rope.py,sha256=hw7kBPNR9GQXj4GxyIAffsGKPfcTPOFh8Bc7oEHtaZY,12108
 broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
-broccoli/transformer.py,sha256=NH94U6lxHzmDGDHTTtJV2kUs7IcS2iNmFJl44_6KtQ0,15456
+broccoli/transformer.py,sha256=hhembQe9tEVNZMRtgbdGEsHWaBXSl95h_RpDhFde030,18171
 broccoli/utils.py,sha256=htq_hOsdhUhL0nJi9WkKiEYOjEoWqFpK5X49PtgTf-0,299
 broccoli/vit.py,sha256=05xqIw9xvE5easXcp4wIA1jQ0xUyRIq6h0ZDtbitXi4,17184
-broccoli_ml-0.36.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
-broccoli_ml-0.36.0.dist-info/METADATA,sha256=csog4ZG1PGeRuFO5QnHdVPgmDYXsGQQJ621JgU0D83w,1257
-broccoli_ml-0.36.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-broccoli_ml-0.36.0.dist-info/RECORD,,
+broccoli_ml-0.37.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
+broccoli_ml-0.37.0.dist-info/METADATA,sha256=jUDSeLfYphtaOGvJn64v3deZw1nmKn4VYc7PO69BSPk,1257
+broccoli_ml-0.37.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+broccoli_ml-0.37.0.dist-info/RECORD,,

{broccoli_ml-0.36.0.dist-info → broccoli_ml-0.37.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{broccoli_ml-0.36.0.dist-info → broccoli_ml-0.37.0.dist-info}/WHEEL RENAMED Viewed

File without changes

broccoli-ml 0.36.0__py3-none-any.whl → 0.37.0__py3-none-any.whl

broccoli-ml 0.36.0py3-none-any.whl → 0.37.0py3-none-any.whl