broccoli-ml 15.0.0__tar.gz → 15.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 15.0.0
3
+ Version: 15.1.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -122,6 +122,9 @@ class MHAttention(nn.Module):
122
122
 
123
123
  self.head_dim = self.embed_dim // self.n_heads
124
124
 
125
+ self.query_norm = nn.RMSNorm(self.head_dim)
126
+ self.key_norm = nn.RMSNorm(self.head_dim)
127
+
125
128
  if self.scaling == "sqrtd":
126
129
  self.scaling_factor = 1 / math.sqrt(self.head_dim)
127
130
  elif self.scaling == "d":
@@ -225,8 +228,9 @@ class MHAttention(nn.Module):
225
228
 
226
229
  freqs = self.rotary_embedding.get_axial_freqs(*self.source_size)
227
230
 
228
- q_img = apply_rotary_emb(freqs, q_img)
229
- k_img = apply_rotary_emb(freqs, k_img)
231
+ # norm Qs/Ks to protect axial rope, like https://arxiv.org/abs/2302.05442
232
+ q_img = apply_rotary_emb(freqs, self.query_norm(q_img))
233
+ k_img = apply_rotary_emb(freqs, self.key_norm(k_img))
230
234
 
231
235
  q_img = rearrange(
232
236
  q_img,
@@ -416,7 +420,7 @@ class FeedforwardBlock(nn.Module):
416
420
  self.activation,
417
421
  self.inner_dropout,
418
422
  (
419
- nn.LayerNorm(int(ratio * output_features))
423
+ nn.RMSNorm(int(ratio * output_features))
420
424
  if normformer
421
425
  else nn.Identity()
422
426
  ),
@@ -474,13 +478,7 @@ class FeedforwardBlock(nn.Module):
474
478
 
475
479
 
476
480
  class EncoderBlock(nn.Module):
477
- """
478
- Performs LayerNorms first (as in PyTorch Transformers when norm_first=True),
479
- which is also what is seen in e.g.
480
- https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
481
- and is recommended by https://arxiv.org/abs/2002.04745
482
-
483
- """
481
+ """ """
484
482
 
485
483
  def __init__(
486
484
  self,
@@ -534,16 +532,16 @@ class EncoderBlock(nn.Module):
534
532
  self.drop_path = DropPath(drop_prob=identity_probability, scale_by_keep=True)
535
533
 
536
534
  if self.pre_norm:
537
- self.pre_attention_norm = nn.LayerNorm(d_model)
538
- self.pre_mlp_norm = nn.LayerNorm(d_model)
535
+ self.pre_attention_norm = nn.RMSNorm(d_model)
536
+ self.pre_mlp_norm = nn.RMSNorm(d_model)
539
537
 
540
538
  if normformer:
541
- self.normformer_norm = nn.LayerNorm(d_model)
539
+ self.normformer_norm = nn.RMSNorm(d_model)
542
540
 
543
541
  if self.post_norm:
544
- self.input_norm = nn.LayerNorm(d_model)
545
- self.post_attention_norm = nn.LayerNorm(d_model)
546
- self.post_mlp_norm = nn.LayerNorm(d_model)
542
+ self.input_norm = nn.RMSNorm(d_model)
543
+ self.post_attention_norm = nn.RMSNorm(d_model)
544
+ self.post_mlp_norm = nn.RMSNorm(d_model)
547
545
 
548
546
  if relative_position_embedding:
549
547
  max_freq = int(max(source_size) / 2) # Suggested by Gemini!
@@ -403,7 +403,7 @@ class ViTEncoder(nn.Module):
403
403
  checkpoint=transformer_checkpoint_ff,
404
404
  beta=self.beta,
405
405
  )
406
- self.layer_norm = nn.LayerNorm(transformer_embedding_size)
406
+ self.layer_norm = nn.RMSNorm(transformer_embedding_size)
407
407
  else:
408
408
  self.initial_ff = None
409
409
 
@@ -417,7 +417,7 @@ class ViTEncoder(nn.Module):
417
417
  f"N C {spatial_dim_names} -> N ({spatial_dim_names}) C"
418
418
  ),
419
419
  self.pooling_channels_padding,
420
- nn.LayerNorm(transformer_embedding_size),
420
+ nn.RMSNorm(transformer_embedding_size),
421
421
  ]
422
422
  )
423
423
 
@@ -522,7 +522,7 @@ class ViT(nn.Module):
522
522
 
523
523
  # Set alpha and beta according to Microsoft's DeepNorm
524
524
  self.alpha = (2 * transformer_layers) ** 0.25
525
- self.beta = (8 * transformer_layers) ** 0.25
525
+ self.beta = (8 * transformer_layers) ** -0.25
526
526
 
527
527
  self.encoder = ViTEncoder(
528
528
  input_size=input_size,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "15.0.0"
3
+ version = "15.1.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes