broccoli-ml 15.4.2__tar.gz → 15.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 15.4.2
3
+ Version: 15.6.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -122,9 +122,6 @@ class MHAttention(nn.Module):
122
122
 
123
123
  self.head_dim = self.embed_dim // self.n_heads
124
124
 
125
- self.query_norm = nn.RMSNorm(self.head_dim)
126
- self.key_norm = nn.RMSNorm(self.head_dim)
127
-
128
125
  if self.scaling == "sqrtd":
129
126
  self.scaling_factor = 1 / math.sqrt(self.head_dim)
130
127
  elif self.scaling == "d":
@@ -229,8 +226,8 @@ class MHAttention(nn.Module):
229
226
  freqs = self.rotary_embedding.get_axial_freqs(*self.source_size)
230
227
 
231
228
  # norm Qs/Ks to protect axial rope, like https://arxiv.org/abs/2302.05442
232
- q_img = apply_rotary_emb(freqs, self.query_norm(q_img))
233
- k_img = apply_rotary_emb(freqs, self.key_norm(k_img))
229
+ q_img = apply_rotary_emb(freqs, q_img)
230
+ k_img = apply_rotary_emb(freqs, k_img)
234
231
 
235
232
  q_img = rearrange(
236
233
  q_img,
@@ -354,17 +351,9 @@ class MHAttention(nn.Module):
354
351
  self.q_proj.reset_parameters()
355
352
  self.k_proj.reset_parameters()
356
353
  self.v_proj.reset_parameters()
357
- scale_parameters(
358
- self.v_proj,
359
- math.sqrt(6)
360
- * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
361
- )
354
+ scale_parameters(self.v_proj, math.sqrt(6) * self.beta)
362
355
  self.out_proj.reset_parameters()
363
- scale_parameters(
364
- self.out_proj,
365
- math.sqrt(6)
366
- * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
367
- )
356
+ scale_parameters(self.out_proj, math.sqrt(6) * self.beta)
368
357
 
369
358
  if self.talking_heads:
370
359
  # Initialize close to identity
@@ -481,16 +470,8 @@ class FeedforwardBlock(nn.Module):
481
470
  if hasattr(module, "reset_parameters"):
482
471
  module.reset_parameters()
483
472
 
484
- scale_parameters(
485
- self.linear_in,
486
- math.sqrt(6)
487
- * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
488
- )
489
- scale_parameters(
490
- self.linear_out,
491
- math.sqrt(6)
492
- * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
493
- )
473
+ scale_parameters(self.linear_in, math.sqrt(6) * self.beta)
474
+ scale_parameters(self.linear_out, math.sqrt(6) * self.beta)
494
475
 
495
476
 
496
477
  class EncoderBlock(nn.Module):
@@ -521,7 +521,7 @@ class ViT(nn.Module):
521
521
  }[transformer_activation]
522
522
 
523
523
  self.alpha = (2 * transformer_layers) ** 0.25
524
- self.beta = 1.0 # beta is not needed if we norm the Q and K vectors in MSA!
524
+ self.beta = (8 * transformer_layers) ** -0.25
525
525
 
526
526
  self.encoder = ViTEncoder(
527
527
  input_size=input_size,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "15.4.2"
3
+ version = "15.6.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes