broccoli-ml 15.2.0__tar.gz → 15.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 15.2.0
3
+ Version: 15.4.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -354,9 +354,17 @@ class MHAttention(nn.Module):
354
354
  self.q_proj.reset_parameters()
355
355
  self.k_proj.reset_parameters()
356
356
  self.v_proj.reset_parameters()
357
- scale_parameters(self.v_proj, self.beta) # per Microsoft DeepNet
357
+ scale_parameters(
358
+ self.v_proj,
359
+ math.sqrt(6)
360
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
361
+ )
358
362
  self.out_proj.reset_parameters()
359
- scale_parameters(self.out_proj, self.beta) # per Microsoft DeepNet
363
+ scale_parameters(
364
+ self.out_proj,
365
+ math.sqrt(6)
366
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
367
+ )
360
368
 
361
369
  if self.talking_heads:
362
370
  # Initialize close to identity
@@ -473,8 +481,16 @@ class FeedforwardBlock(nn.Module):
473
481
  if hasattr(module, "reset_parameters"):
474
482
  module.reset_parameters()
475
483
 
476
- scale_parameters(self.linear_in, self.beta) # per Microsoft DeepNet
477
- scale_parameters(self.linear_out, self.beta)
484
+ scale_parameters(
485
+ self.linear_in,
486
+ math.sqrt(6)
487
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
488
+ )
489
+ scale_parameters(
490
+ self.linear_out,
491
+ math.sqrt(6)
492
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
493
+ )
478
494
 
479
495
 
480
496
  class EncoderBlock(nn.Module):
@@ -520,10 +520,10 @@ class ViT(nn.Module):
520
520
  "SwiGLU": SwiGLU,
521
521
  }[transformer_activation]
522
522
 
523
- # Set alpha and beta according to Microsoft's DeepNorm
524
- self.alpha = (2 * transformer_layers) ** 0.25
525
- # beta is only needed for very deep models
526
- self.beta = 1 if transformer_layers < 50 else (8 * transformer_layers) ** -0.25
523
+ # Set alpha according to Microsoft's DeepNorm if layers > 50
524
+ if transformer_layers > 50:
525
+ self.alpha = (2 * transformer_layers) ** 0.25
526
+ # beta is not needed as we norm the Q and K vectors in MSA!
527
527
 
528
528
  self.encoder = ViTEncoder(
529
529
  input_size=input_size,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "15.2.0"
3
+ version = "15.4.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes