broccoli-ml 15.2.0__tar.gz → 15.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 15.2.0
3
+ Version: 15.3.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -354,9 +354,17 @@ class MHAttention(nn.Module):
354
354
  self.q_proj.reset_parameters()
355
355
  self.k_proj.reset_parameters()
356
356
  self.v_proj.reset_parameters()
357
- scale_parameters(self.v_proj, self.beta) # per Microsoft DeepNet
357
+ scale_parameters(
358
+ self.v_proj,
359
+ math.sqrt(6)
360
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
361
+ )
358
362
  self.out_proj.reset_parameters()
359
- scale_parameters(self.out_proj, self.beta) # per Microsoft DeepNet
363
+ scale_parameters(
364
+ self.out_proj,
365
+ math.sqrt(6)
366
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
367
+ )
360
368
 
361
369
  if self.talking_heads:
362
370
  # Initialize close to identity
@@ -473,8 +481,16 @@ class FeedforwardBlock(nn.Module):
473
481
  if hasattr(module, "reset_parameters"):
474
482
  module.reset_parameters()
475
483
 
476
- scale_parameters(self.linear_in, self.beta) # per Microsoft DeepNet
477
- scale_parameters(self.linear_out, self.beta)
484
+ scale_parameters(
485
+ self.linear_in,
486
+ math.sqrt(6)
487
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
488
+ )
489
+ scale_parameters(
490
+ self.linear_out,
491
+ math.sqrt(6)
492
+ * self.beta, # sqrt(6) to compensate for PyTorch tiny default init
493
+ )
478
494
 
479
495
 
480
496
  class EncoderBlock(nn.Module):
@@ -522,8 +522,7 @@ class ViT(nn.Module):
522
522
 
523
523
  # Set alpha and beta according to Microsoft's DeepNorm
524
524
  self.alpha = (2 * transformer_layers) ** 0.25
525
- # beta is only needed for very deep models
526
- self.beta = 1 if transformer_layers < 50 else (8 * transformer_layers) ** -0.25
525
+ self.beta = (8 * transformer_layers) ** -0.25
527
526
 
528
527
  self.encoder = ViTEncoder(
529
528
  input_size=input_size,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "15.2.0"
3
+ version = "15.3.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes