broccoli-ml 11.0.0__tar.gz → 12.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 11.0.0
3
+ Version: 12.0.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -21,29 +21,6 @@ except ImportError:
21
21
  FLASH_ATTN = False
22
22
 
23
23
 
24
- class LayerScale(nn.Module):
25
- def __init__(self, dim, decay=False, init_values=1e-4):
26
- super().__init__()
27
- self.dim = dim
28
- self.decay = decay
29
- self.init_values = init_values
30
- self.reset_parameters()
31
-
32
- def forward(self, x):
33
- if self.decay:
34
- return x * self.scale
35
- else:
36
- return x * self.nondecay_scale
37
-
38
- def reset_parameters(self):
39
- if self.decay:
40
- self.scale = nn.Parameter(self.init_values * torch.ones(self.dim))
41
- self.nondecay_scale = None
42
- else:
43
- self.nondecay_scale = nn.Parameter(self.init_values * torch.ones(self.dim))
44
- self.scale = None
45
-
46
-
47
24
  def drop_path(
48
25
  x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
49
26
  ):
@@ -527,7 +504,6 @@ class TransformerBlock(nn.Module):
527
504
  post_norm=False,
528
505
  normformer=False,
529
506
  checkpoint_ff=True,
530
- layerscale=True,
531
507
  ):
532
508
  """
533
509
  Args:
@@ -556,14 +532,6 @@ class TransformerBlock(nn.Module):
556
532
  self.post_attention_norm = nn.LayerNorm(d_model)
557
533
  self.post_mlp_norm = nn.LayerNorm(d_model)
558
534
 
559
- self.layerscale = layerscale
560
- if layerscale:
561
- self.layerscale1 = LayerScale(d_model)
562
- self.layerscale2 = LayerScale(d_model)
563
- else:
564
- self.layerscale1 = nn.Identity()
565
- self.layerscale2 = nn.Identity()
566
-
567
535
  if relative_position_embedding:
568
536
  max_freq = int(max(source_size) / 2) # Suggested by Gemini!
569
537
  if d_model < 16:
@@ -630,9 +598,7 @@ class TransformerBlock(nn.Module):
630
598
  else:
631
599
  process_x = x
632
600
 
633
- processed = self.drop_path(
634
- self.layerscale1(self.attn(process_x, process_x, process_x))
635
- )
601
+ processed = self.drop_path(self.attn(process_x, process_x, process_x))
636
602
 
637
603
  if self.normformer:
638
604
  processed = self.normformer_norm(processed)
@@ -647,7 +613,7 @@ class TransformerBlock(nn.Module):
647
613
  else:
648
614
  process_x = x
649
615
 
650
- x = x + self.drop_path(self.layerscale2(self.ff(process_x)))
616
+ x = x + self.drop_path(self.ff(process_x))
651
617
 
652
618
  if self.post_norm:
653
619
  x = self.post_mlp_norm(x)
@@ -682,10 +648,6 @@ class TransformerBlock(nn.Module):
682
648
  self.attn.reset_parameters()
683
649
  self.ff.reset_parameters()
684
650
 
685
- if self.layerscale:
686
- self.layerscale1.reset_parameters()
687
- self.layerscale2.reset_parameters()
688
-
689
651
 
690
652
  class TransformerEncoder(nn.Module):
691
653
  """
@@ -722,7 +684,6 @@ class TransformerEncoder(nn.Module):
722
684
  normformer=False,
723
685
  msa_scaling="d",
724
686
  checkpoint_ff=True,
725
- layerscale=True,
726
687
  ):
727
688
  """
728
689
  Args:
@@ -757,12 +718,6 @@ class TransformerEncoder(nn.Module):
757
718
  self._utility_tokens = utility_tokens
758
719
  self.return_utility_tokens = return_utility_tokens
759
720
 
760
- if layerscale:
761
- rope_and_ape = absolute_position_embedding and relative_position_embedding
762
- self.position_layerscale = LayerScale(d_model, decay=rope_and_ape)
763
- else:
764
- self.position_layerscale = None
765
-
766
721
  # Initialise utility tokens with normal init, like usual Pytorch embeddings
767
722
  if self._utility_tokens:
768
723
  self._utility_token_embedding = nn.Parameter(
@@ -827,7 +782,6 @@ class TransformerEncoder(nn.Module):
827
782
  post_norm=post_norm,
828
783
  normformer=normformer,
829
784
  checkpoint_ff=checkpoint_ff,
830
- layerscale=layerscale,
831
785
  )
832
786
  for i in range(n_layers)
833
787
  ]
@@ -855,8 +809,6 @@ class TransformerEncoder(nn.Module):
855
809
  0
856
810
  ) # to shape (1, seq_len) to broadcast over batch
857
811
  )
858
- if self.position_layerscale is not None:
859
- position_embedding = self.position_layerscale(position_embedding)
860
812
  x += position_embedding
861
813
 
862
814
  return x
@@ -187,7 +187,6 @@ class ViTEncoder(nn.Module):
187
187
  transformer_msa_dropout=0.1,
188
188
  transformer_stochastic_depth=0.1,
189
189
  transformer_checkpoint_ff=True,
190
- transformer_layerscale=True,
191
190
  linear_module=nn.Linear,
192
191
  ):
193
192
  super().__init__()
@@ -353,7 +352,6 @@ class ViTEncoder(nn.Module):
353
352
  normformer=transformer_normformer,
354
353
  post_norm=transformer_post_norm,
355
354
  checkpoint_ff=transformer_checkpoint_ff,
356
- layerscale=transformer_layerscale,
357
355
  )
358
356
  else:
359
357
  self.transformer = nn.Identity()
@@ -489,7 +487,6 @@ class ViT(nn.Module):
489
487
  transformer_msa_dropout=0.1,
490
488
  transformer_stochastic_depth=0.1,
491
489
  transformer_checkpoint_ff=True,
492
- transformer_layerscale=True,
493
490
  head=SequencePoolClassificationHead,
494
491
  batch_norm_logits=True,
495
492
  logit_projection_layer=nn.Linear,
@@ -562,7 +559,6 @@ class ViT(nn.Module):
562
559
  transformer_msa_dropout=transformer_msa_dropout,
563
560
  transformer_stochastic_depth=transformer_stochastic_depth,
564
561
  transformer_checkpoint_ff=transformer_checkpoint_ff,
565
- transformer_layerscale=transformer_layerscale,
566
562
  linear_module=linear_module,
567
563
  )
568
564
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "11.0.0"
3
+ version = "12.0.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes