broccoli-ml 14.0.1__tar.gz → 15.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 14.0.1
3
+ Version: 15.0.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -604,8 +604,6 @@ class EncoderBlock(nn.Module):
604
604
  return self.attn._kv_distance
605
605
 
606
606
  def forward(self, x):
607
- if self.post_norm:
608
- x = self.input_norm(x)
609
607
 
610
608
  if self.pre_norm:
611
609
  process_x = self.pre_attention_norm(x)
@@ -403,8 +403,9 @@ class ViTEncoder(nn.Module):
403
403
  checkpoint=transformer_checkpoint_ff,
404
404
  beta=self.beta,
405
405
  )
406
+ self.layer_norm = nn.LayerNorm(transformer_embedding_size)
406
407
  else:
407
- self.initial_ff = nn.Identity()
408
+ self.initial_ff = None
408
409
 
409
410
  self.preprocess = nn.Sequential(
410
411
  *[
@@ -424,7 +425,8 @@ class ViTEncoder(nn.Module):
424
425
 
425
426
  def forward(self, x):
426
427
  x = self.preprocess(x)
427
- x = x + self.initial_ff(x)
428
+ if self.initial_ff is not None:
429
+ x = self.layer_norm(x + self.initial_ff(x))
428
430
  return self.transformer(x)
429
431
 
430
432
  def attention_logits(self, x):
@@ -498,8 +500,6 @@ class ViT(nn.Module):
498
500
  batch_norm_logits=True,
499
501
  logit_projection_layer=nn.Linear,
500
502
  linear_module=nn.Linear,
501
- alpha=1.0,
502
- beta=1.0,
503
503
  ):
504
504
 
505
505
  super().__init__()
@@ -520,8 +520,9 @@ class ViT(nn.Module):
520
520
  "SwiGLU": SwiGLU,
521
521
  }[transformer_activation]
522
522
 
523
- self.alpha = alpha
524
- self.beta = beta
523
+ # Set alpha and beta according to Microsoft's DeepNorm
524
+ self.alpha = (2 * transformer_layers) ** 0.25
525
+ self.beta = (8 * transformer_layers) ** 0.25
525
526
 
526
527
  self.encoder = ViTEncoder(
527
528
  input_size=input_size,
@@ -571,8 +572,8 @@ class ViT(nn.Module):
571
572
  transformer_stochastic_depth=transformer_stochastic_depth,
572
573
  transformer_checkpoint_ff=transformer_checkpoint_ff,
573
574
  linear_module=linear_module,
574
- alpha=alpha,
575
- beta=beta,
575
+ alpha=self.alpha,
576
+ beta=self.beta,
576
577
  )
577
578
 
578
579
  self.pool = head(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "14.0.1"
3
+ version = "15.0.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
File without changes
File without changes