broccoli-ml 0.11.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: broccoli-ml
3
- Version: 0.11.0
3
+ Version: 0.13.0
4
4
  Summary: Some useful Pytorch models, circa 2025
5
5
  License: MIT
6
6
  Author: Nicholas Bailey
@@ -0,0 +1,88 @@
1
+ # UNDER CONSTRUCTION
2
+
3
+ import math
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from .tensor import SigmaReparamTensor
9
+
10
+
11
+ class SpectralNormLinear(nn.Module):
12
+ """
13
+ ...
14
+ """
15
+
16
+ def __init__(self, in_features: int, out_features: int, bias: bool = True):
17
+ super().__init__()
18
+ self.in_features = in_features
19
+ self.out_features = out_features
20
+ self.use_bias = bias
21
+
22
+ self.weights = None
23
+
24
+ self.weight_init = nn.Parameter(torch.empty(out_features, in_features))
25
+
26
+ # Define the bias vector as a learnable parameter if required.
27
+ if self.use_bias:
28
+ self.bias = nn.Parameter(torch.empty(out_features))
29
+ else:
30
+ # If no bias, register it as None.
31
+ # This is important so that PyTorch doesn't complain when saving/loading the model.
32
+ self.register_parameter("bias", None)
33
+
34
+ self.reset_parameters()
35
+
36
+ def reset_parameters(self) -> None:
37
+ nn.init.kaiming_uniform_(self.weight_init, a=math.sqrt(5))
38
+ if self.use_bias:
39
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weights)
40
+ bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
41
+ nn.init.uniform_(self.bias, -bound, bound)
42
+ self.weights = SigmaReparamTensor(self.weight_init)
43
+
44
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
45
+ return F.linear(x, self.weights(), self.bias)
46
+
47
+ def __repr__(self) -> str:
48
+ # Optional: A nice representation for printing the module.
49
+ return (
50
+ f"SpectralNormFeedForward(in_features={self.in_features}",
51
+ f"out_features={self.out_features}, bias={self.use_bias})",
52
+ )
53
+
54
+
55
+ class RandomLinear(nn.Linear):
56
+ """ """
57
+
58
+ def __init__(
59
+ self,
60
+ in_features: int,
61
+ out_features: int,
62
+ bias: bool = False, # <---- TODO: explain this
63
+ beta=0.1,
64
+ forward_looks_random=True,
65
+ ):
66
+ super().__init__(in_features, out_features, bias=False)
67
+ self.beta = beta
68
+ self.forward_looks_random = forward_looks_random
69
+
70
+ def forward(self, inputs: torch.Tensor):
71
+ if not self.training:
72
+ return F.linear(inputs, self.weight)
73
+ else:
74
+ # Initialise self.random_weights
75
+ random_weights = torch.empty_like(self.weight)
76
+ nn.init.trunc_normal_(random_weights)
77
+ random_weights *= self.beta
78
+
79
+ if self.forward_looks_random:
80
+ # Forward using a reparameterisation trick
81
+ a = F.linear(inputs.detach(), self.weight, self.bias)
82
+ b = F.linear(inputs, random_weights, bias=None)
83
+ else:
84
+ # Forward as (W_actual * input + W_random * input) + bias
85
+ a = F.linear(inputs, self.weight, self.bias)
86
+ b = F.linear(inputs, random_weights, bias=None)
87
+
88
+ return a + b
@@ -10,6 +10,7 @@ import torch.nn.functional as F
10
10
  from einops import rearrange
11
11
 
12
12
  from .rope import RotaryEmbedding, apply_rotary_emb
13
+ from .linear import SpectralNormLinear
13
14
 
14
15
 
15
16
  class MHAttention(nn.Module):
@@ -245,19 +246,19 @@ class FeedforwardLayer(nn.Module):
245
246
 
246
247
  self.dropout = nn.Dropout(dropout)
247
248
 
249
+ self.max_features = (
250
+ 2 * ratio * output_features
251
+ if activation.__name__.endswith("GLU")
252
+ else ratio * output_features
253
+ )
254
+
248
255
  self.process = nn.Sequential(
249
256
  *[
250
257
  nn.LayerNorm(input_features),
251
- linear_module(
252
- input_features,
253
- (
254
- 2 * ratio * output_features
255
- if activation.__name__.endswith("GLU")
256
- else ratio * output_features
257
- ),
258
- ),
258
+ linear_module(input_features, self.max_features),
259
259
  self.activation,
260
- linear_module(ratio * output_features, output_features),
260
+ nn.LayerNorm(self.max_features),
261
+ linear_module(ratio * output_features, output_features, bias=False),
261
262
  self.dropout,
262
263
  ]
263
264
  )
@@ -296,7 +297,6 @@ class TransformerBlock(nn.Module):
296
297
 
297
298
  self.identity_probability = identity_probability
298
299
 
299
- # Submodules for applying attention
300
300
  self.layer_norm = nn.LayerNorm(d_model)
301
301
 
302
302
  if position_embedding_type == "relative":
@@ -20,37 +20,74 @@ class PadTensor(nn.Module):
20
20
  return F.pad(x, *self.args, **self.kwargs)
21
21
 
22
22
 
23
+ class GetCLSToken(nn.Module):
24
+ def __init__(self):
25
+ super().__init__()
26
+
27
+ def forward(self, x):
28
+ return x[:, 0, :]
29
+
30
+
23
31
  class SequencePool(nn.Module):
32
+ def __init__(self, d_model, linear_module):
33
+ super().__init__()
34
+ self.attention = nn.Sequential(
35
+ *[
36
+ linear_module(d_model, 1),
37
+ Rearrange("batch seq 1 -> batch seq"),
38
+ nn.Softmax(dim=-1),
39
+ ]
40
+ )
41
+
42
+ def forward(self, x):
43
+ weights = self.attention(x)
44
+ return einsum(weights, x, "batch seq, batch seq d_model -> batch d_model")
45
+
46
+
47
+ class ClassificationHead(nn.Module):
24
48
  """
25
- As described in [Hasani et al. (2021) *''Escaping the Big Data Paradigm with
26
- Compact Transformers''*](https://arxiv.org/abs/2104.05704). It can be viewed
27
- as a generalisation of average pooling.
49
+ A general classification head for a ViT
28
50
  """
29
51
 
30
- def __init__(self, d_model, linear_module, out_dim, batch_norm=True):
52
+ def __init__(self, d_model, linear_module, n_classes, batch_norm=True):
31
53
  super().__init__()
32
54
  self.d_model = d_model
33
- self.attention = nn.Sequential(
55
+ self.summarize = GetCLSToken()
56
+ self.process = nn.Sequential(
34
57
  *[
35
58
  linear_module(d_model, 1),
36
59
  Rearrange("batch seq 1 -> batch seq"),
37
60
  nn.Softmax(dim=-1),
38
61
  ]
39
62
  )
40
- self.projection = nn.Linear(d_model, out_dim)
41
- self.batch_norm = batch_norm
63
+ self.projection = nn.Linear(d_model, n_classes)
42
64
  if batch_norm:
43
- self.norm = nn.BatchNorm1d(out_dim, affine=False)
65
+ self.batch_norm = nn.BatchNorm1d(n_classes, affine=False)
44
66
  else:
45
- self.norm = None
67
+ self.batch_norm = nn.Identity()
46
68
 
47
- def forward(self, x):
48
- weights = self.attention(x)
49
- weighted_embedding = einsum(
50
- weights, x, "batch seq, batch seq d_model -> batch d_model"
69
+ self.classification_process = nn.Sequential(
70
+ *[
71
+ self.summarize,
72
+ self.projection,
73
+ self.batch_norm,
74
+ ]
51
75
  )
52
- projection = self.projection(weighted_embedding)
53
- return self.norm(projection) if self.batch_norm else projection
76
+
77
+ def forward(self, x):
78
+ return self.classification_process(x)
79
+
80
+
81
+ class SequencePoolClassificationHead(ClassificationHead):
82
+ """
83
+ As described in [Hasani et al. (2021) *''Escaping the Big Data Paradigm with
84
+ Compact Transformers''*](https://arxiv.org/abs/2104.05704). It can be viewed
85
+ as a generalisation of average pooling.
86
+ """
87
+
88
+ def __init__(self, d_model, linear_module, out_dim, batch_norm=True):
89
+ super().__init__(d_model, linear_module, out_dim, batch_norm=True)
90
+ self.summarize = SequencePool()
54
91
 
55
92
 
56
93
  class ViTEncoder(nn.Module):
@@ -88,6 +125,7 @@ class ViTEncoder(nn.Module):
88
125
  transformer_heads=4,
89
126
  transformer_mlp_ratio=2,
90
127
  transformer_bos_tokens=0,
128
+ transformer_return_bos_tokens=False,
91
129
  transformer_activation: nn.Module = SquaredReLU,
92
130
  transformer_activation_kwargs: Optional[dict] = None,
93
131
  transformer_mlp_dropout=0.0,
@@ -249,6 +287,7 @@ class ViTEncoder(nn.Module):
249
287
  causal=False,
250
288
  linear_module=linear_module,
251
289
  bos_tokens=transformer_bos_tokens,
290
+ return_bos_tokens=transformer_return_bos_tokens,
252
291
  )
253
292
  else:
254
293
  self.transformer = nn.Identity()
@@ -298,7 +337,7 @@ class ViTEncoder(nn.Module):
298
337
  return self.encoder(x)
299
338
 
300
339
 
301
- class CCT(nn.Module):
340
+ class ViT(nn.Module):
302
341
  """
303
342
  Denoising convolutional transformer
304
343
  Based on the Compact Convolutional Transformer (CCT) of [Hasani et al. (2021)
@@ -332,6 +371,7 @@ class CCT(nn.Module):
332
371
  transformer_heads=4,
333
372
  transformer_mlp_ratio=2,
334
373
  transformer_bos_tokens=0,
374
+ transformer_return_bos_tokens=False,
335
375
  transformer_activation: nn.Module = SquaredReLU,
336
376
  transformer_activation_kwargs: Optional[dict] = None,
337
377
  transformer_mlp_dropout=0.0,
@@ -341,6 +381,7 @@ class CCT(nn.Module):
341
381
  initial_batch_norm=True,
342
382
  linear_module=nn.Linear,
343
383
  image_classes=100,
384
+ head=SequencePoolClassificationHead,
344
385
  ):
345
386
 
346
387
  super().__init__()
@@ -385,6 +426,7 @@ class CCT(nn.Module):
385
426
  transformer_heads=transformer_heads,
386
427
  transformer_mlp_ratio=transformer_mlp_ratio,
387
428
  transformer_bos_tokens=transformer_bos_tokens,
429
+ transformer_return_bos_tokens=transformer_return_bos_tokens,
388
430
  transformer_activation=transformer_activation,
389
431
  transformer_activation_kwargs=transformer_activation_kwargs,
390
432
  transformer_mlp_dropout=transformer_mlp_dropout,
@@ -393,7 +435,8 @@ class CCT(nn.Module):
393
435
  linear_module=linear_module,
394
436
  initial_batch_norm=initial_batch_norm,
395
437
  )
396
- self.pool = SequencePool(
438
+
439
+ self.pool = head(
397
440
  transformer_embedding_size,
398
441
  linear_module,
399
442
  image_classes,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "broccoli-ml"
3
- version = "0.11.0"
3
+ version = "0.13.0"
4
4
  description = "Some useful Pytorch models, circa 2025"
5
5
  authors = [
6
6
  {name = "Nicholas Bailey"}
@@ -1,41 +0,0 @@
1
- # UNDER CONSTRUCTION
2
-
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
-
8
- class RandomLinear(nn.Linear):
9
- """ """
10
-
11
- def __init__(
12
- self,
13
- in_features: int,
14
- out_features: int,
15
- bias: bool = False, # <---- TODO: explain this
16
- beta=0.1,
17
- forward_looks_random=True,
18
- ):
19
- super().__init__(in_features, out_features, bias=False)
20
- self.beta = beta
21
- self.forward_looks_random = forward_looks_random
22
-
23
- def forward(self, inputs: torch.Tensor):
24
- if not self.training:
25
- return F.linear(inputs, self.weight)
26
- else:
27
- # Initialise self.random_weights
28
- random_weights = torch.empty_like(self.weight)
29
- nn.init.trunc_normal_(random_weights)
30
- random_weights *= self.beta
31
-
32
- if self.forward_looks_random:
33
- # Forward using a reparameterisation trick
34
- a = F.linear(inputs.detach(), self.weight, self.bias)
35
- b = F.linear(inputs, random_weights, bias=None)
36
- else:
37
- # Forward as (W_actual * input + W_random * input) + bias
38
- a = F.linear(inputs, self.weight, self.bias)
39
- b = F.linear(inputs, random_weights, bias=None)
40
-
41
- return a + b
File without changes
File without changes