broccoli-ml 0.11.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/PKG-INFO +1 -1
- broccoli_ml-0.13.0/broccoli/linear.py +88 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/transformer.py +10 -10
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/vit.py +60 -17
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/pyproject.toml +1 -1
- broccoli_ml-0.11.0/broccoli/linear.py +0 -41
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/LICENSE +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/README.md +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/__init__.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/activation.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/assets/2025_resnet_imagenet_1k_pretrained_state_dict.pkl +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/assets/cifar100_eigenvectors_size_2.pt +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/assets/cifar100_eigenvectors_size_3.pt +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/cnn.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/eigenpatches.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/rope.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/tensor.py +0 -0
- {broccoli_ml-0.11.0 → broccoli_ml-0.13.0}/broccoli/utils.py +0 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
# UNDER CONSTRUCTION
|
2
|
+
|
3
|
+
import math
|
4
|
+
import torch
|
5
|
+
from torch import nn
|
6
|
+
from torch.nn import functional as F
|
7
|
+
|
8
|
+
from .tensor import SigmaReparamTensor
|
9
|
+
|
10
|
+
|
11
|
+
class SpectralNormLinear(nn.Module):
|
12
|
+
"""
|
13
|
+
...
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(self, in_features: int, out_features: int, bias: bool = True):
|
17
|
+
super().__init__()
|
18
|
+
self.in_features = in_features
|
19
|
+
self.out_features = out_features
|
20
|
+
self.use_bias = bias
|
21
|
+
|
22
|
+
self.weights = None
|
23
|
+
|
24
|
+
self.weight_init = nn.Parameter(torch.empty(out_features, in_features))
|
25
|
+
|
26
|
+
# Define the bias vector as a learnable parameter if required.
|
27
|
+
if self.use_bias:
|
28
|
+
self.bias = nn.Parameter(torch.empty(out_features))
|
29
|
+
else:
|
30
|
+
# If no bias, register it as None.
|
31
|
+
# This is important so that PyTorch doesn't complain when saving/loading the model.
|
32
|
+
self.register_parameter("bias", None)
|
33
|
+
|
34
|
+
self.reset_parameters()
|
35
|
+
|
36
|
+
def reset_parameters(self) -> None:
|
37
|
+
nn.init.kaiming_uniform_(self.weight_init, a=math.sqrt(5))
|
38
|
+
if self.use_bias:
|
39
|
+
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weights)
|
40
|
+
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
|
41
|
+
nn.init.uniform_(self.bias, -bound, bound)
|
42
|
+
self.weights = SigmaReparamTensor(self.weight_init)
|
43
|
+
|
44
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
45
|
+
return F.linear(x, self.weights(), self.bias)
|
46
|
+
|
47
|
+
def __repr__(self) -> str:
|
48
|
+
# Optional: A nice representation for printing the module.
|
49
|
+
return (
|
50
|
+
f"SpectralNormFeedForward(in_features={self.in_features}",
|
51
|
+
f"out_features={self.out_features}, bias={self.use_bias})",
|
52
|
+
)
|
53
|
+
|
54
|
+
|
55
|
+
class RandomLinear(nn.Linear):
|
56
|
+
""" """
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self,
|
60
|
+
in_features: int,
|
61
|
+
out_features: int,
|
62
|
+
bias: bool = False, # <---- TODO: explain this
|
63
|
+
beta=0.1,
|
64
|
+
forward_looks_random=True,
|
65
|
+
):
|
66
|
+
super().__init__(in_features, out_features, bias=False)
|
67
|
+
self.beta = beta
|
68
|
+
self.forward_looks_random = forward_looks_random
|
69
|
+
|
70
|
+
def forward(self, inputs: torch.Tensor):
|
71
|
+
if not self.training:
|
72
|
+
return F.linear(inputs, self.weight)
|
73
|
+
else:
|
74
|
+
# Initialise self.random_weights
|
75
|
+
random_weights = torch.empty_like(self.weight)
|
76
|
+
nn.init.trunc_normal_(random_weights)
|
77
|
+
random_weights *= self.beta
|
78
|
+
|
79
|
+
if self.forward_looks_random:
|
80
|
+
# Forward using a reparameterisation trick
|
81
|
+
a = F.linear(inputs.detach(), self.weight, self.bias)
|
82
|
+
b = F.linear(inputs, random_weights, bias=None)
|
83
|
+
else:
|
84
|
+
# Forward as (W_actual * input + W_random * input) + bias
|
85
|
+
a = F.linear(inputs, self.weight, self.bias)
|
86
|
+
b = F.linear(inputs, random_weights, bias=None)
|
87
|
+
|
88
|
+
return a + b
|
@@ -10,6 +10,7 @@ import torch.nn.functional as F
|
|
10
10
|
from einops import rearrange
|
11
11
|
|
12
12
|
from .rope import RotaryEmbedding, apply_rotary_emb
|
13
|
+
from .linear import SpectralNormLinear
|
13
14
|
|
14
15
|
|
15
16
|
class MHAttention(nn.Module):
|
@@ -245,19 +246,19 @@ class FeedforwardLayer(nn.Module):
|
|
245
246
|
|
246
247
|
self.dropout = nn.Dropout(dropout)
|
247
248
|
|
249
|
+
self.max_features = (
|
250
|
+
2 * ratio * output_features
|
251
|
+
if activation.__name__.endswith("GLU")
|
252
|
+
else ratio * output_features
|
253
|
+
)
|
254
|
+
|
248
255
|
self.process = nn.Sequential(
|
249
256
|
*[
|
250
257
|
nn.LayerNorm(input_features),
|
251
|
-
linear_module(
|
252
|
-
input_features,
|
253
|
-
(
|
254
|
-
2 * ratio * output_features
|
255
|
-
if activation.__name__.endswith("GLU")
|
256
|
-
else ratio * output_features
|
257
|
-
),
|
258
|
-
),
|
258
|
+
linear_module(input_features, self.max_features),
|
259
259
|
self.activation,
|
260
|
-
|
260
|
+
nn.LayerNorm(self.max_features),
|
261
|
+
linear_module(ratio * output_features, output_features, bias=False),
|
261
262
|
self.dropout,
|
262
263
|
]
|
263
264
|
)
|
@@ -296,7 +297,6 @@ class TransformerBlock(nn.Module):
|
|
296
297
|
|
297
298
|
self.identity_probability = identity_probability
|
298
299
|
|
299
|
-
# Submodules for applying attention
|
300
300
|
self.layer_norm = nn.LayerNorm(d_model)
|
301
301
|
|
302
302
|
if position_embedding_type == "relative":
|
@@ -20,37 +20,74 @@ class PadTensor(nn.Module):
|
|
20
20
|
return F.pad(x, *self.args, **self.kwargs)
|
21
21
|
|
22
22
|
|
23
|
+
class GetCLSToken(nn.Module):
|
24
|
+
def __init__(self):
|
25
|
+
super().__init__()
|
26
|
+
|
27
|
+
def forward(self, x):
|
28
|
+
return x[:, 0, :]
|
29
|
+
|
30
|
+
|
23
31
|
class SequencePool(nn.Module):
|
32
|
+
def __init__(self, d_model, linear_module):
|
33
|
+
super().__init__()
|
34
|
+
self.attention = nn.Sequential(
|
35
|
+
*[
|
36
|
+
linear_module(d_model, 1),
|
37
|
+
Rearrange("batch seq 1 -> batch seq"),
|
38
|
+
nn.Softmax(dim=-1),
|
39
|
+
]
|
40
|
+
)
|
41
|
+
|
42
|
+
def forward(self, x):
|
43
|
+
weights = self.attention(x)
|
44
|
+
return einsum(weights, x, "batch seq, batch seq d_model -> batch d_model")
|
45
|
+
|
46
|
+
|
47
|
+
class ClassificationHead(nn.Module):
|
24
48
|
"""
|
25
|
-
|
26
|
-
Compact Transformers''*](https://arxiv.org/abs/2104.05704). It can be viewed
|
27
|
-
as a generalisation of average pooling.
|
49
|
+
A general classification head for a ViT
|
28
50
|
"""
|
29
51
|
|
30
|
-
def __init__(self, d_model, linear_module,
|
52
|
+
def __init__(self, d_model, linear_module, n_classes, batch_norm=True):
|
31
53
|
super().__init__()
|
32
54
|
self.d_model = d_model
|
33
|
-
self.
|
55
|
+
self.summarize = GetCLSToken()
|
56
|
+
self.process = nn.Sequential(
|
34
57
|
*[
|
35
58
|
linear_module(d_model, 1),
|
36
59
|
Rearrange("batch seq 1 -> batch seq"),
|
37
60
|
nn.Softmax(dim=-1),
|
38
61
|
]
|
39
62
|
)
|
40
|
-
self.projection = nn.Linear(d_model,
|
41
|
-
self.batch_norm = batch_norm
|
63
|
+
self.projection = nn.Linear(d_model, n_classes)
|
42
64
|
if batch_norm:
|
43
|
-
self.
|
65
|
+
self.batch_norm = nn.BatchNorm1d(n_classes, affine=False)
|
44
66
|
else:
|
45
|
-
self.
|
67
|
+
self.batch_norm = nn.Identity()
|
46
68
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
69
|
+
self.classification_process = nn.Sequential(
|
70
|
+
*[
|
71
|
+
self.summarize,
|
72
|
+
self.projection,
|
73
|
+
self.batch_norm,
|
74
|
+
]
|
51
75
|
)
|
52
|
-
|
53
|
-
|
76
|
+
|
77
|
+
def forward(self, x):
|
78
|
+
return self.classification_process(x)
|
79
|
+
|
80
|
+
|
81
|
+
class SequencePoolClassificationHead(ClassificationHead):
|
82
|
+
"""
|
83
|
+
As described in [Hasani et al. (2021) *''Escaping the Big Data Paradigm with
|
84
|
+
Compact Transformers''*](https://arxiv.org/abs/2104.05704). It can be viewed
|
85
|
+
as a generalisation of average pooling.
|
86
|
+
"""
|
87
|
+
|
88
|
+
def __init__(self, d_model, linear_module, out_dim, batch_norm=True):
|
89
|
+
super().__init__(d_model, linear_module, out_dim, batch_norm=True)
|
90
|
+
self.summarize = SequencePool()
|
54
91
|
|
55
92
|
|
56
93
|
class ViTEncoder(nn.Module):
|
@@ -88,6 +125,7 @@ class ViTEncoder(nn.Module):
|
|
88
125
|
transformer_heads=4,
|
89
126
|
transformer_mlp_ratio=2,
|
90
127
|
transformer_bos_tokens=0,
|
128
|
+
transformer_return_bos_tokens=False,
|
91
129
|
transformer_activation: nn.Module = SquaredReLU,
|
92
130
|
transformer_activation_kwargs: Optional[dict] = None,
|
93
131
|
transformer_mlp_dropout=0.0,
|
@@ -249,6 +287,7 @@ class ViTEncoder(nn.Module):
|
|
249
287
|
causal=False,
|
250
288
|
linear_module=linear_module,
|
251
289
|
bos_tokens=transformer_bos_tokens,
|
290
|
+
return_bos_tokens=transformer_return_bos_tokens,
|
252
291
|
)
|
253
292
|
else:
|
254
293
|
self.transformer = nn.Identity()
|
@@ -298,7 +337,7 @@ class ViTEncoder(nn.Module):
|
|
298
337
|
return self.encoder(x)
|
299
338
|
|
300
339
|
|
301
|
-
class
|
340
|
+
class ViT(nn.Module):
|
302
341
|
"""
|
303
342
|
Denoising convolutional transformer
|
304
343
|
Based on the Compact Convolutional Transformer (CCT) of [Hasani et al. (2021)
|
@@ -332,6 +371,7 @@ class CCT(nn.Module):
|
|
332
371
|
transformer_heads=4,
|
333
372
|
transformer_mlp_ratio=2,
|
334
373
|
transformer_bos_tokens=0,
|
374
|
+
transformer_return_bos_tokens=False,
|
335
375
|
transformer_activation: nn.Module = SquaredReLU,
|
336
376
|
transformer_activation_kwargs: Optional[dict] = None,
|
337
377
|
transformer_mlp_dropout=0.0,
|
@@ -341,6 +381,7 @@ class CCT(nn.Module):
|
|
341
381
|
initial_batch_norm=True,
|
342
382
|
linear_module=nn.Linear,
|
343
383
|
image_classes=100,
|
384
|
+
head=SequencePoolClassificationHead,
|
344
385
|
):
|
345
386
|
|
346
387
|
super().__init__()
|
@@ -385,6 +426,7 @@ class CCT(nn.Module):
|
|
385
426
|
transformer_heads=transformer_heads,
|
386
427
|
transformer_mlp_ratio=transformer_mlp_ratio,
|
387
428
|
transformer_bos_tokens=transformer_bos_tokens,
|
429
|
+
transformer_return_bos_tokens=transformer_return_bos_tokens,
|
388
430
|
transformer_activation=transformer_activation,
|
389
431
|
transformer_activation_kwargs=transformer_activation_kwargs,
|
390
432
|
transformer_mlp_dropout=transformer_mlp_dropout,
|
@@ -393,7 +435,8 @@ class CCT(nn.Module):
|
|
393
435
|
linear_module=linear_module,
|
394
436
|
initial_batch_norm=initial_batch_norm,
|
395
437
|
)
|
396
|
-
|
438
|
+
|
439
|
+
self.pool = head(
|
397
440
|
transformer_embedding_size,
|
398
441
|
linear_module,
|
399
442
|
image_classes,
|
@@ -1,41 +0,0 @@
|
|
1
|
-
# UNDER CONSTRUCTION
|
2
|
-
|
3
|
-
import torch
|
4
|
-
from torch import nn
|
5
|
-
from torch.nn import functional as F
|
6
|
-
|
7
|
-
|
8
|
-
class RandomLinear(nn.Linear):
|
9
|
-
""" """
|
10
|
-
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
in_features: int,
|
14
|
-
out_features: int,
|
15
|
-
bias: bool = False, # <---- TODO: explain this
|
16
|
-
beta=0.1,
|
17
|
-
forward_looks_random=True,
|
18
|
-
):
|
19
|
-
super().__init__(in_features, out_features, bias=False)
|
20
|
-
self.beta = beta
|
21
|
-
self.forward_looks_random = forward_looks_random
|
22
|
-
|
23
|
-
def forward(self, inputs: torch.Tensor):
|
24
|
-
if not self.training:
|
25
|
-
return F.linear(inputs, self.weight)
|
26
|
-
else:
|
27
|
-
# Initialise self.random_weights
|
28
|
-
random_weights = torch.empty_like(self.weight)
|
29
|
-
nn.init.trunc_normal_(random_weights)
|
30
|
-
random_weights *= self.beta
|
31
|
-
|
32
|
-
if self.forward_looks_random:
|
33
|
-
# Forward using a reparameterisation trick
|
34
|
-
a = F.linear(inputs.detach(), self.weight, self.bias)
|
35
|
-
b = F.linear(inputs, random_weights, bias=None)
|
36
|
-
else:
|
37
|
-
# Forward as (W_actual * input + W_random * input) + bias
|
38
|
-
a = F.linear(inputs, self.weight, self.bias)
|
39
|
-
b = F.linear(inputs, random_weights, bias=None)
|
40
|
-
|
41
|
-
return a + b
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|