broccoli-ml 0.36.0__py3-none-any.whl → 10.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- broccoli/activation.py +1 -4
- broccoli/cnn.py +1 -289
- broccoli/linear.py +237 -7
- broccoli/rope.py +19 -4
- broccoli/transformer.py +502 -142
- broccoli/utils.py +13 -7
- broccoli/vit.py +192 -51
- {broccoli_ml-0.36.0.dist-info → broccoli_ml-10.0.1.dist-info}/METADATA +5 -3
- broccoli_ml-10.0.1.dist-info/RECORD +13 -0
- broccoli/assets/2025_resnet_imagenet_1k_pretrained_state_dict.pkl +0 -0
- broccoli/assets/cifar100_eigenvectors_size_2.pt +0 -0
- broccoli/assets/cifar100_eigenvectors_size_3.pt +0 -0
- broccoli/eigenpatches.py +0 -49
- broccoli_ml-0.36.0.dist-info/RECORD +0 -17
- {broccoli_ml-0.36.0.dist-info → broccoli_ml-10.0.1.dist-info}/LICENSE +0 -0
- {broccoli_ml-0.36.0.dist-info → broccoli_ml-10.0.1.dist-info}/WHEEL +0 -0
broccoli/utils.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
import torch
|
|
1
|
+
import torch.nn as nn
|
|
2
|
+
import torch.nn.functional as F
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
5
|
+
class PadTensor(nn.Module):
|
|
6
|
+
def __init__(self, *args, **kwargs):
|
|
7
|
+
super().__init__()
|
|
8
|
+
self.args = args
|
|
9
|
+
self.kwargs = kwargs
|
|
10
|
+
|
|
11
|
+
def forward(self, x):
|
|
12
|
+
if sum(self.args[0]) == 0:
|
|
13
|
+
return x
|
|
14
|
+
else:
|
|
15
|
+
return F.pad(x, *self.args, **self.kwargs)
|
broccoli/vit.py
CHANGED
|
@@ -4,24 +4,13 @@ from typing import Optional
|
|
|
4
4
|
from .transformer import TransformerEncoder, FeedforwardBlock
|
|
5
5
|
from .cnn import SpaceToDepth, calculate_output_spatial_size, spatial_tuple
|
|
6
6
|
from .activation import ReLU, SquaredReLU, GELU, SwiGLU
|
|
7
|
-
from .
|
|
7
|
+
from .utils import PadTensor
|
|
8
|
+
|
|
8
9
|
from einops import einsum
|
|
9
10
|
from einops.layers.torch import Rearrange
|
|
10
|
-
import torch.nn as nn
|
|
11
|
-
import torch.nn.functional as F
|
|
12
|
-
|
|
13
11
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
super().__init__()
|
|
17
|
-
self.args = args
|
|
18
|
-
self.kwargs = kwargs
|
|
19
|
-
|
|
20
|
-
def forward(self, x):
|
|
21
|
-
if sum(self.args[0]) == 0:
|
|
22
|
-
return x
|
|
23
|
-
else:
|
|
24
|
-
return F.pad(x, *self.args, **self.kwargs)
|
|
12
|
+
import torch
|
|
13
|
+
import torch.nn as nn
|
|
25
14
|
|
|
26
15
|
|
|
27
16
|
class GetCLSToken(nn.Module):
|
|
@@ -43,22 +32,45 @@ class SequencePool(nn.Module):
|
|
|
43
32
|
]
|
|
44
33
|
)
|
|
45
34
|
|
|
35
|
+
self.reset_parameters()
|
|
36
|
+
|
|
46
37
|
def forward(self, x):
|
|
47
38
|
weights = self.attention(x)
|
|
48
39
|
return einsum(weights, x, "batch seq, batch seq d_model -> batch d_model")
|
|
49
40
|
|
|
41
|
+
def attention_scores(self, x):
|
|
42
|
+
return self.attention(x)
|
|
43
|
+
|
|
44
|
+
def reset_parameters(self):
|
|
45
|
+
# Iterate over modules in the sequential block
|
|
46
|
+
for module in self.attention:
|
|
47
|
+
if hasattr(module, "reset_parameters"):
|
|
48
|
+
module.reset_parameters()
|
|
49
|
+
|
|
50
50
|
|
|
51
51
|
class ClassificationHead(nn.Module):
|
|
52
52
|
"""
|
|
53
53
|
A general classification head for a ViT
|
|
54
54
|
"""
|
|
55
55
|
|
|
56
|
-
def __init__(
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
d_model,
|
|
59
|
+
n_classes,
|
|
60
|
+
logit_projection_layer=nn.Linear,
|
|
61
|
+
batch_norm_logits=True,
|
|
62
|
+
):
|
|
57
63
|
super().__init__()
|
|
58
64
|
self.d_model = d_model
|
|
59
65
|
self.summarize = GetCLSToken()
|
|
60
|
-
|
|
61
|
-
if
|
|
66
|
+
|
|
67
|
+
if d_model == n_classes:
|
|
68
|
+
# No need to project
|
|
69
|
+
self.projection = nn.Identity()
|
|
70
|
+
else:
|
|
71
|
+
self.projection = logit_projection_layer(d_model, n_classes)
|
|
72
|
+
|
|
73
|
+
if batch_norm_logits:
|
|
62
74
|
self.batch_norm = nn.BatchNorm1d(n_classes, affine=False)
|
|
63
75
|
else:
|
|
64
76
|
self.batch_norm = nn.Identity()
|
|
@@ -71,9 +83,16 @@ class ClassificationHead(nn.Module):
|
|
|
71
83
|
]
|
|
72
84
|
)
|
|
73
85
|
|
|
86
|
+
self.reset_parameters()
|
|
87
|
+
|
|
74
88
|
def forward(self, x):
|
|
75
89
|
return self.classification_process(x)
|
|
76
90
|
|
|
91
|
+
def reset_parameters(self):
|
|
92
|
+
for module in self.classification_process:
|
|
93
|
+
if hasattr(module, "reset_parameters"):
|
|
94
|
+
module.reset_parameters()
|
|
95
|
+
|
|
77
96
|
|
|
78
97
|
class SequencePoolClassificationHead(ClassificationHead):
|
|
79
98
|
"""
|
|
@@ -82,9 +101,31 @@ class SequencePoolClassificationHead(ClassificationHead):
|
|
|
82
101
|
as a generalisation of average pooling.
|
|
83
102
|
"""
|
|
84
103
|
|
|
85
|
-
def __init__(
|
|
86
|
-
|
|
87
|
-
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
d_model,
|
|
107
|
+
n_classes,
|
|
108
|
+
logit_projection_layer=nn.Linear,
|
|
109
|
+
batch_norm_logits=True,
|
|
110
|
+
):
|
|
111
|
+
super().__init__(
|
|
112
|
+
d_model,
|
|
113
|
+
n_classes,
|
|
114
|
+
logit_projection_layer=logit_projection_layer,
|
|
115
|
+
batch_norm_logits=batch_norm_logits,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.summarize = SequencePool(d_model, logit_projection_layer)
|
|
119
|
+
# Rebuild the classification process with the correct summary module:
|
|
120
|
+
self.classification_process = nn.Sequential(
|
|
121
|
+
*[
|
|
122
|
+
self.summarize,
|
|
123
|
+
self.projection,
|
|
124
|
+
self.batch_norm,
|
|
125
|
+
]
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
self.reset_parameters()
|
|
88
129
|
|
|
89
130
|
|
|
90
131
|
class ViTEncoder(nn.Module):
|
|
@@ -120,21 +161,33 @@ class ViTEncoder(nn.Module):
|
|
|
120
161
|
transformer_initial_ff_residual_path=True,
|
|
121
162
|
transformer_initial_ff_linear_module_up=None,
|
|
122
163
|
transformer_initial_ff_linear_module_down=None,
|
|
164
|
+
transformer_initial_ff_dropout=None,
|
|
165
|
+
transformer_initial_ff_inner_dropout=None,
|
|
166
|
+
transformer_initial_ff_outer_dropout=None,
|
|
123
167
|
transformer_pre_norm=True,
|
|
124
168
|
transformer_normformer=False,
|
|
125
169
|
transformer_post_norm=False,
|
|
126
|
-
|
|
170
|
+
transformer_absolute_position_embedding=False,
|
|
171
|
+
transformer_relative_position_embedding=True,
|
|
127
172
|
transformer_embedding_size=256,
|
|
128
173
|
transformer_layers=7,
|
|
129
174
|
transformer_heads=4,
|
|
130
175
|
transformer_mlp_ratio=2,
|
|
131
|
-
|
|
132
|
-
|
|
176
|
+
transformer_utility_tokens=0,
|
|
177
|
+
transformer_talking_heads=False,
|
|
178
|
+
transformer_return_utility_tokens=False,
|
|
133
179
|
transformer_activation: nn.Module = SquaredReLU,
|
|
134
180
|
transformer_activation_kwargs: Optional[dict] = None,
|
|
135
|
-
|
|
181
|
+
transformer_ff_linear_module_up=None,
|
|
182
|
+
transformer_ff_linear_module_down=None,
|
|
183
|
+
transformer_msa_scaling="d",
|
|
184
|
+
transformer_ff_dropout=0.0,
|
|
185
|
+
transformer_ff_inner_dropout=0.0,
|
|
186
|
+
transformer_ff_outer_dropout=0.0,
|
|
136
187
|
transformer_msa_dropout=0.1,
|
|
137
188
|
transformer_stochastic_depth=0.1,
|
|
189
|
+
transformer_checkpoint_ff=True,
|
|
190
|
+
transformer_layerscale=True,
|
|
138
191
|
linear_module=nn.Linear,
|
|
139
192
|
):
|
|
140
193
|
super().__init__()
|
|
@@ -277,21 +330,30 @@ class ViTEncoder(nn.Module):
|
|
|
277
330
|
transformer_embedding_size,
|
|
278
331
|
transformer_layers,
|
|
279
332
|
transformer_heads,
|
|
280
|
-
|
|
333
|
+
absolute_position_embedding=transformer_absolute_position_embedding,
|
|
334
|
+
relative_position_embedding=transformer_relative_position_embedding,
|
|
281
335
|
source_size=pooling_output_size,
|
|
282
336
|
mlp_ratio=transformer_mlp_ratio,
|
|
283
337
|
activation=transformer_activation,
|
|
284
338
|
activation_kwargs=transformer_activation_kwargs,
|
|
285
|
-
|
|
339
|
+
ff_linear_module_up=transformer_ff_linear_module_up,
|
|
340
|
+
ff_linear_module_down=transformer_ff_linear_module_down,
|
|
341
|
+
msa_scaling=transformer_msa_scaling,
|
|
342
|
+
ff_dropout=transformer_ff_dropout,
|
|
343
|
+
ff_inner_dropout=transformer_ff_inner_dropout,
|
|
344
|
+
ff_outer_dropout=transformer_ff_outer_dropout,
|
|
286
345
|
msa_dropout=transformer_msa_dropout,
|
|
287
346
|
stochastic_depth=transformer_stochastic_depth,
|
|
288
347
|
causal=False,
|
|
289
348
|
linear_module=linear_module,
|
|
290
|
-
|
|
291
|
-
|
|
349
|
+
utility_tokens=transformer_utility_tokens,
|
|
350
|
+
talking_heads=transformer_talking_heads,
|
|
351
|
+
return_utility_tokens=transformer_return_utility_tokens,
|
|
292
352
|
pre_norm=transformer_pre_norm,
|
|
293
353
|
normformer=transformer_normformer,
|
|
294
354
|
post_norm=transformer_post_norm,
|
|
355
|
+
checkpoint_ff=transformer_checkpoint_ff,
|
|
356
|
+
layerscale=transformer_layerscale,
|
|
295
357
|
)
|
|
296
358
|
else:
|
|
297
359
|
self.transformer = nn.Identity()
|
|
@@ -303,21 +365,41 @@ class ViTEncoder(nn.Module):
|
|
|
303
365
|
transformer_embedding_size,
|
|
304
366
|
activation=transformer_activation,
|
|
305
367
|
activation_kwargs=transformer_activation_kwargs,
|
|
306
|
-
dropout=
|
|
368
|
+
dropout=(
|
|
369
|
+
# First truthy assigned value
|
|
370
|
+
transformer_initial_ff_dropout
|
|
371
|
+
if transformer_initial_ff_dropout is not None
|
|
372
|
+
else transformer_ff_dropout
|
|
373
|
+
),
|
|
374
|
+
inner_dropout=(
|
|
375
|
+
# First truthy assigned value
|
|
376
|
+
transformer_initial_ff_inner_dropout
|
|
377
|
+
if transformer_initial_ff_inner_dropout is not None
|
|
378
|
+
else transformer_ff_inner_dropout
|
|
379
|
+
),
|
|
380
|
+
outer_dropout=(
|
|
381
|
+
# First truthy assigned value
|
|
382
|
+
transformer_initial_ff_outer_dropout
|
|
383
|
+
if transformer_initial_ff_outer_dropout is not None
|
|
384
|
+
else transformer_ff_outer_dropout
|
|
385
|
+
),
|
|
307
386
|
linear_module_up=(
|
|
387
|
+
# First truthy assigned value
|
|
308
388
|
transformer_initial_ff_linear_module_up
|
|
309
|
-
|
|
310
|
-
|
|
389
|
+
or transformer_ff_linear_module_up
|
|
390
|
+
or linear_module
|
|
311
391
|
),
|
|
312
392
|
linear_module_down=(
|
|
393
|
+
# First truthy assigned value
|
|
313
394
|
transformer_initial_ff_linear_module_down
|
|
314
|
-
|
|
315
|
-
|
|
395
|
+
or transformer_ff_linear_module_down
|
|
396
|
+
or linear_module
|
|
316
397
|
),
|
|
317
398
|
pre_norm=transformer_pre_norm,
|
|
318
399
|
normformer=transformer_normformer,
|
|
319
400
|
post_norm=transformer_post_norm,
|
|
320
401
|
residual_path=transformer_initial_ff_residual_path,
|
|
402
|
+
checkpoint=transformer_checkpoint_ff,
|
|
321
403
|
)
|
|
322
404
|
else:
|
|
323
405
|
self.initial_ff = nn.Identity()
|
|
@@ -337,17 +419,24 @@ class ViTEncoder(nn.Module):
|
|
|
337
419
|
]
|
|
338
420
|
)
|
|
339
421
|
|
|
422
|
+
self.reset_parameters()
|
|
423
|
+
|
|
340
424
|
def forward(self, x):
|
|
341
425
|
return self.encoder(x)
|
|
342
426
|
|
|
427
|
+
def attention_logits(self, x):
|
|
428
|
+
x = self.encoder[:-1](x)
|
|
429
|
+
return self.encoder[-1].attention_logits(x)
|
|
430
|
+
|
|
431
|
+
def reset_parameters(self):
|
|
432
|
+
for module in self.encoder:
|
|
433
|
+
if hasattr(module, "reset_parameters"):
|
|
434
|
+
module.reset_parameters()
|
|
435
|
+
|
|
343
436
|
|
|
344
437
|
class ViT(nn.Module):
|
|
345
438
|
"""
|
|
346
|
-
|
|
347
|
-
Based on the Compact Convolutional Transformer (CCT) of [Hasani et al. (2021)
|
|
348
|
-
*''Escaping the Big Data Paradigm with Compact Transformers''*](
|
|
349
|
-
https://arxiv.org/abs/2104.05704). It's a convolutional neural network
|
|
350
|
-
leading into a transformer encoder, followed by a sequence pooling layer.
|
|
439
|
+
...
|
|
351
440
|
"""
|
|
352
441
|
|
|
353
442
|
def __init__(
|
|
@@ -374,24 +463,37 @@ class ViT(nn.Module):
|
|
|
374
463
|
transformer_initial_ff_residual_path=True,
|
|
375
464
|
transformer_initial_ff_linear_module_up=None,
|
|
376
465
|
transformer_initial_ff_linear_module_down=None,
|
|
466
|
+
transformer_initial_ff_dropout=None,
|
|
467
|
+
transformer_initial_ff_inner_dropout=None,
|
|
468
|
+
transformer_initial_ff_outer_dropout=None,
|
|
377
469
|
transformer_pre_norm=True,
|
|
378
470
|
transformer_normformer=False,
|
|
379
471
|
transformer_post_norm=False,
|
|
380
|
-
|
|
472
|
+
transformer_absolute_position_embedding=False,
|
|
473
|
+
transformer_relative_position_embedding=True,
|
|
381
474
|
transformer_embedding_size=256,
|
|
382
475
|
transformer_layers=7,
|
|
383
476
|
transformer_heads=4,
|
|
384
477
|
transformer_mlp_ratio=2,
|
|
385
|
-
|
|
386
|
-
|
|
478
|
+
transformer_utility_tokens=0,
|
|
479
|
+
transformer_talking_heads=False,
|
|
480
|
+
transformer_return_utility_tokens=False,
|
|
387
481
|
transformer_activation: nn.Module = SquaredReLU,
|
|
388
482
|
transformer_activation_kwargs: Optional[dict] = None,
|
|
389
|
-
|
|
483
|
+
transformer_ff_linear_module_up=None,
|
|
484
|
+
transformer_ff_linear_module_down=None,
|
|
485
|
+
transformer_msa_scaling="d",
|
|
486
|
+
transformer_ff_dropout=0.0,
|
|
487
|
+
transformer_ff_inner_dropout=0.0,
|
|
488
|
+
transformer_ff_outer_dropout=0.0,
|
|
390
489
|
transformer_msa_dropout=0.1,
|
|
391
490
|
transformer_stochastic_depth=0.1,
|
|
392
|
-
|
|
393
|
-
|
|
491
|
+
transformer_checkpoint_ff=True,
|
|
492
|
+
transformer_layerscale=True,
|
|
394
493
|
head=SequencePoolClassificationHead,
|
|
494
|
+
batch_norm_logits=True,
|
|
495
|
+
logit_projection_layer=nn.Linear,
|
|
496
|
+
linear_module=nn.Linear,
|
|
395
497
|
):
|
|
396
498
|
|
|
397
499
|
super().__init__()
|
|
@@ -434,34 +536,73 @@ class ViT(nn.Module):
|
|
|
434
536
|
transformer_initial_ff_residual_path=transformer_initial_ff_residual_path,
|
|
435
537
|
transformer_initial_ff_linear_module_up=transformer_initial_ff_linear_module_up,
|
|
436
538
|
transformer_initial_ff_linear_module_down=transformer_initial_ff_linear_module_down,
|
|
539
|
+
transformer_initial_ff_dropout=transformer_initial_ff_dropout,
|
|
540
|
+
transformer_initial_ff_inner_dropout=transformer_initial_ff_inner_dropout,
|
|
541
|
+
transformer_initial_ff_outer_dropout=transformer_initial_ff_outer_dropout,
|
|
437
542
|
transformer_pre_norm=transformer_pre_norm,
|
|
438
543
|
transformer_normformer=transformer_normformer,
|
|
439
544
|
transformer_post_norm=transformer_post_norm,
|
|
440
|
-
|
|
545
|
+
transformer_absolute_position_embedding=transformer_absolute_position_embedding,
|
|
546
|
+
transformer_relative_position_embedding=transformer_relative_position_embedding,
|
|
441
547
|
transformer_embedding_size=transformer_embedding_size,
|
|
442
548
|
transformer_layers=transformer_layers,
|
|
443
549
|
transformer_heads=transformer_heads,
|
|
444
550
|
transformer_mlp_ratio=transformer_mlp_ratio,
|
|
445
|
-
|
|
446
|
-
|
|
551
|
+
transformer_utility_tokens=transformer_utility_tokens,
|
|
552
|
+
transformer_talking_heads=transformer_talking_heads,
|
|
553
|
+
transformer_return_utility_tokens=transformer_return_utility_tokens,
|
|
447
554
|
transformer_activation=transformer_activation,
|
|
448
555
|
transformer_activation_kwargs=transformer_activation_kwargs,
|
|
449
|
-
|
|
556
|
+
transformer_ff_linear_module_up=transformer_ff_linear_module_up,
|
|
557
|
+
transformer_ff_linear_module_down=transformer_ff_linear_module_down,
|
|
558
|
+
transformer_msa_scaling=transformer_msa_scaling,
|
|
559
|
+
transformer_ff_dropout=transformer_ff_dropout,
|
|
560
|
+
transformer_ff_inner_dropout=transformer_ff_inner_dropout,
|
|
561
|
+
transformer_ff_outer_dropout=transformer_ff_outer_dropout,
|
|
450
562
|
transformer_msa_dropout=transformer_msa_dropout,
|
|
451
563
|
transformer_stochastic_depth=transformer_stochastic_depth,
|
|
564
|
+
transformer_checkpoint_ff=transformer_checkpoint_ff,
|
|
565
|
+
transformer_layerscale=transformer_layerscale,
|
|
452
566
|
linear_module=linear_module,
|
|
453
567
|
)
|
|
454
568
|
|
|
455
569
|
self.pool = head(
|
|
456
570
|
transformer_embedding_size,
|
|
457
|
-
linear_module,
|
|
458
571
|
image_classes,
|
|
459
|
-
|
|
572
|
+
logit_projection_layer=logit_projection_layer,
|
|
573
|
+
batch_norm_logits=batch_norm_logits,
|
|
460
574
|
)
|
|
461
575
|
|
|
576
|
+
self.reset_parameters()
|
|
577
|
+
|
|
462
578
|
@property
|
|
463
579
|
def sequence_length(self):
|
|
464
580
|
return self.encoder.sequence_length
|
|
465
581
|
|
|
466
582
|
def forward(self, x):
|
|
467
583
|
return self.pool(self.encoder(x))
|
|
584
|
+
|
|
585
|
+
def attention_logits(self, x):
|
|
586
|
+
return self.encoder.attention_logits(x)
|
|
587
|
+
|
|
588
|
+
def pool_attention(self, x):
|
|
589
|
+
if hasattr(self.pool.summarize, "attention"):
|
|
590
|
+
return self.pool.summarize.attention(self.encoder(x))
|
|
591
|
+
else:
|
|
592
|
+
raise NotImplementedError(
|
|
593
|
+
"`pool_attention` is currently only implemented where"
|
|
594
|
+
" head class is SequencePoolClassificationHead"
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def head_to_utility_token_attention_logits(self, x):
|
|
598
|
+
all_attention = self.attention_logits(x)
|
|
599
|
+
batch_averages = torch.mean(all_attention, dim=0, keepdim=False)
|
|
600
|
+
sequence_averages = torch.mean(batch_averages, dim=-1, keepdim=False)
|
|
601
|
+
n_utility_tokens = self.encoder.encoder[-1]._utility_tokens
|
|
602
|
+
return sequence_averages[
|
|
603
|
+
:, :, :n_utility_tokens
|
|
604
|
+
] # (layer, head, utility_tokens)
|
|
605
|
+
|
|
606
|
+
def reset_parameters(self):
|
|
607
|
+
self.encoder.reset_parameters()
|
|
608
|
+
self.pool.reset_parameters()
|
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: broccoli-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 10.0.1
|
|
4
4
|
Summary: Some useful Pytorch models, circa 2025
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Nicholas Bailey
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Classifier: License :: OSI Approved :: MIT License
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
16
|
Requires-Dist: einops (>=0.8.1,<0.9.0)
|
|
14
|
-
Requires-Dist: numpy (>=2.0.2,<2.1.0)
|
|
15
17
|
Description-Content-Type: text/markdown
|
|
16
18
|
|
|
17
19
|
# broccoli
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
broccoli/__init__.py,sha256=tmyspsVxqPZHRQCY_NRwpW4SMNBbtE8E_8z7l-SAzSo,127
|
|
2
|
+
broccoli/activation.py,sha256=nrpTOrpg9k23_E4AJWy7VlXXAJCtCJCOR-TonEWJr04,3218
|
|
3
|
+
broccoli/cnn.py,sha256=WjoPDSpe3ttwxCBNfCVRdaCHvbeZ7G-a5_i8fUsK_d8,4889
|
|
4
|
+
broccoli/linear.py,sha256=W-3aNpBjd_0xRyzbCKkmg4H1qmslQOIQhB-WDDay2nM,13125
|
|
5
|
+
broccoli/rope.py,sha256=GRqApBNmYCFaDak0WL1xE_BC5CTTYKQU_PBdeTcQcjc,12557
|
|
6
|
+
broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
|
|
7
|
+
broccoli/transformer.py,sha256=lnfiv7UIYbABiClIluy6CefGxaiYMrvBcj2Ul0uU6xE,27693
|
|
8
|
+
broccoli/utils.py,sha256=oOWzn6dJ5nC_9r4zq0emmfmaYACJXJNFS48AOpW2jqc,358
|
|
9
|
+
broccoli/vit.py,sha256=EGbQb-atuzG3JAx7kdTaJEbWvQR-4XgyYvwjKkN5C38,22612
|
|
10
|
+
broccoli_ml-10.0.1.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
|
|
11
|
+
broccoli_ml-10.0.1.dist-info/METADATA,sha256=65GKe2Jor5jgUZ8zxROntJ_t0XwAlaukrvpT7nxS0lQ,1369
|
|
12
|
+
broccoli_ml-10.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
13
|
+
broccoli_ml-10.0.1.dist-info/RECORD,,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
broccoli/eigenpatches.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Jordan (2024) was able to train a CNN to 94% accuracy on CIFAR-10 in 3.29 seconds
|
|
3
|
-
on a single A100 GPU by using carefully-tuned hyperparameters and a number of
|
|
4
|
-
techniques to increase learning efficiency. The author notes that applying fixed
|
|
5
|
-
weights to the first layer of the network that approximate a whitening
|
|
6
|
-
transformation on image patches, following tsyam-code, (2023), was "the single
|
|
7
|
-
most impactful feature... [and] more than doubles training speed".
|
|
8
|
-
|
|
9
|
-
The usefulness of a fixed layer that whitens image patches can be justified
|
|
10
|
-
according to the work of Chowers & Weiss (2022), who find that the first layer
|
|
11
|
-
weights of a convolutional neural network will asymptotically approach a whitening
|
|
12
|
-
transformation regardless of the details of the rest of the network architecture
|
|
13
|
-
or the training data. This effectively functions as a bandpass filter layer,
|
|
14
|
-
reminiscent of the way neurons in the human primary visual cortex work (Kristensen
|
|
15
|
-
& Sandberg, 2021).
|
|
16
|
-
|
|
17
|
-
The `eigenvectors` function here is adapted from
|
|
18
|
-
https://github.com/KellerJordan/cifar10-airbench/blob/master/airbench96_faster.py
|
|
19
|
-
using https://datascienceplus.com/understanding-the-covariance-matrix/
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
import torch
|
|
23
|
-
import torch.nn as nn
|
|
24
|
-
from einops import rearrange
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def eigenvectors(images: torch.Tensor, patch_size: int, eps=5e-4) -> torch.Tensor:
|
|
28
|
-
"""
|
|
29
|
-
Adapted from
|
|
30
|
-
github.com/KellerJordan/cifar10-airbench/blob/master/airbench96_faster.py
|
|
31
|
-
using https://datascienceplus.com/understanding-the-covariance-matrix/
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
images: a batch of training images (the bigger and more representative the better!)
|
|
35
|
-
patch_size: the size of the eigenvectors we want to create (i.e. the patch/kernel
|
|
36
|
-
size of the model we will initialise with the eigenvectors)
|
|
37
|
-
eps: a small number to avoid division by zero
|
|
38
|
-
"""
|
|
39
|
-
with torch.no_grad():
|
|
40
|
-
unfolder = nn.Unfold(kernel_size=patch_size, stride=1)
|
|
41
|
-
patches = unfolder(images) # (N, patch_elements, patches_per_image)
|
|
42
|
-
patches = rearrange(patches, "N elements patches -> (N patches) elements")
|
|
43
|
-
n = patches.size(0)
|
|
44
|
-
centred = patches - patches.mean(dim=1, keepdim=True)
|
|
45
|
-
covariance_matrix = (
|
|
46
|
-
centred.T @ centred
|
|
47
|
-
) / n # https://datascienceplus.com/understanding-the-covariance-matrix/
|
|
48
|
-
_, eigenvectors = torch.linalg.eigh(covariance_matrix)
|
|
49
|
-
return eigenvectors
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
broccoli/__init__.py,sha256=tmyspsVxqPZHRQCY_NRwpW4SMNBbtE8E_8z7l-SAzSo,127
|
|
2
|
-
broccoli/activation.py,sha256=-Jf30C6iGqWCorC9HEGn2oduWwjeaCAxGLUUYIy1zX8,3438
|
|
3
|
-
broccoli/assets/2025_resnet_imagenet_1k_pretrained_state_dict.pkl,sha256=RZpPupWxFaVfgZrK-gBgfW1hj78oMEGhVWTbjRB3qMo,46835797
|
|
4
|
-
broccoli/assets/cifar100_eigenvectors_size_2.pt,sha256=DjXDOXMeuMpIqNuGhX9z-OWYVqZwIMScSXZApRr9JjU,2501
|
|
5
|
-
broccoli/assets/cifar100_eigenvectors_size_3.pt,sha256=gL6k0xtXYiYP6ZSvEiMBdJ7kIkT0AngTpDJHFQqwgxA,7173
|
|
6
|
-
broccoli/cnn.py,sha256=jeRyKIAMWu1E3iyI14MGgSZuZivPMh12iqkqW9ilNjo,17785
|
|
7
|
-
broccoli/eigenpatches.py,sha256=J6n2usN1oQuHEHYiBNyYpn_a9eQcHjOBiIlvSei520Y,2413
|
|
8
|
-
broccoli/linear.py,sha256=8Y9vD85ZEgNZsIQgO3uRQ3lOQR-JjwvabY8liCrfNCk,4831
|
|
9
|
-
broccoli/rope.py,sha256=hw7kBPNR9GQXj4GxyIAffsGKPfcTPOFh8Bc7oEHtaZY,12108
|
|
10
|
-
broccoli/tensor.py,sha256=um8mrxkYbvNDo-QvHlmJm8Aw6qcngOlUZPoAk_PMReA,4480
|
|
11
|
-
broccoli/transformer.py,sha256=NH94U6lxHzmDGDHTTtJV2kUs7IcS2iNmFJl44_6KtQ0,15456
|
|
12
|
-
broccoli/utils.py,sha256=htq_hOsdhUhL0nJi9WkKiEYOjEoWqFpK5X49PtgTf-0,299
|
|
13
|
-
broccoli/vit.py,sha256=05xqIw9xvE5easXcp4wIA1jQ0xUyRIq6h0ZDtbitXi4,17184
|
|
14
|
-
broccoli_ml-0.36.0.dist-info/LICENSE,sha256=0BAzJE5BqQ7Iixp_AFdB2W1uO-HCRX-Qfun8PHt6yVM,1073
|
|
15
|
-
broccoli_ml-0.36.0.dist-info/METADATA,sha256=csog4ZG1PGeRuFO5QnHdVPgmDYXsGQQJ621JgU0D83w,1257
|
|
16
|
-
broccoli_ml-0.36.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
17
|
-
broccoli_ml-0.36.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|