birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder/common/fs_ops.py +2 -2
- birder/introspection/attention_rollout.py +1 -1
- birder/introspection/transformer_attribution.py +1 -1
- birder/layers/layer_scale.py +1 -1
- birder/net/__init__.py +2 -10
- birder/net/_rope_vit_configs.py +430 -0
- birder/net/_vit_configs.py +479 -0
- birder/net/biformer.py +1 -0
- birder/net/cait.py +5 -5
- birder/net/coat.py +12 -12
- birder/net/conv2former.py +3 -3
- birder/net/convmixer.py +1 -1
- birder/net/convnext_v1.py +1 -1
- birder/net/crossvit.py +5 -5
- birder/net/davit.py +1 -1
- birder/net/deit.py +12 -26
- birder/net/deit3.py +42 -189
- birder/net/densenet.py +9 -8
- birder/net/detection/deformable_detr.py +5 -2
- birder/net/detection/detr.py +5 -2
- birder/net/detection/efficientdet.py +1 -1
- birder/net/dpn.py +1 -2
- birder/net/edgenext.py +2 -1
- birder/net/edgevit.py +3 -0
- birder/net/efficientformer_v1.py +2 -1
- birder/net/efficientformer_v2.py +18 -31
- birder/net/efficientnet_v2.py +3 -0
- birder/net/efficientvit_mit.py +5 -5
- birder/net/fasternet.py +2 -2
- birder/net/flexivit.py +22 -43
- birder/net/groupmixformer.py +1 -1
- birder/net/hgnet_v1.py +5 -5
- birder/net/inception_next.py +1 -1
- birder/net/inception_resnet_v1.py +3 -3
- birder/net/inception_resnet_v2.py +7 -4
- birder/net/inception_v3.py +3 -0
- birder/net/inception_v4.py +3 -0
- birder/net/maxvit.py +1 -1
- birder/net/metaformer.py +3 -3
- birder/net/mim/crossmae.py +1 -1
- birder/net/mim/mae_vit.py +1 -1
- birder/net/mim/simmim.py +1 -1
- birder/net/mobilenet_v1.py +0 -9
- birder/net/mobilenet_v2.py +38 -44
- birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
- birder/net/mobilevit_v1.py +5 -32
- birder/net/mobilevit_v2.py +1 -45
- birder/net/moganet.py +8 -5
- birder/net/mvit_v2.py +6 -6
- birder/net/nfnet.py +4 -0
- birder/net/pit.py +1 -1
- birder/net/pvt_v1.py +5 -5
- birder/net/pvt_v2.py +5 -5
- birder/net/repghost.py +1 -30
- birder/net/resmlp.py +2 -2
- birder/net/resnest.py +3 -0
- birder/net/resnet_v1.py +125 -1
- birder/net/resnet_v2.py +75 -1
- birder/net/resnext.py +35 -1
- birder/net/rope_deit3.py +33 -136
- birder/net/rope_flexivit.py +18 -18
- birder/net/rope_vit.py +3 -735
- birder/net/simple_vit.py +22 -16
- birder/net/smt.py +1 -1
- birder/net/squeezenet.py +5 -12
- birder/net/squeezenext.py +0 -24
- birder/net/ssl/capi.py +1 -1
- birder/net/ssl/data2vec.py +1 -1
- birder/net/ssl/dino_v2.py +2 -2
- birder/net/ssl/franca.py +2 -2
- birder/net/ssl/i_jepa.py +1 -1
- birder/net/ssl/ibot.py +1 -1
- birder/net/swiftformer.py +12 -2
- birder/net/swin_transformer_v2.py +1 -1
- birder/net/tiny_vit.py +3 -16
- birder/net/van.py +2 -2
- birder/net/vit.py +35 -963
- birder/net/vit_sam.py +13 -38
- birder/net/xcit.py +7 -6
- birder/tools/introspection.py +1 -1
- birder/tools/model_info.py +3 -1
- birder/version.py +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
- birder/net/mobilenet_v3_small.py +0 -43
- birder/net/se_resnet_v1.py +0 -105
- birder/net/se_resnet_v2.py +0 -59
- birder/net/se_resnext.py +0 -30
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
birder/net/vit.py
CHANGED
|
@@ -10,8 +10,6 @@ and
|
|
|
10
10
|
Paper "Vision Transformers Need Registers", https://arxiv.org/abs/2309.16588
|
|
11
11
|
and
|
|
12
12
|
Paper "Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design", https://arxiv.org/abs/2305.13035
|
|
13
|
-
and
|
|
14
|
-
Paper "Scaling Vision Transformers", https://arxiv.org/abs/2106.04560
|
|
15
13
|
"""
|
|
16
14
|
|
|
17
15
|
# Reference license: BSD 3-Clause and Apache-2.0
|
|
@@ -35,6 +33,7 @@ from birder.layers import MultiHeadAttentionPool
|
|
|
35
33
|
from birder.layers import SwiGLU_FFN
|
|
36
34
|
from birder.layers.activations import get_activation_module
|
|
37
35
|
from birder.model_registry import registry
|
|
36
|
+
from birder.net._vit_configs import register_vit_configs
|
|
38
37
|
from birder.net.base import DetectorBackbone
|
|
39
38
|
from birder.net.base import MaskedTokenOmissionMixin
|
|
40
39
|
from birder.net.base import MaskedTokenRetentionMixin
|
|
@@ -122,14 +121,10 @@ class Attention(nn.Module):
|
|
|
122
121
|
self.proj = nn.Linear(dim, dim)
|
|
123
122
|
self.proj_drop = nn.Dropout(proj_drop)
|
|
124
123
|
|
|
125
|
-
# Make the same interface as nn.MultiheadAttention forward for TorchScript compatibility
|
|
126
124
|
def forward(
|
|
127
125
|
self,
|
|
128
126
|
x: torch.Tensor,
|
|
129
|
-
key: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
|
|
130
|
-
value: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
|
|
131
127
|
need_weights: bool = False,
|
|
132
|
-
attn_mask: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
|
|
133
128
|
average_attn_weights: bool = False,
|
|
134
129
|
is_causal: bool = False,
|
|
135
130
|
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
|
@@ -137,30 +132,16 @@ class Attention(nn.Module):
|
|
|
137
132
|
Apply multi-head self-attention to the input sequence
|
|
138
133
|
|
|
139
134
|
This module implements scaled dot-product attention over x and returns the
|
|
140
|
-
projected output.
|
|
141
|
-
torch.nn.MultiheadAttention.forward for TorchScript compatibility.
|
|
142
|
-
|
|
143
|
-
Compatibility notes
|
|
144
|
-
-------------------
|
|
145
|
-
The following parameters are accepted for API compatibility but are ignored by this implementation:
|
|
146
|
-
- key: ignored (keys are computed from x)
|
|
147
|
-
- value: ignored (values are computed from x)
|
|
148
|
-
- attn_mask: ignored (no external attention mask is applied)
|
|
135
|
+
projected output.
|
|
149
136
|
|
|
150
137
|
Parameters
|
|
151
138
|
----------
|
|
152
139
|
x
|
|
153
140
|
Input tensor of shape (B, N, C) where B is batch size, N is sequence length,
|
|
154
141
|
and C is embedding dimension.
|
|
155
|
-
key
|
|
156
|
-
Unused. Present for nn.MultiheadAttention-compatible signature.
|
|
157
|
-
value
|
|
158
|
-
Unused. Present for nn.MultiheadAttention-compatible signature.
|
|
159
142
|
need_weights
|
|
160
143
|
If True, also return attention weights computed explicitly. If False, uses
|
|
161
144
|
torch.nn.functional.scaled_dot_product_attention and returns None for attention weights.
|
|
162
|
-
attn_mask
|
|
163
|
-
Unused. Present for nn.MultiheadAttention-compatible signature.
|
|
164
145
|
average_attn_weights
|
|
165
146
|
If True and need_weights is True, average attention weights across heads
|
|
166
147
|
to shape (B, N, N). If False, return per-head weights of shape (B, num_heads, N, N).
|
|
@@ -231,41 +212,32 @@ class EncoderBlock(nn.Module):
|
|
|
231
212
|
super().__init__()
|
|
232
213
|
self.need_attn = False
|
|
233
214
|
self.is_causal = False
|
|
234
|
-
self.use_custom_attn = qk_norm is True
|
|
235
215
|
|
|
236
216
|
if mlp_dim is None:
|
|
237
217
|
mlp_dim = hidden_dim * 4
|
|
238
218
|
|
|
239
219
|
# Attention block
|
|
240
|
-
self.
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
attn_drop=attention_dropout,
|
|
252
|
-
proj_drop=0.0,
|
|
253
|
-
qkv_bias=qkv_bias,
|
|
254
|
-
qk_norm=qk_norm,
|
|
255
|
-
norm_layer=norm_layer,
|
|
256
|
-
norm_layer_eps=norm_layer_eps,
|
|
257
|
-
)
|
|
220
|
+
self.norm1 = norm_layer(hidden_dim, eps=norm_layer_eps)
|
|
221
|
+
self.attn = Attention(
|
|
222
|
+
hidden_dim,
|
|
223
|
+
num_heads=num_heads,
|
|
224
|
+
attn_drop=attention_dropout,
|
|
225
|
+
proj_drop=0.0,
|
|
226
|
+
qkv_bias=qkv_bias,
|
|
227
|
+
qk_norm=qk_norm,
|
|
228
|
+
norm_layer=norm_layer,
|
|
229
|
+
norm_layer_eps=norm_layer_eps,
|
|
230
|
+
)
|
|
258
231
|
|
|
259
|
-
self.
|
|
232
|
+
self.drop_path = StochasticDepth(drop_path, mode="row")
|
|
260
233
|
if layer_scale_init_value is not None:
|
|
261
234
|
self.layer_scale_1 = LayerScale(hidden_dim, layer_scale_init_value)
|
|
262
235
|
else:
|
|
263
236
|
self.layer_scale_1 = nn.Identity()
|
|
264
237
|
|
|
265
238
|
# MLP block
|
|
266
|
-
self.
|
|
239
|
+
self.norm2 = norm_layer(hidden_dim, eps=norm_layer_eps)
|
|
267
240
|
self.mlp = mlp_layer(hidden_dim, mlp_dim, act_layer=activation_layer, dropout=dropout)
|
|
268
|
-
self.drop_path2 = StochasticDepth(drop_path, mode="row")
|
|
269
241
|
if layer_scale_init_value is not None:
|
|
270
242
|
self.layer_scale_2 = LayerScale(hidden_dim, layer_scale_init_value)
|
|
271
243
|
else:
|
|
@@ -273,34 +245,14 @@ class EncoderBlock(nn.Module):
|
|
|
273
245
|
|
|
274
246
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
275
247
|
# torch._assert(x.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {x.size()}")
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
seq_len = x.size(1)
|
|
279
|
-
attn_mask = torch.triu(
|
|
280
|
-
torch.full((seq_len, seq_len), float("-inf"), dtype=x.dtype, device=x.device),
|
|
281
|
-
diagonal=1,
|
|
282
|
-
)
|
|
283
|
-
else:
|
|
284
|
-
attn_mask = None
|
|
285
|
-
|
|
286
|
-
(branch1, _) = self.self_attention(
|
|
287
|
-
branch1,
|
|
288
|
-
branch1,
|
|
289
|
-
branch1,
|
|
248
|
+
(attn_out, _) = self.attn(
|
|
249
|
+
self.norm1(x),
|
|
290
250
|
need_weights=self.need_attn,
|
|
291
|
-
attn_mask=attn_mask, # Ignored on the custom attention
|
|
292
251
|
average_attn_weights=False,
|
|
293
252
|
is_causal=self.is_causal,
|
|
294
253
|
)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
branch1 = self.drop_path1(branch1) + x
|
|
298
|
-
|
|
299
|
-
branch2 = self.ln2(branch1)
|
|
300
|
-
branch2 = self.mlp(branch2)
|
|
301
|
-
branch2 = self.layer_scale_2(branch2)
|
|
302
|
-
|
|
303
|
-
x = self.drop_path2(branch2) + branch1
|
|
254
|
+
x = x + self.drop_path(self.layer_scale_1(attn_out))
|
|
255
|
+
x = x + self.drop_path(self.layer_scale_2(self.mlp(self.norm2(x))))
|
|
304
256
|
|
|
305
257
|
return x
|
|
306
258
|
|
|
@@ -834,888 +786,8 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
|
|
|
834
786
|
self.pos_embedding = nn.Parameter(pos_embedding)
|
|
835
787
|
|
|
836
788
|
|
|
837
|
-
#
|
|
838
|
-
|
|
839
|
-
registry.register_model_config(
|
|
840
|
-
"vit_t32",
|
|
841
|
-
ViT,
|
|
842
|
-
config={
|
|
843
|
-
"patch_size": 32,
|
|
844
|
-
"num_layers": 12,
|
|
845
|
-
"num_heads": 3,
|
|
846
|
-
"hidden_dim": 192,
|
|
847
|
-
"mlp_dim": 768,
|
|
848
|
-
"drop_path_rate": 0.0,
|
|
849
|
-
},
|
|
850
|
-
)
|
|
851
|
-
registry.register_model_config(
|
|
852
|
-
"vit_t16",
|
|
853
|
-
ViT,
|
|
854
|
-
config={
|
|
855
|
-
"patch_size": 16,
|
|
856
|
-
"num_layers": 12,
|
|
857
|
-
"num_heads": 3,
|
|
858
|
-
"hidden_dim": 192,
|
|
859
|
-
"mlp_dim": 768,
|
|
860
|
-
"drop_path_rate": 0.0,
|
|
861
|
-
},
|
|
862
|
-
)
|
|
863
|
-
registry.register_model_config(
|
|
864
|
-
"vit_s32",
|
|
865
|
-
ViT,
|
|
866
|
-
config={
|
|
867
|
-
"patch_size": 32,
|
|
868
|
-
"num_layers": 12,
|
|
869
|
-
"num_heads": 6,
|
|
870
|
-
"hidden_dim": 384,
|
|
871
|
-
"mlp_dim": 1536,
|
|
872
|
-
"drop_path_rate": 0.0,
|
|
873
|
-
},
|
|
874
|
-
)
|
|
875
|
-
registry.register_model_config(
|
|
876
|
-
"vit_s16",
|
|
877
|
-
ViT,
|
|
878
|
-
config={
|
|
879
|
-
"patch_size": 16,
|
|
880
|
-
"num_layers": 12,
|
|
881
|
-
"num_heads": 6,
|
|
882
|
-
"hidden_dim": 384,
|
|
883
|
-
"mlp_dim": 1536,
|
|
884
|
-
"drop_path_rate": 0.0,
|
|
885
|
-
},
|
|
886
|
-
)
|
|
887
|
-
registry.register_model_config(
|
|
888
|
-
"vit_s16_ls",
|
|
889
|
-
ViT,
|
|
890
|
-
config={
|
|
891
|
-
"patch_size": 16,
|
|
892
|
-
"num_layers": 12,
|
|
893
|
-
"num_heads": 6,
|
|
894
|
-
"hidden_dim": 384,
|
|
895
|
-
"mlp_dim": 1536,
|
|
896
|
-
"layer_scale_init_value": 1e-5,
|
|
897
|
-
"drop_path_rate": 0.0,
|
|
898
|
-
},
|
|
899
|
-
)
|
|
900
|
-
registry.register_model_config(
|
|
901
|
-
"vit_s16_pn",
|
|
902
|
-
ViT,
|
|
903
|
-
config={
|
|
904
|
-
"patch_size": 16,
|
|
905
|
-
"num_layers": 12,
|
|
906
|
-
"num_heads": 6,
|
|
907
|
-
"hidden_dim": 384,
|
|
908
|
-
"mlp_dim": 1536,
|
|
909
|
-
"pre_norm": True,
|
|
910
|
-
"norm_layer_eps": 1e-5,
|
|
911
|
-
"drop_path_rate": 0.0,
|
|
912
|
-
},
|
|
913
|
-
)
|
|
914
|
-
registry.register_model_config(
|
|
915
|
-
"vit_s14",
|
|
916
|
-
ViT,
|
|
917
|
-
config={
|
|
918
|
-
"patch_size": 14,
|
|
919
|
-
"num_layers": 12,
|
|
920
|
-
"num_heads": 6,
|
|
921
|
-
"hidden_dim": 384,
|
|
922
|
-
"mlp_dim": 1536,
|
|
923
|
-
"drop_path_rate": 0.0,
|
|
924
|
-
},
|
|
925
|
-
)
|
|
926
|
-
registry.register_model_config(
|
|
927
|
-
"vit_m32",
|
|
928
|
-
ViT,
|
|
929
|
-
config={
|
|
930
|
-
"patch_size": 32,
|
|
931
|
-
"num_layers": 12,
|
|
932
|
-
"num_heads": 8,
|
|
933
|
-
"hidden_dim": 512,
|
|
934
|
-
"mlp_dim": 2048,
|
|
935
|
-
"drop_path_rate": 0.0,
|
|
936
|
-
},
|
|
937
|
-
)
|
|
938
|
-
registry.register_model_config(
|
|
939
|
-
"vit_m16",
|
|
940
|
-
ViT,
|
|
941
|
-
config={
|
|
942
|
-
"patch_size": 16,
|
|
943
|
-
"num_layers": 12,
|
|
944
|
-
"num_heads": 8,
|
|
945
|
-
"hidden_dim": 512,
|
|
946
|
-
"mlp_dim": 2048,
|
|
947
|
-
"drop_path_rate": 0.0,
|
|
948
|
-
},
|
|
949
|
-
)
|
|
950
|
-
registry.register_model_config(
|
|
951
|
-
"vit_m14",
|
|
952
|
-
ViT,
|
|
953
|
-
config={
|
|
954
|
-
"patch_size": 14,
|
|
955
|
-
"num_layers": 12,
|
|
956
|
-
"num_heads": 8,
|
|
957
|
-
"hidden_dim": 512,
|
|
958
|
-
"mlp_dim": 2048,
|
|
959
|
-
"drop_path_rate": 0.0,
|
|
960
|
-
},
|
|
961
|
-
)
|
|
962
|
-
registry.register_model_config(
|
|
963
|
-
"vit_b32",
|
|
964
|
-
ViT,
|
|
965
|
-
config={
|
|
966
|
-
"patch_size": 32,
|
|
967
|
-
"num_layers": 12,
|
|
968
|
-
"num_heads": 12,
|
|
969
|
-
"hidden_dim": 768,
|
|
970
|
-
"mlp_dim": 3072,
|
|
971
|
-
"drop_path_rate": 0.0,
|
|
972
|
-
},
|
|
973
|
-
)
|
|
974
|
-
registry.register_model_config(
|
|
975
|
-
"vit_b16",
|
|
976
|
-
ViT,
|
|
977
|
-
config={
|
|
978
|
-
"patch_size": 16,
|
|
979
|
-
"num_layers": 12,
|
|
980
|
-
"num_heads": 12,
|
|
981
|
-
"hidden_dim": 768,
|
|
982
|
-
"mlp_dim": 3072,
|
|
983
|
-
"drop_path_rate": 0.1,
|
|
984
|
-
},
|
|
985
|
-
)
|
|
986
|
-
registry.register_model_config(
|
|
987
|
-
"vit_b16_ls",
|
|
988
|
-
ViT,
|
|
989
|
-
config={
|
|
990
|
-
"patch_size": 16,
|
|
991
|
-
"num_layers": 12,
|
|
992
|
-
"num_heads": 12,
|
|
993
|
-
"hidden_dim": 768,
|
|
994
|
-
"mlp_dim": 3072,
|
|
995
|
-
"layer_scale_init_value": 1e-5,
|
|
996
|
-
"drop_path_rate": 0.1,
|
|
997
|
-
},
|
|
998
|
-
)
|
|
999
|
-
registry.register_model_config(
|
|
1000
|
-
"vit_b16_qkn_ls",
|
|
1001
|
-
ViT,
|
|
1002
|
-
config={
|
|
1003
|
-
"patch_size": 16,
|
|
1004
|
-
"num_layers": 12,
|
|
1005
|
-
"num_heads": 12,
|
|
1006
|
-
"hidden_dim": 768,
|
|
1007
|
-
"mlp_dim": 3072,
|
|
1008
|
-
"layer_scale_init_value": 1e-5,
|
|
1009
|
-
"qk_norm": True,
|
|
1010
|
-
"drop_path_rate": 0.1,
|
|
1011
|
-
},
|
|
1012
|
-
)
|
|
1013
|
-
registry.register_model_config(
|
|
1014
|
-
"vit_b16_pn_quick_gelu",
|
|
1015
|
-
ViT,
|
|
1016
|
-
config={
|
|
1017
|
-
"patch_size": 16,
|
|
1018
|
-
"num_layers": 12,
|
|
1019
|
-
"num_heads": 12,
|
|
1020
|
-
"hidden_dim": 768,
|
|
1021
|
-
"mlp_dim": 3072,
|
|
1022
|
-
"pre_norm": True,
|
|
1023
|
-
"norm_layer_eps": 1e-5,
|
|
1024
|
-
"act_layer_type": "quick_gelu",
|
|
1025
|
-
"drop_path_rate": 0.1,
|
|
1026
|
-
},
|
|
1027
|
-
)
|
|
1028
|
-
registry.register_model_config(
|
|
1029
|
-
"vit_b14",
|
|
1030
|
-
ViT,
|
|
1031
|
-
config={
|
|
1032
|
-
"patch_size": 14,
|
|
1033
|
-
"num_layers": 12,
|
|
1034
|
-
"num_heads": 12,
|
|
1035
|
-
"hidden_dim": 768,
|
|
1036
|
-
"mlp_dim": 3072,
|
|
1037
|
-
"drop_path_rate": 0.1,
|
|
1038
|
-
},
|
|
1039
|
-
)
|
|
1040
|
-
registry.register_model_config(
|
|
1041
|
-
"vit_l32",
|
|
1042
|
-
ViT,
|
|
1043
|
-
config={
|
|
1044
|
-
"patch_size": 32,
|
|
1045
|
-
"num_layers": 24,
|
|
1046
|
-
"num_heads": 16,
|
|
1047
|
-
"hidden_dim": 1024,
|
|
1048
|
-
"mlp_dim": 4096,
|
|
1049
|
-
"drop_path_rate": 0.1,
|
|
1050
|
-
},
|
|
1051
|
-
)
|
|
1052
|
-
registry.register_model_config(
|
|
1053
|
-
"vit_l16",
|
|
1054
|
-
ViT,
|
|
1055
|
-
config={
|
|
1056
|
-
"patch_size": 16,
|
|
1057
|
-
"num_layers": 24,
|
|
1058
|
-
"num_heads": 16,
|
|
1059
|
-
"hidden_dim": 1024,
|
|
1060
|
-
"mlp_dim": 4096,
|
|
1061
|
-
"drop_path_rate": 0.1,
|
|
1062
|
-
},
|
|
1063
|
-
)
|
|
1064
|
-
registry.register_model_config(
|
|
1065
|
-
"vit_l14",
|
|
1066
|
-
ViT,
|
|
1067
|
-
config={
|
|
1068
|
-
"patch_size": 14,
|
|
1069
|
-
"num_layers": 24,
|
|
1070
|
-
"num_heads": 16,
|
|
1071
|
-
"hidden_dim": 1024,
|
|
1072
|
-
"mlp_dim": 4096,
|
|
1073
|
-
"drop_path_rate": 0.1,
|
|
1074
|
-
},
|
|
1075
|
-
)
|
|
1076
|
-
registry.register_model_config(
|
|
1077
|
-
"vit_l14_pn",
|
|
1078
|
-
ViT,
|
|
1079
|
-
config={
|
|
1080
|
-
"patch_size": 14,
|
|
1081
|
-
"num_layers": 24,
|
|
1082
|
-
"num_heads": 16,
|
|
1083
|
-
"hidden_dim": 1024,
|
|
1084
|
-
"mlp_dim": 4096,
|
|
1085
|
-
"pre_norm": True,
|
|
1086
|
-
"norm_layer_eps": 1e-5,
|
|
1087
|
-
"drop_path_rate": 0.1,
|
|
1088
|
-
},
|
|
1089
|
-
)
|
|
1090
|
-
registry.register_model_config(
|
|
1091
|
-
"vit_l14_pn_quick_gelu",
|
|
1092
|
-
ViT,
|
|
1093
|
-
config={
|
|
1094
|
-
"patch_size": 14,
|
|
1095
|
-
"num_layers": 24,
|
|
1096
|
-
"num_heads": 16,
|
|
1097
|
-
"hidden_dim": 1024,
|
|
1098
|
-
"mlp_dim": 4096,
|
|
1099
|
-
"pre_norm": True,
|
|
1100
|
-
"norm_layer_eps": 1e-5,
|
|
1101
|
-
"act_layer_type": "quick_gelu",
|
|
1102
|
-
"drop_path_rate": 0.1,
|
|
1103
|
-
},
|
|
1104
|
-
)
|
|
1105
|
-
registry.register_model_config(
|
|
1106
|
-
"vit_h16",
|
|
1107
|
-
ViT,
|
|
1108
|
-
config={
|
|
1109
|
-
"patch_size": 16,
|
|
1110
|
-
"num_layers": 32,
|
|
1111
|
-
"num_heads": 16,
|
|
1112
|
-
"hidden_dim": 1280,
|
|
1113
|
-
"mlp_dim": 5120,
|
|
1114
|
-
"drop_path_rate": 0.1,
|
|
1115
|
-
},
|
|
1116
|
-
)
|
|
1117
|
-
registry.register_model_config(
|
|
1118
|
-
"vit_h14",
|
|
1119
|
-
ViT,
|
|
1120
|
-
config={
|
|
1121
|
-
"patch_size": 14,
|
|
1122
|
-
"num_layers": 32,
|
|
1123
|
-
"num_heads": 16,
|
|
1124
|
-
"hidden_dim": 1280,
|
|
1125
|
-
"mlp_dim": 5120,
|
|
1126
|
-
"drop_path_rate": 0.1,
|
|
1127
|
-
},
|
|
1128
|
-
)
|
|
1129
|
-
registry.register_model_config( # From "Scaling Vision Transformers"
|
|
1130
|
-
"vit_g14",
|
|
1131
|
-
ViT,
|
|
1132
|
-
config={
|
|
1133
|
-
"patch_size": 14,
|
|
1134
|
-
"num_layers": 40,
|
|
1135
|
-
"num_heads": 16,
|
|
1136
|
-
"hidden_dim": 1408,
|
|
1137
|
-
"mlp_dim": 6144,
|
|
1138
|
-
"drop_path_rate": 0.1,
|
|
1139
|
-
},
|
|
1140
|
-
)
|
|
1141
|
-
registry.register_model_config( # From "Scaling Vision Transformers"
|
|
1142
|
-
"vit_gigantic14",
|
|
1143
|
-
ViT,
|
|
1144
|
-
config={
|
|
1145
|
-
"patch_size": 14,
|
|
1146
|
-
"num_layers": 48,
|
|
1147
|
-
"num_heads": 16,
|
|
1148
|
-
"hidden_dim": 1664,
|
|
1149
|
-
"mlp_dim": 8192,
|
|
1150
|
-
"drop_path_rate": 0.1,
|
|
1151
|
-
},
|
|
1152
|
-
)
|
|
1153
|
-
|
|
1154
|
-
# With registers
|
|
1155
|
-
registry.register_model_config(
|
|
1156
|
-
"vit_reg1_t16",
|
|
1157
|
-
ViT,
|
|
1158
|
-
config={
|
|
1159
|
-
"patch_size": 16,
|
|
1160
|
-
"num_layers": 12,
|
|
1161
|
-
"num_heads": 3,
|
|
1162
|
-
"hidden_dim": 192,
|
|
1163
|
-
"mlp_dim": 768,
|
|
1164
|
-
"num_reg_tokens": 1,
|
|
1165
|
-
"drop_path_rate": 0.0,
|
|
1166
|
-
},
|
|
1167
|
-
)
|
|
1168
|
-
registry.register_model_config(
|
|
1169
|
-
"vit_reg1_s32",
|
|
1170
|
-
ViT,
|
|
1171
|
-
config={
|
|
1172
|
-
"patch_size": 32,
|
|
1173
|
-
"num_layers": 12,
|
|
1174
|
-
"num_heads": 6,
|
|
1175
|
-
"hidden_dim": 384,
|
|
1176
|
-
"mlp_dim": 1536,
|
|
1177
|
-
"num_reg_tokens": 1,
|
|
1178
|
-
"drop_path_rate": 0.0,
|
|
1179
|
-
},
|
|
1180
|
-
)
|
|
1181
|
-
registry.register_model_config(
|
|
1182
|
-
"vit_reg1_s16",
|
|
1183
|
-
ViT,
|
|
1184
|
-
config={
|
|
1185
|
-
"patch_size": 16,
|
|
1186
|
-
"num_layers": 12,
|
|
1187
|
-
"num_heads": 6,
|
|
1188
|
-
"hidden_dim": 384,
|
|
1189
|
-
"mlp_dim": 1536,
|
|
1190
|
-
"num_reg_tokens": 1,
|
|
1191
|
-
"drop_path_rate": 0.0,
|
|
1192
|
-
},
|
|
1193
|
-
)
|
|
1194
|
-
registry.register_model_config(
|
|
1195
|
-
"vit_reg1_s16_ls",
|
|
1196
|
-
ViT,
|
|
1197
|
-
config={
|
|
1198
|
-
"patch_size": 16,
|
|
1199
|
-
"num_layers": 12,
|
|
1200
|
-
"num_heads": 6,
|
|
1201
|
-
"hidden_dim": 384,
|
|
1202
|
-
"mlp_dim": 1536,
|
|
1203
|
-
"layer_scale_init_value": 1e-5,
|
|
1204
|
-
"num_reg_tokens": 1,
|
|
1205
|
-
"drop_path_rate": 0.0,
|
|
1206
|
-
},
|
|
1207
|
-
)
|
|
1208
|
-
registry.register_model_config(
|
|
1209
|
-
"vit_reg1_s16_rms_ls",
|
|
1210
|
-
ViT,
|
|
1211
|
-
config={
|
|
1212
|
-
"patch_size": 16,
|
|
1213
|
-
"num_layers": 12,
|
|
1214
|
-
"num_heads": 6,
|
|
1215
|
-
"hidden_dim": 384,
|
|
1216
|
-
"mlp_dim": 1536,
|
|
1217
|
-
"layer_scale_init_value": 1e-5,
|
|
1218
|
-
"num_reg_tokens": 1,
|
|
1219
|
-
"norm_layer_type": "RMSNorm",
|
|
1220
|
-
"drop_path_rate": 0.0,
|
|
1221
|
-
},
|
|
1222
|
-
)
|
|
1223
|
-
registry.register_model_config(
|
|
1224
|
-
"vit_reg1_s14",
|
|
1225
|
-
ViT,
|
|
1226
|
-
config={
|
|
1227
|
-
"patch_size": 14,
|
|
1228
|
-
"num_layers": 12,
|
|
1229
|
-
"num_heads": 6,
|
|
1230
|
-
"hidden_dim": 384,
|
|
1231
|
-
"mlp_dim": 1536,
|
|
1232
|
-
"num_reg_tokens": 1,
|
|
1233
|
-
"drop_path_rate": 0.0,
|
|
1234
|
-
},
|
|
1235
|
-
)
|
|
1236
|
-
registry.register_model_config(
|
|
1237
|
-
"vit_reg4_m32",
|
|
1238
|
-
ViT,
|
|
1239
|
-
config={
|
|
1240
|
-
"patch_size": 32,
|
|
1241
|
-
"num_layers": 12,
|
|
1242
|
-
"num_heads": 8,
|
|
1243
|
-
"hidden_dim": 512,
|
|
1244
|
-
"mlp_dim": 2048,
|
|
1245
|
-
"num_reg_tokens": 4,
|
|
1246
|
-
"drop_path_rate": 0.0,
|
|
1247
|
-
},
|
|
1248
|
-
)
|
|
1249
|
-
registry.register_model_config(
|
|
1250
|
-
"vit_reg4_m16",
|
|
1251
|
-
ViT,
|
|
1252
|
-
config={
|
|
1253
|
-
"patch_size": 16,
|
|
1254
|
-
"num_layers": 12,
|
|
1255
|
-
"num_heads": 8,
|
|
1256
|
-
"hidden_dim": 512,
|
|
1257
|
-
"mlp_dim": 2048,
|
|
1258
|
-
"num_reg_tokens": 4,
|
|
1259
|
-
"drop_path_rate": 0.0,
|
|
1260
|
-
},
|
|
1261
|
-
)
|
|
1262
|
-
registry.register_model_config(
|
|
1263
|
-
"vit_reg4_m16_rms_avg",
|
|
1264
|
-
ViT,
|
|
1265
|
-
config={
|
|
1266
|
-
"patch_size": 16,
|
|
1267
|
-
"num_layers": 12,
|
|
1268
|
-
"num_heads": 8,
|
|
1269
|
-
"hidden_dim": 512,
|
|
1270
|
-
"mlp_dim": 2048,
|
|
1271
|
-
"num_reg_tokens": 4,
|
|
1272
|
-
"class_token": False,
|
|
1273
|
-
"norm_layer_type": "RMSNorm",
|
|
1274
|
-
"drop_path_rate": 0.0,
|
|
1275
|
-
},
|
|
1276
|
-
)
|
|
1277
|
-
registry.register_model_config(
|
|
1278
|
-
"vit_reg4_m14",
|
|
1279
|
-
ViT,
|
|
1280
|
-
config={
|
|
1281
|
-
"patch_size": 14,
|
|
1282
|
-
"num_layers": 12,
|
|
1283
|
-
"num_heads": 8,
|
|
1284
|
-
"hidden_dim": 512,
|
|
1285
|
-
"mlp_dim": 2048,
|
|
1286
|
-
"num_reg_tokens": 4,
|
|
1287
|
-
"drop_path_rate": 0.0,
|
|
1288
|
-
},
|
|
1289
|
-
)
|
|
1290
|
-
registry.register_model_config(
|
|
1291
|
-
"vit_reg4_b32",
|
|
1292
|
-
ViT,
|
|
1293
|
-
config={
|
|
1294
|
-
"patch_size": 32,
|
|
1295
|
-
"num_layers": 12,
|
|
1296
|
-
"num_heads": 12,
|
|
1297
|
-
"hidden_dim": 768,
|
|
1298
|
-
"mlp_dim": 3072,
|
|
1299
|
-
"num_reg_tokens": 4,
|
|
1300
|
-
"drop_path_rate": 0.0,
|
|
1301
|
-
},
|
|
1302
|
-
)
|
|
1303
|
-
registry.register_model_config(
|
|
1304
|
-
"vit_reg4_b16",
|
|
1305
|
-
ViT,
|
|
1306
|
-
config={
|
|
1307
|
-
"patch_size": 16,
|
|
1308
|
-
"num_layers": 12,
|
|
1309
|
-
"num_heads": 12,
|
|
1310
|
-
"hidden_dim": 768,
|
|
1311
|
-
"mlp_dim": 3072,
|
|
1312
|
-
"num_reg_tokens": 4,
|
|
1313
|
-
"drop_path_rate": 0.1,
|
|
1314
|
-
},
|
|
1315
|
-
)
|
|
1316
|
-
registry.register_model_config(
|
|
1317
|
-
"vit_reg4_b16_avg",
|
|
1318
|
-
ViT,
|
|
1319
|
-
config={
|
|
1320
|
-
"patch_size": 16,
|
|
1321
|
-
"num_layers": 12,
|
|
1322
|
-
"num_heads": 12,
|
|
1323
|
-
"hidden_dim": 768,
|
|
1324
|
-
"mlp_dim": 3072,
|
|
1325
|
-
"num_reg_tokens": 4,
|
|
1326
|
-
"class_token": False,
|
|
1327
|
-
"drop_path_rate": 0.1,
|
|
1328
|
-
},
|
|
1329
|
-
)
|
|
1330
|
-
registry.register_model_config(
|
|
1331
|
-
"vit_reg4_b14",
|
|
1332
|
-
ViT,
|
|
1333
|
-
config={
|
|
1334
|
-
"patch_size": 14,
|
|
1335
|
-
"num_layers": 12,
|
|
1336
|
-
"num_heads": 12,
|
|
1337
|
-
"hidden_dim": 768,
|
|
1338
|
-
"mlp_dim": 3072,
|
|
1339
|
-
"num_reg_tokens": 4,
|
|
1340
|
-
"drop_path_rate": 0.1,
|
|
1341
|
-
},
|
|
1342
|
-
)
|
|
1343
|
-
registry.register_model_config(
|
|
1344
|
-
"vit_reg8_b14_ap",
|
|
1345
|
-
ViT,
|
|
1346
|
-
config={
|
|
1347
|
-
"patch_size": 14,
|
|
1348
|
-
"num_layers": 12,
|
|
1349
|
-
"num_heads": 12,
|
|
1350
|
-
"hidden_dim": 768,
|
|
1351
|
-
"mlp_dim": 3072,
|
|
1352
|
-
"num_reg_tokens": 8,
|
|
1353
|
-
"class_token": False,
|
|
1354
|
-
"attn_pool_head": True,
|
|
1355
|
-
"drop_path_rate": 0.1,
|
|
1356
|
-
},
|
|
1357
|
-
)
|
|
1358
|
-
registry.register_model_config(
|
|
1359
|
-
"vit_reg4_l32",
|
|
1360
|
-
ViT,
|
|
1361
|
-
config={
|
|
1362
|
-
"patch_size": 32,
|
|
1363
|
-
"num_layers": 24,
|
|
1364
|
-
"num_heads": 16,
|
|
1365
|
-
"hidden_dim": 1024,
|
|
1366
|
-
"mlp_dim": 4096,
|
|
1367
|
-
"num_reg_tokens": 4,
|
|
1368
|
-
"drop_path_rate": 0.1,
|
|
1369
|
-
},
|
|
1370
|
-
)
|
|
1371
|
-
registry.register_model_config(
|
|
1372
|
-
"vit_reg4_l16",
|
|
1373
|
-
ViT,
|
|
1374
|
-
config={
|
|
1375
|
-
"patch_size": 16,
|
|
1376
|
-
"num_layers": 24,
|
|
1377
|
-
"num_heads": 16,
|
|
1378
|
-
"hidden_dim": 1024,
|
|
1379
|
-
"mlp_dim": 4096,
|
|
1380
|
-
"num_reg_tokens": 4,
|
|
1381
|
-
"drop_path_rate": 0.1,
|
|
1382
|
-
},
|
|
1383
|
-
)
|
|
1384
|
-
registry.register_model_config(
|
|
1385
|
-
"vit_reg8_l16_avg",
|
|
1386
|
-
ViT,
|
|
1387
|
-
config={
|
|
1388
|
-
"patch_size": 16,
|
|
1389
|
-
"num_layers": 24,
|
|
1390
|
-
"num_heads": 16,
|
|
1391
|
-
"hidden_dim": 1024,
|
|
1392
|
-
"mlp_dim": 4096,
|
|
1393
|
-
"num_reg_tokens": 8,
|
|
1394
|
-
"class_token": False,
|
|
1395
|
-
"drop_path_rate": 0.1,
|
|
1396
|
-
},
|
|
1397
|
-
)
|
|
1398
|
-
registry.register_model_config(
|
|
1399
|
-
"vit_reg8_l16_aps",
|
|
1400
|
-
ViT,
|
|
1401
|
-
config={
|
|
1402
|
-
"patch_size": 16,
|
|
1403
|
-
"num_layers": 24,
|
|
1404
|
-
"num_heads": 16,
|
|
1405
|
-
"hidden_dim": 1024,
|
|
1406
|
-
"mlp_dim": 4096,
|
|
1407
|
-
"num_reg_tokens": 8,
|
|
1408
|
-
"class_token": False,
|
|
1409
|
-
"attn_pool_head": True,
|
|
1410
|
-
"attn_pool_special_tokens": True,
|
|
1411
|
-
"drop_path_rate": 0.1,
|
|
1412
|
-
},
|
|
1413
|
-
)
|
|
1414
|
-
registry.register_model_config(
|
|
1415
|
-
"vit_reg4_l14",
|
|
1416
|
-
ViT,
|
|
1417
|
-
config={
|
|
1418
|
-
"patch_size": 14,
|
|
1419
|
-
"num_layers": 24,
|
|
1420
|
-
"num_heads": 16,
|
|
1421
|
-
"hidden_dim": 1024,
|
|
1422
|
-
"mlp_dim": 4096,
|
|
1423
|
-
"num_reg_tokens": 4,
|
|
1424
|
-
"drop_path_rate": 0.1,
|
|
1425
|
-
},
|
|
1426
|
-
)
|
|
1427
|
-
registry.register_model_config( # DeiT III style
|
|
1428
|
-
"vit_reg4_l14_nps_ls",
|
|
1429
|
-
ViT,
|
|
1430
|
-
config={
|
|
1431
|
-
"pos_embed_special_tokens": False,
|
|
1432
|
-
"patch_size": 14,
|
|
1433
|
-
"num_layers": 24,
|
|
1434
|
-
"num_heads": 16,
|
|
1435
|
-
"hidden_dim": 1024,
|
|
1436
|
-
"mlp_dim": 4096,
|
|
1437
|
-
"layer_scale_init_value": 1e-5,
|
|
1438
|
-
"num_reg_tokens": 4,
|
|
1439
|
-
"drop_path_rate": 0.1,
|
|
1440
|
-
},
|
|
1441
|
-
)
|
|
1442
|
-
registry.register_model_config(
|
|
1443
|
-
"vit_reg8_l14_ap",
|
|
1444
|
-
ViT,
|
|
1445
|
-
config={
|
|
1446
|
-
"patch_size": 14,
|
|
1447
|
-
"num_layers": 24,
|
|
1448
|
-
"num_heads": 16,
|
|
1449
|
-
"hidden_dim": 1024,
|
|
1450
|
-
"mlp_dim": 4096,
|
|
1451
|
-
"num_reg_tokens": 8,
|
|
1452
|
-
"class_token": False,
|
|
1453
|
-
"attn_pool_head": True,
|
|
1454
|
-
"drop_path_rate": 0.1,
|
|
1455
|
-
},
|
|
1456
|
-
)
|
|
1457
|
-
registry.register_model_config(
|
|
1458
|
-
"vit_reg8_l14_rms_ap",
|
|
1459
|
-
ViT,
|
|
1460
|
-
config={
|
|
1461
|
-
"patch_size": 14,
|
|
1462
|
-
"num_layers": 24,
|
|
1463
|
-
"num_heads": 16,
|
|
1464
|
-
"hidden_dim": 1024,
|
|
1465
|
-
"mlp_dim": 4096,
|
|
1466
|
-
"num_reg_tokens": 8,
|
|
1467
|
-
"class_token": False,
|
|
1468
|
-
"attn_pool_head": True,
|
|
1469
|
-
"norm_layer_type": "RMSNorm",
|
|
1470
|
-
"drop_path_rate": 0.1,
|
|
1471
|
-
},
|
|
1472
|
-
)
|
|
1473
|
-
registry.register_model_config(
|
|
1474
|
-
"vit_reg4_h16",
|
|
1475
|
-
ViT,
|
|
1476
|
-
config={
|
|
1477
|
-
"patch_size": 16,
|
|
1478
|
-
"num_layers": 32,
|
|
1479
|
-
"num_heads": 16,
|
|
1480
|
-
"hidden_dim": 1280,
|
|
1481
|
-
"mlp_dim": 5120,
|
|
1482
|
-
"num_reg_tokens": 4,
|
|
1483
|
-
"drop_path_rate": 0.1,
|
|
1484
|
-
},
|
|
1485
|
-
)
|
|
1486
|
-
registry.register_model_config(
|
|
1487
|
-
"vit_reg4_h14",
|
|
1488
|
-
ViT,
|
|
1489
|
-
config={
|
|
1490
|
-
"patch_size": 14,
|
|
1491
|
-
"num_layers": 32,
|
|
1492
|
-
"num_heads": 16,
|
|
1493
|
-
"hidden_dim": 1280,
|
|
1494
|
-
"mlp_dim": 5120,
|
|
1495
|
-
"num_reg_tokens": 4,
|
|
1496
|
-
"drop_path_rate": 0.1,
|
|
1497
|
-
},
|
|
1498
|
-
)
|
|
1499
|
-
registry.register_model_config( # From "Scaling Vision Transformers"
|
|
1500
|
-
"vit_reg4_g14",
|
|
1501
|
-
ViT,
|
|
1502
|
-
config={
|
|
1503
|
-
"patch_size": 14,
|
|
1504
|
-
"num_layers": 40,
|
|
1505
|
-
"num_heads": 16,
|
|
1506
|
-
"hidden_dim": 1408,
|
|
1507
|
-
"mlp_dim": 6144,
|
|
1508
|
-
"num_reg_tokens": 4,
|
|
1509
|
-
"drop_path_rate": 0.1,
|
|
1510
|
-
},
|
|
1511
|
-
)
|
|
1512
|
-
|
|
1513
|
-
# Shape-optimized vision transformer (SoViT)
|
|
1514
|
-
registry.register_model_config(
|
|
1515
|
-
"vit_so150m_p14_ap",
|
|
1516
|
-
ViT,
|
|
1517
|
-
config={
|
|
1518
|
-
"patch_size": 14,
|
|
1519
|
-
"num_layers": 18,
|
|
1520
|
-
"num_heads": 16,
|
|
1521
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1522
|
-
"mlp_dim": 2320,
|
|
1523
|
-
"class_token": False,
|
|
1524
|
-
"attn_pool_head": True,
|
|
1525
|
-
"drop_path_rate": 0.1,
|
|
1526
|
-
},
|
|
1527
|
-
)
|
|
1528
|
-
registry.register_model_config(
|
|
1529
|
-
"vit_so400m_p14_ap",
|
|
1530
|
-
ViT,
|
|
1531
|
-
config={
|
|
1532
|
-
"patch_size": 14,
|
|
1533
|
-
"num_layers": 27,
|
|
1534
|
-
"num_heads": 16,
|
|
1535
|
-
"hidden_dim": 1152,
|
|
1536
|
-
"mlp_dim": 4304,
|
|
1537
|
-
"class_token": False,
|
|
1538
|
-
"attn_pool_head": True,
|
|
1539
|
-
"drop_path_rate": 0.1,
|
|
1540
|
-
},
|
|
1541
|
-
)
|
|
1542
|
-
registry.register_model_config(
|
|
1543
|
-
"vit_reg4_so150m_p16_avg",
|
|
1544
|
-
ViT,
|
|
1545
|
-
config={
|
|
1546
|
-
"patch_size": 16,
|
|
1547
|
-
"num_layers": 18,
|
|
1548
|
-
"num_heads": 16,
|
|
1549
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1550
|
-
"mlp_dim": 2320,
|
|
1551
|
-
"num_reg_tokens": 4,
|
|
1552
|
-
"class_token": False,
|
|
1553
|
-
"drop_path_rate": 0.1,
|
|
1554
|
-
},
|
|
1555
|
-
)
|
|
1556
|
-
registry.register_model_config(
|
|
1557
|
-
"vit_reg8_so150m_p16_swiglu_ap",
|
|
1558
|
-
ViT,
|
|
1559
|
-
config={
|
|
1560
|
-
"patch_size": 16,
|
|
1561
|
-
"num_layers": 18,
|
|
1562
|
-
"num_heads": 16,
|
|
1563
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1564
|
-
"mlp_dim": 2320,
|
|
1565
|
-
"num_reg_tokens": 8,
|
|
1566
|
-
"class_token": False,
|
|
1567
|
-
"attn_pool_head": True,
|
|
1568
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1569
|
-
"drop_path_rate": 0.1,
|
|
1570
|
-
},
|
|
1571
|
-
)
|
|
1572
|
-
registry.register_model_config(
|
|
1573
|
-
"vit_reg4_so150m_p14_avg",
|
|
1574
|
-
ViT,
|
|
1575
|
-
config={
|
|
1576
|
-
"patch_size": 14,
|
|
1577
|
-
"num_layers": 18,
|
|
1578
|
-
"num_heads": 16,
|
|
1579
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1580
|
-
"mlp_dim": 2320,
|
|
1581
|
-
"num_reg_tokens": 4,
|
|
1582
|
-
"class_token": False,
|
|
1583
|
-
"drop_path_rate": 0.1,
|
|
1584
|
-
},
|
|
1585
|
-
)
|
|
1586
|
-
registry.register_model_config(
|
|
1587
|
-
"vit_reg4_so150m_p14_ls",
|
|
1588
|
-
ViT,
|
|
1589
|
-
config={
|
|
1590
|
-
"patch_size": 14,
|
|
1591
|
-
"num_layers": 18,
|
|
1592
|
-
"num_heads": 16,
|
|
1593
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1594
|
-
"mlp_dim": 2320,
|
|
1595
|
-
"layer_scale_init_value": 1e-5,
|
|
1596
|
-
"num_reg_tokens": 4,
|
|
1597
|
-
"drop_path_rate": 0.1,
|
|
1598
|
-
},
|
|
1599
|
-
)
|
|
1600
|
-
registry.register_model_config(
|
|
1601
|
-
"vit_reg4_so150m_p14_ap",
|
|
1602
|
-
ViT,
|
|
1603
|
-
config={
|
|
1604
|
-
"patch_size": 14,
|
|
1605
|
-
"num_layers": 18,
|
|
1606
|
-
"num_heads": 16,
|
|
1607
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1608
|
-
"mlp_dim": 2320,
|
|
1609
|
-
"num_reg_tokens": 4,
|
|
1610
|
-
"class_token": False,
|
|
1611
|
-
"attn_pool_head": True,
|
|
1612
|
-
"drop_path_rate": 0.1,
|
|
1613
|
-
},
|
|
1614
|
-
)
|
|
1615
|
-
registry.register_model_config(
|
|
1616
|
-
"vit_reg4_so150m_p14_aps",
|
|
1617
|
-
ViT,
|
|
1618
|
-
config={
|
|
1619
|
-
"patch_size": 14,
|
|
1620
|
-
"num_layers": 18,
|
|
1621
|
-
"num_heads": 16,
|
|
1622
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1623
|
-
"mlp_dim": 2320,
|
|
1624
|
-
"num_reg_tokens": 4,
|
|
1625
|
-
"class_token": False,
|
|
1626
|
-
"attn_pool_head": True,
|
|
1627
|
-
"attn_pool_special_tokens": True,
|
|
1628
|
-
"drop_path_rate": 0.1,
|
|
1629
|
-
},
|
|
1630
|
-
)
|
|
1631
|
-
registry.register_model_config(
|
|
1632
|
-
"vit_reg8_so150m_p14_avg",
|
|
1633
|
-
ViT,
|
|
1634
|
-
config={
|
|
1635
|
-
"patch_size": 14,
|
|
1636
|
-
"num_layers": 18,
|
|
1637
|
-
"num_heads": 16,
|
|
1638
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1639
|
-
"mlp_dim": 2320,
|
|
1640
|
-
"num_reg_tokens": 8,
|
|
1641
|
-
"class_token": False,
|
|
1642
|
-
"drop_path_rate": 0.1,
|
|
1643
|
-
},
|
|
1644
|
-
)
|
|
1645
|
-
registry.register_model_config(
|
|
1646
|
-
"vit_reg8_so150m_p14_swiglu",
|
|
1647
|
-
ViT,
|
|
1648
|
-
config={
|
|
1649
|
-
"patch_size": 14,
|
|
1650
|
-
"num_layers": 18,
|
|
1651
|
-
"num_heads": 16,
|
|
1652
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1653
|
-
"mlp_dim": 2320,
|
|
1654
|
-
"num_reg_tokens": 8,
|
|
1655
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1656
|
-
"drop_path_rate": 0.1,
|
|
1657
|
-
},
|
|
1658
|
-
)
|
|
1659
|
-
registry.register_model_config(
|
|
1660
|
-
"vit_reg8_so150m_p14_swiglu_avg",
|
|
1661
|
-
ViT,
|
|
1662
|
-
config={
|
|
1663
|
-
"patch_size": 14,
|
|
1664
|
-
"num_layers": 18,
|
|
1665
|
-
"num_heads": 16,
|
|
1666
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1667
|
-
"mlp_dim": 2320,
|
|
1668
|
-
"num_reg_tokens": 8,
|
|
1669
|
-
"class_token": False,
|
|
1670
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1671
|
-
"drop_path_rate": 0.1,
|
|
1672
|
-
},
|
|
1673
|
-
)
|
|
1674
|
-
registry.register_model_config(
|
|
1675
|
-
"vit_reg8_so150m_p14_ap",
|
|
1676
|
-
ViT,
|
|
1677
|
-
config={
|
|
1678
|
-
"patch_size": 14,
|
|
1679
|
-
"num_layers": 18,
|
|
1680
|
-
"num_heads": 16,
|
|
1681
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1682
|
-
"mlp_dim": 2320,
|
|
1683
|
-
"num_reg_tokens": 8,
|
|
1684
|
-
"class_token": False,
|
|
1685
|
-
"attn_pool_head": True,
|
|
1686
|
-
"drop_path_rate": 0.1,
|
|
1687
|
-
},
|
|
1688
|
-
)
|
|
1689
|
-
registry.register_model_config(
|
|
1690
|
-
"vit_reg4_so400m_p14_ap",
|
|
1691
|
-
ViT,
|
|
1692
|
-
config={
|
|
1693
|
-
"patch_size": 14,
|
|
1694
|
-
"num_layers": 27,
|
|
1695
|
-
"num_heads": 16,
|
|
1696
|
-
"hidden_dim": 1152,
|
|
1697
|
-
"mlp_dim": 4304,
|
|
1698
|
-
"num_reg_tokens": 4,
|
|
1699
|
-
"class_token": False,
|
|
1700
|
-
"attn_pool_head": True,
|
|
1701
|
-
"drop_path_rate": 0.1,
|
|
1702
|
-
},
|
|
1703
|
-
)
|
|
1704
|
-
registry.register_model_config(
|
|
1705
|
-
"vit_reg8_so400m_p14_ap",
|
|
1706
|
-
ViT,
|
|
1707
|
-
config={
|
|
1708
|
-
"patch_size": 14,
|
|
1709
|
-
"num_layers": 27,
|
|
1710
|
-
"num_heads": 16,
|
|
1711
|
-
"hidden_dim": 1152,
|
|
1712
|
-
"mlp_dim": 4304,
|
|
1713
|
-
"num_reg_tokens": 8,
|
|
1714
|
-
"class_token": False,
|
|
1715
|
-
"attn_pool_head": True,
|
|
1716
|
-
"drop_path_rate": 0.1,
|
|
1717
|
-
},
|
|
1718
|
-
)
|
|
789
|
+
# Register model configs (side effects)
|
|
790
|
+
register_vit_configs(ViT)
|
|
1719
791
|
|
|
1720
792
|
registry.register_weights(
|
|
1721
793
|
"vit_l16_mim_200",
|
|
@@ -1729,7 +801,7 @@ registry.register_weights(
|
|
|
1729
801
|
"formats": {
|
|
1730
802
|
"pt": {
|
|
1731
803
|
"file_size": 1157.1,
|
|
1732
|
-
"sha256": "
|
|
804
|
+
"sha256": "7fc5b342347d8349aaf5f069a47efd441b646f8542821ed2e30b47a7da72917a",
|
|
1733
805
|
},
|
|
1734
806
|
},
|
|
1735
807
|
"net": {"network": "vit_l16", "tag": "mim"},
|
|
@@ -1747,7 +819,7 @@ registry.register_weights(
|
|
|
1747
819
|
"formats": {
|
|
1748
820
|
"pt": {
|
|
1749
821
|
"file_size": 1157.1,
|
|
1750
|
-
"sha256": "
|
|
822
|
+
"sha256": "9b5c4e2538ea40edd60d8831d3807b543290dc2db44d537e60e44a341b47e54e",
|
|
1751
823
|
},
|
|
1752
824
|
},
|
|
1753
825
|
"net": {"network": "vit_l16", "tag": "mim"},
|
|
@@ -1765,7 +837,7 @@ registry.register_weights( # BioCLIP v2: https://arxiv.org/abs/2505.23883
|
|
|
1765
837
|
"formats": {
|
|
1766
838
|
"pt": {
|
|
1767
839
|
"file_size": 1156.6,
|
|
1768
|
-
"sha256": "
|
|
840
|
+
"sha256": "6cd7bd6993762590891fe2b41db1649cde5a0c4de5a7f341672f8856ed529d07",
|
|
1769
841
|
},
|
|
1770
842
|
},
|
|
1771
843
|
"net": {"network": "vit_l14_pn", "tag": "bioclip-v2"},
|
|
@@ -1783,7 +855,7 @@ registry.register_weights( # OpenAI CLIP: https://arxiv.org/abs/2103.00020
|
|
|
1783
855
|
"formats": {
|
|
1784
856
|
"pt": {
|
|
1785
857
|
"file_size": 1159.7,
|
|
1786
|
-
"sha256": "
|
|
858
|
+
"sha256": "2c7462390956d8942de0df21d9d1a43cf53fdbe3a3570a1add64d859313a0bee",
|
|
1787
859
|
},
|
|
1788
860
|
},
|
|
1789
861
|
"net": {"network": "vit_l14_pn_quick_gelu", "tag": "openai-clip"},
|
|
@@ -1801,7 +873,7 @@ registry.register_weights( # SigLIP 2: https://arxiv.org/abs/2502.14786
|
|
|
1801
873
|
"formats": {
|
|
1802
874
|
"pt": {
|
|
1803
875
|
"file_size": 1631.6,
|
|
1804
|
-
"sha256": "
|
|
876
|
+
"sha256": "f8ac3bdf028d17a2ee2673f58b51cffa5c696edef44c92092299d970607c7be6",
|
|
1805
877
|
},
|
|
1806
878
|
},
|
|
1807
879
|
"net": {"network": "vit_so400m_p14_ap", "tag": "siglip-v2-webli"},
|
|
@@ -1821,7 +893,7 @@ registry.register_weights(
|
|
|
1821
893
|
"formats": {
|
|
1822
894
|
"pt": {
|
|
1823
895
|
"file_size": 146.2,
|
|
1824
|
-
"sha256": "
|
|
896
|
+
"sha256": "0f5cd4e0acb44d1e429bbed342c60bf22087ecd1d7112363c3ceb909dcd9d547",
|
|
1825
897
|
},
|
|
1826
898
|
},
|
|
1827
899
|
"net": {"network": "vit_reg4_m16_rms_avg", "tag": "i-jepa"},
|
|
@@ -1839,7 +911,7 @@ registry.register_weights(
|
|
|
1839
911
|
"formats": {
|
|
1840
912
|
"pt": {
|
|
1841
913
|
"file_size": 166.8,
|
|
1842
|
-
"sha256": "
|
|
914
|
+
"sha256": "e9b83e90c284877c859e92a05a35ff25884a06d3fd006d90ee576d58f71d3251",
|
|
1843
915
|
},
|
|
1844
916
|
},
|
|
1845
917
|
"net": {"network": "vit_reg4_m16_rms_avg", "tag": "i-jepa-inat21-256px"},
|
|
@@ -1857,7 +929,7 @@ registry.register_weights(
|
|
|
1857
929
|
"formats": {
|
|
1858
930
|
"pt": {
|
|
1859
931
|
"file_size": 167.4,
|
|
1860
|
-
"sha256": "
|
|
932
|
+
"sha256": "7fde7375f5f9165114561f6288cdf086ba8b6635251304de08bd01883bb7a2da",
|
|
1861
933
|
},
|
|
1862
934
|
},
|
|
1863
935
|
"net": {"network": "vit_reg4_m16_rms_avg", "tag": "i-jepa-inat21"},
|
|
@@ -1874,7 +946,7 @@ registry.register_weights(
|
|
|
1874
946
|
"formats": {
|
|
1875
947
|
"pt": {
|
|
1876
948
|
"file_size": 184.2,
|
|
1877
|
-
"sha256": "
|
|
949
|
+
"sha256": "da47dc6bd4f41c347235beba92657b66148180141d0bd629169e84449b629fbb",
|
|
1878
950
|
},
|
|
1879
951
|
},
|
|
1880
952
|
"net": {"network": "vit_reg4_m16_rms_avg", "tag": "i-jepa-imagenet21k"},
|
|
@@ -1892,7 +964,7 @@ registry.register_weights(
|
|
|
1892
964
|
"formats": {
|
|
1893
965
|
"pt": {
|
|
1894
966
|
"file_size": 327.4,
|
|
1895
|
-
"sha256": "
|
|
967
|
+
"sha256": "c7ec433c01e1dc0d6100cafc29fa88155a0d65f4b42afa9cc252b77485a566a7",
|
|
1896
968
|
},
|
|
1897
969
|
},
|
|
1898
970
|
"net": {"network": "vit_reg4_b16", "tag": "mim"},
|
|
@@ -1910,7 +982,7 @@ registry.register_weights(
|
|
|
1910
982
|
"formats": {
|
|
1911
983
|
"pt": {
|
|
1912
984
|
"file_size": 327.4,
|
|
1913
|
-
"sha256": "
|
|
985
|
+
"sha256": "b0e5e2b24ea7a8d2be246df43c9d8092354f6ee81e88c6cdd7c52d8e38ed44a4",
|
|
1914
986
|
},
|
|
1915
987
|
},
|
|
1916
988
|
"net": {"network": "vit_reg4_b16", "tag": "mim"},
|
|
@@ -1928,7 +1000,7 @@ registry.register_weights(
|
|
|
1928
1000
|
"formats": {
|
|
1929
1001
|
"pt": {
|
|
1930
1002
|
"file_size": 328.7,
|
|
1931
|
-
"sha256": "
|
|
1003
|
+
"sha256": "3a15b95285cd4435b601ef058839f422cdce8f68cca50de9353e1ac2bcb65f9a",
|
|
1932
1004
|
},
|
|
1933
1005
|
},
|
|
1934
1006
|
"net": {"network": "vit_reg4_b16", "tag": "mim-intermediate-il-common"},
|
|
@@ -1946,7 +1018,7 @@ registry.register_weights(
|
|
|
1946
1018
|
"formats": {
|
|
1947
1019
|
"pt": {
|
|
1948
1020
|
"file_size": 330.7,
|
|
1949
|
-
"sha256": "
|
|
1021
|
+
"sha256": "78dbf578ebe7d5761705231e16fef280b14905a94f18879167c96df3e59d13a5",
|
|
1950
1022
|
},
|
|
1951
1023
|
},
|
|
1952
1024
|
"net": {"network": "vit_reg4_b16", "tag": "mim-intermediate-arabian-peninsula"},
|
|
@@ -1964,7 +1036,7 @@ registry.register_weights( # DINO v2: https://arxiv.org/abs/2304.07193
|
|
|
1964
1036
|
"formats": {
|
|
1965
1037
|
"pt": {
|
|
1966
1038
|
"file_size": 1161.2,
|
|
1967
|
-
"sha256": "
|
|
1039
|
+
"sha256": "441721029ca0ef85582bc8822ec91d780ee442eb3d06b04fb5e4662c9317b52d",
|
|
1968
1040
|
},
|
|
1969
1041
|
},
|
|
1970
1042
|
"net": {"network": "vit_reg4_l14_nps_ls", "tag": "dino-v2-lvd142m"},
|