birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder/common/fs_ops.py +2 -2
- birder/introspection/attention_rollout.py +1 -1
- birder/introspection/transformer_attribution.py +1 -1
- birder/layers/layer_scale.py +1 -1
- birder/net/__init__.py +2 -10
- birder/net/_rope_vit_configs.py +430 -0
- birder/net/_vit_configs.py +479 -0
- birder/net/biformer.py +1 -0
- birder/net/cait.py +5 -5
- birder/net/coat.py +12 -12
- birder/net/conv2former.py +3 -3
- birder/net/convmixer.py +1 -1
- birder/net/convnext_v1.py +1 -1
- birder/net/crossvit.py +5 -5
- birder/net/davit.py +1 -1
- birder/net/deit.py +12 -26
- birder/net/deit3.py +42 -189
- birder/net/densenet.py +9 -8
- birder/net/detection/deformable_detr.py +5 -2
- birder/net/detection/detr.py +5 -2
- birder/net/detection/efficientdet.py +1 -1
- birder/net/dpn.py +1 -2
- birder/net/edgenext.py +2 -1
- birder/net/edgevit.py +3 -0
- birder/net/efficientformer_v1.py +2 -1
- birder/net/efficientformer_v2.py +18 -31
- birder/net/efficientnet_v2.py +3 -0
- birder/net/efficientvit_mit.py +5 -5
- birder/net/fasternet.py +2 -2
- birder/net/flexivit.py +22 -43
- birder/net/groupmixformer.py +1 -1
- birder/net/hgnet_v1.py +5 -5
- birder/net/inception_next.py +1 -1
- birder/net/inception_resnet_v1.py +3 -3
- birder/net/inception_resnet_v2.py +7 -4
- birder/net/inception_v3.py +3 -0
- birder/net/inception_v4.py +3 -0
- birder/net/maxvit.py +1 -1
- birder/net/metaformer.py +3 -3
- birder/net/mim/crossmae.py +1 -1
- birder/net/mim/mae_vit.py +1 -1
- birder/net/mim/simmim.py +1 -1
- birder/net/mobilenet_v1.py +0 -9
- birder/net/mobilenet_v2.py +38 -44
- birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
- birder/net/mobilevit_v1.py +5 -32
- birder/net/mobilevit_v2.py +1 -45
- birder/net/moganet.py +8 -5
- birder/net/mvit_v2.py +6 -6
- birder/net/nfnet.py +4 -0
- birder/net/pit.py +1 -1
- birder/net/pvt_v1.py +5 -5
- birder/net/pvt_v2.py +5 -5
- birder/net/repghost.py +1 -30
- birder/net/resmlp.py +2 -2
- birder/net/resnest.py +3 -0
- birder/net/resnet_v1.py +125 -1
- birder/net/resnet_v2.py +75 -1
- birder/net/resnext.py +35 -1
- birder/net/rope_deit3.py +33 -136
- birder/net/rope_flexivit.py +18 -18
- birder/net/rope_vit.py +3 -735
- birder/net/simple_vit.py +22 -16
- birder/net/smt.py +1 -1
- birder/net/squeezenet.py +5 -12
- birder/net/squeezenext.py +0 -24
- birder/net/ssl/capi.py +1 -1
- birder/net/ssl/data2vec.py +1 -1
- birder/net/ssl/dino_v2.py +2 -2
- birder/net/ssl/franca.py +2 -2
- birder/net/ssl/i_jepa.py +1 -1
- birder/net/ssl/ibot.py +1 -1
- birder/net/swiftformer.py +12 -2
- birder/net/swin_transformer_v2.py +1 -1
- birder/net/tiny_vit.py +3 -16
- birder/net/van.py +2 -2
- birder/net/vit.py +35 -963
- birder/net/vit_sam.py +13 -38
- birder/net/xcit.py +7 -6
- birder/tools/introspection.py +1 -1
- birder/tools/model_info.py +3 -1
- birder/version.py +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
- birder/net/mobilenet_v3_small.py +0 -43
- birder/net/se_resnet_v1.py +0 -105
- birder/net/se_resnet_v2.py +0 -59
- birder/net/se_resnext.py +0 -30
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
birder/net/rope_vit.py
CHANGED
|
@@ -31,6 +31,7 @@ from birder.layers import MultiHeadAttentionPool
|
|
|
31
31
|
from birder.layers import SwiGLU_FFN
|
|
32
32
|
from birder.layers.activations import get_activation_module
|
|
33
33
|
from birder.model_registry import registry
|
|
34
|
+
from birder.net._rope_vit_configs import register_rope_vit_configs
|
|
34
35
|
from birder.net.base import DetectorBackbone
|
|
35
36
|
from birder.net.base import MaskedTokenOmissionMixin
|
|
36
37
|
from birder.net.base import MaskedTokenRetentionMixin
|
|
@@ -939,741 +940,8 @@ class RoPE_ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
|
|
|
939
940
|
)
|
|
940
941
|
|
|
941
942
|
|
|
942
|
-
#
|
|
943
|
-
|
|
944
|
-
#
|
|
945
|
-
# Model names follow a structured pattern to encode architectural choices:
|
|
946
|
-
# [rope_]vit_[reg{N}_][size][patch_size][_components][_pooling][_c{N}]
|
|
947
|
-
#
|
|
948
|
-
# Core Components:
|
|
949
|
-
# - rope_ : Rotary Position Embedding (RoPE) enabled
|
|
950
|
-
# - rope_i_ : Rotary Position Embedding (RoPE) enabled with interleaved rotation - implies different temp, indexing
|
|
951
|
-
# - vit_ : Vision Transformer base architecture
|
|
952
|
-
# - reg{N}_ : Register tokens (N = number of register tokens, e.g., reg4, reg8)
|
|
953
|
-
# - size : Model size (s=small, b=base, l=large, or specific like so150m)
|
|
954
|
-
# - patch_size : Patch size (e.g., 14, 16, 32 for 14x14, 16x16, 32x32 patches)
|
|
955
|
-
#
|
|
956
|
-
# Optional Components:
|
|
957
|
-
# Position Embeddings:
|
|
958
|
-
# - nps : No Position embedding on Special tokens
|
|
959
|
-
#
|
|
960
|
-
# Normalization:
|
|
961
|
-
# - rms : RMSNorm (instead of LayerNorm)
|
|
962
|
-
# - pn : Pre-Norm (layer norm before the encoder) - implies different norm eps
|
|
963
|
-
# - npn : No Post Norm (disables post-normalization layer)
|
|
964
|
-
# - qkn : QK Norm
|
|
965
|
-
#
|
|
966
|
-
# Feed-Forward Network:
|
|
967
|
-
# - swiglu : SwiGLU FFN layer type (instead of standard FFN)
|
|
968
|
-
#
|
|
969
|
-
# Activation:
|
|
970
|
-
# - quick_gelu : QuickGELU activation type
|
|
971
|
-
# - ...
|
|
972
|
-
#
|
|
973
|
-
# Regularization:
|
|
974
|
-
# - ls : Layer Scaling applied
|
|
975
|
-
#
|
|
976
|
-
# Pooling/Reduction:
|
|
977
|
-
# - avg : Average pooling for sequence reduction
|
|
978
|
-
# - ap : Attention Pooling for sequence reduction
|
|
979
|
-
# - aps : Attention Pooling inc. Special tokens for sequence reduction
|
|
980
|
-
#
|
|
981
|
-
# Custom Variants:
|
|
982
|
-
# - c{N} : Custom variant (N = version number) for models with fine-grained or non-standard
|
|
983
|
-
# modifications not fully reflected in the name
|
|
984
|
-
|
|
985
|
-
registry.register_model_config(
|
|
986
|
-
"rope_vit_s32",
|
|
987
|
-
RoPE_ViT,
|
|
988
|
-
config={
|
|
989
|
-
"patch_size": 32,
|
|
990
|
-
"num_layers": 12,
|
|
991
|
-
"num_heads": 6,
|
|
992
|
-
"hidden_dim": 384,
|
|
993
|
-
"mlp_dim": 1536,
|
|
994
|
-
"drop_path_rate": 0.0,
|
|
995
|
-
},
|
|
996
|
-
)
|
|
997
|
-
registry.register_model_config(
|
|
998
|
-
"rope_vit_s16",
|
|
999
|
-
RoPE_ViT,
|
|
1000
|
-
config={
|
|
1001
|
-
"patch_size": 16,
|
|
1002
|
-
"num_layers": 12,
|
|
1003
|
-
"num_heads": 6,
|
|
1004
|
-
"hidden_dim": 384,
|
|
1005
|
-
"mlp_dim": 1536,
|
|
1006
|
-
"drop_path_rate": 0.0,
|
|
1007
|
-
},
|
|
1008
|
-
)
|
|
1009
|
-
registry.register_model_config(
|
|
1010
|
-
"rope_i_vit_s16_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
|
|
1011
|
-
RoPE_ViT,
|
|
1012
|
-
config={
|
|
1013
|
-
"patch_size": 16,
|
|
1014
|
-
"num_layers": 12,
|
|
1015
|
-
"num_heads": 6,
|
|
1016
|
-
"hidden_dim": 384,
|
|
1017
|
-
"mlp_dim": 1536,
|
|
1018
|
-
"pre_norm": True,
|
|
1019
|
-
"attn_pool_head": True,
|
|
1020
|
-
"attn_pool_num_heads": 8,
|
|
1021
|
-
"attn_pool_special_tokens": True,
|
|
1022
|
-
"norm_layer_eps": 1e-5,
|
|
1023
|
-
"rope_rot_type": "interleaved",
|
|
1024
|
-
"rope_grid_indexing": "xy",
|
|
1025
|
-
"rope_grid_offset": 1,
|
|
1026
|
-
"rope_temperature": 10000.0,
|
|
1027
|
-
"drop_path_rate": 0.0,
|
|
1028
|
-
},
|
|
1029
|
-
)
|
|
1030
|
-
registry.register_model_config(
|
|
1031
|
-
"rope_vit_s14",
|
|
1032
|
-
RoPE_ViT,
|
|
1033
|
-
config={
|
|
1034
|
-
"patch_size": 14,
|
|
1035
|
-
"num_layers": 12,
|
|
1036
|
-
"num_heads": 6,
|
|
1037
|
-
"hidden_dim": 384,
|
|
1038
|
-
"mlp_dim": 1536,
|
|
1039
|
-
"drop_path_rate": 0.0,
|
|
1040
|
-
},
|
|
1041
|
-
)
|
|
1042
|
-
registry.register_model_config(
|
|
1043
|
-
"rope_vit_m32",
|
|
1044
|
-
RoPE_ViT,
|
|
1045
|
-
config={
|
|
1046
|
-
"patch_size": 32,
|
|
1047
|
-
"num_layers": 12,
|
|
1048
|
-
"num_heads": 8,
|
|
1049
|
-
"hidden_dim": 512,
|
|
1050
|
-
"mlp_dim": 2048,
|
|
1051
|
-
"drop_path_rate": 0.0,
|
|
1052
|
-
},
|
|
1053
|
-
)
|
|
1054
|
-
registry.register_model_config(
|
|
1055
|
-
"rope_vit_m16",
|
|
1056
|
-
RoPE_ViT,
|
|
1057
|
-
config={
|
|
1058
|
-
"patch_size": 16,
|
|
1059
|
-
"num_layers": 12,
|
|
1060
|
-
"num_heads": 8,
|
|
1061
|
-
"hidden_dim": 512,
|
|
1062
|
-
"mlp_dim": 2048,
|
|
1063
|
-
"drop_path_rate": 0.0,
|
|
1064
|
-
},
|
|
1065
|
-
)
|
|
1066
|
-
registry.register_model_config(
|
|
1067
|
-
"rope_vit_m14",
|
|
1068
|
-
RoPE_ViT,
|
|
1069
|
-
config={
|
|
1070
|
-
"patch_size": 14,
|
|
1071
|
-
"num_layers": 12,
|
|
1072
|
-
"num_heads": 8,
|
|
1073
|
-
"hidden_dim": 512,
|
|
1074
|
-
"mlp_dim": 2048,
|
|
1075
|
-
"drop_path_rate": 0.0,
|
|
1076
|
-
},
|
|
1077
|
-
)
|
|
1078
|
-
registry.register_model_config(
|
|
1079
|
-
"rope_vit_b32",
|
|
1080
|
-
RoPE_ViT,
|
|
1081
|
-
config={
|
|
1082
|
-
"patch_size": 32,
|
|
1083
|
-
"num_layers": 12,
|
|
1084
|
-
"num_heads": 12,
|
|
1085
|
-
"hidden_dim": 768,
|
|
1086
|
-
"mlp_dim": 3072,
|
|
1087
|
-
"drop_path_rate": 0.0,
|
|
1088
|
-
},
|
|
1089
|
-
)
|
|
1090
|
-
registry.register_model_config(
|
|
1091
|
-
"rope_vit_b16",
|
|
1092
|
-
RoPE_ViT,
|
|
1093
|
-
config={
|
|
1094
|
-
"patch_size": 16,
|
|
1095
|
-
"num_layers": 12,
|
|
1096
|
-
"num_heads": 12,
|
|
1097
|
-
"hidden_dim": 768,
|
|
1098
|
-
"mlp_dim": 3072,
|
|
1099
|
-
"drop_path_rate": 0.1,
|
|
1100
|
-
},
|
|
1101
|
-
)
|
|
1102
|
-
registry.register_model_config(
|
|
1103
|
-
"rope_vit_b16_qkn_ls",
|
|
1104
|
-
RoPE_ViT,
|
|
1105
|
-
config={
|
|
1106
|
-
"patch_size": 16,
|
|
1107
|
-
"num_layers": 12,
|
|
1108
|
-
"num_heads": 12,
|
|
1109
|
-
"hidden_dim": 768,
|
|
1110
|
-
"mlp_dim": 3072,
|
|
1111
|
-
"layer_scale_init_value": 1e-5,
|
|
1112
|
-
"qk_norm": True,
|
|
1113
|
-
"drop_path_rate": 0.1,
|
|
1114
|
-
},
|
|
1115
|
-
)
|
|
1116
|
-
registry.register_model_config(
|
|
1117
|
-
"rope_i_vit_b16_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
|
|
1118
|
-
RoPE_ViT,
|
|
1119
|
-
config={
|
|
1120
|
-
"patch_size": 16,
|
|
1121
|
-
"num_layers": 12,
|
|
1122
|
-
"num_heads": 12,
|
|
1123
|
-
"hidden_dim": 768,
|
|
1124
|
-
"mlp_dim": 3072,
|
|
1125
|
-
"pre_norm": True,
|
|
1126
|
-
"attn_pool_head": True,
|
|
1127
|
-
"attn_pool_num_heads": 8,
|
|
1128
|
-
"attn_pool_special_tokens": True,
|
|
1129
|
-
"norm_layer_eps": 1e-5,
|
|
1130
|
-
"rope_rot_type": "interleaved",
|
|
1131
|
-
"rope_grid_indexing": "xy",
|
|
1132
|
-
"rope_grid_offset": 1,
|
|
1133
|
-
"rope_temperature": 10000.0,
|
|
1134
|
-
"drop_path_rate": 0.1,
|
|
1135
|
-
},
|
|
1136
|
-
)
|
|
1137
|
-
registry.register_model_config(
|
|
1138
|
-
"rope_vit_b14",
|
|
1139
|
-
RoPE_ViT,
|
|
1140
|
-
config={
|
|
1141
|
-
"patch_size": 14,
|
|
1142
|
-
"num_layers": 12,
|
|
1143
|
-
"num_heads": 12,
|
|
1144
|
-
"hidden_dim": 768,
|
|
1145
|
-
"mlp_dim": 3072,
|
|
1146
|
-
"drop_path_rate": 0.1,
|
|
1147
|
-
},
|
|
1148
|
-
)
|
|
1149
|
-
registry.register_model_config(
|
|
1150
|
-
"rope_vit_l32",
|
|
1151
|
-
RoPE_ViT,
|
|
1152
|
-
config={
|
|
1153
|
-
"patch_size": 32,
|
|
1154
|
-
"num_layers": 24,
|
|
1155
|
-
"num_heads": 16,
|
|
1156
|
-
"hidden_dim": 1024,
|
|
1157
|
-
"mlp_dim": 4096,
|
|
1158
|
-
"drop_path_rate": 0.1,
|
|
1159
|
-
},
|
|
1160
|
-
)
|
|
1161
|
-
registry.register_model_config(
|
|
1162
|
-
"rope_vit_l16",
|
|
1163
|
-
RoPE_ViT,
|
|
1164
|
-
config={
|
|
1165
|
-
"patch_size": 16,
|
|
1166
|
-
"num_layers": 24,
|
|
1167
|
-
"num_heads": 16,
|
|
1168
|
-
"hidden_dim": 1024,
|
|
1169
|
-
"mlp_dim": 4096,
|
|
1170
|
-
"drop_path_rate": 0.1,
|
|
1171
|
-
},
|
|
1172
|
-
)
|
|
1173
|
-
registry.register_model_config(
|
|
1174
|
-
"rope_vit_l14",
|
|
1175
|
-
RoPE_ViT,
|
|
1176
|
-
config={
|
|
1177
|
-
"patch_size": 14,
|
|
1178
|
-
"num_layers": 24,
|
|
1179
|
-
"num_heads": 16,
|
|
1180
|
-
"hidden_dim": 1024,
|
|
1181
|
-
"mlp_dim": 4096,
|
|
1182
|
-
"drop_path_rate": 0.1,
|
|
1183
|
-
},
|
|
1184
|
-
)
|
|
1185
|
-
registry.register_model_config(
|
|
1186
|
-
"rope_i_vit_l14_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
|
|
1187
|
-
RoPE_ViT,
|
|
1188
|
-
config={
|
|
1189
|
-
"patch_size": 14,
|
|
1190
|
-
"num_layers": 24,
|
|
1191
|
-
"num_heads": 16,
|
|
1192
|
-
"hidden_dim": 1024,
|
|
1193
|
-
"mlp_dim": 4096,
|
|
1194
|
-
"pre_norm": True,
|
|
1195
|
-
"attn_pool_head": True,
|
|
1196
|
-
"attn_pool_num_heads": 8,
|
|
1197
|
-
"attn_pool_special_tokens": True,
|
|
1198
|
-
"norm_layer_eps": 1e-5,
|
|
1199
|
-
"rope_rot_type": "interleaved",
|
|
1200
|
-
"rope_grid_indexing": "xy",
|
|
1201
|
-
"rope_grid_offset": 1,
|
|
1202
|
-
"rope_temperature": 10000.0,
|
|
1203
|
-
"drop_path_rate": 0.1,
|
|
1204
|
-
},
|
|
1205
|
-
)
|
|
1206
|
-
registry.register_model_config(
|
|
1207
|
-
"rope_vit_h16",
|
|
1208
|
-
RoPE_ViT,
|
|
1209
|
-
config={
|
|
1210
|
-
"patch_size": 16,
|
|
1211
|
-
"num_layers": 32,
|
|
1212
|
-
"num_heads": 16,
|
|
1213
|
-
"hidden_dim": 1280,
|
|
1214
|
-
"mlp_dim": 5120,
|
|
1215
|
-
"drop_path_rate": 0.1,
|
|
1216
|
-
},
|
|
1217
|
-
)
|
|
1218
|
-
registry.register_model_config(
|
|
1219
|
-
"rope_vit_h14",
|
|
1220
|
-
RoPE_ViT,
|
|
1221
|
-
config={
|
|
1222
|
-
"patch_size": 14,
|
|
1223
|
-
"num_layers": 32,
|
|
1224
|
-
"num_heads": 16,
|
|
1225
|
-
"hidden_dim": 1280,
|
|
1226
|
-
"mlp_dim": 5120,
|
|
1227
|
-
"drop_path_rate": 0.1,
|
|
1228
|
-
},
|
|
1229
|
-
)
|
|
1230
|
-
registry.register_model_config( # From "Scaling Vision Transformers"
|
|
1231
|
-
"rope_vit_g14",
|
|
1232
|
-
RoPE_ViT,
|
|
1233
|
-
config={
|
|
1234
|
-
"patch_size": 14,
|
|
1235
|
-
"num_layers": 40,
|
|
1236
|
-
"num_heads": 16,
|
|
1237
|
-
"hidden_dim": 1408,
|
|
1238
|
-
"mlp_dim": 6144,
|
|
1239
|
-
"drop_path_rate": 0.1,
|
|
1240
|
-
},
|
|
1241
|
-
)
|
|
1242
|
-
|
|
1243
|
-
# With registers
|
|
1244
|
-
registry.register_model_config(
|
|
1245
|
-
"rope_vit_reg1_s32",
|
|
1246
|
-
RoPE_ViT,
|
|
1247
|
-
config={
|
|
1248
|
-
"patch_size": 32,
|
|
1249
|
-
"num_layers": 12,
|
|
1250
|
-
"num_heads": 6,
|
|
1251
|
-
"hidden_dim": 384,
|
|
1252
|
-
"mlp_dim": 1536,
|
|
1253
|
-
"num_reg_tokens": 1,
|
|
1254
|
-
"drop_path_rate": 0.0,
|
|
1255
|
-
},
|
|
1256
|
-
)
|
|
1257
|
-
registry.register_model_config(
|
|
1258
|
-
"rope_vit_reg1_s16",
|
|
1259
|
-
RoPE_ViT,
|
|
1260
|
-
config={
|
|
1261
|
-
"patch_size": 16,
|
|
1262
|
-
"num_layers": 12,
|
|
1263
|
-
"num_heads": 6,
|
|
1264
|
-
"hidden_dim": 384,
|
|
1265
|
-
"mlp_dim": 1536,
|
|
1266
|
-
"num_reg_tokens": 1,
|
|
1267
|
-
"drop_path_rate": 0.0,
|
|
1268
|
-
},
|
|
1269
|
-
)
|
|
1270
|
-
registry.register_model_config(
|
|
1271
|
-
"rope_i_vit_reg1_s16_pn_npn_avg_c1", # For PE Spatial - https://arxiv.org/abs/2504.13181
|
|
1272
|
-
RoPE_ViT,
|
|
1273
|
-
config={
|
|
1274
|
-
"patch_size": 16,
|
|
1275
|
-
"num_layers": 12,
|
|
1276
|
-
"num_heads": 6,
|
|
1277
|
-
"hidden_dim": 384,
|
|
1278
|
-
"mlp_dim": 1536,
|
|
1279
|
-
"num_reg_tokens": 1,
|
|
1280
|
-
"class_token": False,
|
|
1281
|
-
"pre_norm": True,
|
|
1282
|
-
"post_norm": False,
|
|
1283
|
-
"norm_layer_eps": 1e-5,
|
|
1284
|
-
"rope_rot_type": "interleaved",
|
|
1285
|
-
"rope_grid_indexing": "xy",
|
|
1286
|
-
"rope_grid_offset": 1,
|
|
1287
|
-
"rope_temperature": 10000.0,
|
|
1288
|
-
"drop_path_rate": 0.0,
|
|
1289
|
-
},
|
|
1290
|
-
)
|
|
1291
|
-
registry.register_model_config(
|
|
1292
|
-
"rope_vit_reg1_s14",
|
|
1293
|
-
RoPE_ViT,
|
|
1294
|
-
config={
|
|
1295
|
-
"patch_size": 14,
|
|
1296
|
-
"num_layers": 12,
|
|
1297
|
-
"num_heads": 6,
|
|
1298
|
-
"hidden_dim": 384,
|
|
1299
|
-
"mlp_dim": 1536,
|
|
1300
|
-
"num_reg_tokens": 1,
|
|
1301
|
-
"drop_path_rate": 0.0,
|
|
1302
|
-
},
|
|
1303
|
-
)
|
|
1304
|
-
registry.register_model_config(
|
|
1305
|
-
"rope_vit_reg4_m32",
|
|
1306
|
-
RoPE_ViT,
|
|
1307
|
-
config={
|
|
1308
|
-
"patch_size": 32,
|
|
1309
|
-
"num_layers": 12,
|
|
1310
|
-
"num_heads": 8,
|
|
1311
|
-
"hidden_dim": 512,
|
|
1312
|
-
"mlp_dim": 2048,
|
|
1313
|
-
"num_reg_tokens": 4,
|
|
1314
|
-
"drop_path_rate": 0.0,
|
|
1315
|
-
},
|
|
1316
|
-
)
|
|
1317
|
-
registry.register_model_config(
|
|
1318
|
-
"rope_vit_reg4_m16",
|
|
1319
|
-
RoPE_ViT,
|
|
1320
|
-
config={
|
|
1321
|
-
"patch_size": 16,
|
|
1322
|
-
"num_layers": 12,
|
|
1323
|
-
"num_heads": 8,
|
|
1324
|
-
"hidden_dim": 512,
|
|
1325
|
-
"mlp_dim": 2048,
|
|
1326
|
-
"num_reg_tokens": 4,
|
|
1327
|
-
"drop_path_rate": 0.0,
|
|
1328
|
-
},
|
|
1329
|
-
)
|
|
1330
|
-
registry.register_model_config(
|
|
1331
|
-
"rope_vit_reg4_m16_rms_avg",
|
|
1332
|
-
RoPE_ViT,
|
|
1333
|
-
config={
|
|
1334
|
-
"patch_size": 16,
|
|
1335
|
-
"num_layers": 12,
|
|
1336
|
-
"num_heads": 8,
|
|
1337
|
-
"hidden_dim": 512,
|
|
1338
|
-
"mlp_dim": 2048,
|
|
1339
|
-
"num_reg_tokens": 4,
|
|
1340
|
-
"class_token": False,
|
|
1341
|
-
"norm_layer_type": "RMSNorm",
|
|
1342
|
-
"drop_path_rate": 0.0,
|
|
1343
|
-
},
|
|
1344
|
-
)
|
|
1345
|
-
registry.register_model_config(
|
|
1346
|
-
"rope_vit_reg4_m14",
|
|
1347
|
-
RoPE_ViT,
|
|
1348
|
-
config={
|
|
1349
|
-
"patch_size": 14,
|
|
1350
|
-
"num_layers": 12,
|
|
1351
|
-
"num_heads": 8,
|
|
1352
|
-
"hidden_dim": 512,
|
|
1353
|
-
"mlp_dim": 2048,
|
|
1354
|
-
"num_reg_tokens": 4,
|
|
1355
|
-
"drop_path_rate": 0.0,
|
|
1356
|
-
},
|
|
1357
|
-
)
|
|
1358
|
-
registry.register_model_config(
|
|
1359
|
-
"rope_vit_reg4_m14_avg",
|
|
1360
|
-
RoPE_ViT,
|
|
1361
|
-
config={
|
|
1362
|
-
"patch_size": 14,
|
|
1363
|
-
"num_layers": 12,
|
|
1364
|
-
"num_heads": 8,
|
|
1365
|
-
"hidden_dim": 512,
|
|
1366
|
-
"mlp_dim": 2048,
|
|
1367
|
-
"num_reg_tokens": 4,
|
|
1368
|
-
"class_token": False,
|
|
1369
|
-
"drop_path_rate": 0.0,
|
|
1370
|
-
},
|
|
1371
|
-
)
|
|
1372
|
-
registry.register_model_config(
|
|
1373
|
-
"rope_vit_reg4_b32",
|
|
1374
|
-
RoPE_ViT,
|
|
1375
|
-
config={
|
|
1376
|
-
"patch_size": 32,
|
|
1377
|
-
"num_layers": 12,
|
|
1378
|
-
"num_heads": 12,
|
|
1379
|
-
"hidden_dim": 768,
|
|
1380
|
-
"mlp_dim": 3072,
|
|
1381
|
-
"num_reg_tokens": 4,
|
|
1382
|
-
"drop_path_rate": 0.0,
|
|
1383
|
-
},
|
|
1384
|
-
)
|
|
1385
|
-
registry.register_model_config(
|
|
1386
|
-
"rope_vit_reg4_b16",
|
|
1387
|
-
RoPE_ViT,
|
|
1388
|
-
config={
|
|
1389
|
-
"patch_size": 16,
|
|
1390
|
-
"num_layers": 12,
|
|
1391
|
-
"num_heads": 12,
|
|
1392
|
-
"hidden_dim": 768,
|
|
1393
|
-
"mlp_dim": 3072,
|
|
1394
|
-
"num_reg_tokens": 4,
|
|
1395
|
-
"drop_path_rate": 0.1,
|
|
1396
|
-
},
|
|
1397
|
-
)
|
|
1398
|
-
registry.register_model_config(
|
|
1399
|
-
"rope_vit_reg4_b14",
|
|
1400
|
-
RoPE_ViT,
|
|
1401
|
-
config={
|
|
1402
|
-
"patch_size": 14,
|
|
1403
|
-
"num_layers": 12,
|
|
1404
|
-
"num_heads": 12,
|
|
1405
|
-
"hidden_dim": 768,
|
|
1406
|
-
"mlp_dim": 3072,
|
|
1407
|
-
"num_reg_tokens": 4,
|
|
1408
|
-
"drop_path_rate": 0.1,
|
|
1409
|
-
},
|
|
1410
|
-
)
|
|
1411
|
-
registry.register_model_config(
|
|
1412
|
-
"rope_vit_reg8_nps_b14_ap",
|
|
1413
|
-
RoPE_ViT,
|
|
1414
|
-
config={
|
|
1415
|
-
"pos_embed_special_tokens": False,
|
|
1416
|
-
"patch_size": 14,
|
|
1417
|
-
"num_layers": 12,
|
|
1418
|
-
"num_heads": 12,
|
|
1419
|
-
"hidden_dim": 768,
|
|
1420
|
-
"mlp_dim": 3072,
|
|
1421
|
-
"num_reg_tokens": 8,
|
|
1422
|
-
"class_token": False,
|
|
1423
|
-
"attn_pool_head": True,
|
|
1424
|
-
"drop_path_rate": 0.1,
|
|
1425
|
-
},
|
|
1426
|
-
)
|
|
1427
|
-
registry.register_model_config(
|
|
1428
|
-
"rope_vit_reg4_l32",
|
|
1429
|
-
RoPE_ViT,
|
|
1430
|
-
config={
|
|
1431
|
-
"patch_size": 32,
|
|
1432
|
-
"num_layers": 24,
|
|
1433
|
-
"num_heads": 16,
|
|
1434
|
-
"hidden_dim": 1024,
|
|
1435
|
-
"mlp_dim": 4096,
|
|
1436
|
-
"num_reg_tokens": 4,
|
|
1437
|
-
"drop_path_rate": 0.1,
|
|
1438
|
-
},
|
|
1439
|
-
)
|
|
1440
|
-
registry.register_model_config(
|
|
1441
|
-
"rope_vit_reg4_l16",
|
|
1442
|
-
RoPE_ViT,
|
|
1443
|
-
config={
|
|
1444
|
-
"patch_size": 16,
|
|
1445
|
-
"num_layers": 24,
|
|
1446
|
-
"num_heads": 16,
|
|
1447
|
-
"hidden_dim": 1024,
|
|
1448
|
-
"mlp_dim": 4096,
|
|
1449
|
-
"num_reg_tokens": 4,
|
|
1450
|
-
"drop_path_rate": 0.1,
|
|
1451
|
-
},
|
|
1452
|
-
)
|
|
1453
|
-
registry.register_model_config(
|
|
1454
|
-
"rope_vit_reg4_l14",
|
|
1455
|
-
RoPE_ViT,
|
|
1456
|
-
config={
|
|
1457
|
-
"patch_size": 14,
|
|
1458
|
-
"num_layers": 24,
|
|
1459
|
-
"num_heads": 16,
|
|
1460
|
-
"hidden_dim": 1024,
|
|
1461
|
-
"mlp_dim": 4096,
|
|
1462
|
-
"num_reg_tokens": 4,
|
|
1463
|
-
"drop_path_rate": 0.1,
|
|
1464
|
-
},
|
|
1465
|
-
)
|
|
1466
|
-
registry.register_model_config(
|
|
1467
|
-
"rope_vit_reg8_l14_ap",
|
|
1468
|
-
RoPE_ViT,
|
|
1469
|
-
config={
|
|
1470
|
-
"patch_size": 14,
|
|
1471
|
-
"num_layers": 24,
|
|
1472
|
-
"num_heads": 16,
|
|
1473
|
-
"hidden_dim": 1024,
|
|
1474
|
-
"mlp_dim": 4096,
|
|
1475
|
-
"num_reg_tokens": 8,
|
|
1476
|
-
"class_token": False,
|
|
1477
|
-
"attn_pool_head": True,
|
|
1478
|
-
"drop_path_rate": 0.1,
|
|
1479
|
-
},
|
|
1480
|
-
)
|
|
1481
|
-
registry.register_model_config(
|
|
1482
|
-
"rope_vit_reg8_l14_rms_ap",
|
|
1483
|
-
RoPE_ViT,
|
|
1484
|
-
config={
|
|
1485
|
-
"patch_size": 14,
|
|
1486
|
-
"num_layers": 24,
|
|
1487
|
-
"num_heads": 16,
|
|
1488
|
-
"hidden_dim": 1024,
|
|
1489
|
-
"mlp_dim": 4096,
|
|
1490
|
-
"num_reg_tokens": 8,
|
|
1491
|
-
"class_token": False,
|
|
1492
|
-
"attn_pool_head": True,
|
|
1493
|
-
"norm_layer_type": "RMSNorm",
|
|
1494
|
-
"drop_path_rate": 0.1,
|
|
1495
|
-
},
|
|
1496
|
-
)
|
|
1497
|
-
registry.register_model_config(
|
|
1498
|
-
"rope_vit_reg4_h16",
|
|
1499
|
-
RoPE_ViT,
|
|
1500
|
-
config={
|
|
1501
|
-
"patch_size": 16,
|
|
1502
|
-
"num_layers": 32,
|
|
1503
|
-
"num_heads": 16,
|
|
1504
|
-
"hidden_dim": 1280,
|
|
1505
|
-
"mlp_dim": 5120,
|
|
1506
|
-
"num_reg_tokens": 4,
|
|
1507
|
-
"drop_path_rate": 0.1,
|
|
1508
|
-
},
|
|
1509
|
-
)
|
|
1510
|
-
registry.register_model_config(
|
|
1511
|
-
"rope_vit_reg4_h14",
|
|
1512
|
-
RoPE_ViT,
|
|
1513
|
-
config={
|
|
1514
|
-
"patch_size": 14,
|
|
1515
|
-
"num_layers": 32,
|
|
1516
|
-
"num_heads": 16,
|
|
1517
|
-
"hidden_dim": 1280,
|
|
1518
|
-
"mlp_dim": 5120,
|
|
1519
|
-
"num_reg_tokens": 4,
|
|
1520
|
-
"drop_path_rate": 0.1,
|
|
1521
|
-
},
|
|
1522
|
-
)
|
|
1523
|
-
registry.register_model_config( # From "Scaling Vision Transformers"
|
|
1524
|
-
"rope_vit_reg4_g14",
|
|
1525
|
-
RoPE_ViT,
|
|
1526
|
-
config={
|
|
1527
|
-
"patch_size": 14,
|
|
1528
|
-
"num_layers": 40,
|
|
1529
|
-
"num_heads": 16,
|
|
1530
|
-
"hidden_dim": 1408,
|
|
1531
|
-
"mlp_dim": 6144,
|
|
1532
|
-
"num_reg_tokens": 4,
|
|
1533
|
-
"drop_path_rate": 0.1,
|
|
1534
|
-
},
|
|
1535
|
-
)
|
|
1536
|
-
|
|
1537
|
-
# Shape-optimized vision transformer (SoViT)
|
|
1538
|
-
registry.register_model_config(
|
|
1539
|
-
"rope_vit_so150m_p14_ap",
|
|
1540
|
-
RoPE_ViT,
|
|
1541
|
-
config={
|
|
1542
|
-
"patch_size": 14,
|
|
1543
|
-
"num_layers": 18,
|
|
1544
|
-
"num_heads": 16,
|
|
1545
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1546
|
-
"mlp_dim": 2320,
|
|
1547
|
-
"class_token": False,
|
|
1548
|
-
"attn_pool_head": True,
|
|
1549
|
-
"drop_path_rate": 0.1,
|
|
1550
|
-
},
|
|
1551
|
-
)
|
|
1552
|
-
registry.register_model_config(
|
|
1553
|
-
"rope_vit_so400m_p14_ap",
|
|
1554
|
-
RoPE_ViT,
|
|
1555
|
-
config={
|
|
1556
|
-
"patch_size": 14,
|
|
1557
|
-
"num_layers": 27,
|
|
1558
|
-
"num_heads": 16,
|
|
1559
|
-
"hidden_dim": 1152,
|
|
1560
|
-
"mlp_dim": 4304,
|
|
1561
|
-
"class_token": False,
|
|
1562
|
-
"attn_pool_head": True,
|
|
1563
|
-
"drop_path_rate": 0.1,
|
|
1564
|
-
},
|
|
1565
|
-
)
|
|
1566
|
-
registry.register_model_config(
|
|
1567
|
-
"rope_vit_reg4_so150m_p14_ap",
|
|
1568
|
-
RoPE_ViT,
|
|
1569
|
-
config={
|
|
1570
|
-
"patch_size": 14,
|
|
1571
|
-
"num_layers": 18,
|
|
1572
|
-
"num_heads": 16,
|
|
1573
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1574
|
-
"mlp_dim": 2320,
|
|
1575
|
-
"num_reg_tokens": 4,
|
|
1576
|
-
"class_token": False,
|
|
1577
|
-
"attn_pool_head": True,
|
|
1578
|
-
"drop_path_rate": 0.1,
|
|
1579
|
-
},
|
|
1580
|
-
)
|
|
1581
|
-
registry.register_model_config(
|
|
1582
|
-
"rope_vit_reg8_so150m_p14_swiglu_rms_avg",
|
|
1583
|
-
RoPE_ViT,
|
|
1584
|
-
config={
|
|
1585
|
-
"patch_size": 14,
|
|
1586
|
-
"num_layers": 18,
|
|
1587
|
-
"num_heads": 16,
|
|
1588
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1589
|
-
"mlp_dim": 2320,
|
|
1590
|
-
"num_reg_tokens": 8,
|
|
1591
|
-
"class_token": False,
|
|
1592
|
-
"norm_layer_type": "RMSNorm",
|
|
1593
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1594
|
-
"drop_path_rate": 0.1,
|
|
1595
|
-
},
|
|
1596
|
-
)
|
|
1597
|
-
registry.register_model_config(
|
|
1598
|
-
"rope_vit_reg8_so150m_p14_swiglu_rms_ap",
|
|
1599
|
-
RoPE_ViT,
|
|
1600
|
-
config={
|
|
1601
|
-
"patch_size": 14,
|
|
1602
|
-
"num_layers": 18,
|
|
1603
|
-
"num_heads": 16,
|
|
1604
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1605
|
-
"mlp_dim": 2320,
|
|
1606
|
-
"num_reg_tokens": 8,
|
|
1607
|
-
"class_token": False,
|
|
1608
|
-
"attn_pool_head": True,
|
|
1609
|
-
"norm_layer_type": "RMSNorm",
|
|
1610
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1611
|
-
"drop_path_rate": 0.1,
|
|
1612
|
-
},
|
|
1613
|
-
)
|
|
1614
|
-
registry.register_model_config(
|
|
1615
|
-
"rope_vit_reg8_so150m_p14_swiglu_rms_aps",
|
|
1616
|
-
RoPE_ViT,
|
|
1617
|
-
config={
|
|
1618
|
-
"patch_size": 14,
|
|
1619
|
-
"num_layers": 18,
|
|
1620
|
-
"num_heads": 16,
|
|
1621
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1622
|
-
"mlp_dim": 2320,
|
|
1623
|
-
"num_reg_tokens": 8,
|
|
1624
|
-
"class_token": False,
|
|
1625
|
-
"attn_pool_head": True,
|
|
1626
|
-
"attn_pool_special_tokens": True,
|
|
1627
|
-
"norm_layer_type": "RMSNorm",
|
|
1628
|
-
"mlp_layer_type": "SwiGLU_FFN",
|
|
1629
|
-
"drop_path_rate": 0.1,
|
|
1630
|
-
},
|
|
1631
|
-
)
|
|
1632
|
-
registry.register_model_config(
|
|
1633
|
-
"rope_vit_reg8_so150m_p14_ap",
|
|
1634
|
-
RoPE_ViT,
|
|
1635
|
-
config={
|
|
1636
|
-
"patch_size": 14,
|
|
1637
|
-
"num_layers": 18,
|
|
1638
|
-
"num_heads": 16,
|
|
1639
|
-
"hidden_dim": 896, # Changed from 880 for RoPE divisibility
|
|
1640
|
-
"mlp_dim": 2320,
|
|
1641
|
-
"num_reg_tokens": 8,
|
|
1642
|
-
"class_token": False,
|
|
1643
|
-
"attn_pool_head": True,
|
|
1644
|
-
"drop_path_rate": 0.1,
|
|
1645
|
-
},
|
|
1646
|
-
)
|
|
1647
|
-
registry.register_model_config(
|
|
1648
|
-
"rope_vit_reg4_so400m_p14_ap",
|
|
1649
|
-
RoPE_ViT,
|
|
1650
|
-
config={
|
|
1651
|
-
"patch_size": 14,
|
|
1652
|
-
"num_layers": 27,
|
|
1653
|
-
"num_heads": 16,
|
|
1654
|
-
"hidden_dim": 1152,
|
|
1655
|
-
"mlp_dim": 4304,
|
|
1656
|
-
"num_reg_tokens": 4,
|
|
1657
|
-
"class_token": False,
|
|
1658
|
-
"attn_pool_head": True,
|
|
1659
|
-
"drop_path_rate": 0.1,
|
|
1660
|
-
},
|
|
1661
|
-
)
|
|
1662
|
-
registry.register_model_config(
|
|
1663
|
-
"rope_vit_reg8_so400m_p14_ap",
|
|
1664
|
-
RoPE_ViT,
|
|
1665
|
-
config={
|
|
1666
|
-
"patch_size": 14,
|
|
1667
|
-
"num_layers": 27,
|
|
1668
|
-
"num_heads": 16,
|
|
1669
|
-
"hidden_dim": 1152,
|
|
1670
|
-
"mlp_dim": 4304,
|
|
1671
|
-
"num_reg_tokens": 8,
|
|
1672
|
-
"class_token": False,
|
|
1673
|
-
"attn_pool_head": True,
|
|
1674
|
-
"drop_path_rate": 0.1,
|
|
1675
|
-
},
|
|
1676
|
-
)
|
|
943
|
+
# Register model configs (side effects)
|
|
944
|
+
register_rope_vit_configs(RoPE_ViT)
|
|
1677
945
|
|
|
1678
946
|
registry.register_weights(
|
|
1679
947
|
"rope_vit_reg4_b14_capi",
|