birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. birder/common/fs_ops.py +2 -2
  2. birder/introspection/attention_rollout.py +1 -1
  3. birder/introspection/transformer_attribution.py +1 -1
  4. birder/layers/layer_scale.py +1 -1
  5. birder/net/__init__.py +2 -10
  6. birder/net/_rope_vit_configs.py +430 -0
  7. birder/net/_vit_configs.py +479 -0
  8. birder/net/biformer.py +1 -0
  9. birder/net/cait.py +5 -5
  10. birder/net/coat.py +12 -12
  11. birder/net/conv2former.py +3 -3
  12. birder/net/convmixer.py +1 -1
  13. birder/net/convnext_v1.py +1 -1
  14. birder/net/crossvit.py +5 -5
  15. birder/net/davit.py +1 -1
  16. birder/net/deit.py +12 -26
  17. birder/net/deit3.py +42 -189
  18. birder/net/densenet.py +9 -8
  19. birder/net/detection/deformable_detr.py +5 -2
  20. birder/net/detection/detr.py +5 -2
  21. birder/net/detection/efficientdet.py +1 -1
  22. birder/net/dpn.py +1 -2
  23. birder/net/edgenext.py +2 -1
  24. birder/net/edgevit.py +3 -0
  25. birder/net/efficientformer_v1.py +2 -1
  26. birder/net/efficientformer_v2.py +18 -31
  27. birder/net/efficientnet_v2.py +3 -0
  28. birder/net/efficientvit_mit.py +5 -5
  29. birder/net/fasternet.py +2 -2
  30. birder/net/flexivit.py +22 -43
  31. birder/net/groupmixformer.py +1 -1
  32. birder/net/hgnet_v1.py +5 -5
  33. birder/net/inception_next.py +1 -1
  34. birder/net/inception_resnet_v1.py +3 -3
  35. birder/net/inception_resnet_v2.py +7 -4
  36. birder/net/inception_v3.py +3 -0
  37. birder/net/inception_v4.py +3 -0
  38. birder/net/maxvit.py +1 -1
  39. birder/net/metaformer.py +3 -3
  40. birder/net/mim/crossmae.py +1 -1
  41. birder/net/mim/mae_vit.py +1 -1
  42. birder/net/mim/simmim.py +1 -1
  43. birder/net/mobilenet_v1.py +0 -9
  44. birder/net/mobilenet_v2.py +38 -44
  45. birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
  46. birder/net/mobilevit_v1.py +5 -32
  47. birder/net/mobilevit_v2.py +1 -45
  48. birder/net/moganet.py +8 -5
  49. birder/net/mvit_v2.py +6 -6
  50. birder/net/nfnet.py +4 -0
  51. birder/net/pit.py +1 -1
  52. birder/net/pvt_v1.py +5 -5
  53. birder/net/pvt_v2.py +5 -5
  54. birder/net/repghost.py +1 -30
  55. birder/net/resmlp.py +2 -2
  56. birder/net/resnest.py +3 -0
  57. birder/net/resnet_v1.py +125 -1
  58. birder/net/resnet_v2.py +75 -1
  59. birder/net/resnext.py +35 -1
  60. birder/net/rope_deit3.py +33 -136
  61. birder/net/rope_flexivit.py +18 -18
  62. birder/net/rope_vit.py +3 -735
  63. birder/net/simple_vit.py +22 -16
  64. birder/net/smt.py +1 -1
  65. birder/net/squeezenet.py +5 -12
  66. birder/net/squeezenext.py +0 -24
  67. birder/net/ssl/capi.py +1 -1
  68. birder/net/ssl/data2vec.py +1 -1
  69. birder/net/ssl/dino_v2.py +2 -2
  70. birder/net/ssl/franca.py +2 -2
  71. birder/net/ssl/i_jepa.py +1 -1
  72. birder/net/ssl/ibot.py +1 -1
  73. birder/net/swiftformer.py +12 -2
  74. birder/net/swin_transformer_v2.py +1 -1
  75. birder/net/tiny_vit.py +3 -16
  76. birder/net/van.py +2 -2
  77. birder/net/vit.py +35 -963
  78. birder/net/vit_sam.py +13 -38
  79. birder/net/xcit.py +7 -6
  80. birder/tools/introspection.py +1 -1
  81. birder/tools/model_info.py +3 -1
  82. birder/version.py +1 -1
  83. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
  84. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
  85. birder/net/mobilenet_v3_small.py +0 -43
  86. birder/net/se_resnet_v1.py +0 -105
  87. birder/net/se_resnet_v2.py +0 -59
  88. birder/net/se_resnext.py +0 -30
  89. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
  90. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
  91. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
  92. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
birder/net/rope_vit.py CHANGED
@@ -31,6 +31,7 @@ from birder.layers import MultiHeadAttentionPool
31
31
  from birder.layers import SwiGLU_FFN
32
32
  from birder.layers.activations import get_activation_module
33
33
  from birder.model_registry import registry
34
+ from birder.net._rope_vit_configs import register_rope_vit_configs
34
35
  from birder.net.base import DetectorBackbone
35
36
  from birder.net.base import MaskedTokenOmissionMixin
36
37
  from birder.net.base import MaskedTokenRetentionMixin
@@ -939,741 +940,8 @@ class RoPE_ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
939
940
  )
940
941
 
941
942
 
942
- # Vision Transformer Model Naming Convention
943
- # ==========================================
944
- #
945
- # Model names follow a structured pattern to encode architectural choices:
946
- # [rope_]vit_[reg{N}_][size][patch_size][_components][_pooling][_c{N}]
947
- #
948
- # Core Components:
949
- # - rope_ : Rotary Position Embedding (RoPE) enabled
950
- # - rope_i_ : Rotary Position Embedding (RoPE) enabled with interleaved rotation - implies different temp, indexing
951
- # - vit_ : Vision Transformer base architecture
952
- # - reg{N}_ : Register tokens (N = number of register tokens, e.g., reg4, reg8)
953
- # - size : Model size (s=small, b=base, l=large, or specific like so150m)
954
- # - patch_size : Patch size (e.g., 14, 16, 32 for 14x14, 16x16, 32x32 patches)
955
- #
956
- # Optional Components:
957
- # Position Embeddings:
958
- # - nps : No Position embedding on Special tokens
959
- #
960
- # Normalization:
961
- # - rms : RMSNorm (instead of LayerNorm)
962
- # - pn : Pre-Norm (layer norm before the encoder) - implies different norm eps
963
- # - npn : No Post Norm (disables post-normalization layer)
964
- # - qkn : QK Norm
965
- #
966
- # Feed-Forward Network:
967
- # - swiglu : SwiGLU FFN layer type (instead of standard FFN)
968
- #
969
- # Activation:
970
- # - quick_gelu : QuickGELU activation type
971
- # - ...
972
- #
973
- # Regularization:
974
- # - ls : Layer Scaling applied
975
- #
976
- # Pooling/Reduction:
977
- # - avg : Average pooling for sequence reduction
978
- # - ap : Attention Pooling for sequence reduction
979
- # - aps : Attention Pooling inc. Special tokens for sequence reduction
980
- #
981
- # Custom Variants:
982
- # - c{N} : Custom variant (N = version number) for models with fine-grained or non-standard
983
- # modifications not fully reflected in the name
984
-
985
- registry.register_model_config(
986
- "rope_vit_s32",
987
- RoPE_ViT,
988
- config={
989
- "patch_size": 32,
990
- "num_layers": 12,
991
- "num_heads": 6,
992
- "hidden_dim": 384,
993
- "mlp_dim": 1536,
994
- "drop_path_rate": 0.0,
995
- },
996
- )
997
- registry.register_model_config(
998
- "rope_vit_s16",
999
- RoPE_ViT,
1000
- config={
1001
- "patch_size": 16,
1002
- "num_layers": 12,
1003
- "num_heads": 6,
1004
- "hidden_dim": 384,
1005
- "mlp_dim": 1536,
1006
- "drop_path_rate": 0.0,
1007
- },
1008
- )
1009
- registry.register_model_config(
1010
- "rope_i_vit_s16_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
1011
- RoPE_ViT,
1012
- config={
1013
- "patch_size": 16,
1014
- "num_layers": 12,
1015
- "num_heads": 6,
1016
- "hidden_dim": 384,
1017
- "mlp_dim": 1536,
1018
- "pre_norm": True,
1019
- "attn_pool_head": True,
1020
- "attn_pool_num_heads": 8,
1021
- "attn_pool_special_tokens": True,
1022
- "norm_layer_eps": 1e-5,
1023
- "rope_rot_type": "interleaved",
1024
- "rope_grid_indexing": "xy",
1025
- "rope_grid_offset": 1,
1026
- "rope_temperature": 10000.0,
1027
- "drop_path_rate": 0.0,
1028
- },
1029
- )
1030
- registry.register_model_config(
1031
- "rope_vit_s14",
1032
- RoPE_ViT,
1033
- config={
1034
- "patch_size": 14,
1035
- "num_layers": 12,
1036
- "num_heads": 6,
1037
- "hidden_dim": 384,
1038
- "mlp_dim": 1536,
1039
- "drop_path_rate": 0.0,
1040
- },
1041
- )
1042
- registry.register_model_config(
1043
- "rope_vit_m32",
1044
- RoPE_ViT,
1045
- config={
1046
- "patch_size": 32,
1047
- "num_layers": 12,
1048
- "num_heads": 8,
1049
- "hidden_dim": 512,
1050
- "mlp_dim": 2048,
1051
- "drop_path_rate": 0.0,
1052
- },
1053
- )
1054
- registry.register_model_config(
1055
- "rope_vit_m16",
1056
- RoPE_ViT,
1057
- config={
1058
- "patch_size": 16,
1059
- "num_layers": 12,
1060
- "num_heads": 8,
1061
- "hidden_dim": 512,
1062
- "mlp_dim": 2048,
1063
- "drop_path_rate": 0.0,
1064
- },
1065
- )
1066
- registry.register_model_config(
1067
- "rope_vit_m14",
1068
- RoPE_ViT,
1069
- config={
1070
- "patch_size": 14,
1071
- "num_layers": 12,
1072
- "num_heads": 8,
1073
- "hidden_dim": 512,
1074
- "mlp_dim": 2048,
1075
- "drop_path_rate": 0.0,
1076
- },
1077
- )
1078
- registry.register_model_config(
1079
- "rope_vit_b32",
1080
- RoPE_ViT,
1081
- config={
1082
- "patch_size": 32,
1083
- "num_layers": 12,
1084
- "num_heads": 12,
1085
- "hidden_dim": 768,
1086
- "mlp_dim": 3072,
1087
- "drop_path_rate": 0.0,
1088
- },
1089
- )
1090
- registry.register_model_config(
1091
- "rope_vit_b16",
1092
- RoPE_ViT,
1093
- config={
1094
- "patch_size": 16,
1095
- "num_layers": 12,
1096
- "num_heads": 12,
1097
- "hidden_dim": 768,
1098
- "mlp_dim": 3072,
1099
- "drop_path_rate": 0.1,
1100
- },
1101
- )
1102
- registry.register_model_config(
1103
- "rope_vit_b16_qkn_ls",
1104
- RoPE_ViT,
1105
- config={
1106
- "patch_size": 16,
1107
- "num_layers": 12,
1108
- "num_heads": 12,
1109
- "hidden_dim": 768,
1110
- "mlp_dim": 3072,
1111
- "layer_scale_init_value": 1e-5,
1112
- "qk_norm": True,
1113
- "drop_path_rate": 0.1,
1114
- },
1115
- )
1116
- registry.register_model_config(
1117
- "rope_i_vit_b16_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
1118
- RoPE_ViT,
1119
- config={
1120
- "patch_size": 16,
1121
- "num_layers": 12,
1122
- "num_heads": 12,
1123
- "hidden_dim": 768,
1124
- "mlp_dim": 3072,
1125
- "pre_norm": True,
1126
- "attn_pool_head": True,
1127
- "attn_pool_num_heads": 8,
1128
- "attn_pool_special_tokens": True,
1129
- "norm_layer_eps": 1e-5,
1130
- "rope_rot_type": "interleaved",
1131
- "rope_grid_indexing": "xy",
1132
- "rope_grid_offset": 1,
1133
- "rope_temperature": 10000.0,
1134
- "drop_path_rate": 0.1,
1135
- },
1136
- )
1137
- registry.register_model_config(
1138
- "rope_vit_b14",
1139
- RoPE_ViT,
1140
- config={
1141
- "patch_size": 14,
1142
- "num_layers": 12,
1143
- "num_heads": 12,
1144
- "hidden_dim": 768,
1145
- "mlp_dim": 3072,
1146
- "drop_path_rate": 0.1,
1147
- },
1148
- )
1149
- registry.register_model_config(
1150
- "rope_vit_l32",
1151
- RoPE_ViT,
1152
- config={
1153
- "patch_size": 32,
1154
- "num_layers": 24,
1155
- "num_heads": 16,
1156
- "hidden_dim": 1024,
1157
- "mlp_dim": 4096,
1158
- "drop_path_rate": 0.1,
1159
- },
1160
- )
1161
- registry.register_model_config(
1162
- "rope_vit_l16",
1163
- RoPE_ViT,
1164
- config={
1165
- "patch_size": 16,
1166
- "num_layers": 24,
1167
- "num_heads": 16,
1168
- "hidden_dim": 1024,
1169
- "mlp_dim": 4096,
1170
- "drop_path_rate": 0.1,
1171
- },
1172
- )
1173
- registry.register_model_config(
1174
- "rope_vit_l14",
1175
- RoPE_ViT,
1176
- config={
1177
- "patch_size": 14,
1178
- "num_layers": 24,
1179
- "num_heads": 16,
1180
- "hidden_dim": 1024,
1181
- "mlp_dim": 4096,
1182
- "drop_path_rate": 0.1,
1183
- },
1184
- )
1185
- registry.register_model_config(
1186
- "rope_i_vit_l14_pn_aps_c1", # For PE Core - https://arxiv.org/abs/2504.13181
1187
- RoPE_ViT,
1188
- config={
1189
- "patch_size": 14,
1190
- "num_layers": 24,
1191
- "num_heads": 16,
1192
- "hidden_dim": 1024,
1193
- "mlp_dim": 4096,
1194
- "pre_norm": True,
1195
- "attn_pool_head": True,
1196
- "attn_pool_num_heads": 8,
1197
- "attn_pool_special_tokens": True,
1198
- "norm_layer_eps": 1e-5,
1199
- "rope_rot_type": "interleaved",
1200
- "rope_grid_indexing": "xy",
1201
- "rope_grid_offset": 1,
1202
- "rope_temperature": 10000.0,
1203
- "drop_path_rate": 0.1,
1204
- },
1205
- )
1206
- registry.register_model_config(
1207
- "rope_vit_h16",
1208
- RoPE_ViT,
1209
- config={
1210
- "patch_size": 16,
1211
- "num_layers": 32,
1212
- "num_heads": 16,
1213
- "hidden_dim": 1280,
1214
- "mlp_dim": 5120,
1215
- "drop_path_rate": 0.1,
1216
- },
1217
- )
1218
- registry.register_model_config(
1219
- "rope_vit_h14",
1220
- RoPE_ViT,
1221
- config={
1222
- "patch_size": 14,
1223
- "num_layers": 32,
1224
- "num_heads": 16,
1225
- "hidden_dim": 1280,
1226
- "mlp_dim": 5120,
1227
- "drop_path_rate": 0.1,
1228
- },
1229
- )
1230
- registry.register_model_config( # From "Scaling Vision Transformers"
1231
- "rope_vit_g14",
1232
- RoPE_ViT,
1233
- config={
1234
- "patch_size": 14,
1235
- "num_layers": 40,
1236
- "num_heads": 16,
1237
- "hidden_dim": 1408,
1238
- "mlp_dim": 6144,
1239
- "drop_path_rate": 0.1,
1240
- },
1241
- )
1242
-
1243
- # With registers
1244
- registry.register_model_config(
1245
- "rope_vit_reg1_s32",
1246
- RoPE_ViT,
1247
- config={
1248
- "patch_size": 32,
1249
- "num_layers": 12,
1250
- "num_heads": 6,
1251
- "hidden_dim": 384,
1252
- "mlp_dim": 1536,
1253
- "num_reg_tokens": 1,
1254
- "drop_path_rate": 0.0,
1255
- },
1256
- )
1257
- registry.register_model_config(
1258
- "rope_vit_reg1_s16",
1259
- RoPE_ViT,
1260
- config={
1261
- "patch_size": 16,
1262
- "num_layers": 12,
1263
- "num_heads": 6,
1264
- "hidden_dim": 384,
1265
- "mlp_dim": 1536,
1266
- "num_reg_tokens": 1,
1267
- "drop_path_rate": 0.0,
1268
- },
1269
- )
1270
- registry.register_model_config(
1271
- "rope_i_vit_reg1_s16_pn_npn_avg_c1", # For PE Spatial - https://arxiv.org/abs/2504.13181
1272
- RoPE_ViT,
1273
- config={
1274
- "patch_size": 16,
1275
- "num_layers": 12,
1276
- "num_heads": 6,
1277
- "hidden_dim": 384,
1278
- "mlp_dim": 1536,
1279
- "num_reg_tokens": 1,
1280
- "class_token": False,
1281
- "pre_norm": True,
1282
- "post_norm": False,
1283
- "norm_layer_eps": 1e-5,
1284
- "rope_rot_type": "interleaved",
1285
- "rope_grid_indexing": "xy",
1286
- "rope_grid_offset": 1,
1287
- "rope_temperature": 10000.0,
1288
- "drop_path_rate": 0.0,
1289
- },
1290
- )
1291
- registry.register_model_config(
1292
- "rope_vit_reg1_s14",
1293
- RoPE_ViT,
1294
- config={
1295
- "patch_size": 14,
1296
- "num_layers": 12,
1297
- "num_heads": 6,
1298
- "hidden_dim": 384,
1299
- "mlp_dim": 1536,
1300
- "num_reg_tokens": 1,
1301
- "drop_path_rate": 0.0,
1302
- },
1303
- )
1304
- registry.register_model_config(
1305
- "rope_vit_reg4_m32",
1306
- RoPE_ViT,
1307
- config={
1308
- "patch_size": 32,
1309
- "num_layers": 12,
1310
- "num_heads": 8,
1311
- "hidden_dim": 512,
1312
- "mlp_dim": 2048,
1313
- "num_reg_tokens": 4,
1314
- "drop_path_rate": 0.0,
1315
- },
1316
- )
1317
- registry.register_model_config(
1318
- "rope_vit_reg4_m16",
1319
- RoPE_ViT,
1320
- config={
1321
- "patch_size": 16,
1322
- "num_layers": 12,
1323
- "num_heads": 8,
1324
- "hidden_dim": 512,
1325
- "mlp_dim": 2048,
1326
- "num_reg_tokens": 4,
1327
- "drop_path_rate": 0.0,
1328
- },
1329
- )
1330
- registry.register_model_config(
1331
- "rope_vit_reg4_m16_rms_avg",
1332
- RoPE_ViT,
1333
- config={
1334
- "patch_size": 16,
1335
- "num_layers": 12,
1336
- "num_heads": 8,
1337
- "hidden_dim": 512,
1338
- "mlp_dim": 2048,
1339
- "num_reg_tokens": 4,
1340
- "class_token": False,
1341
- "norm_layer_type": "RMSNorm",
1342
- "drop_path_rate": 0.0,
1343
- },
1344
- )
1345
- registry.register_model_config(
1346
- "rope_vit_reg4_m14",
1347
- RoPE_ViT,
1348
- config={
1349
- "patch_size": 14,
1350
- "num_layers": 12,
1351
- "num_heads": 8,
1352
- "hidden_dim": 512,
1353
- "mlp_dim": 2048,
1354
- "num_reg_tokens": 4,
1355
- "drop_path_rate": 0.0,
1356
- },
1357
- )
1358
- registry.register_model_config(
1359
- "rope_vit_reg4_m14_avg",
1360
- RoPE_ViT,
1361
- config={
1362
- "patch_size": 14,
1363
- "num_layers": 12,
1364
- "num_heads": 8,
1365
- "hidden_dim": 512,
1366
- "mlp_dim": 2048,
1367
- "num_reg_tokens": 4,
1368
- "class_token": False,
1369
- "drop_path_rate": 0.0,
1370
- },
1371
- )
1372
- registry.register_model_config(
1373
- "rope_vit_reg4_b32",
1374
- RoPE_ViT,
1375
- config={
1376
- "patch_size": 32,
1377
- "num_layers": 12,
1378
- "num_heads": 12,
1379
- "hidden_dim": 768,
1380
- "mlp_dim": 3072,
1381
- "num_reg_tokens": 4,
1382
- "drop_path_rate": 0.0,
1383
- },
1384
- )
1385
- registry.register_model_config(
1386
- "rope_vit_reg4_b16",
1387
- RoPE_ViT,
1388
- config={
1389
- "patch_size": 16,
1390
- "num_layers": 12,
1391
- "num_heads": 12,
1392
- "hidden_dim": 768,
1393
- "mlp_dim": 3072,
1394
- "num_reg_tokens": 4,
1395
- "drop_path_rate": 0.1,
1396
- },
1397
- )
1398
- registry.register_model_config(
1399
- "rope_vit_reg4_b14",
1400
- RoPE_ViT,
1401
- config={
1402
- "patch_size": 14,
1403
- "num_layers": 12,
1404
- "num_heads": 12,
1405
- "hidden_dim": 768,
1406
- "mlp_dim": 3072,
1407
- "num_reg_tokens": 4,
1408
- "drop_path_rate": 0.1,
1409
- },
1410
- )
1411
- registry.register_model_config(
1412
- "rope_vit_reg8_nps_b14_ap",
1413
- RoPE_ViT,
1414
- config={
1415
- "pos_embed_special_tokens": False,
1416
- "patch_size": 14,
1417
- "num_layers": 12,
1418
- "num_heads": 12,
1419
- "hidden_dim": 768,
1420
- "mlp_dim": 3072,
1421
- "num_reg_tokens": 8,
1422
- "class_token": False,
1423
- "attn_pool_head": True,
1424
- "drop_path_rate": 0.1,
1425
- },
1426
- )
1427
- registry.register_model_config(
1428
- "rope_vit_reg4_l32",
1429
- RoPE_ViT,
1430
- config={
1431
- "patch_size": 32,
1432
- "num_layers": 24,
1433
- "num_heads": 16,
1434
- "hidden_dim": 1024,
1435
- "mlp_dim": 4096,
1436
- "num_reg_tokens": 4,
1437
- "drop_path_rate": 0.1,
1438
- },
1439
- )
1440
- registry.register_model_config(
1441
- "rope_vit_reg4_l16",
1442
- RoPE_ViT,
1443
- config={
1444
- "patch_size": 16,
1445
- "num_layers": 24,
1446
- "num_heads": 16,
1447
- "hidden_dim": 1024,
1448
- "mlp_dim": 4096,
1449
- "num_reg_tokens": 4,
1450
- "drop_path_rate": 0.1,
1451
- },
1452
- )
1453
- registry.register_model_config(
1454
- "rope_vit_reg4_l14",
1455
- RoPE_ViT,
1456
- config={
1457
- "patch_size": 14,
1458
- "num_layers": 24,
1459
- "num_heads": 16,
1460
- "hidden_dim": 1024,
1461
- "mlp_dim": 4096,
1462
- "num_reg_tokens": 4,
1463
- "drop_path_rate": 0.1,
1464
- },
1465
- )
1466
- registry.register_model_config(
1467
- "rope_vit_reg8_l14_ap",
1468
- RoPE_ViT,
1469
- config={
1470
- "patch_size": 14,
1471
- "num_layers": 24,
1472
- "num_heads": 16,
1473
- "hidden_dim": 1024,
1474
- "mlp_dim": 4096,
1475
- "num_reg_tokens": 8,
1476
- "class_token": False,
1477
- "attn_pool_head": True,
1478
- "drop_path_rate": 0.1,
1479
- },
1480
- )
1481
- registry.register_model_config(
1482
- "rope_vit_reg8_l14_rms_ap",
1483
- RoPE_ViT,
1484
- config={
1485
- "patch_size": 14,
1486
- "num_layers": 24,
1487
- "num_heads": 16,
1488
- "hidden_dim": 1024,
1489
- "mlp_dim": 4096,
1490
- "num_reg_tokens": 8,
1491
- "class_token": False,
1492
- "attn_pool_head": True,
1493
- "norm_layer_type": "RMSNorm",
1494
- "drop_path_rate": 0.1,
1495
- },
1496
- )
1497
- registry.register_model_config(
1498
- "rope_vit_reg4_h16",
1499
- RoPE_ViT,
1500
- config={
1501
- "patch_size": 16,
1502
- "num_layers": 32,
1503
- "num_heads": 16,
1504
- "hidden_dim": 1280,
1505
- "mlp_dim": 5120,
1506
- "num_reg_tokens": 4,
1507
- "drop_path_rate": 0.1,
1508
- },
1509
- )
1510
- registry.register_model_config(
1511
- "rope_vit_reg4_h14",
1512
- RoPE_ViT,
1513
- config={
1514
- "patch_size": 14,
1515
- "num_layers": 32,
1516
- "num_heads": 16,
1517
- "hidden_dim": 1280,
1518
- "mlp_dim": 5120,
1519
- "num_reg_tokens": 4,
1520
- "drop_path_rate": 0.1,
1521
- },
1522
- )
1523
- registry.register_model_config( # From "Scaling Vision Transformers"
1524
- "rope_vit_reg4_g14",
1525
- RoPE_ViT,
1526
- config={
1527
- "patch_size": 14,
1528
- "num_layers": 40,
1529
- "num_heads": 16,
1530
- "hidden_dim": 1408,
1531
- "mlp_dim": 6144,
1532
- "num_reg_tokens": 4,
1533
- "drop_path_rate": 0.1,
1534
- },
1535
- )
1536
-
1537
- # Shape-optimized vision transformer (SoViT)
1538
- registry.register_model_config(
1539
- "rope_vit_so150m_p14_ap",
1540
- RoPE_ViT,
1541
- config={
1542
- "patch_size": 14,
1543
- "num_layers": 18,
1544
- "num_heads": 16,
1545
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1546
- "mlp_dim": 2320,
1547
- "class_token": False,
1548
- "attn_pool_head": True,
1549
- "drop_path_rate": 0.1,
1550
- },
1551
- )
1552
- registry.register_model_config(
1553
- "rope_vit_so400m_p14_ap",
1554
- RoPE_ViT,
1555
- config={
1556
- "patch_size": 14,
1557
- "num_layers": 27,
1558
- "num_heads": 16,
1559
- "hidden_dim": 1152,
1560
- "mlp_dim": 4304,
1561
- "class_token": False,
1562
- "attn_pool_head": True,
1563
- "drop_path_rate": 0.1,
1564
- },
1565
- )
1566
- registry.register_model_config(
1567
- "rope_vit_reg4_so150m_p14_ap",
1568
- RoPE_ViT,
1569
- config={
1570
- "patch_size": 14,
1571
- "num_layers": 18,
1572
- "num_heads": 16,
1573
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1574
- "mlp_dim": 2320,
1575
- "num_reg_tokens": 4,
1576
- "class_token": False,
1577
- "attn_pool_head": True,
1578
- "drop_path_rate": 0.1,
1579
- },
1580
- )
1581
- registry.register_model_config(
1582
- "rope_vit_reg8_so150m_p14_swiglu_rms_avg",
1583
- RoPE_ViT,
1584
- config={
1585
- "patch_size": 14,
1586
- "num_layers": 18,
1587
- "num_heads": 16,
1588
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1589
- "mlp_dim": 2320,
1590
- "num_reg_tokens": 8,
1591
- "class_token": False,
1592
- "norm_layer_type": "RMSNorm",
1593
- "mlp_layer_type": "SwiGLU_FFN",
1594
- "drop_path_rate": 0.1,
1595
- },
1596
- )
1597
- registry.register_model_config(
1598
- "rope_vit_reg8_so150m_p14_swiglu_rms_ap",
1599
- RoPE_ViT,
1600
- config={
1601
- "patch_size": 14,
1602
- "num_layers": 18,
1603
- "num_heads": 16,
1604
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1605
- "mlp_dim": 2320,
1606
- "num_reg_tokens": 8,
1607
- "class_token": False,
1608
- "attn_pool_head": True,
1609
- "norm_layer_type": "RMSNorm",
1610
- "mlp_layer_type": "SwiGLU_FFN",
1611
- "drop_path_rate": 0.1,
1612
- },
1613
- )
1614
- registry.register_model_config(
1615
- "rope_vit_reg8_so150m_p14_swiglu_rms_aps",
1616
- RoPE_ViT,
1617
- config={
1618
- "patch_size": 14,
1619
- "num_layers": 18,
1620
- "num_heads": 16,
1621
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1622
- "mlp_dim": 2320,
1623
- "num_reg_tokens": 8,
1624
- "class_token": False,
1625
- "attn_pool_head": True,
1626
- "attn_pool_special_tokens": True,
1627
- "norm_layer_type": "RMSNorm",
1628
- "mlp_layer_type": "SwiGLU_FFN",
1629
- "drop_path_rate": 0.1,
1630
- },
1631
- )
1632
- registry.register_model_config(
1633
- "rope_vit_reg8_so150m_p14_ap",
1634
- RoPE_ViT,
1635
- config={
1636
- "patch_size": 14,
1637
- "num_layers": 18,
1638
- "num_heads": 16,
1639
- "hidden_dim": 896, # Changed from 880 for RoPE divisibility
1640
- "mlp_dim": 2320,
1641
- "num_reg_tokens": 8,
1642
- "class_token": False,
1643
- "attn_pool_head": True,
1644
- "drop_path_rate": 0.1,
1645
- },
1646
- )
1647
- registry.register_model_config(
1648
- "rope_vit_reg4_so400m_p14_ap",
1649
- RoPE_ViT,
1650
- config={
1651
- "patch_size": 14,
1652
- "num_layers": 27,
1653
- "num_heads": 16,
1654
- "hidden_dim": 1152,
1655
- "mlp_dim": 4304,
1656
- "num_reg_tokens": 4,
1657
- "class_token": False,
1658
- "attn_pool_head": True,
1659
- "drop_path_rate": 0.1,
1660
- },
1661
- )
1662
- registry.register_model_config(
1663
- "rope_vit_reg8_so400m_p14_ap",
1664
- RoPE_ViT,
1665
- config={
1666
- "patch_size": 14,
1667
- "num_layers": 27,
1668
- "num_heads": 16,
1669
- "hidden_dim": 1152,
1670
- "mlp_dim": 4304,
1671
- "num_reg_tokens": 8,
1672
- "class_token": False,
1673
- "attn_pool_head": True,
1674
- "drop_path_rate": 0.1,
1675
- },
1676
- )
943
+ # Register model configs (side effects)
944
+ register_rope_vit_configs(RoPE_ViT)
1677
945
 
1678
946
  registry.register_weights(
1679
947
  "rope_vit_reg4_b14_capi",