birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder/common/fs_ops.py +2 -2
- birder/introspection/attention_rollout.py +1 -1
- birder/introspection/transformer_attribution.py +1 -1
- birder/layers/layer_scale.py +1 -1
- birder/net/__init__.py +2 -10
- birder/net/_rope_vit_configs.py +430 -0
- birder/net/_vit_configs.py +479 -0
- birder/net/biformer.py +1 -0
- birder/net/cait.py +5 -5
- birder/net/coat.py +12 -12
- birder/net/conv2former.py +3 -3
- birder/net/convmixer.py +1 -1
- birder/net/convnext_v1.py +1 -1
- birder/net/crossvit.py +5 -5
- birder/net/davit.py +1 -1
- birder/net/deit.py +12 -26
- birder/net/deit3.py +42 -189
- birder/net/densenet.py +9 -8
- birder/net/detection/deformable_detr.py +5 -2
- birder/net/detection/detr.py +5 -2
- birder/net/detection/efficientdet.py +1 -1
- birder/net/dpn.py +1 -2
- birder/net/edgenext.py +2 -1
- birder/net/edgevit.py +3 -0
- birder/net/efficientformer_v1.py +2 -1
- birder/net/efficientformer_v2.py +18 -31
- birder/net/efficientnet_v2.py +3 -0
- birder/net/efficientvit_mit.py +5 -5
- birder/net/fasternet.py +2 -2
- birder/net/flexivit.py +22 -43
- birder/net/groupmixformer.py +1 -1
- birder/net/hgnet_v1.py +5 -5
- birder/net/inception_next.py +1 -1
- birder/net/inception_resnet_v1.py +3 -3
- birder/net/inception_resnet_v2.py +7 -4
- birder/net/inception_v3.py +3 -0
- birder/net/inception_v4.py +3 -0
- birder/net/maxvit.py +1 -1
- birder/net/metaformer.py +3 -3
- birder/net/mim/crossmae.py +1 -1
- birder/net/mim/mae_vit.py +1 -1
- birder/net/mim/simmim.py +1 -1
- birder/net/mobilenet_v1.py +0 -9
- birder/net/mobilenet_v2.py +38 -44
- birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
- birder/net/mobilevit_v1.py +5 -32
- birder/net/mobilevit_v2.py +1 -45
- birder/net/moganet.py +8 -5
- birder/net/mvit_v2.py +6 -6
- birder/net/nfnet.py +4 -0
- birder/net/pit.py +1 -1
- birder/net/pvt_v1.py +5 -5
- birder/net/pvt_v2.py +5 -5
- birder/net/repghost.py +1 -30
- birder/net/resmlp.py +2 -2
- birder/net/resnest.py +3 -0
- birder/net/resnet_v1.py +125 -1
- birder/net/resnet_v2.py +75 -1
- birder/net/resnext.py +35 -1
- birder/net/rope_deit3.py +33 -136
- birder/net/rope_flexivit.py +18 -18
- birder/net/rope_vit.py +3 -735
- birder/net/simple_vit.py +22 -16
- birder/net/smt.py +1 -1
- birder/net/squeezenet.py +5 -12
- birder/net/squeezenext.py +0 -24
- birder/net/ssl/capi.py +1 -1
- birder/net/ssl/data2vec.py +1 -1
- birder/net/ssl/dino_v2.py +2 -2
- birder/net/ssl/franca.py +2 -2
- birder/net/ssl/i_jepa.py +1 -1
- birder/net/ssl/ibot.py +1 -1
- birder/net/swiftformer.py +12 -2
- birder/net/swin_transformer_v2.py +1 -1
- birder/net/tiny_vit.py +3 -16
- birder/net/van.py +2 -2
- birder/net/vit.py +35 -963
- birder/net/vit_sam.py +13 -38
- birder/net/xcit.py +7 -6
- birder/tools/introspection.py +1 -1
- birder/tools/model_info.py +3 -1
- birder/version.py +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
- birder/net/mobilenet_v3_small.py +0 -43
- birder/net/se_resnet_v1.py +0 -105
- birder/net/se_resnet_v2.py +0 -59
- birder/net/se_resnext.py +0 -30
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
birder/net/mobilevit_v2.py
CHANGED
|
@@ -159,12 +159,7 @@ class MobileVitBlock(nn.Module):
|
|
|
159
159
|
self.norm = nn.GroupNorm(num_groups=1, num_channels=transformer_dim)
|
|
160
160
|
|
|
161
161
|
self.conv_proj = Conv2dNormActivation(
|
|
162
|
-
transformer_dim,
|
|
163
|
-
channels,
|
|
164
|
-
kernel_size=(1, 1),
|
|
165
|
-
stride=(1, 1),
|
|
166
|
-
padding=(0, 0),
|
|
167
|
-
activation_layer=nn.SiLU,
|
|
162
|
+
transformer_dim, channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), activation_layer=None
|
|
168
163
|
)
|
|
169
164
|
|
|
170
165
|
self.patch_size = patch_size
|
|
@@ -236,7 +231,6 @@ class MobileViT_v2(DetectorBackbone):
|
|
|
236
231
|
stride=(2, 2),
|
|
237
232
|
padding=(1, 1),
|
|
238
233
|
activation_layer=nn.SiLU,
|
|
239
|
-
bias=True,
|
|
240
234
|
)
|
|
241
235
|
|
|
242
236
|
stages: OrderedDict[str, nn.Module] = OrderedDict()
|
|
@@ -340,15 +334,6 @@ class MobileViT_v2(DetectorBackbone):
|
|
|
340
334
|
x = self.forward_features(x)
|
|
341
335
|
return self.features(x)
|
|
342
336
|
|
|
343
|
-
def create_classifier(self, embed_dim: Optional[int] = None) -> nn.Module:
|
|
344
|
-
if self.num_classes == 0:
|
|
345
|
-
return nn.Identity()
|
|
346
|
-
|
|
347
|
-
if embed_dim is None:
|
|
348
|
-
embed_dim = self.embedding_size
|
|
349
|
-
|
|
350
|
-
return nn.Linear(embed_dim, self.num_classes, bias=False)
|
|
351
|
-
|
|
352
337
|
|
|
353
338
|
registry.register_model_config("mobilevit_v2_0_25", MobileViT_v2, config={"width_factor": 0.25})
|
|
354
339
|
registry.register_model_config("mobilevit_v2_0_5", MobileViT_v2, config={"width_factor": 0.5})
|
|
@@ -358,32 +343,3 @@ registry.register_model_config("mobilevit_v2_1_25", MobileViT_v2, config={"width
|
|
|
358
343
|
registry.register_model_config("mobilevit_v2_1_5", MobileViT_v2, config={"width_factor": 1.5})
|
|
359
344
|
registry.register_model_config("mobilevit_v2_1_75", MobileViT_v2, config={"width_factor": 1.75})
|
|
360
345
|
registry.register_model_config("mobilevit_v2_2_0", MobileViT_v2, config={"width_factor": 2.0})
|
|
361
|
-
|
|
362
|
-
registry.register_weights(
|
|
363
|
-
"mobilevit_v2_1_0_il-common",
|
|
364
|
-
{
|
|
365
|
-
"description": "MobileViT v2 with width multiplier of 1.0 trained on the il-common dataset",
|
|
366
|
-
"resolution": (256, 256),
|
|
367
|
-
"formats": {
|
|
368
|
-
"pt": {
|
|
369
|
-
"file_size": 17.6,
|
|
370
|
-
"sha256": "2b45b7f2ffe3dd129d9a7e9690d2dfd0f93ac60f24d118b920a51bcb950fd95e",
|
|
371
|
-
}
|
|
372
|
-
},
|
|
373
|
-
"net": {"network": "mobilevit_v2_1_0", "tag": "il-common"},
|
|
374
|
-
},
|
|
375
|
-
)
|
|
376
|
-
registry.register_weights(
|
|
377
|
-
"mobilevit_v2_1_5_il-common",
|
|
378
|
-
{
|
|
379
|
-
"description": "MobileViT v2 with width multiplier of 1.5 trained on the il-common dataset",
|
|
380
|
-
"resolution": (256, 256),
|
|
381
|
-
"formats": {
|
|
382
|
-
"pt": {
|
|
383
|
-
"file_size": 38.8,
|
|
384
|
-
"sha256": "acd28c3ee653b62c69ad765c1d99827cea5051deb6dbdd7b9c8d7612782c86a3",
|
|
385
|
-
}
|
|
386
|
-
},
|
|
387
|
-
"net": {"network": "mobilevit_v2_1_5", "tag": "il-common"},
|
|
388
|
-
},
|
|
389
|
-
)
|
birder/net/moganet.py
CHANGED
|
@@ -4,6 +4,9 @@ https://github.com/Westlake-AI/MogaNet/blob/main/models/moganet.py
|
|
|
4
4
|
|
|
5
5
|
Paper "MogaNet: Multi-order Gated Aggregation Network",
|
|
6
6
|
https://arxiv.org/abs/2211.03295
|
|
7
|
+
|
|
8
|
+
Changes from original:
|
|
9
|
+
* Removed biases before norms
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
# Reference license: Apache-2.0
|
|
@@ -30,7 +33,7 @@ from birder.net.base import TokenRetentionResultType
|
|
|
30
33
|
class ElementScale(nn.Module):
|
|
31
34
|
def __init__(self, embed_dims: int, init_value: float) -> None:
|
|
32
35
|
super().__init__()
|
|
33
|
-
self.scale = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1))
|
|
36
|
+
self.scale = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
|
|
34
37
|
|
|
35
38
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
36
39
|
return x * self.scale
|
|
@@ -179,14 +182,14 @@ class MogaBlock(nn.Module):
|
|
|
179
182
|
super().__init__()
|
|
180
183
|
|
|
181
184
|
# Spatial attention
|
|
182
|
-
self.norm1 = nn.BatchNorm2d(embed_dims
|
|
185
|
+
self.norm1 = nn.BatchNorm2d(embed_dims)
|
|
183
186
|
self.attn = MultiOrderGatedAggregation(
|
|
184
187
|
embed_dims, attn_dw_dilation=attn_dw_dilation, attn_channel_split=attn_channel_split
|
|
185
188
|
)
|
|
186
|
-
self.layer_scale_1 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1))
|
|
189
|
+
self.layer_scale_1 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
|
|
187
190
|
|
|
188
191
|
# Channel MLP
|
|
189
|
-
self.norm2 = nn.BatchNorm2d(embed_dims
|
|
192
|
+
self.norm2 = nn.BatchNorm2d(embed_dims)
|
|
190
193
|
mlp_hidden_dim = int(embed_dims * ffn_ratio)
|
|
191
194
|
self.mlp = ChannelAggregationFFN(
|
|
192
195
|
embed_dims=embed_dims,
|
|
@@ -194,7 +197,7 @@ class MogaBlock(nn.Module):
|
|
|
194
197
|
kernel_size=3,
|
|
195
198
|
ffn_drop=drop_rate,
|
|
196
199
|
)
|
|
197
|
-
self.layer_scale_2 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1))
|
|
200
|
+
self.layer_scale_2 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
|
|
198
201
|
|
|
199
202
|
self.drop_path = StochasticDepth(drop_path_rate, mode="row")
|
|
200
203
|
|
birder/net/mvit_v2.py
CHANGED
|
@@ -178,7 +178,7 @@ class MultiScaleAttention(nn.Module):
|
|
|
178
178
|
groups=dim_conv,
|
|
179
179
|
bias=False,
|
|
180
180
|
)
|
|
181
|
-
self.norm_q = nn.LayerNorm(dim_conv)
|
|
181
|
+
self.norm_q = nn.LayerNorm(dim_conv, eps=1e-6)
|
|
182
182
|
else:
|
|
183
183
|
self.pool_q = None
|
|
184
184
|
self.norm_q = None
|
|
@@ -193,7 +193,7 @@ class MultiScaleAttention(nn.Module):
|
|
|
193
193
|
groups=dim_conv,
|
|
194
194
|
bias=False,
|
|
195
195
|
)
|
|
196
|
-
self.norm_k = nn.LayerNorm(dim_conv)
|
|
196
|
+
self.norm_k = nn.LayerNorm(dim_conv, eps=1e-6)
|
|
197
197
|
|
|
198
198
|
self.pool_v = nn.Conv2d(
|
|
199
199
|
dim_conv,
|
|
@@ -204,7 +204,7 @@ class MultiScaleAttention(nn.Module):
|
|
|
204
204
|
groups=dim_conv,
|
|
205
205
|
bias=False,
|
|
206
206
|
)
|
|
207
|
-
self.norm_v = nn.LayerNorm(dim_conv)
|
|
207
|
+
self.norm_v = nn.LayerNorm(dim_conv, eps=1e-6)
|
|
208
208
|
else:
|
|
209
209
|
self.pool_k = None
|
|
210
210
|
self.norm_k = None
|
|
@@ -291,7 +291,7 @@ class MultiScaleBlock(nn.Module):
|
|
|
291
291
|
self.dim = dim
|
|
292
292
|
self.dim_out = dim_out
|
|
293
293
|
self.num_heads = num_heads
|
|
294
|
-
self.norm1 = nn.LayerNorm(dim)
|
|
294
|
+
self.norm1 = nn.LayerNorm(dim, eps=1e-6)
|
|
295
295
|
self.has_cls_token = has_cls_token
|
|
296
296
|
self.dim_mul_in_att = dim_mul_in_att
|
|
297
297
|
|
|
@@ -309,7 +309,7 @@ class MultiScaleBlock(nn.Module):
|
|
|
309
309
|
has_cls_token=has_cls_token,
|
|
310
310
|
)
|
|
311
311
|
self.drop_path = StochasticDepth(drop_path, mode="row")
|
|
312
|
-
self.norm2 = nn.LayerNorm(att_dim)
|
|
312
|
+
self.norm2 = nn.LayerNorm(att_dim, eps=1e-6)
|
|
313
313
|
self.mlp = MLP(att_dim, [int(att_dim * mlp_ratio), dim_out], activation_layer=nn.GELU, inplace=None)
|
|
314
314
|
|
|
315
315
|
if self.dim_mul_in_att is True and self.dim != self.dim_out:
|
|
@@ -506,7 +506,7 @@ class MViT_v2(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
|
|
|
506
506
|
input_size = (input_size[0] // stride_q[i][0], input_size[1] // stride_q[i][1])
|
|
507
507
|
|
|
508
508
|
self.body = SequentialWithShape(stages)
|
|
509
|
-
self.norm = nn.LayerNorm(embed_dim)
|
|
509
|
+
self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
|
|
510
510
|
self.return_channels = return_channels
|
|
511
511
|
self.embedding_size = embed_dim
|
|
512
512
|
self.classifier = self.create_classifier()
|
birder/net/nfnet.py
CHANGED
|
@@ -3,6 +3,9 @@ Normalizer-Free Networks, adapted from
|
|
|
3
3
|
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/nfnet.py
|
|
4
4
|
|
|
5
5
|
Paper "High-Performance Large-Scale Image Recognition Without Normalization", https://arxiv.org/abs/2102.06171
|
|
6
|
+
|
|
7
|
+
Changes from original:
|
|
8
|
+
* Removed dynamic padding
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Reference license: Apache-2.0
|
|
@@ -262,6 +265,7 @@ class NFNet(DetectorBackbone):
|
|
|
262
265
|
self.body = nn.Sequential(stages)
|
|
263
266
|
self.features = nn.Sequential(
|
|
264
267
|
ScaledStdConv2d(prev_channels, prev_channels * 2, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)),
|
|
268
|
+
act_layer(),
|
|
265
269
|
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
|
|
266
270
|
nn.Flatten(1),
|
|
267
271
|
)
|
birder/net/pit.py
CHANGED
|
@@ -321,7 +321,7 @@ registry.register_weights(
|
|
|
321
321
|
"formats": {
|
|
322
322
|
"pt": {
|
|
323
323
|
"file_size": 18.4,
|
|
324
|
-
"sha256": "
|
|
324
|
+
"sha256": "5f6bd74b09c1ee541ee2ddae4844ce501b4b3218201ea6381fce0b8fc30257f2",
|
|
325
325
|
}
|
|
326
326
|
},
|
|
327
327
|
"net": {"network": "pit_t", "tag": "il-common"},
|
birder/net/pvt_v1.py
CHANGED
|
@@ -50,7 +50,7 @@ class Attention(nn.Module):
|
|
|
50
50
|
|
|
51
51
|
if sr_ratio > 1:
|
|
52
52
|
self.sr = nn.Conv2d(dim, dim, kernel_size=(sr_ratio, sr_ratio), stride=(sr_ratio, sr_ratio), padding=(0, 0))
|
|
53
|
-
self.norm = nn.LayerNorm(dim)
|
|
53
|
+
self.norm = nn.LayerNorm(dim, eps=1e-6)
|
|
54
54
|
else:
|
|
55
55
|
self.sr = None
|
|
56
56
|
self.norm = None
|
|
@@ -90,7 +90,7 @@ class PyramidVisionTransformerBlock(nn.Module):
|
|
|
90
90
|
drop_path: float,
|
|
91
91
|
) -> None:
|
|
92
92
|
super().__init__()
|
|
93
|
-
self.norm1 = nn.LayerNorm(dim)
|
|
93
|
+
self.norm1 = nn.LayerNorm(dim, eps=1e-6)
|
|
94
94
|
self.attn = Attention(
|
|
95
95
|
dim,
|
|
96
96
|
num_heads=num_heads,
|
|
@@ -100,7 +100,7 @@ class PyramidVisionTransformerBlock(nn.Module):
|
|
|
100
100
|
proj_drop=proj_drop,
|
|
101
101
|
)
|
|
102
102
|
|
|
103
|
-
self.norm2 = nn.LayerNorm(dim)
|
|
103
|
+
self.norm2 = nn.LayerNorm(dim, eps=1e-6)
|
|
104
104
|
self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, dropout=proj_drop)
|
|
105
105
|
self.drop_path = StochasticDepth(drop_path, mode="row")
|
|
106
106
|
|
|
@@ -115,7 +115,7 @@ class PatchEmbed(nn.Module):
|
|
|
115
115
|
def __init__(self, patch_size: tuple[int, int], in_channels: int, embed_dim: int) -> None:
|
|
116
116
|
super().__init__()
|
|
117
117
|
self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=(0, 0))
|
|
118
|
-
self.norm = nn.LayerNorm(embed_dim)
|
|
118
|
+
self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
|
|
119
119
|
|
|
120
120
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
121
121
|
x = self.proj(x)
|
|
@@ -169,7 +169,7 @@ class PyramidVisionTransformerStage(nn.Module):
|
|
|
169
169
|
]
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
self.norm = nn.LayerNorm(dim_out)
|
|
172
|
+
self.norm = nn.LayerNorm(dim_out, eps=1e-6)
|
|
173
173
|
if cls_token is True:
|
|
174
174
|
self.cls_token = nn.Parameter(torch.zeros(1, 1, dim_out))
|
|
175
175
|
else:
|
birder/net/pvt_v2.py
CHANGED
|
@@ -85,7 +85,7 @@ class Attention(nn.Module):
|
|
|
85
85
|
self.sr = nn.Conv2d(
|
|
86
86
|
dim, dim, kernel_size=(sr_ratio, sr_ratio), stride=(sr_ratio, sr_ratio), padding=(0, 0)
|
|
87
87
|
)
|
|
88
|
-
self.norm = nn.LayerNorm(dim)
|
|
88
|
+
self.norm = nn.LayerNorm(dim, eps=1e-6)
|
|
89
89
|
else:
|
|
90
90
|
self.sr = None
|
|
91
91
|
self.norm = None
|
|
@@ -93,7 +93,7 @@ class Attention(nn.Module):
|
|
|
93
93
|
self.pool = nn.AdaptiveAvgPool2d(7)
|
|
94
94
|
self.act = nn.GELU()
|
|
95
95
|
self.sr = nn.Conv2d(dim, dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
96
|
-
self.norm = nn.LayerNorm(dim)
|
|
96
|
+
self.norm = nn.LayerNorm(dim, eps=1e-6)
|
|
97
97
|
|
|
98
98
|
assert (self.pool is None and self.act is None) or (self.pool is not None and self.act is not None)
|
|
99
99
|
|
|
@@ -140,7 +140,7 @@ class PyramidVisionTransformerBlock(nn.Module):
|
|
|
140
140
|
drop_path: float,
|
|
141
141
|
) -> None:
|
|
142
142
|
super().__init__()
|
|
143
|
-
self.norm1 = nn.LayerNorm(dim)
|
|
143
|
+
self.norm1 = nn.LayerNorm(dim, eps=1e-6)
|
|
144
144
|
self.attn = Attention(
|
|
145
145
|
dim,
|
|
146
146
|
num_heads=num_heads,
|
|
@@ -151,7 +151,7 @@ class PyramidVisionTransformerBlock(nn.Module):
|
|
|
151
151
|
proj_drop=proj_drop,
|
|
152
152
|
)
|
|
153
153
|
|
|
154
|
-
self.norm2 = nn.LayerNorm(dim)
|
|
154
|
+
self.norm2 = nn.LayerNorm(dim, eps=1e-6)
|
|
155
155
|
self.mlp = MLP(
|
|
156
156
|
in_features=dim,
|
|
157
157
|
hidden_features=int(dim * mlp_ratio),
|
|
@@ -179,7 +179,7 @@ class OverlapPatchEmbed(nn.Module):
|
|
|
179
179
|
stride=stride,
|
|
180
180
|
padding=(patch_size[0] // 2, patch_size[1] // 2),
|
|
181
181
|
)
|
|
182
|
-
self.norm = nn.LayerNorm(embed_dim)
|
|
182
|
+
self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
|
|
183
183
|
|
|
184
184
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
185
185
|
x = self.proj(x)
|
birder/net/repghost.py
CHANGED
|
@@ -169,7 +169,7 @@ class RepGhostBottleneck(nn.Module):
|
|
|
169
169
|
# Squeeze-and-excitation
|
|
170
170
|
if se_ratio > 0:
|
|
171
171
|
self.se = SqueezeExcitation(
|
|
172
|
-
mid_channels, make_divisible(int(mid_channels * se_ratio), 4),
|
|
172
|
+
mid_channels, make_divisible(int(mid_channels * se_ratio), 4), scale_activation=nn.Hardsigmoid
|
|
173
173
|
)
|
|
174
174
|
else:
|
|
175
175
|
self.se = nn.Identity()
|
|
@@ -351,32 +351,3 @@ registry.register_model_config("repghost_0_5", RepGhost, config={"width": 0.5})
|
|
|
351
351
|
registry.register_model_config("repghost_1_0", RepGhost, config={"width": 1.0})
|
|
352
352
|
registry.register_model_config("repghost_1_3", RepGhost, config={"width": 1.3})
|
|
353
353
|
registry.register_model_config("repghost_1_5", RepGhost, config={"width": 1.5})
|
|
354
|
-
|
|
355
|
-
registry.register_weights(
|
|
356
|
-
"repghost_1_0_il-common",
|
|
357
|
-
{
|
|
358
|
-
"description": "RepGhost 1.0x model trained on the il-common dataset",
|
|
359
|
-
"resolution": (256, 256),
|
|
360
|
-
"formats": {
|
|
361
|
-
"pt": {
|
|
362
|
-
"file_size": 12.8,
|
|
363
|
-
"sha256": "37e211ec65c752ad79bbbaacea277f7d683d0b0f69d954a7ca7af46b9a1260e6",
|
|
364
|
-
}
|
|
365
|
-
},
|
|
366
|
-
"net": {"network": "repghost_1_0", "tag": "il-common"},
|
|
367
|
-
},
|
|
368
|
-
)
|
|
369
|
-
registry.register_weights(
|
|
370
|
-
"repghost_1_0_il-common_reparameterized",
|
|
371
|
-
{
|
|
372
|
-
"description": "RepGhost 1.0x (reparameterized) model trained on the il-common dataset",
|
|
373
|
-
"resolution": (256, 256),
|
|
374
|
-
"formats": {
|
|
375
|
-
"pt": {
|
|
376
|
-
"file_size": 12.6,
|
|
377
|
-
"sha256": "e003e0498d63428305c10f879a0e2b999604795d417f07ea0da35ea925f794f5",
|
|
378
|
-
}
|
|
379
|
-
},
|
|
380
|
-
"net": {"network": "repghost_1_0", "tag": "il-common_reparameterized", "reparameterized": True},
|
|
381
|
-
},
|
|
382
|
-
)
|
birder/net/resmlp.py
CHANGED
|
@@ -40,8 +40,8 @@ class LayerScaleMLP(nn.Module):
|
|
|
40
40
|
self.drop_path = StochasticDepth(drop_path, mode="row")
|
|
41
41
|
self.norm2 = Affine(dim)
|
|
42
42
|
self.mlp = MLP(dim, [int(dim * 4.0), dim], activation_layer=nn.GELU, dropout=drop)
|
|
43
|
-
self.gamma_1 = nn.Parameter(init_value * torch.ones((dim))
|
|
44
|
-
self.gamma_2 = nn.Parameter(init_value * torch.ones((dim))
|
|
43
|
+
self.gamma_1 = nn.Parameter(init_value * torch.ones((dim)))
|
|
44
|
+
self.gamma_2 = nn.Parameter(init_value * torch.ones((dim)))
|
|
45
45
|
|
|
46
46
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
47
47
|
x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2))
|
birder/net/resnest.py
CHANGED
|
@@ -3,6 +3,9 @@ ResNeSt, adapted from
|
|
|
3
3
|
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/resnest.py
|
|
4
4
|
|
|
5
5
|
Paper "ResNeSt: Split-Attention Networks", https://arxiv.org/abs/2004.08955
|
|
6
|
+
|
|
7
|
+
Changes from original:
|
|
8
|
+
* Removed bias from fc1 (SplitAttn)
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Reference license: Apache-2.0
|
birder/net/resnet_v1.py
CHANGED
|
@@ -4,6 +4,8 @@ https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
|
|
|
4
4
|
|
|
5
5
|
Paper "Deep Residual Learning for Image Recognition", https://arxiv.org/abs/1512.03385
|
|
6
6
|
and
|
|
7
|
+
Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
|
|
8
|
+
and
|
|
7
9
|
Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
|
|
8
10
|
https://arxiv.org/abs/1812.01187
|
|
9
11
|
"""
|
|
@@ -101,7 +103,6 @@ class ResNet_v1(DetectorBackbone):
|
|
|
101
103
|
*,
|
|
102
104
|
config: Optional[dict[str, Any]] = None,
|
|
103
105
|
size: Optional[tuple[int, int]] = None,
|
|
104
|
-
squeeze_excitation: bool = False,
|
|
105
106
|
) -> None:
|
|
106
107
|
super().__init__(input_channels, num_classes, config=config, size=size)
|
|
107
108
|
assert self.config is not None, "must set config"
|
|
@@ -110,6 +111,7 @@ class ResNet_v1(DetectorBackbone):
|
|
|
110
111
|
filter_list: list[int] = self.config["filter_list"]
|
|
111
112
|
units: list[int] = self.config["units"]
|
|
112
113
|
pooling_param: Optional[float] = self.config.get("pooling_param", None)
|
|
114
|
+
squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
|
|
113
115
|
deep_stem: bool = self.config.get("deep_stem", False)
|
|
114
116
|
avg_down: bool = self.config.get("avg_down", False)
|
|
115
117
|
|
|
@@ -302,6 +304,128 @@ registry.register_model_config(
|
|
|
302
304
|
},
|
|
303
305
|
)
|
|
304
306
|
|
|
307
|
+
# Squeeze-and-Excitation Networks
|
|
308
|
+
registry.register_model_config(
|
|
309
|
+
"se_resnet_v1_18",
|
|
310
|
+
ResNet_v1,
|
|
311
|
+
config={
|
|
312
|
+
"bottle_neck": False,
|
|
313
|
+
"filter_list": [64, 64, 128, 256, 512],
|
|
314
|
+
"units": [2, 2, 2, 2],
|
|
315
|
+
"squeeze_excitation": True,
|
|
316
|
+
},
|
|
317
|
+
)
|
|
318
|
+
registry.register_model_config(
|
|
319
|
+
"se_resnet_v1_34",
|
|
320
|
+
ResNet_v1,
|
|
321
|
+
config={
|
|
322
|
+
"bottle_neck": False,
|
|
323
|
+
"filter_list": [64, 64, 128, 256, 512],
|
|
324
|
+
"units": [3, 4, 6, 3],
|
|
325
|
+
"squeeze_excitation": True,
|
|
326
|
+
},
|
|
327
|
+
)
|
|
328
|
+
registry.register_model_config(
|
|
329
|
+
"se_resnet_v1_50",
|
|
330
|
+
ResNet_v1,
|
|
331
|
+
config={
|
|
332
|
+
"bottle_neck": True,
|
|
333
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
334
|
+
"units": [3, 4, 6, 3],
|
|
335
|
+
"squeeze_excitation": True,
|
|
336
|
+
},
|
|
337
|
+
)
|
|
338
|
+
registry.register_model_config(
|
|
339
|
+
"se_resnet_v1_101",
|
|
340
|
+
ResNet_v1,
|
|
341
|
+
config={
|
|
342
|
+
"bottle_neck": True,
|
|
343
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
344
|
+
"units": [3, 4, 23, 3],
|
|
345
|
+
"squeeze_excitation": True,
|
|
346
|
+
},
|
|
347
|
+
)
|
|
348
|
+
registry.register_model_config(
|
|
349
|
+
"se_resnet_v1_152",
|
|
350
|
+
ResNet_v1,
|
|
351
|
+
config={
|
|
352
|
+
"bottle_neck": True,
|
|
353
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
354
|
+
"units": [3, 8, 36, 3],
|
|
355
|
+
"squeeze_excitation": True,
|
|
356
|
+
},
|
|
357
|
+
)
|
|
358
|
+
registry.register_model_config(
|
|
359
|
+
"se_resnet_v1_200",
|
|
360
|
+
ResNet_v1,
|
|
361
|
+
config={
|
|
362
|
+
"bottle_neck": True,
|
|
363
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
364
|
+
"units": [3, 24, 36, 3],
|
|
365
|
+
"squeeze_excitation": True,
|
|
366
|
+
},
|
|
367
|
+
)
|
|
368
|
+
registry.register_model_config(
|
|
369
|
+
"se_resnet_v1_269",
|
|
370
|
+
ResNet_v1,
|
|
371
|
+
config={
|
|
372
|
+
"bottle_neck": True,
|
|
373
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
374
|
+
"units": [3, 30, 48, 8],
|
|
375
|
+
"squeeze_excitation": True,
|
|
376
|
+
},
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# SE-ResNet-D variants with SE
|
|
380
|
+
registry.register_model_config(
|
|
381
|
+
"se_resnet_d_50",
|
|
382
|
+
ResNet_v1,
|
|
383
|
+
config={
|
|
384
|
+
"bottle_neck": True,
|
|
385
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
386
|
+
"units": [3, 4, 6, 3],
|
|
387
|
+
"squeeze_excitation": True,
|
|
388
|
+
"deep_stem": True,
|
|
389
|
+
"avg_down": True,
|
|
390
|
+
},
|
|
391
|
+
)
|
|
392
|
+
registry.register_model_config(
|
|
393
|
+
"se_resnet_d_101",
|
|
394
|
+
ResNet_v1,
|
|
395
|
+
config={
|
|
396
|
+
"bottle_neck": True,
|
|
397
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
398
|
+
"units": [3, 4, 23, 3],
|
|
399
|
+
"squeeze_excitation": True,
|
|
400
|
+
"deep_stem": True,
|
|
401
|
+
"avg_down": True,
|
|
402
|
+
},
|
|
403
|
+
)
|
|
404
|
+
registry.register_model_config(
|
|
405
|
+
"se_resnet_d_152",
|
|
406
|
+
ResNet_v1,
|
|
407
|
+
config={
|
|
408
|
+
"bottle_neck": True,
|
|
409
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
410
|
+
"units": [3, 8, 36, 3],
|
|
411
|
+
"squeeze_excitation": True,
|
|
412
|
+
"deep_stem": True,
|
|
413
|
+
"avg_down": True,
|
|
414
|
+
},
|
|
415
|
+
)
|
|
416
|
+
registry.register_model_config(
|
|
417
|
+
"se_resnet_d_200",
|
|
418
|
+
ResNet_v1,
|
|
419
|
+
config={
|
|
420
|
+
"bottle_neck": True,
|
|
421
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
422
|
+
"units": [3, 24, 36, 3],
|
|
423
|
+
"squeeze_excitation": True,
|
|
424
|
+
"deep_stem": True,
|
|
425
|
+
"avg_down": True,
|
|
426
|
+
},
|
|
427
|
+
)
|
|
428
|
+
|
|
305
429
|
registry.register_weights(
|
|
306
430
|
"resnet_v1_50_arabian-peninsula",
|
|
307
431
|
{
|
birder/net/resnet_v2.py
CHANGED
|
@@ -3,6 +3,8 @@ ResNet v2, adapted from
|
|
|
3
3
|
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/resnetv2.py
|
|
4
4
|
|
|
5
5
|
Paper "Identity Mappings in Deep Residual Networks", https://arxiv.org/abs/1603.05027
|
|
6
|
+
and
|
|
7
|
+
Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
|
|
6
8
|
"""
|
|
7
9
|
|
|
8
10
|
# Reference license: Apache-2.0
|
|
@@ -98,7 +100,6 @@ class ResNet_v2(DetectorBackbone):
|
|
|
98
100
|
*,
|
|
99
101
|
config: Optional[dict[str, Any]] = None,
|
|
100
102
|
size: Optional[tuple[int, int]] = None,
|
|
101
|
-
squeeze_excitation: bool = False,
|
|
102
103
|
) -> None:
|
|
103
104
|
super().__init__(input_channels, num_classes, config=config, size=size)
|
|
104
105
|
assert self.config is not None, "must set config"
|
|
@@ -106,6 +107,7 @@ class ResNet_v2(DetectorBackbone):
|
|
|
106
107
|
bottle_neck: bool = self.config["bottle_neck"]
|
|
107
108
|
filter_list: list[int] = self.config["filter_list"]
|
|
108
109
|
units: list[int] = self.config["units"]
|
|
110
|
+
squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
|
|
109
111
|
|
|
110
112
|
assert len(units) + 1 == len(filter_list)
|
|
111
113
|
num_unit = len(units)
|
|
@@ -231,3 +233,75 @@ registry.register_model_config(
|
|
|
231
233
|
ResNet_v2,
|
|
232
234
|
config={"bottle_neck": True, "filter_list": [64, 256, 512, 1024, 2048], "units": [3, 30, 48, 8]},
|
|
233
235
|
)
|
|
236
|
+
|
|
237
|
+
# Squeeze-and-Excitation Networks
|
|
238
|
+
registry.register_model_config(
|
|
239
|
+
"se_resnet_v2_18",
|
|
240
|
+
ResNet_v2,
|
|
241
|
+
config={
|
|
242
|
+
"bottle_neck": False,
|
|
243
|
+
"filter_list": [64, 64, 128, 256, 512],
|
|
244
|
+
"units": [2, 2, 2, 2],
|
|
245
|
+
"squeeze_excitation": True,
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
registry.register_model_config(
|
|
249
|
+
"se_resnet_v2_34",
|
|
250
|
+
ResNet_v2,
|
|
251
|
+
config={
|
|
252
|
+
"bottle_neck": False,
|
|
253
|
+
"filter_list": [64, 64, 128, 256, 512],
|
|
254
|
+
"units": [3, 4, 6, 3],
|
|
255
|
+
"squeeze_excitation": True,
|
|
256
|
+
},
|
|
257
|
+
)
|
|
258
|
+
registry.register_model_config(
|
|
259
|
+
"se_resnet_v2_50",
|
|
260
|
+
ResNet_v2,
|
|
261
|
+
config={
|
|
262
|
+
"bottle_neck": True,
|
|
263
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
264
|
+
"units": [3, 4, 6, 3],
|
|
265
|
+
"squeeze_excitation": True,
|
|
266
|
+
},
|
|
267
|
+
)
|
|
268
|
+
registry.register_model_config(
|
|
269
|
+
"se_resnet_v2_101",
|
|
270
|
+
ResNet_v2,
|
|
271
|
+
config={
|
|
272
|
+
"bottle_neck": True,
|
|
273
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
274
|
+
"units": [3, 4, 23, 3],
|
|
275
|
+
"squeeze_excitation": True,
|
|
276
|
+
},
|
|
277
|
+
)
|
|
278
|
+
registry.register_model_config(
|
|
279
|
+
"se_resnet_v2_152",
|
|
280
|
+
ResNet_v2,
|
|
281
|
+
config={
|
|
282
|
+
"bottle_neck": True,
|
|
283
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
284
|
+
"units": [3, 8, 36, 3],
|
|
285
|
+
"squeeze_excitation": True,
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
registry.register_model_config(
|
|
289
|
+
"se_resnet_v2_200",
|
|
290
|
+
ResNet_v2,
|
|
291
|
+
config={
|
|
292
|
+
"bottle_neck": True,
|
|
293
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
294
|
+
"units": [3, 24, 36, 3],
|
|
295
|
+
"squeeze_excitation": True,
|
|
296
|
+
},
|
|
297
|
+
)
|
|
298
|
+
registry.register_model_config(
|
|
299
|
+
"se_resnet_v2_269",
|
|
300
|
+
ResNet_v2,
|
|
301
|
+
config={
|
|
302
|
+
"bottle_neck": True,
|
|
303
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
304
|
+
"units": [3, 30, 48, 8],
|
|
305
|
+
"squeeze_excitation": True,
|
|
306
|
+
},
|
|
307
|
+
)
|
birder/net/resnext.py
CHANGED
|
@@ -3,6 +3,11 @@ ResNeXt, adapted from
|
|
|
3
3
|
https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
|
|
4
4
|
|
|
5
5
|
Paper "Aggregated Residual Transformations for Deep Neural Networks", https://arxiv.org/abs/1611.05431
|
|
6
|
+
and
|
|
7
|
+
Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
|
|
8
|
+
and
|
|
9
|
+
Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
|
|
10
|
+
https://arxiv.org/abs/1812.01187
|
|
6
11
|
"""
|
|
7
12
|
|
|
8
13
|
# Reference license: BSD 3-Clause
|
|
@@ -117,7 +122,6 @@ class ResNeXt(DetectorBackbone):
|
|
|
117
122
|
*,
|
|
118
123
|
config: Optional[dict[str, Any]] = None,
|
|
119
124
|
size: Optional[tuple[int, int]] = None,
|
|
120
|
-
squeeze_excitation: bool = False,
|
|
121
125
|
) -> None:
|
|
122
126
|
super().__init__(input_channels, num_classes, config=config, size=size)
|
|
123
127
|
assert self.config is not None, "must set config"
|
|
@@ -127,6 +131,7 @@ class ResNeXt(DetectorBackbone):
|
|
|
127
131
|
base_width: int = self.config.get("base_width", 4)
|
|
128
132
|
filter_list = [64, 128, 256, 512]
|
|
129
133
|
units: list[int] = self.config["units"]
|
|
134
|
+
squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
|
|
130
135
|
deep_stem: bool = self.config.get("deep_stem", False)
|
|
131
136
|
avg_down: bool = self.config.get("avg_down", False)
|
|
132
137
|
|
|
@@ -251,3 +256,32 @@ registry.register_model_config(
|
|
|
251
256
|
registry.register_model_config(
|
|
252
257
|
"resnext_d_152", ResNeXt, config={"units": [3, 8, 36, 3], "deep_stem": True, "avg_down": True}
|
|
253
258
|
)
|
|
259
|
+
|
|
260
|
+
# Squeeze-and-Excitation Networks
|
|
261
|
+
registry.register_model_config("se_resnext_50", ResNeXt, config={"units": [3, 4, 6, 3], "squeeze_excitation": True})
|
|
262
|
+
registry.register_model_config("se_resnext_101", ResNeXt, config={"units": [3, 4, 23, 3], "squeeze_excitation": True})
|
|
263
|
+
registry.register_model_config("se_resnext_152", ResNeXt, config={"units": [3, 8, 36, 3], "squeeze_excitation": True})
|
|
264
|
+
|
|
265
|
+
registry.register_model_config(
|
|
266
|
+
"se_resnext_101_32x8", ResNeXt, config={"units": [3, 4, 23, 3], "base_width": 8, "squeeze_excitation": True}
|
|
267
|
+
)
|
|
268
|
+
registry.register_model_config(
|
|
269
|
+
"se_resnext_101_64x4", ResNeXt, config={"units": [3, 4, 23, 3], "groups": 64, "squeeze_excitation": True}
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# SE-ResNeXt-D variants with SE
|
|
273
|
+
registry.register_model_config(
|
|
274
|
+
"se_resnext_d_50",
|
|
275
|
+
ResNeXt,
|
|
276
|
+
config={"units": [3, 4, 6, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
|
|
277
|
+
)
|
|
278
|
+
registry.register_model_config(
|
|
279
|
+
"se_resnext_d_101",
|
|
280
|
+
ResNeXt,
|
|
281
|
+
config={"units": [3, 4, 23, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
|
|
282
|
+
)
|
|
283
|
+
registry.register_model_config(
|
|
284
|
+
"se_resnext_d_152",
|
|
285
|
+
ResNeXt,
|
|
286
|
+
config={"units": [3, 8, 36, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
|
|
287
|
+
)
|