birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder/common/fs_ops.py +2 -2
- birder/introspection/attention_rollout.py +1 -1
- birder/introspection/transformer_attribution.py +1 -1
- birder/layers/layer_scale.py +1 -1
- birder/net/__init__.py +2 -10
- birder/net/_rope_vit_configs.py +430 -0
- birder/net/_vit_configs.py +479 -0
- birder/net/biformer.py +1 -0
- birder/net/cait.py +5 -5
- birder/net/coat.py +12 -12
- birder/net/conv2former.py +3 -3
- birder/net/convmixer.py +1 -1
- birder/net/convnext_v1.py +1 -1
- birder/net/crossvit.py +5 -5
- birder/net/davit.py +1 -1
- birder/net/deit.py +12 -26
- birder/net/deit3.py +42 -189
- birder/net/densenet.py +9 -8
- birder/net/detection/deformable_detr.py +5 -2
- birder/net/detection/detr.py +5 -2
- birder/net/detection/efficientdet.py +1 -1
- birder/net/dpn.py +1 -2
- birder/net/edgenext.py +2 -1
- birder/net/edgevit.py +3 -0
- birder/net/efficientformer_v1.py +2 -1
- birder/net/efficientformer_v2.py +18 -31
- birder/net/efficientnet_v2.py +3 -0
- birder/net/efficientvit_mit.py +5 -5
- birder/net/fasternet.py +2 -2
- birder/net/flexivit.py +22 -43
- birder/net/groupmixformer.py +1 -1
- birder/net/hgnet_v1.py +5 -5
- birder/net/inception_next.py +1 -1
- birder/net/inception_resnet_v1.py +3 -3
- birder/net/inception_resnet_v2.py +7 -4
- birder/net/inception_v3.py +3 -0
- birder/net/inception_v4.py +3 -0
- birder/net/maxvit.py +1 -1
- birder/net/metaformer.py +3 -3
- birder/net/mim/crossmae.py +1 -1
- birder/net/mim/mae_vit.py +1 -1
- birder/net/mim/simmim.py +1 -1
- birder/net/mobilenet_v1.py +0 -9
- birder/net/mobilenet_v2.py +38 -44
- birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
- birder/net/mobilevit_v1.py +5 -32
- birder/net/mobilevit_v2.py +1 -45
- birder/net/moganet.py +8 -5
- birder/net/mvit_v2.py +6 -6
- birder/net/nfnet.py +4 -0
- birder/net/pit.py +1 -1
- birder/net/pvt_v1.py +5 -5
- birder/net/pvt_v2.py +5 -5
- birder/net/repghost.py +1 -30
- birder/net/resmlp.py +2 -2
- birder/net/resnest.py +3 -0
- birder/net/resnet_v1.py +125 -1
- birder/net/resnet_v2.py +75 -1
- birder/net/resnext.py +35 -1
- birder/net/rope_deit3.py +33 -136
- birder/net/rope_flexivit.py +18 -18
- birder/net/rope_vit.py +3 -735
- birder/net/simple_vit.py +22 -16
- birder/net/smt.py +1 -1
- birder/net/squeezenet.py +5 -12
- birder/net/squeezenext.py +0 -24
- birder/net/ssl/capi.py +1 -1
- birder/net/ssl/data2vec.py +1 -1
- birder/net/ssl/dino_v2.py +2 -2
- birder/net/ssl/franca.py +2 -2
- birder/net/ssl/i_jepa.py +1 -1
- birder/net/ssl/ibot.py +1 -1
- birder/net/swiftformer.py +12 -2
- birder/net/swin_transformer_v2.py +1 -1
- birder/net/tiny_vit.py +3 -16
- birder/net/van.py +2 -2
- birder/net/vit.py +35 -963
- birder/net/vit_sam.py +13 -38
- birder/net/xcit.py +7 -6
- birder/tools/introspection.py +1 -1
- birder/tools/model_info.py +3 -1
- birder/version.py +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
- birder/net/mobilenet_v3_small.py +0 -43
- birder/net/se_resnet_v1.py +0 -105
- birder/net/se_resnet_v2.py +0 -59
- birder/net/se_resnext.py +0 -30
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
birder/net/efficientvit_mit.py
CHANGED
|
@@ -54,7 +54,7 @@ class DSConv(nn.Module):
|
|
|
54
54
|
in_channels,
|
|
55
55
|
kernel_size=kernel_size,
|
|
56
56
|
stride=stride,
|
|
57
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
57
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
58
58
|
groups=in_channels,
|
|
59
59
|
norm_layer=norm_layer[0],
|
|
60
60
|
activation_layer=act_layer[0],
|
|
@@ -96,7 +96,7 @@ class ConvBlock(nn.Module):
|
|
|
96
96
|
mid_channels,
|
|
97
97
|
kernel_size=kernel_size,
|
|
98
98
|
stride=stride,
|
|
99
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
99
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
100
100
|
norm_layer=norm_layer[0],
|
|
101
101
|
activation_layer=act_layer[0],
|
|
102
102
|
inplace=None,
|
|
@@ -106,7 +106,7 @@ class ConvBlock(nn.Module):
|
|
|
106
106
|
out_channels,
|
|
107
107
|
kernel_size=kernel_size,
|
|
108
108
|
stride=(1, 1),
|
|
109
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
109
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
110
110
|
norm_layer=norm_layer[1],
|
|
111
111
|
activation_layer=act_layer[1],
|
|
112
112
|
inplace=None,
|
|
@@ -148,7 +148,7 @@ class MBConv(nn.Module):
|
|
|
148
148
|
mid_channels,
|
|
149
149
|
kernel_size=kernel_size,
|
|
150
150
|
stride=stride,
|
|
151
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
151
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
152
152
|
groups=mid_channels,
|
|
153
153
|
norm_layer=norm_layer[1],
|
|
154
154
|
activation_layer=act_layer[1],
|
|
@@ -192,7 +192,7 @@ class FusedMBConv(nn.Module):
|
|
|
192
192
|
mid_channels,
|
|
193
193
|
kernel_size=kernel_size,
|
|
194
194
|
stride=stride,
|
|
195
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
195
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
196
196
|
norm_layer=norm_layer[0],
|
|
197
197
|
activation_layer=act_layer[0],
|
|
198
198
|
inplace=None,
|
birder/net/fasternet.py
CHANGED
|
@@ -50,10 +50,10 @@ class MLPBlock(nn.Module):
|
|
|
50
50
|
mlp_hidden_dim = int(dim * mlp_ratio)
|
|
51
51
|
self.spatial_mixing = PartialConv(dim, n_div)
|
|
52
52
|
self.mlp = nn.Sequential(
|
|
53
|
-
nn.Conv2d(dim, mlp_hidden_dim, 1, bias=False),
|
|
53
|
+
nn.Conv2d(dim, mlp_hidden_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
|
|
54
54
|
nn.BatchNorm2d(mlp_hidden_dim),
|
|
55
55
|
act_layer(),
|
|
56
|
-
nn.Conv2d(mlp_hidden_dim, dim, 1, bias=False),
|
|
56
|
+
nn.Conv2d(mlp_hidden_dim, dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
|
|
57
57
|
)
|
|
58
58
|
self.drop_path = StochasticDepth(drop_path, mode="row")
|
|
59
59
|
|
birder/net/flexivit.py
CHANGED
|
@@ -22,6 +22,8 @@ from birder.layers import MultiHeadAttentionPool
|
|
|
22
22
|
from birder.layers import SwiGLU_FFN
|
|
23
23
|
from birder.layers.activations import get_activation_module
|
|
24
24
|
from birder.model_registry import registry
|
|
25
|
+
from birder.net._vit_configs import BASE
|
|
26
|
+
from birder.net._vit_configs import SMALL
|
|
25
27
|
from birder.net.base import DetectorBackbone
|
|
26
28
|
from birder.net.base import MaskedTokenOmissionMixin
|
|
27
29
|
from birder.net.base import MaskedTokenRetentionMixin
|
|
@@ -583,70 +585,47 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
|
|
|
583
585
|
registry.register_model_config(
|
|
584
586
|
"flexivit_s16",
|
|
585
587
|
FlexiViT,
|
|
586
|
-
config={
|
|
587
|
-
"patch_size": 16,
|
|
588
|
-
"num_layers": 12,
|
|
589
|
-
"num_heads": 6,
|
|
590
|
-
"hidden_dim": 384,
|
|
591
|
-
"mlp_dim": 1536,
|
|
592
|
-
"drop_path_rate": 0.0,
|
|
593
|
-
},
|
|
588
|
+
config={"patch_size": 16, **SMALL},
|
|
594
589
|
)
|
|
595
590
|
registry.register_model_config(
|
|
596
591
|
"flexivit_s16_ls",
|
|
597
592
|
FlexiViT,
|
|
598
|
-
config={
|
|
599
|
-
"patch_size": 16,
|
|
600
|
-
"num_layers": 12,
|
|
601
|
-
"num_heads": 6,
|
|
602
|
-
"hidden_dim": 384,
|
|
603
|
-
"mlp_dim": 1536,
|
|
604
|
-
"layer_scale_init_value": 1e-5,
|
|
605
|
-
"drop_path_rate": 0.0,
|
|
606
|
-
},
|
|
593
|
+
config={"patch_size": 16, **SMALL, "layer_scale_init_value": 1e-5},
|
|
607
594
|
)
|
|
595
|
+
registry.register_model_config(
|
|
596
|
+
"flexivit_b16",
|
|
597
|
+
FlexiViT,
|
|
598
|
+
config={"patch_size": 16, **BASE},
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
# With registers
|
|
602
|
+
####################
|
|
603
|
+
|
|
608
604
|
registry.register_model_config(
|
|
609
605
|
"flexivit_reg1_s16",
|
|
610
606
|
FlexiViT,
|
|
611
|
-
config={
|
|
612
|
-
"patch_size": 16,
|
|
613
|
-
"num_layers": 12,
|
|
614
|
-
"num_heads": 6,
|
|
615
|
-
"hidden_dim": 384,
|
|
616
|
-
"mlp_dim": 1536,
|
|
617
|
-
"num_reg_tokens": 1,
|
|
618
|
-
"drop_path_rate": 0.0,
|
|
619
|
-
},
|
|
607
|
+
config={"patch_size": 16, **SMALL, "num_reg_tokens": 1},
|
|
620
608
|
)
|
|
621
609
|
registry.register_model_config(
|
|
622
610
|
"flexivit_reg1_s16_rms_ls",
|
|
623
611
|
FlexiViT,
|
|
624
612
|
config={
|
|
625
613
|
"patch_size": 16,
|
|
626
|
-
|
|
627
|
-
"num_heads": 6,
|
|
628
|
-
"hidden_dim": 384,
|
|
629
|
-
"mlp_dim": 1536,
|
|
614
|
+
**SMALL,
|
|
630
615
|
"layer_scale_init_value": 1e-5,
|
|
631
616
|
"num_reg_tokens": 1,
|
|
632
617
|
"norm_layer_type": "RMSNorm",
|
|
633
|
-
"drop_path_rate": 0.0,
|
|
634
618
|
},
|
|
635
619
|
)
|
|
620
|
+
registry.register_model_config(
|
|
621
|
+
"flexivit_reg4_b16",
|
|
622
|
+
FlexiViT,
|
|
623
|
+
config={"patch_size": 16, **BASE, "num_reg_tokens": 4},
|
|
624
|
+
)
|
|
636
625
|
registry.register_model_config(
|
|
637
626
|
"flexivit_reg8_b14_ap",
|
|
638
627
|
FlexiViT,
|
|
639
|
-
config={
|
|
640
|
-
"patch_size": 14,
|
|
641
|
-
"num_layers": 12,
|
|
642
|
-
"num_heads": 12,
|
|
643
|
-
"hidden_dim": 768,
|
|
644
|
-
"mlp_dim": 3072,
|
|
645
|
-
"num_reg_tokens": 8,
|
|
646
|
-
"class_token": False,
|
|
647
|
-
"attn_pool_head": True,
|
|
648
|
-
"drop_path_rate": 0.1,
|
|
649
|
-
},
|
|
628
|
+
config={"patch_size": 14, **BASE, "num_reg_tokens": 8, "class_token": False, "attn_pool_head": True},
|
|
650
629
|
)
|
|
651
630
|
|
|
652
631
|
registry.register_weights(
|
|
@@ -661,7 +640,7 @@ registry.register_weights(
|
|
|
661
640
|
"formats": {
|
|
662
641
|
"pt": {
|
|
663
642
|
"file_size": 83.6,
|
|
664
|
-
"sha256": "
|
|
643
|
+
"sha256": "8285f4fe56401f169491cb2399d2a7c82f3a0cfbe8a5a8d3c27163024a274800",
|
|
665
644
|
},
|
|
666
645
|
},
|
|
667
646
|
"net": {"network": "flexivit_reg1_s16_rms_ls", "tag": "dino-v2-il-all"},
|
birder/net/groupmixformer.py
CHANGED
birder/net/hgnet_v1.py
CHANGED
|
@@ -23,8 +23,8 @@ from birder.net.base import DetectorBackbone
|
|
|
23
23
|
class LearnableAffineBlock(nn.Module):
|
|
24
24
|
def __init__(self, scale_value: float, bias_value: float) -> None:
|
|
25
25
|
super().__init__()
|
|
26
|
-
self.scale = nn.Parameter(torch.tensor([scale_value])
|
|
27
|
-
self.bias = nn.Parameter(torch.tensor([bias_value])
|
|
26
|
+
self.scale = nn.Parameter(torch.tensor([scale_value]))
|
|
27
|
+
self.bias = nn.Parameter(torch.tensor([bias_value]))
|
|
28
28
|
|
|
29
29
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
30
30
|
return self.scale * x + self.bias
|
|
@@ -90,7 +90,7 @@ class LightConvBNAct(nn.Module):
|
|
|
90
90
|
out_channels,
|
|
91
91
|
kernel_size=kernel_size,
|
|
92
92
|
stride=(1, 1),
|
|
93
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
93
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
94
94
|
groups=out_channels,
|
|
95
95
|
use_act=True,
|
|
96
96
|
use_lab=use_lab,
|
|
@@ -180,7 +180,7 @@ class HighPerfGPUBlock(nn.Module):
|
|
|
180
180
|
mid_channels,
|
|
181
181
|
kernel_size=kernel_size,
|
|
182
182
|
stride=(1, 1),
|
|
183
|
-
padding=(kernel_size[0] // 2, kernel_size[1] // 2),
|
|
183
|
+
padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
|
|
184
184
|
groups=1,
|
|
185
185
|
use_act=True,
|
|
186
186
|
use_lab=use_lab,
|
|
@@ -267,7 +267,7 @@ class HighPerfGPUStage(nn.Module):
|
|
|
267
267
|
in_channels,
|
|
268
268
|
kernel_size=(3, 3),
|
|
269
269
|
stride=stride,
|
|
270
|
-
padding=(
|
|
270
|
+
padding=(1, 1),
|
|
271
271
|
groups=in_channels,
|
|
272
272
|
use_act=False,
|
|
273
273
|
use_lab=False,
|
birder/net/inception_next.py
CHANGED
|
@@ -110,7 +110,7 @@ class InceptionNeXtBlock(nn.Module):
|
|
|
110
110
|
nn.BatchNorm2d(channels),
|
|
111
111
|
ConvMLP(channels, hidden_features=int(mlp_ratio * channels), out_features=channels),
|
|
112
112
|
)
|
|
113
|
-
self.layer_scale = nn.Parameter(torch.ones(channels, 1, 1) * layer_scale
|
|
113
|
+
self.layer_scale = nn.Parameter(torch.ones(channels, 1, 1) * layer_scale)
|
|
114
114
|
self.stochastic_depth = StochasticDepth(stochastic_depth_prob, mode="row")
|
|
115
115
|
|
|
116
116
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -33,7 +33,7 @@ class InceptionBlockA(nn.Module):
|
|
|
33
33
|
Conv2dNormActivation(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
self.conv2d = nn.Conv2d(96, 256, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
36
|
+
self.conv2d = nn.Conv2d(96, 256, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
37
37
|
self.relu = nn.ReLU(inplace=True)
|
|
38
38
|
|
|
39
39
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -84,7 +84,7 @@ class InceptionBlockB(nn.Module):
|
|
|
84
84
|
Conv2dNormActivation(128, 128, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0), bias=False),
|
|
85
85
|
)
|
|
86
86
|
|
|
87
|
-
self.conv2d = nn.Conv2d(256, 896, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
87
|
+
self.conv2d = nn.Conv2d(256, 896, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
88
88
|
self.relu = nn.ReLU(inplace=True)
|
|
89
89
|
|
|
90
90
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -140,7 +140,7 @@ class InceptionBlockC(nn.Module):
|
|
|
140
140
|
Conv2dNormActivation(192, 192, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0), bias=False),
|
|
141
141
|
)
|
|
142
142
|
|
|
143
|
-
self.conv2d = nn.Conv2d(384, 1792, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
143
|
+
self.conv2d = nn.Conv2d(384, 1792, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
144
144
|
self.relu = nn.ReLU(inplace=True)
|
|
145
145
|
|
|
146
146
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -4,6 +4,9 @@ https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/incept
|
|
|
4
4
|
|
|
5
5
|
Paper "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning",
|
|
6
6
|
https://arxiv.org/abs/1602.07261
|
|
7
|
+
|
|
8
|
+
Changes from original:
|
|
9
|
+
* Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
# Reference license: Apache-2.0
|
|
@@ -35,7 +38,7 @@ class StemBlock(nn.Module):
|
|
|
35
38
|
Conv2dNormActivation(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
|
|
36
39
|
)
|
|
37
40
|
self.branch_pool = nn.Sequential(
|
|
38
|
-
nn.AvgPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
|
|
41
|
+
nn.AvgPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), count_include_pad=False),
|
|
39
42
|
Conv2dNormActivation(in_channels, 64, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
|
|
40
43
|
)
|
|
41
44
|
|
|
@@ -66,7 +69,7 @@ class InceptionBlockA(nn.Module):
|
|
|
66
69
|
Conv2dNormActivation(48, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
|
|
67
70
|
)
|
|
68
71
|
|
|
69
|
-
self.conv2d = nn.Conv2d(128, 320, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
72
|
+
self.conv2d = nn.Conv2d(128, 320, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
70
73
|
self.relu = nn.ReLU(inplace=True)
|
|
71
74
|
|
|
72
75
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -117,7 +120,7 @@ class InceptionBlockB(nn.Module):
|
|
|
117
120
|
Conv2dNormActivation(160, 192, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0), bias=False),
|
|
118
121
|
)
|
|
119
122
|
|
|
120
|
-
self.conv2d = nn.Conv2d(384, 1088, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
123
|
+
self.conv2d = nn.Conv2d(384, 1088, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
121
124
|
self.relu = nn.ReLU(inplace=True)
|
|
122
125
|
|
|
123
126
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
@@ -174,7 +177,7 @@ class InceptionBlockC(nn.Module):
|
|
|
174
177
|
Conv2dNormActivation(224, 256, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0), bias=False),
|
|
175
178
|
)
|
|
176
179
|
|
|
177
|
-
self.conv2d = nn.Conv2d(448, 2080, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
|
|
180
|
+
self.conv2d = nn.Conv2d(448, 2080, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
|
|
178
181
|
self.relu = nn.ReLU(inplace=True)
|
|
179
182
|
|
|
180
183
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
birder/net/inception_v3.py
CHANGED
|
@@ -3,6 +3,9 @@ Inception v3, adapted from
|
|
|
3
3
|
https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py
|
|
4
4
|
|
|
5
5
|
Paper "Rethinking the Inception Architecture for Computer Vision", https://arxiv.org/abs/1512.00567
|
|
6
|
+
|
|
7
|
+
Changes from original:
|
|
8
|
+
* Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Reference license: BSD 3-Clause
|
birder/net/inception_v4.py
CHANGED
|
@@ -4,6 +4,9 @@ https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/incept
|
|
|
4
4
|
|
|
5
5
|
Paper "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning",
|
|
6
6
|
https://arxiv.org/abs/1602.07261
|
|
7
|
+
|
|
8
|
+
Changes from original:
|
|
9
|
+
* Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
# Reference license: Apache-2.0
|
birder/net/maxvit.py
CHANGED
|
@@ -82,7 +82,7 @@ class MBConv(nn.Module):
|
|
|
82
82
|
|
|
83
83
|
if stride[0] != 1 or stride[1] != 1 or in_channels != out_channels:
|
|
84
84
|
self.proj = nn.Sequential(
|
|
85
|
-
nn.AvgPool2d(kernel_size=(
|
|
85
|
+
nn.AvgPool2d(kernel_size=(3, 3), stride=stride, padding=(1, 1)),
|
|
86
86
|
nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=True),
|
|
87
87
|
)
|
|
88
88
|
else:
|
birder/net/metaformer.py
CHANGED
|
@@ -81,7 +81,7 @@ class ConvMLP(nn.Module):
|
|
|
81
81
|
class Scale(nn.Module):
|
|
82
82
|
def __init__(self, dim: int, init_value: float) -> None:
|
|
83
83
|
super().__init__()
|
|
84
|
-
self.scale = nn.Parameter(init_value * torch.ones(dim, 1, 1)
|
|
84
|
+
self.scale = nn.Parameter(init_value * torch.ones(dim, 1, 1))
|
|
85
85
|
|
|
86
86
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
87
87
|
return x * self.scale
|
|
@@ -104,8 +104,8 @@ class StarReLU(nn.Module):
|
|
|
104
104
|
def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0, inplace: bool = False) -> None:
|
|
105
105
|
super().__init__()
|
|
106
106
|
self.relu = nn.ReLU(inplace=inplace)
|
|
107
|
-
self.scale = nn.Parameter(scale_value * torch.ones(1)
|
|
108
|
-
self.bias = nn.Parameter(bias_value * torch.ones(1)
|
|
107
|
+
self.scale = nn.Parameter(scale_value * torch.ones(1))
|
|
108
|
+
self.bias = nn.Parameter(bias_value * torch.ones(1))
|
|
109
109
|
|
|
110
110
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
111
111
|
return self.scale * self.relu(x) ** 2 + self.bias
|
birder/net/mim/crossmae.py
CHANGED
|
@@ -113,7 +113,7 @@ class CrossMAE(MIMBaseNet):
|
|
|
113
113
|
dim=decoder_embed_dim,
|
|
114
114
|
num_special_tokens=0,
|
|
115
115
|
).unsqueeze(0)
|
|
116
|
-
self.decoder_pos_embed = nn.
|
|
116
|
+
self.decoder_pos_embed = nn.Buffer(pos_embedding)
|
|
117
117
|
|
|
118
118
|
self.decoder_layers = nn.ModuleList()
|
|
119
119
|
for _ in range(decoder_depth):
|
birder/net/mim/mae_vit.py
CHANGED
|
@@ -67,7 +67,7 @@ class MAE_ViT(MIMBaseNet):
|
|
|
67
67
|
dim=decoder_embed_dim,
|
|
68
68
|
num_special_tokens=self.encoder.num_special_tokens,
|
|
69
69
|
)
|
|
70
|
-
self.decoder_pos_embed = nn.
|
|
70
|
+
self.decoder_pos_embed = nn.Buffer(pos_embedding)
|
|
71
71
|
|
|
72
72
|
layers = []
|
|
73
73
|
for _ in range(decoder_depth):
|
birder/net/mim/simmim.py
CHANGED
|
@@ -83,7 +83,7 @@ class SimMIM(MIMBaseNet):
|
|
|
83
83
|
bias=True,
|
|
84
84
|
)
|
|
85
85
|
|
|
86
|
-
self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, self.encoder.stem_width)
|
|
86
|
+
self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, self.encoder.stem_width))
|
|
87
87
|
|
|
88
88
|
# Weights initialization
|
|
89
89
|
nn.init.trunc_normal_(self.mask_token, mean=0.0, std=0.02)
|
birder/net/mobilenet_v1.py
CHANGED
|
@@ -107,15 +107,6 @@ class MobileNet_v1(DetectorBackbone):
|
|
|
107
107
|
|
|
108
108
|
self.body = nn.Sequential(stages)
|
|
109
109
|
self.features = nn.Sequential(
|
|
110
|
-
Conv2dNormActivation(
|
|
111
|
-
base * 32,
|
|
112
|
-
base * 32,
|
|
113
|
-
kernel_size=(1, 1),
|
|
114
|
-
stride=(1, 1),
|
|
115
|
-
padding=(0, 0),
|
|
116
|
-
bias=False,
|
|
117
|
-
activation_layer=None,
|
|
118
|
-
),
|
|
119
110
|
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
|
|
120
111
|
nn.Flatten(1),
|
|
121
112
|
)
|
birder/net/mobilenet_v2.py
CHANGED
|
@@ -37,36 +37,44 @@ class InvertedResidual(nn.Module):
|
|
|
37
37
|
num_expfilter = int(round(in_channels * expansion_factor))
|
|
38
38
|
|
|
39
39
|
self.shortcut = shortcut
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
40
|
+
layers = []
|
|
41
|
+
if expansion_factor != 1.0:
|
|
42
|
+
layers.append(
|
|
43
|
+
Conv2dNormActivation(
|
|
44
|
+
in_channels,
|
|
45
|
+
num_expfilter,
|
|
46
|
+
kernel_size=(1, 1),
|
|
47
|
+
stride=(1, 1),
|
|
48
|
+
padding=(0, 0),
|
|
49
|
+
bias=False,
|
|
50
|
+
activation_layer=activation_layer,
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
layers.extend(
|
|
55
|
+
[
|
|
56
|
+
Conv2dNormActivation(
|
|
57
|
+
num_expfilter,
|
|
58
|
+
num_expfilter,
|
|
59
|
+
kernel_size=kernel_size,
|
|
60
|
+
stride=stride,
|
|
61
|
+
padding=padding,
|
|
62
|
+
groups=num_expfilter,
|
|
63
|
+
bias=False,
|
|
64
|
+
activation_layer=activation_layer,
|
|
65
|
+
),
|
|
66
|
+
Conv2dNormActivation(
|
|
67
|
+
num_expfilter,
|
|
68
|
+
out_channels,
|
|
69
|
+
kernel_size=(1, 1),
|
|
70
|
+
stride=(1, 1),
|
|
71
|
+
padding=(0, 0),
|
|
72
|
+
bias=False,
|
|
73
|
+
activation_layer=None,
|
|
74
|
+
),
|
|
75
|
+
]
|
|
69
76
|
)
|
|
77
|
+
self.block = nn.Sequential(*layers)
|
|
70
78
|
|
|
71
79
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
72
80
|
if self.shortcut is True:
|
|
@@ -171,6 +179,7 @@ class MobileNet_v2(DetectorBackbone):
|
|
|
171
179
|
),
|
|
172
180
|
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
|
|
173
181
|
nn.Flatten(1),
|
|
182
|
+
nn.Dropout(0.2),
|
|
174
183
|
)
|
|
175
184
|
self.return_channels = return_channels[1:5]
|
|
176
185
|
self.embedding_size = last_channels
|
|
@@ -230,18 +239,3 @@ registry.register_model_config("mobilenet_v2_1_25", MobileNet_v2, config={"alpha
|
|
|
230
239
|
registry.register_model_config("mobilenet_v2_1_5", MobileNet_v2, config={"alpha": 1.5})
|
|
231
240
|
registry.register_model_config("mobilenet_v2_1_75", MobileNet_v2, config={"alpha": 1.75})
|
|
232
241
|
registry.register_model_config("mobilenet_v2_2_0", MobileNet_v2, config={"alpha": 2.0})
|
|
233
|
-
|
|
234
|
-
registry.register_weights(
|
|
235
|
-
"mobilenet_v2_1_0_il-common",
|
|
236
|
-
{
|
|
237
|
-
"description": "MobileNet v2 (1.0 multiplier) model trained on the il-common dataset",
|
|
238
|
-
"resolution": (256, 256),
|
|
239
|
-
"formats": {
|
|
240
|
-
"pt": {
|
|
241
|
-
"file_size": 10.6,
|
|
242
|
-
"sha256": "d6182293e98c102026f7cdc0d446aaf0e511232173c4b98c1a882c9f147be6e7",
|
|
243
|
-
}
|
|
244
|
-
},
|
|
245
|
-
"net": {"network": "mobilenet_v2_1_0", "tag": "il-common"},
|
|
246
|
-
},
|
|
247
|
-
)
|
|
@@ -3,6 +3,9 @@ MobileNet v3, adapted from
|
|
|
3
3
|
https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py
|
|
4
4
|
|
|
5
5
|
Paper "Searching for MobileNetV3", https://arxiv.org/abs/1905.02244
|
|
6
|
+
|
|
7
|
+
Changes from original:
|
|
8
|
+
* Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Reference license: BSD 3-Clause
|
|
@@ -113,7 +116,7 @@ class InvertedResidual(nn.Module):
|
|
|
113
116
|
|
|
114
117
|
|
|
115
118
|
# pylint: disable=invalid-name
|
|
116
|
-
class
|
|
119
|
+
class MobileNet_v3(DetectorBackbone):
|
|
117
120
|
def __init__(
|
|
118
121
|
self,
|
|
119
122
|
input_channels: int,
|
|
@@ -121,12 +124,12 @@ class MobileNet_v3_Large(DetectorBackbone):
|
|
|
121
124
|
*,
|
|
122
125
|
config: Optional[dict[str, Any]] = None,
|
|
123
126
|
size: Optional[tuple[int, int]] = None,
|
|
124
|
-
large: bool = True,
|
|
125
127
|
) -> None:
|
|
126
128
|
super().__init__(input_channels, num_classes, config=config, size=size)
|
|
127
129
|
assert self.config is not None, "must set config"
|
|
128
130
|
|
|
129
131
|
alpha: float = self.config["alpha"]
|
|
132
|
+
large: bool = self.config["large"]
|
|
130
133
|
|
|
131
134
|
if large is True:
|
|
132
135
|
last_channels = int(round(1280 * max(1.0, alpha)))
|
|
@@ -268,15 +271,39 @@ class MobileNet_v3_Large(DetectorBackbone):
|
|
|
268
271
|
)
|
|
269
272
|
|
|
270
273
|
|
|
271
|
-
registry.register_model_config("
|
|
272
|
-
registry.register_model_config("
|
|
273
|
-
registry.register_model_config("
|
|
274
|
-
registry.register_model_config("
|
|
275
|
-
registry.register_model_config("
|
|
276
|
-
registry.register_model_config("
|
|
277
|
-
registry.register_model_config("
|
|
278
|
-
registry.register_model_config("
|
|
274
|
+
registry.register_model_config("mobilenet_v3_small_0_25", MobileNet_v3, config={"alpha": 0.25, "large": False})
|
|
275
|
+
registry.register_model_config("mobilenet_v3_small_0_5", MobileNet_v3, config={"alpha": 0.5, "large": False})
|
|
276
|
+
registry.register_model_config("mobilenet_v3_small_0_75", MobileNet_v3, config={"alpha": 0.75, "large": False})
|
|
277
|
+
registry.register_model_config("mobilenet_v3_small_1_0", MobileNet_v3, config={"alpha": 1.0, "large": False})
|
|
278
|
+
registry.register_model_config("mobilenet_v3_small_1_25", MobileNet_v3, config={"alpha": 1.25, "large": False})
|
|
279
|
+
registry.register_model_config("mobilenet_v3_small_1_5", MobileNet_v3, config={"alpha": 1.5, "large": False})
|
|
280
|
+
registry.register_model_config("mobilenet_v3_small_1_75", MobileNet_v3, config={"alpha": 1.75, "large": False})
|
|
281
|
+
registry.register_model_config("mobilenet_v3_small_2_0", MobileNet_v3, config={"alpha": 2.0, "large": False})
|
|
282
|
+
|
|
283
|
+
registry.register_model_config("mobilenet_v3_large_0_25", MobileNet_v3, config={"alpha": 0.25, "large": True})
|
|
284
|
+
registry.register_model_config("mobilenet_v3_large_0_5", MobileNet_v3, config={"alpha": 0.5, "large": True})
|
|
285
|
+
registry.register_model_config("mobilenet_v3_large_0_75", MobileNet_v3, config={"alpha": 0.75, "large": True})
|
|
286
|
+
registry.register_model_config("mobilenet_v3_large_1_0", MobileNet_v3, config={"alpha": 1.0, "large": True})
|
|
287
|
+
registry.register_model_config("mobilenet_v3_large_1_25", MobileNet_v3, config={"alpha": 1.25, "large": True})
|
|
288
|
+
registry.register_model_config("mobilenet_v3_large_1_5", MobileNet_v3, config={"alpha": 1.5, "large": True})
|
|
289
|
+
registry.register_model_config("mobilenet_v3_large_1_75", MobileNet_v3, config={"alpha": 1.75, "large": True})
|
|
290
|
+
registry.register_model_config("mobilenet_v3_large_2_0", MobileNet_v3, config={"alpha": 2.0, "large": True})
|
|
291
|
+
|
|
279
292
|
|
|
293
|
+
registry.register_weights(
|
|
294
|
+
"mobilenet_v3_small_1_0_il-common",
|
|
295
|
+
{
|
|
296
|
+
"description": "MobileNet v3 small (1.0 multiplier) model trained on the il-common dataset",
|
|
297
|
+
"resolution": (256, 256),
|
|
298
|
+
"formats": {
|
|
299
|
+
"pt": {
|
|
300
|
+
"file_size": 7.4,
|
|
301
|
+
"sha256": "ac53227f7513fd0c0b5204ee57403de2ab6c74c4e4d1061b9168596c6b5cea48",
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"net": {"network": "mobilenet_v3_small_1_0", "tag": "il-common"},
|
|
305
|
+
},
|
|
306
|
+
)
|
|
280
307
|
registry.register_weights(
|
|
281
308
|
"mobilenet_v3_large_0_75_il-common",
|
|
282
309
|
{
|
birder/net/mobilevit_v1.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
|
-
MobileViT, adapted from
|
|
2
|
+
MobileViT v1, adapted from
|
|
3
3
|
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/mobilevit.py
|
|
4
4
|
and
|
|
5
5
|
https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/mobile_vit.py
|
|
6
6
|
|
|
7
7
|
Paper "MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer",
|
|
8
8
|
https://arxiv.org/abs/2110.02178
|
|
9
|
+
|
|
10
|
+
Changes from original:
|
|
11
|
+
* Removed classifier bias
|
|
9
12
|
"""
|
|
10
13
|
|
|
11
14
|
# Reference license: Apache-2.0 and MIT
|
|
@@ -63,6 +66,7 @@ class MobileVitBlock(nn.Module):
|
|
|
63
66
|
attention_dropout=attn_drop,
|
|
64
67
|
drop_path=drop_path_rate,
|
|
65
68
|
activation_layer=nn.SiLU,
|
|
69
|
+
norm_layer_eps=1e-5,
|
|
66
70
|
)
|
|
67
71
|
for _ in range(transformer_depth)
|
|
68
72
|
]
|
|
@@ -166,7 +170,6 @@ class MobileViT_v1(BaseNet):
|
|
|
166
170
|
stride=(2, 2),
|
|
167
171
|
padding=(1, 1),
|
|
168
172
|
activation_layer=nn.SiLU,
|
|
169
|
-
bias=True,
|
|
170
173
|
)
|
|
171
174
|
|
|
172
175
|
layers = []
|
|
@@ -231,7 +234,6 @@ class MobileViT_v1(BaseNet):
|
|
|
231
234
|
stride=(1, 1),
|
|
232
235
|
padding=(0, 0),
|
|
233
236
|
activation_layer=nn.SiLU,
|
|
234
|
-
bias=True,
|
|
235
237
|
),
|
|
236
238
|
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
|
|
237
239
|
nn.Flatten(1),
|
|
@@ -290,32 +292,3 @@ registry.register_model_config(
|
|
|
290
292
|
"expansion": 4,
|
|
291
293
|
},
|
|
292
294
|
)
|
|
293
|
-
|
|
294
|
-
registry.register_weights(
|
|
295
|
-
"mobilevit_v1_xxs_il-common",
|
|
296
|
-
{
|
|
297
|
-
"description": "MobileViT v1 XXS model trained on the il-common dataset",
|
|
298
|
-
"resolution": (256, 256),
|
|
299
|
-
"formats": {
|
|
300
|
-
"pt": {
|
|
301
|
-
"file_size": 4.2,
|
|
302
|
-
"sha256": "2b565a768ca21fd72d5ef5090ff0f8b725f3e1165cd8e56749815041e5254d26",
|
|
303
|
-
}
|
|
304
|
-
},
|
|
305
|
-
"net": {"network": "mobilevit_v1_xxs", "tag": "il-common"},
|
|
306
|
-
},
|
|
307
|
-
)
|
|
308
|
-
registry.register_weights(
|
|
309
|
-
"mobilevit_v1_xs_il-common",
|
|
310
|
-
{
|
|
311
|
-
"description": "MobileViT v1 XS model trained on the il-common dataset",
|
|
312
|
-
"resolution": (256, 256),
|
|
313
|
-
"formats": {
|
|
314
|
-
"pt": {
|
|
315
|
-
"file_size": 8.1,
|
|
316
|
-
"sha256": "193bcede7f0b9f4574673e95c23c6ca3b8eeb30254a32a85e93342f1d67db31b",
|
|
317
|
-
}
|
|
318
|
-
},
|
|
319
|
-
"net": {"network": "mobilevit_v1_xs", "tag": "il-common"},
|
|
320
|
-
},
|
|
321
|
-
)
|