birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. birder/common/fs_ops.py +2 -2
  2. birder/introspection/attention_rollout.py +1 -1
  3. birder/introspection/transformer_attribution.py +1 -1
  4. birder/layers/layer_scale.py +1 -1
  5. birder/net/__init__.py +2 -10
  6. birder/net/_rope_vit_configs.py +430 -0
  7. birder/net/_vit_configs.py +479 -0
  8. birder/net/biformer.py +1 -0
  9. birder/net/cait.py +5 -5
  10. birder/net/coat.py +12 -12
  11. birder/net/conv2former.py +3 -3
  12. birder/net/convmixer.py +1 -1
  13. birder/net/convnext_v1.py +1 -1
  14. birder/net/crossvit.py +5 -5
  15. birder/net/davit.py +1 -1
  16. birder/net/deit.py +12 -26
  17. birder/net/deit3.py +42 -189
  18. birder/net/densenet.py +9 -8
  19. birder/net/detection/deformable_detr.py +5 -2
  20. birder/net/detection/detr.py +5 -2
  21. birder/net/detection/efficientdet.py +1 -1
  22. birder/net/dpn.py +1 -2
  23. birder/net/edgenext.py +2 -1
  24. birder/net/edgevit.py +3 -0
  25. birder/net/efficientformer_v1.py +2 -1
  26. birder/net/efficientformer_v2.py +18 -31
  27. birder/net/efficientnet_v2.py +3 -0
  28. birder/net/efficientvit_mit.py +5 -5
  29. birder/net/fasternet.py +2 -2
  30. birder/net/flexivit.py +22 -43
  31. birder/net/groupmixformer.py +1 -1
  32. birder/net/hgnet_v1.py +5 -5
  33. birder/net/inception_next.py +1 -1
  34. birder/net/inception_resnet_v1.py +3 -3
  35. birder/net/inception_resnet_v2.py +7 -4
  36. birder/net/inception_v3.py +3 -0
  37. birder/net/inception_v4.py +3 -0
  38. birder/net/maxvit.py +1 -1
  39. birder/net/metaformer.py +3 -3
  40. birder/net/mim/crossmae.py +1 -1
  41. birder/net/mim/mae_vit.py +1 -1
  42. birder/net/mim/simmim.py +1 -1
  43. birder/net/mobilenet_v1.py +0 -9
  44. birder/net/mobilenet_v2.py +38 -44
  45. birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
  46. birder/net/mobilevit_v1.py +5 -32
  47. birder/net/mobilevit_v2.py +1 -45
  48. birder/net/moganet.py +8 -5
  49. birder/net/mvit_v2.py +6 -6
  50. birder/net/nfnet.py +4 -0
  51. birder/net/pit.py +1 -1
  52. birder/net/pvt_v1.py +5 -5
  53. birder/net/pvt_v2.py +5 -5
  54. birder/net/repghost.py +1 -30
  55. birder/net/resmlp.py +2 -2
  56. birder/net/resnest.py +3 -0
  57. birder/net/resnet_v1.py +125 -1
  58. birder/net/resnet_v2.py +75 -1
  59. birder/net/resnext.py +35 -1
  60. birder/net/rope_deit3.py +33 -136
  61. birder/net/rope_flexivit.py +18 -18
  62. birder/net/rope_vit.py +3 -735
  63. birder/net/simple_vit.py +22 -16
  64. birder/net/smt.py +1 -1
  65. birder/net/squeezenet.py +5 -12
  66. birder/net/squeezenext.py +0 -24
  67. birder/net/ssl/capi.py +1 -1
  68. birder/net/ssl/data2vec.py +1 -1
  69. birder/net/ssl/dino_v2.py +2 -2
  70. birder/net/ssl/franca.py +2 -2
  71. birder/net/ssl/i_jepa.py +1 -1
  72. birder/net/ssl/ibot.py +1 -1
  73. birder/net/swiftformer.py +12 -2
  74. birder/net/swin_transformer_v2.py +1 -1
  75. birder/net/tiny_vit.py +3 -16
  76. birder/net/van.py +2 -2
  77. birder/net/vit.py +35 -963
  78. birder/net/vit_sam.py +13 -38
  79. birder/net/xcit.py +7 -6
  80. birder/tools/introspection.py +1 -1
  81. birder/tools/model_info.py +3 -1
  82. birder/version.py +1 -1
  83. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
  84. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
  85. birder/net/mobilenet_v3_small.py +0 -43
  86. birder/net/se_resnet_v1.py +0 -105
  87. birder/net/se_resnet_v2.py +0 -59
  88. birder/net/se_resnext.py +0 -30
  89. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
  90. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
  91. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
  92. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ class DSConv(nn.Module):
54
54
  in_channels,
55
55
  kernel_size=kernel_size,
56
56
  stride=stride,
57
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
57
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
58
58
  groups=in_channels,
59
59
  norm_layer=norm_layer[0],
60
60
  activation_layer=act_layer[0],
@@ -96,7 +96,7 @@ class ConvBlock(nn.Module):
96
96
  mid_channels,
97
97
  kernel_size=kernel_size,
98
98
  stride=stride,
99
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
99
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
100
100
  norm_layer=norm_layer[0],
101
101
  activation_layer=act_layer[0],
102
102
  inplace=None,
@@ -106,7 +106,7 @@ class ConvBlock(nn.Module):
106
106
  out_channels,
107
107
  kernel_size=kernel_size,
108
108
  stride=(1, 1),
109
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
109
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
110
110
  norm_layer=norm_layer[1],
111
111
  activation_layer=act_layer[1],
112
112
  inplace=None,
@@ -148,7 +148,7 @@ class MBConv(nn.Module):
148
148
  mid_channels,
149
149
  kernel_size=kernel_size,
150
150
  stride=stride,
151
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
151
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
152
152
  groups=mid_channels,
153
153
  norm_layer=norm_layer[1],
154
154
  activation_layer=act_layer[1],
@@ -192,7 +192,7 @@ class FusedMBConv(nn.Module):
192
192
  mid_channels,
193
193
  kernel_size=kernel_size,
194
194
  stride=stride,
195
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
195
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
196
196
  norm_layer=norm_layer[0],
197
197
  activation_layer=act_layer[0],
198
198
  inplace=None,
birder/net/fasternet.py CHANGED
@@ -50,10 +50,10 @@ class MLPBlock(nn.Module):
50
50
  mlp_hidden_dim = int(dim * mlp_ratio)
51
51
  self.spatial_mixing = PartialConv(dim, n_div)
52
52
  self.mlp = nn.Sequential(
53
- nn.Conv2d(dim, mlp_hidden_dim, 1, bias=False),
53
+ nn.Conv2d(dim, mlp_hidden_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
54
54
  nn.BatchNorm2d(mlp_hidden_dim),
55
55
  act_layer(),
56
- nn.Conv2d(mlp_hidden_dim, dim, 1, bias=False),
56
+ nn.Conv2d(mlp_hidden_dim, dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
57
57
  )
58
58
  self.drop_path = StochasticDepth(drop_path, mode="row")
59
59
 
birder/net/flexivit.py CHANGED
@@ -22,6 +22,8 @@ from birder.layers import MultiHeadAttentionPool
22
22
  from birder.layers import SwiGLU_FFN
23
23
  from birder.layers.activations import get_activation_module
24
24
  from birder.model_registry import registry
25
+ from birder.net._vit_configs import BASE
26
+ from birder.net._vit_configs import SMALL
25
27
  from birder.net.base import DetectorBackbone
26
28
  from birder.net.base import MaskedTokenOmissionMixin
27
29
  from birder.net.base import MaskedTokenRetentionMixin
@@ -583,70 +585,47 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
583
585
  registry.register_model_config(
584
586
  "flexivit_s16",
585
587
  FlexiViT,
586
- config={
587
- "patch_size": 16,
588
- "num_layers": 12,
589
- "num_heads": 6,
590
- "hidden_dim": 384,
591
- "mlp_dim": 1536,
592
- "drop_path_rate": 0.0,
593
- },
588
+ config={"patch_size": 16, **SMALL},
594
589
  )
595
590
  registry.register_model_config(
596
591
  "flexivit_s16_ls",
597
592
  FlexiViT,
598
- config={
599
- "patch_size": 16,
600
- "num_layers": 12,
601
- "num_heads": 6,
602
- "hidden_dim": 384,
603
- "mlp_dim": 1536,
604
- "layer_scale_init_value": 1e-5,
605
- "drop_path_rate": 0.0,
606
- },
593
+ config={"patch_size": 16, **SMALL, "layer_scale_init_value": 1e-5},
607
594
  )
595
+ registry.register_model_config(
596
+ "flexivit_b16",
597
+ FlexiViT,
598
+ config={"patch_size": 16, **BASE},
599
+ )
600
+
601
+ # With registers
602
+ ####################
603
+
608
604
  registry.register_model_config(
609
605
  "flexivit_reg1_s16",
610
606
  FlexiViT,
611
- config={
612
- "patch_size": 16,
613
- "num_layers": 12,
614
- "num_heads": 6,
615
- "hidden_dim": 384,
616
- "mlp_dim": 1536,
617
- "num_reg_tokens": 1,
618
- "drop_path_rate": 0.0,
619
- },
607
+ config={"patch_size": 16, **SMALL, "num_reg_tokens": 1},
620
608
  )
621
609
  registry.register_model_config(
622
610
  "flexivit_reg1_s16_rms_ls",
623
611
  FlexiViT,
624
612
  config={
625
613
  "patch_size": 16,
626
- "num_layers": 12,
627
- "num_heads": 6,
628
- "hidden_dim": 384,
629
- "mlp_dim": 1536,
614
+ **SMALL,
630
615
  "layer_scale_init_value": 1e-5,
631
616
  "num_reg_tokens": 1,
632
617
  "norm_layer_type": "RMSNorm",
633
- "drop_path_rate": 0.0,
634
618
  },
635
619
  )
620
+ registry.register_model_config(
621
+ "flexivit_reg4_b16",
622
+ FlexiViT,
623
+ config={"patch_size": 16, **BASE, "num_reg_tokens": 4},
624
+ )
636
625
  registry.register_model_config(
637
626
  "flexivit_reg8_b14_ap",
638
627
  FlexiViT,
639
- config={
640
- "patch_size": 14,
641
- "num_layers": 12,
642
- "num_heads": 12,
643
- "hidden_dim": 768,
644
- "mlp_dim": 3072,
645
- "num_reg_tokens": 8,
646
- "class_token": False,
647
- "attn_pool_head": True,
648
- "drop_path_rate": 0.1,
649
- },
628
+ config={"patch_size": 14, **BASE, "num_reg_tokens": 8, "class_token": False, "attn_pool_head": True},
650
629
  )
651
630
 
652
631
  registry.register_weights(
@@ -661,7 +640,7 @@ registry.register_weights(
661
640
  "formats": {
662
641
  "pt": {
663
642
  "file_size": 83.6,
664
- "sha256": "8d11fb14630f2a54632aeebd09c5a9c2b3b7de1099e09de5e91f433ed915b784",
643
+ "sha256": "8285f4fe56401f169491cb2399d2a7c82f3a0cfbe8a5a8d3c27163024a274800",
665
644
  },
666
645
  },
667
646
  "net": {"network": "flexivit_reg1_s16_rms_ls", "tag": "dino-v2-il-all"},
@@ -160,7 +160,7 @@ class ConvPosEnc(nn.Module):
160
160
  dim,
161
161
  kernel_size=kernel_size,
162
162
  stride=(1, 1),
163
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
163
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
164
164
  groups=dim,
165
165
  )
166
166
 
birder/net/hgnet_v1.py CHANGED
@@ -23,8 +23,8 @@ from birder.net.base import DetectorBackbone
23
23
  class LearnableAffineBlock(nn.Module):
24
24
  def __init__(self, scale_value: float, bias_value: float) -> None:
25
25
  super().__init__()
26
- self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
27
- self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
26
+ self.scale = nn.Parameter(torch.tensor([scale_value]))
27
+ self.bias = nn.Parameter(torch.tensor([bias_value]))
28
28
 
29
29
  def forward(self, x: torch.Tensor) -> torch.Tensor:
30
30
  return self.scale * x + self.bias
@@ -90,7 +90,7 @@ class LightConvBNAct(nn.Module):
90
90
  out_channels,
91
91
  kernel_size=kernel_size,
92
92
  stride=(1, 1),
93
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
93
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
94
94
  groups=out_channels,
95
95
  use_act=True,
96
96
  use_lab=use_lab,
@@ -180,7 +180,7 @@ class HighPerfGPUBlock(nn.Module):
180
180
  mid_channels,
181
181
  kernel_size=kernel_size,
182
182
  stride=(1, 1),
183
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
183
+ padding=((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2),
184
184
  groups=1,
185
185
  use_act=True,
186
186
  use_lab=use_lab,
@@ -267,7 +267,7 @@ class HighPerfGPUStage(nn.Module):
267
267
  in_channels,
268
268
  kernel_size=(3, 3),
269
269
  stride=stride,
270
- padding=(kernel_size[0] // 2, kernel_size[1] // 2),
270
+ padding=(1, 1),
271
271
  groups=in_channels,
272
272
  use_act=False,
273
273
  use_lab=False,
@@ -110,7 +110,7 @@ class InceptionNeXtBlock(nn.Module):
110
110
  nn.BatchNorm2d(channels),
111
111
  ConvMLP(channels, hidden_features=int(mlp_ratio * channels), out_features=channels),
112
112
  )
113
- self.layer_scale = nn.Parameter(torch.ones(channels, 1, 1) * layer_scale, requires_grad=True)
113
+ self.layer_scale = nn.Parameter(torch.ones(channels, 1, 1) * layer_scale)
114
114
  self.stochastic_depth = StochasticDepth(stochastic_depth_prob, mode="row")
115
115
 
116
116
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -33,7 +33,7 @@ class InceptionBlockA(nn.Module):
33
33
  Conv2dNormActivation(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
34
34
  )
35
35
 
36
- self.conv2d = nn.Conv2d(96, 256, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
36
+ self.conv2d = nn.Conv2d(96, 256, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
37
37
  self.relu = nn.ReLU(inplace=True)
38
38
 
39
39
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -84,7 +84,7 @@ class InceptionBlockB(nn.Module):
84
84
  Conv2dNormActivation(128, 128, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0), bias=False),
85
85
  )
86
86
 
87
- self.conv2d = nn.Conv2d(256, 896, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
87
+ self.conv2d = nn.Conv2d(256, 896, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
88
88
  self.relu = nn.ReLU(inplace=True)
89
89
 
90
90
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -140,7 +140,7 @@ class InceptionBlockC(nn.Module):
140
140
  Conv2dNormActivation(192, 192, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0), bias=False),
141
141
  )
142
142
 
143
- self.conv2d = nn.Conv2d(384, 1792, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
143
+ self.conv2d = nn.Conv2d(384, 1792, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
144
144
  self.relu = nn.ReLU(inplace=True)
145
145
 
146
146
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -4,6 +4,9 @@ https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/incept
4
4
 
5
5
  Paper "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning",
6
6
  https://arxiv.org/abs/1602.07261
7
+
8
+ Changes from original:
9
+ * Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
7
10
  """
8
11
 
9
12
  # Reference license: Apache-2.0
@@ -35,7 +38,7 @@ class StemBlock(nn.Module):
35
38
  Conv2dNormActivation(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
36
39
  )
37
40
  self.branch_pool = nn.Sequential(
38
- nn.AvgPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
41
+ nn.AvgPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), count_include_pad=False),
39
42
  Conv2dNormActivation(in_channels, 64, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
40
43
  )
41
44
 
@@ -66,7 +69,7 @@ class InceptionBlockA(nn.Module):
66
69
  Conv2dNormActivation(48, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
67
70
  )
68
71
 
69
- self.conv2d = nn.Conv2d(128, 320, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
72
+ self.conv2d = nn.Conv2d(128, 320, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
70
73
  self.relu = nn.ReLU(inplace=True)
71
74
 
72
75
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -117,7 +120,7 @@ class InceptionBlockB(nn.Module):
117
120
  Conv2dNormActivation(160, 192, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0), bias=False),
118
121
  )
119
122
 
120
- self.conv2d = nn.Conv2d(384, 1088, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
123
+ self.conv2d = nn.Conv2d(384, 1088, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
121
124
  self.relu = nn.ReLU(inplace=True)
122
125
 
123
126
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -174,7 +177,7 @@ class InceptionBlockC(nn.Module):
174
177
  Conv2dNormActivation(224, 256, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0), bias=False),
175
178
  )
176
179
 
177
- self.conv2d = nn.Conv2d(448, 2080, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)
180
+ self.conv2d = nn.Conv2d(448, 2080, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
178
181
  self.relu = nn.ReLU(inplace=True)
179
182
 
180
183
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -3,6 +3,9 @@ Inception v3, adapted from
3
3
  https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py
4
4
 
5
5
  Paper "Rethinking the Inception Architecture for Computer Vision", https://arxiv.org/abs/1512.00567
6
+
7
+ Changes from original:
8
+ * Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
6
9
  """
7
10
 
8
11
  # Reference license: BSD 3-Clause
@@ -4,6 +4,9 @@ https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/incept
4
4
 
5
5
  Paper "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning",
6
6
  https://arxiv.org/abs/1602.07261
7
+
8
+ Changes from original:
9
+ * Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
7
10
  """
8
11
 
9
12
  # Reference license: Apache-2.0
birder/net/maxvit.py CHANGED
@@ -82,7 +82,7 @@ class MBConv(nn.Module):
82
82
 
83
83
  if stride[0] != 1 or stride[1] != 1 or in_channels != out_channels:
84
84
  self.proj = nn.Sequential(
85
- nn.AvgPool2d(kernel_size=(2, 2), stride=stride, padding=(0, 0)),
85
+ nn.AvgPool2d(kernel_size=(3, 3), stride=stride, padding=(1, 1)),
86
86
  nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=True),
87
87
  )
88
88
  else:
birder/net/metaformer.py CHANGED
@@ -81,7 +81,7 @@ class ConvMLP(nn.Module):
81
81
  class Scale(nn.Module):
82
82
  def __init__(self, dim: int, init_value: float) -> None:
83
83
  super().__init__()
84
- self.scale = nn.Parameter(init_value * torch.ones(dim, 1, 1), requires_grad=True)
84
+ self.scale = nn.Parameter(init_value * torch.ones(dim, 1, 1))
85
85
 
86
86
  def forward(self, x: torch.Tensor) -> torch.Tensor:
87
87
  return x * self.scale
@@ -104,8 +104,8 @@ class StarReLU(nn.Module):
104
104
  def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0, inplace: bool = False) -> None:
105
105
  super().__init__()
106
106
  self.relu = nn.ReLU(inplace=inplace)
107
- self.scale = nn.Parameter(scale_value * torch.ones(1), requires_grad=True)
108
- self.bias = nn.Parameter(bias_value * torch.ones(1), requires_grad=True)
107
+ self.scale = nn.Parameter(scale_value * torch.ones(1))
108
+ self.bias = nn.Parameter(bias_value * torch.ones(1))
109
109
 
110
110
  def forward(self, x: torch.Tensor) -> torch.Tensor:
111
111
  return self.scale * self.relu(x) ** 2 + self.bias
@@ -113,7 +113,7 @@ class CrossMAE(MIMBaseNet):
113
113
  dim=decoder_embed_dim,
114
114
  num_special_tokens=0,
115
115
  ).unsqueeze(0)
116
- self.decoder_pos_embed = nn.Parameter(pos_embedding, requires_grad=False)
116
+ self.decoder_pos_embed = nn.Buffer(pos_embedding)
117
117
 
118
118
  self.decoder_layers = nn.ModuleList()
119
119
  for _ in range(decoder_depth):
birder/net/mim/mae_vit.py CHANGED
@@ -67,7 +67,7 @@ class MAE_ViT(MIMBaseNet):
67
67
  dim=decoder_embed_dim,
68
68
  num_special_tokens=self.encoder.num_special_tokens,
69
69
  )
70
- self.decoder_pos_embed = nn.Parameter(pos_embedding, requires_grad=False)
70
+ self.decoder_pos_embed = nn.Buffer(pos_embedding)
71
71
 
72
72
  layers = []
73
73
  for _ in range(decoder_depth):
birder/net/mim/simmim.py CHANGED
@@ -83,7 +83,7 @@ class SimMIM(MIMBaseNet):
83
83
  bias=True,
84
84
  )
85
85
 
86
- self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, self.encoder.stem_width), requires_grad=True)
86
+ self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, self.encoder.stem_width))
87
87
 
88
88
  # Weights initialization
89
89
  nn.init.trunc_normal_(self.mask_token, mean=0.0, std=0.02)
@@ -107,15 +107,6 @@ class MobileNet_v1(DetectorBackbone):
107
107
 
108
108
  self.body = nn.Sequential(stages)
109
109
  self.features = nn.Sequential(
110
- Conv2dNormActivation(
111
- base * 32,
112
- base * 32,
113
- kernel_size=(1, 1),
114
- stride=(1, 1),
115
- padding=(0, 0),
116
- bias=False,
117
- activation_layer=None,
118
- ),
119
110
  nn.AdaptiveAvgPool2d(output_size=(1, 1)),
120
111
  nn.Flatten(1),
121
112
  )
@@ -37,36 +37,44 @@ class InvertedResidual(nn.Module):
37
37
  num_expfilter = int(round(in_channels * expansion_factor))
38
38
 
39
39
  self.shortcut = shortcut
40
- self.block = nn.Sequential(
41
- Conv2dNormActivation(
42
- in_channels,
43
- num_expfilter,
44
- kernel_size=(1, 1),
45
- stride=(1, 1),
46
- padding=(0, 0),
47
- bias=False,
48
- activation_layer=activation_layer,
49
- ),
50
- Conv2dNormActivation(
51
- num_expfilter,
52
- num_expfilter,
53
- kernel_size=kernel_size,
54
- stride=stride,
55
- padding=padding,
56
- groups=num_expfilter,
57
- bias=False,
58
- activation_layer=activation_layer,
59
- ),
60
- Conv2dNormActivation(
61
- num_expfilter,
62
- out_channels,
63
- kernel_size=(1, 1),
64
- stride=(1, 1),
65
- padding=(0, 0),
66
- bias=False,
67
- activation_layer=None,
68
- ),
40
+ layers = []
41
+ if expansion_factor != 1.0:
42
+ layers.append(
43
+ Conv2dNormActivation(
44
+ in_channels,
45
+ num_expfilter,
46
+ kernel_size=(1, 1),
47
+ stride=(1, 1),
48
+ padding=(0, 0),
49
+ bias=False,
50
+ activation_layer=activation_layer,
51
+ )
52
+ )
53
+
54
+ layers.extend(
55
+ [
56
+ Conv2dNormActivation(
57
+ num_expfilter,
58
+ num_expfilter,
59
+ kernel_size=kernel_size,
60
+ stride=stride,
61
+ padding=padding,
62
+ groups=num_expfilter,
63
+ bias=False,
64
+ activation_layer=activation_layer,
65
+ ),
66
+ Conv2dNormActivation(
67
+ num_expfilter,
68
+ out_channels,
69
+ kernel_size=(1, 1),
70
+ stride=(1, 1),
71
+ padding=(0, 0),
72
+ bias=False,
73
+ activation_layer=None,
74
+ ),
75
+ ]
69
76
  )
77
+ self.block = nn.Sequential(*layers)
70
78
 
71
79
  def forward(self, x: torch.Tensor) -> torch.Tensor:
72
80
  if self.shortcut is True:
@@ -171,6 +179,7 @@ class MobileNet_v2(DetectorBackbone):
171
179
  ),
172
180
  nn.AdaptiveAvgPool2d(output_size=(1, 1)),
173
181
  nn.Flatten(1),
182
+ nn.Dropout(0.2),
174
183
  )
175
184
  self.return_channels = return_channels[1:5]
176
185
  self.embedding_size = last_channels
@@ -230,18 +239,3 @@ registry.register_model_config("mobilenet_v2_1_25", MobileNet_v2, config={"alpha
230
239
  registry.register_model_config("mobilenet_v2_1_5", MobileNet_v2, config={"alpha": 1.5})
231
240
  registry.register_model_config("mobilenet_v2_1_75", MobileNet_v2, config={"alpha": 1.75})
232
241
  registry.register_model_config("mobilenet_v2_2_0", MobileNet_v2, config={"alpha": 2.0})
233
-
234
- registry.register_weights(
235
- "mobilenet_v2_1_0_il-common",
236
- {
237
- "description": "MobileNet v2 (1.0 multiplier) model trained on the il-common dataset",
238
- "resolution": (256, 256),
239
- "formats": {
240
- "pt": {
241
- "file_size": 10.6,
242
- "sha256": "d6182293e98c102026f7cdc0d446aaf0e511232173c4b98c1a882c9f147be6e7",
243
- }
244
- },
245
- "net": {"network": "mobilenet_v2_1_0", "tag": "il-common"},
246
- },
247
- )
@@ -3,6 +3,9 @@ MobileNet v3, adapted from
3
3
  https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py
4
4
 
5
5
  Paper "Searching for MobileNetV3", https://arxiv.org/abs/1905.02244
6
+
7
+ Changes from original:
8
+ * Using nn.BatchNorm2d with eps 1e-5 instead of 1e-3
6
9
  """
7
10
 
8
11
  # Reference license: BSD 3-Clause
@@ -113,7 +116,7 @@ class InvertedResidual(nn.Module):
113
116
 
114
117
 
115
118
  # pylint: disable=invalid-name
116
- class MobileNet_v3_Large(DetectorBackbone):
119
+ class MobileNet_v3(DetectorBackbone):
117
120
  def __init__(
118
121
  self,
119
122
  input_channels: int,
@@ -121,12 +124,12 @@ class MobileNet_v3_Large(DetectorBackbone):
121
124
  *,
122
125
  config: Optional[dict[str, Any]] = None,
123
126
  size: Optional[tuple[int, int]] = None,
124
- large: bool = True,
125
127
  ) -> None:
126
128
  super().__init__(input_channels, num_classes, config=config, size=size)
127
129
  assert self.config is not None, "must set config"
128
130
 
129
131
  alpha: float = self.config["alpha"]
132
+ large: bool = self.config["large"]
130
133
 
131
134
  if large is True:
132
135
  last_channels = int(round(1280 * max(1.0, alpha)))
@@ -268,15 +271,39 @@ class MobileNet_v3_Large(DetectorBackbone):
268
271
  )
269
272
 
270
273
 
271
- registry.register_model_config("mobilenet_v3_large_0_25", MobileNet_v3_Large, config={"alpha": 0.25})
272
- registry.register_model_config("mobilenet_v3_large_0_5", MobileNet_v3_Large, config={"alpha": 0.5})
273
- registry.register_model_config("mobilenet_v3_large_0_75", MobileNet_v3_Large, config={"alpha": 0.75})
274
- registry.register_model_config("mobilenet_v3_large_1_0", MobileNet_v3_Large, config={"alpha": 1.0})
275
- registry.register_model_config("mobilenet_v3_large_1_25", MobileNet_v3_Large, config={"alpha": 1.25})
276
- registry.register_model_config("mobilenet_v3_large_1_5", MobileNet_v3_Large, config={"alpha": 1.5})
277
- registry.register_model_config("mobilenet_v3_large_1_75", MobileNet_v3_Large, config={"alpha": 1.75})
278
- registry.register_model_config("mobilenet_v3_large_2_0", MobileNet_v3_Large, config={"alpha": 2.0})
274
+ registry.register_model_config("mobilenet_v3_small_0_25", MobileNet_v3, config={"alpha": 0.25, "large": False})
275
+ registry.register_model_config("mobilenet_v3_small_0_5", MobileNet_v3, config={"alpha": 0.5, "large": False})
276
+ registry.register_model_config("mobilenet_v3_small_0_75", MobileNet_v3, config={"alpha": 0.75, "large": False})
277
+ registry.register_model_config("mobilenet_v3_small_1_0", MobileNet_v3, config={"alpha": 1.0, "large": False})
278
+ registry.register_model_config("mobilenet_v3_small_1_25", MobileNet_v3, config={"alpha": 1.25, "large": False})
279
+ registry.register_model_config("mobilenet_v3_small_1_5", MobileNet_v3, config={"alpha": 1.5, "large": False})
280
+ registry.register_model_config("mobilenet_v3_small_1_75", MobileNet_v3, config={"alpha": 1.75, "large": False})
281
+ registry.register_model_config("mobilenet_v3_small_2_0", MobileNet_v3, config={"alpha": 2.0, "large": False})
282
+
283
+ registry.register_model_config("mobilenet_v3_large_0_25", MobileNet_v3, config={"alpha": 0.25, "large": True})
284
+ registry.register_model_config("mobilenet_v3_large_0_5", MobileNet_v3, config={"alpha": 0.5, "large": True})
285
+ registry.register_model_config("mobilenet_v3_large_0_75", MobileNet_v3, config={"alpha": 0.75, "large": True})
286
+ registry.register_model_config("mobilenet_v3_large_1_0", MobileNet_v3, config={"alpha": 1.0, "large": True})
287
+ registry.register_model_config("mobilenet_v3_large_1_25", MobileNet_v3, config={"alpha": 1.25, "large": True})
288
+ registry.register_model_config("mobilenet_v3_large_1_5", MobileNet_v3, config={"alpha": 1.5, "large": True})
289
+ registry.register_model_config("mobilenet_v3_large_1_75", MobileNet_v3, config={"alpha": 1.75, "large": True})
290
+ registry.register_model_config("mobilenet_v3_large_2_0", MobileNet_v3, config={"alpha": 2.0, "large": True})
291
+
279
292
 
293
+ registry.register_weights(
294
+ "mobilenet_v3_small_1_0_il-common",
295
+ {
296
+ "description": "MobileNet v3 small (1.0 multiplier) model trained on the il-common dataset",
297
+ "resolution": (256, 256),
298
+ "formats": {
299
+ "pt": {
300
+ "file_size": 7.4,
301
+ "sha256": "ac53227f7513fd0c0b5204ee57403de2ab6c74c4e4d1061b9168596c6b5cea48",
302
+ }
303
+ },
304
+ "net": {"network": "mobilenet_v3_small_1_0", "tag": "il-common"},
305
+ },
306
+ )
280
307
  registry.register_weights(
281
308
  "mobilenet_v3_large_0_75_il-common",
282
309
  {
@@ -1,11 +1,14 @@
1
1
  """
2
- MobileViT, adapted from
2
+ MobileViT v1, adapted from
3
3
  https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/mobilevit.py
4
4
  and
5
5
  https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/mobile_vit.py
6
6
 
7
7
  Paper "MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer",
8
8
  https://arxiv.org/abs/2110.02178
9
+
10
+ Changes from original:
11
+ * Removed classifier bias
9
12
  """
10
13
 
11
14
  # Reference license: Apache-2.0 and MIT
@@ -63,6 +66,7 @@ class MobileVitBlock(nn.Module):
63
66
  attention_dropout=attn_drop,
64
67
  drop_path=drop_path_rate,
65
68
  activation_layer=nn.SiLU,
69
+ norm_layer_eps=1e-5,
66
70
  )
67
71
  for _ in range(transformer_depth)
68
72
  ]
@@ -166,7 +170,6 @@ class MobileViT_v1(BaseNet):
166
170
  stride=(2, 2),
167
171
  padding=(1, 1),
168
172
  activation_layer=nn.SiLU,
169
- bias=True,
170
173
  )
171
174
 
172
175
  layers = []
@@ -231,7 +234,6 @@ class MobileViT_v1(BaseNet):
231
234
  stride=(1, 1),
232
235
  padding=(0, 0),
233
236
  activation_layer=nn.SiLU,
234
- bias=True,
235
237
  ),
236
238
  nn.AdaptiveAvgPool2d(output_size=(1, 1)),
237
239
  nn.Flatten(1),
@@ -290,32 +292,3 @@ registry.register_model_config(
290
292
  "expansion": 4,
291
293
  },
292
294
  )
293
-
294
- registry.register_weights(
295
- "mobilevit_v1_xxs_il-common",
296
- {
297
- "description": "MobileViT v1 XXS model trained on the il-common dataset",
298
- "resolution": (256, 256),
299
- "formats": {
300
- "pt": {
301
- "file_size": 4.2,
302
- "sha256": "2b565a768ca21fd72d5ef5090ff0f8b725f3e1165cd8e56749815041e5254d26",
303
- }
304
- },
305
- "net": {"network": "mobilevit_v1_xxs", "tag": "il-common"},
306
- },
307
- )
308
- registry.register_weights(
309
- "mobilevit_v1_xs_il-common",
310
- {
311
- "description": "MobileViT v1 XS model trained on the il-common dataset",
312
- "resolution": (256, 256),
313
- "formats": {
314
- "pt": {
315
- "file_size": 8.1,
316
- "sha256": "193bcede7f0b9f4574673e95c23c6ca3b8eeb30254a32a85e93342f1d67db31b",
317
- }
318
- },
319
- "net": {"network": "mobilevit_v1_xs", "tag": "il-common"},
320
- },
321
- )