birder 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. birder/common/fs_ops.py +2 -2
  2. birder/introspection/attention_rollout.py +1 -1
  3. birder/introspection/transformer_attribution.py +1 -1
  4. birder/layers/layer_scale.py +1 -1
  5. birder/net/__init__.py +2 -10
  6. birder/net/_rope_vit_configs.py +430 -0
  7. birder/net/_vit_configs.py +479 -0
  8. birder/net/biformer.py +1 -0
  9. birder/net/cait.py +5 -5
  10. birder/net/coat.py +12 -12
  11. birder/net/conv2former.py +3 -3
  12. birder/net/convmixer.py +1 -1
  13. birder/net/convnext_v1.py +1 -1
  14. birder/net/crossvit.py +5 -5
  15. birder/net/davit.py +1 -1
  16. birder/net/deit.py +12 -26
  17. birder/net/deit3.py +42 -189
  18. birder/net/densenet.py +9 -8
  19. birder/net/detection/deformable_detr.py +5 -2
  20. birder/net/detection/detr.py +5 -2
  21. birder/net/detection/efficientdet.py +1 -1
  22. birder/net/dpn.py +1 -2
  23. birder/net/edgenext.py +2 -1
  24. birder/net/edgevit.py +3 -0
  25. birder/net/efficientformer_v1.py +2 -1
  26. birder/net/efficientformer_v2.py +18 -31
  27. birder/net/efficientnet_v2.py +3 -0
  28. birder/net/efficientvit_mit.py +5 -5
  29. birder/net/fasternet.py +2 -2
  30. birder/net/flexivit.py +22 -43
  31. birder/net/groupmixformer.py +1 -1
  32. birder/net/hgnet_v1.py +5 -5
  33. birder/net/inception_next.py +1 -1
  34. birder/net/inception_resnet_v1.py +3 -3
  35. birder/net/inception_resnet_v2.py +7 -4
  36. birder/net/inception_v3.py +3 -0
  37. birder/net/inception_v4.py +3 -0
  38. birder/net/maxvit.py +1 -1
  39. birder/net/metaformer.py +3 -3
  40. birder/net/mim/crossmae.py +1 -1
  41. birder/net/mim/mae_vit.py +1 -1
  42. birder/net/mim/simmim.py +1 -1
  43. birder/net/mobilenet_v1.py +0 -9
  44. birder/net/mobilenet_v2.py +38 -44
  45. birder/net/{mobilenet_v3_large.py → mobilenet_v3.py} +37 -10
  46. birder/net/mobilevit_v1.py +5 -32
  47. birder/net/mobilevit_v2.py +1 -45
  48. birder/net/moganet.py +8 -5
  49. birder/net/mvit_v2.py +6 -6
  50. birder/net/nfnet.py +4 -0
  51. birder/net/pit.py +1 -1
  52. birder/net/pvt_v1.py +5 -5
  53. birder/net/pvt_v2.py +5 -5
  54. birder/net/repghost.py +1 -30
  55. birder/net/resmlp.py +2 -2
  56. birder/net/resnest.py +3 -0
  57. birder/net/resnet_v1.py +125 -1
  58. birder/net/resnet_v2.py +75 -1
  59. birder/net/resnext.py +35 -1
  60. birder/net/rope_deit3.py +33 -136
  61. birder/net/rope_flexivit.py +18 -18
  62. birder/net/rope_vit.py +3 -735
  63. birder/net/simple_vit.py +22 -16
  64. birder/net/smt.py +1 -1
  65. birder/net/squeezenet.py +5 -12
  66. birder/net/squeezenext.py +0 -24
  67. birder/net/ssl/capi.py +1 -1
  68. birder/net/ssl/data2vec.py +1 -1
  69. birder/net/ssl/dino_v2.py +2 -2
  70. birder/net/ssl/franca.py +2 -2
  71. birder/net/ssl/i_jepa.py +1 -1
  72. birder/net/ssl/ibot.py +1 -1
  73. birder/net/swiftformer.py +12 -2
  74. birder/net/swin_transformer_v2.py +1 -1
  75. birder/net/tiny_vit.py +3 -16
  76. birder/net/van.py +2 -2
  77. birder/net/vit.py +35 -963
  78. birder/net/vit_sam.py +13 -38
  79. birder/net/xcit.py +7 -6
  80. birder/tools/introspection.py +1 -1
  81. birder/tools/model_info.py +3 -1
  82. birder/version.py +1 -1
  83. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/METADATA +1 -1
  84. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/RECORD +88 -90
  85. birder/net/mobilenet_v3_small.py +0 -43
  86. birder/net/se_resnet_v1.py +0 -105
  87. birder/net/se_resnet_v2.py +0 -59
  88. birder/net/se_resnext.py +0 -30
  89. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/WHEEL +0 -0
  90. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/entry_points.txt +0 -0
  91. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/licenses/LICENSE +0 -0
  92. {birder-0.3.3.dist-info → birder-0.4.0.dist-info}/top_level.txt +0 -0
@@ -159,12 +159,7 @@ class MobileVitBlock(nn.Module):
159
159
  self.norm = nn.GroupNorm(num_groups=1, num_channels=transformer_dim)
160
160
 
161
161
  self.conv_proj = Conv2dNormActivation(
162
- transformer_dim,
163
- channels,
164
- kernel_size=(1, 1),
165
- stride=(1, 1),
166
- padding=(0, 0),
167
- activation_layer=nn.SiLU,
162
+ transformer_dim, channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), activation_layer=None
168
163
  )
169
164
 
170
165
  self.patch_size = patch_size
@@ -236,7 +231,6 @@ class MobileViT_v2(DetectorBackbone):
236
231
  stride=(2, 2),
237
232
  padding=(1, 1),
238
233
  activation_layer=nn.SiLU,
239
- bias=True,
240
234
  )
241
235
 
242
236
  stages: OrderedDict[str, nn.Module] = OrderedDict()
@@ -340,15 +334,6 @@ class MobileViT_v2(DetectorBackbone):
340
334
  x = self.forward_features(x)
341
335
  return self.features(x)
342
336
 
343
- def create_classifier(self, embed_dim: Optional[int] = None) -> nn.Module:
344
- if self.num_classes == 0:
345
- return nn.Identity()
346
-
347
- if embed_dim is None:
348
- embed_dim = self.embedding_size
349
-
350
- return nn.Linear(embed_dim, self.num_classes, bias=False)
351
-
352
337
 
353
338
  registry.register_model_config("mobilevit_v2_0_25", MobileViT_v2, config={"width_factor": 0.25})
354
339
  registry.register_model_config("mobilevit_v2_0_5", MobileViT_v2, config={"width_factor": 0.5})
@@ -358,32 +343,3 @@ registry.register_model_config("mobilevit_v2_1_25", MobileViT_v2, config={"width
358
343
  registry.register_model_config("mobilevit_v2_1_5", MobileViT_v2, config={"width_factor": 1.5})
359
344
  registry.register_model_config("mobilevit_v2_1_75", MobileViT_v2, config={"width_factor": 1.75})
360
345
  registry.register_model_config("mobilevit_v2_2_0", MobileViT_v2, config={"width_factor": 2.0})
361
-
362
- registry.register_weights(
363
- "mobilevit_v2_1_0_il-common",
364
- {
365
- "description": "MobileViT v2 with width multiplier of 1.0 trained on the il-common dataset",
366
- "resolution": (256, 256),
367
- "formats": {
368
- "pt": {
369
- "file_size": 17.6,
370
- "sha256": "2b45b7f2ffe3dd129d9a7e9690d2dfd0f93ac60f24d118b920a51bcb950fd95e",
371
- }
372
- },
373
- "net": {"network": "mobilevit_v2_1_0", "tag": "il-common"},
374
- },
375
- )
376
- registry.register_weights(
377
- "mobilevit_v2_1_5_il-common",
378
- {
379
- "description": "MobileViT v2 with width multiplier of 1.5 trained on the il-common dataset",
380
- "resolution": (256, 256),
381
- "formats": {
382
- "pt": {
383
- "file_size": 38.8,
384
- "sha256": "acd28c3ee653b62c69ad765c1d99827cea5051deb6dbdd7b9c8d7612782c86a3",
385
- }
386
- },
387
- "net": {"network": "mobilevit_v2_1_5", "tag": "il-common"},
388
- },
389
- )
birder/net/moganet.py CHANGED
@@ -4,6 +4,9 @@ https://github.com/Westlake-AI/MogaNet/blob/main/models/moganet.py
4
4
 
5
5
  Paper "MogaNet: Multi-order Gated Aggregation Network",
6
6
  https://arxiv.org/abs/2211.03295
7
+
8
+ Changes from original:
9
+ * Removed biases before norms
7
10
  """
8
11
 
9
12
  # Reference license: Apache-2.0
@@ -30,7 +33,7 @@ from birder.net.base import TokenRetentionResultType
30
33
  class ElementScale(nn.Module):
31
34
  def __init__(self, embed_dims: int, init_value: float) -> None:
32
35
  super().__init__()
33
- self.scale = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)), requires_grad=True)
36
+ self.scale = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
34
37
 
35
38
  def forward(self, x: torch.Tensor) -> torch.Tensor:
36
39
  return x * self.scale
@@ -179,14 +182,14 @@ class MogaBlock(nn.Module):
179
182
  super().__init__()
180
183
 
181
184
  # Spatial attention
182
- self.norm1 = nn.BatchNorm2d(embed_dims, eps=1e-5)
185
+ self.norm1 = nn.BatchNorm2d(embed_dims)
183
186
  self.attn = MultiOrderGatedAggregation(
184
187
  embed_dims, attn_dw_dilation=attn_dw_dilation, attn_channel_split=attn_channel_split
185
188
  )
186
- self.layer_scale_1 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)), requires_grad=True)
189
+ self.layer_scale_1 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
187
190
 
188
191
  # Channel MLP
189
- self.norm2 = nn.BatchNorm2d(embed_dims, eps=1e-5)
192
+ self.norm2 = nn.BatchNorm2d(embed_dims)
190
193
  mlp_hidden_dim = int(embed_dims * ffn_ratio)
191
194
  self.mlp = ChannelAggregationFFN(
192
195
  embed_dims=embed_dims,
@@ -194,7 +197,7 @@ class MogaBlock(nn.Module):
194
197
  kernel_size=3,
195
198
  ffn_drop=drop_rate,
196
199
  )
197
- self.layer_scale_2 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)), requires_grad=True)
200
+ self.layer_scale_2 = nn.Parameter(init_value * torch.ones((1, embed_dims, 1, 1)))
198
201
 
199
202
  self.drop_path = StochasticDepth(drop_path_rate, mode="row")
200
203
 
birder/net/mvit_v2.py CHANGED
@@ -178,7 +178,7 @@ class MultiScaleAttention(nn.Module):
178
178
  groups=dim_conv,
179
179
  bias=False,
180
180
  )
181
- self.norm_q = nn.LayerNorm(dim_conv)
181
+ self.norm_q = nn.LayerNorm(dim_conv, eps=1e-6)
182
182
  else:
183
183
  self.pool_q = None
184
184
  self.norm_q = None
@@ -193,7 +193,7 @@ class MultiScaleAttention(nn.Module):
193
193
  groups=dim_conv,
194
194
  bias=False,
195
195
  )
196
- self.norm_k = nn.LayerNorm(dim_conv)
196
+ self.norm_k = nn.LayerNorm(dim_conv, eps=1e-6)
197
197
 
198
198
  self.pool_v = nn.Conv2d(
199
199
  dim_conv,
@@ -204,7 +204,7 @@ class MultiScaleAttention(nn.Module):
204
204
  groups=dim_conv,
205
205
  bias=False,
206
206
  )
207
- self.norm_v = nn.LayerNorm(dim_conv)
207
+ self.norm_v = nn.LayerNorm(dim_conv, eps=1e-6)
208
208
  else:
209
209
  self.pool_k = None
210
210
  self.norm_k = None
@@ -291,7 +291,7 @@ class MultiScaleBlock(nn.Module):
291
291
  self.dim = dim
292
292
  self.dim_out = dim_out
293
293
  self.num_heads = num_heads
294
- self.norm1 = nn.LayerNorm(dim)
294
+ self.norm1 = nn.LayerNorm(dim, eps=1e-6)
295
295
  self.has_cls_token = has_cls_token
296
296
  self.dim_mul_in_att = dim_mul_in_att
297
297
 
@@ -309,7 +309,7 @@ class MultiScaleBlock(nn.Module):
309
309
  has_cls_token=has_cls_token,
310
310
  )
311
311
  self.drop_path = StochasticDepth(drop_path, mode="row")
312
- self.norm2 = nn.LayerNorm(att_dim)
312
+ self.norm2 = nn.LayerNorm(att_dim, eps=1e-6)
313
313
  self.mlp = MLP(att_dim, [int(att_dim * mlp_ratio), dim_out], activation_layer=nn.GELU, inplace=None)
314
314
 
315
315
  if self.dim_mul_in_att is True and self.dim != self.dim_out:
@@ -506,7 +506,7 @@ class MViT_v2(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
506
506
  input_size = (input_size[0] // stride_q[i][0], input_size[1] // stride_q[i][1])
507
507
 
508
508
  self.body = SequentialWithShape(stages)
509
- self.norm = nn.LayerNorm(embed_dim)
509
+ self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
510
510
  self.return_channels = return_channels
511
511
  self.embedding_size = embed_dim
512
512
  self.classifier = self.create_classifier()
birder/net/nfnet.py CHANGED
@@ -3,6 +3,9 @@ Normalizer-Free Networks, adapted from
3
3
  https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/nfnet.py
4
4
 
5
5
  Paper "High-Performance Large-Scale Image Recognition Without Normalization", https://arxiv.org/abs/2102.06171
6
+
7
+ Changes from original:
8
+ * Removed dynamic padding
6
9
  """
7
10
 
8
11
  # Reference license: Apache-2.0
@@ -262,6 +265,7 @@ class NFNet(DetectorBackbone):
262
265
  self.body = nn.Sequential(stages)
263
266
  self.features = nn.Sequential(
264
267
  ScaledStdConv2d(prev_channels, prev_channels * 2, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)),
268
+ act_layer(),
265
269
  nn.AdaptiveAvgPool2d(output_size=(1, 1)),
266
270
  nn.Flatten(1),
267
271
  )
birder/net/pit.py CHANGED
@@ -321,7 +321,7 @@ registry.register_weights(
321
321
  "formats": {
322
322
  "pt": {
323
323
  "file_size": 18.4,
324
- "sha256": "8a1d98a2b7388e1efa14bbee89fc182b588a35d239544de521769c2d850ee5fe",
324
+ "sha256": "5f6bd74b09c1ee541ee2ddae4844ce501b4b3218201ea6381fce0b8fc30257f2",
325
325
  }
326
326
  },
327
327
  "net": {"network": "pit_t", "tag": "il-common"},
birder/net/pvt_v1.py CHANGED
@@ -50,7 +50,7 @@ class Attention(nn.Module):
50
50
 
51
51
  if sr_ratio > 1:
52
52
  self.sr = nn.Conv2d(dim, dim, kernel_size=(sr_ratio, sr_ratio), stride=(sr_ratio, sr_ratio), padding=(0, 0))
53
- self.norm = nn.LayerNorm(dim)
53
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
54
54
  else:
55
55
  self.sr = None
56
56
  self.norm = None
@@ -90,7 +90,7 @@ class PyramidVisionTransformerBlock(nn.Module):
90
90
  drop_path: float,
91
91
  ) -> None:
92
92
  super().__init__()
93
- self.norm1 = nn.LayerNorm(dim)
93
+ self.norm1 = nn.LayerNorm(dim, eps=1e-6)
94
94
  self.attn = Attention(
95
95
  dim,
96
96
  num_heads=num_heads,
@@ -100,7 +100,7 @@ class PyramidVisionTransformerBlock(nn.Module):
100
100
  proj_drop=proj_drop,
101
101
  )
102
102
 
103
- self.norm2 = nn.LayerNorm(dim)
103
+ self.norm2 = nn.LayerNorm(dim, eps=1e-6)
104
104
  self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, dropout=proj_drop)
105
105
  self.drop_path = StochasticDepth(drop_path, mode="row")
106
106
 
@@ -115,7 +115,7 @@ class PatchEmbed(nn.Module):
115
115
  def __init__(self, patch_size: tuple[int, int], in_channels: int, embed_dim: int) -> None:
116
116
  super().__init__()
117
117
  self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=(0, 0))
118
- self.norm = nn.LayerNorm(embed_dim)
118
+ self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
119
119
 
120
120
  def forward(self, x: torch.Tensor) -> torch.Tensor:
121
121
  x = self.proj(x)
@@ -169,7 +169,7 @@ class PyramidVisionTransformerStage(nn.Module):
169
169
  ]
170
170
  )
171
171
 
172
- self.norm = nn.LayerNorm(dim_out)
172
+ self.norm = nn.LayerNorm(dim_out, eps=1e-6)
173
173
  if cls_token is True:
174
174
  self.cls_token = nn.Parameter(torch.zeros(1, 1, dim_out))
175
175
  else:
birder/net/pvt_v2.py CHANGED
@@ -85,7 +85,7 @@ class Attention(nn.Module):
85
85
  self.sr = nn.Conv2d(
86
86
  dim, dim, kernel_size=(sr_ratio, sr_ratio), stride=(sr_ratio, sr_ratio), padding=(0, 0)
87
87
  )
88
- self.norm = nn.LayerNorm(dim)
88
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
89
89
  else:
90
90
  self.sr = None
91
91
  self.norm = None
@@ -93,7 +93,7 @@ class Attention(nn.Module):
93
93
  self.pool = nn.AdaptiveAvgPool2d(7)
94
94
  self.act = nn.GELU()
95
95
  self.sr = nn.Conv2d(dim, dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
96
- self.norm = nn.LayerNorm(dim)
96
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
97
97
 
98
98
  assert (self.pool is None and self.act is None) or (self.pool is not None and self.act is not None)
99
99
 
@@ -140,7 +140,7 @@ class PyramidVisionTransformerBlock(nn.Module):
140
140
  drop_path: float,
141
141
  ) -> None:
142
142
  super().__init__()
143
- self.norm1 = nn.LayerNorm(dim)
143
+ self.norm1 = nn.LayerNorm(dim, eps=1e-6)
144
144
  self.attn = Attention(
145
145
  dim,
146
146
  num_heads=num_heads,
@@ -151,7 +151,7 @@ class PyramidVisionTransformerBlock(nn.Module):
151
151
  proj_drop=proj_drop,
152
152
  )
153
153
 
154
- self.norm2 = nn.LayerNorm(dim)
154
+ self.norm2 = nn.LayerNorm(dim, eps=1e-6)
155
155
  self.mlp = MLP(
156
156
  in_features=dim,
157
157
  hidden_features=int(dim * mlp_ratio),
@@ -179,7 +179,7 @@ class OverlapPatchEmbed(nn.Module):
179
179
  stride=stride,
180
180
  padding=(patch_size[0] // 2, patch_size[1] // 2),
181
181
  )
182
- self.norm = nn.LayerNorm(embed_dim)
182
+ self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
183
183
 
184
184
  def forward(self, x: torch.Tensor) -> torch.Tensor:
185
185
  x = self.proj(x)
birder/net/repghost.py CHANGED
@@ -169,7 +169,7 @@ class RepGhostBottleneck(nn.Module):
169
169
  # Squeeze-and-excitation
170
170
  if se_ratio > 0:
171
171
  self.se = SqueezeExcitation(
172
- mid_channels, make_divisible(int(mid_channels * se_ratio), 4), activation=nn.Hardsigmoid
172
+ mid_channels, make_divisible(int(mid_channels * se_ratio), 4), scale_activation=nn.Hardsigmoid
173
173
  )
174
174
  else:
175
175
  self.se = nn.Identity()
@@ -351,32 +351,3 @@ registry.register_model_config("repghost_0_5", RepGhost, config={"width": 0.5})
351
351
  registry.register_model_config("repghost_1_0", RepGhost, config={"width": 1.0})
352
352
  registry.register_model_config("repghost_1_3", RepGhost, config={"width": 1.3})
353
353
  registry.register_model_config("repghost_1_5", RepGhost, config={"width": 1.5})
354
-
355
- registry.register_weights(
356
- "repghost_1_0_il-common",
357
- {
358
- "description": "RepGhost 1.0x model trained on the il-common dataset",
359
- "resolution": (256, 256),
360
- "formats": {
361
- "pt": {
362
- "file_size": 12.8,
363
- "sha256": "37e211ec65c752ad79bbbaacea277f7d683d0b0f69d954a7ca7af46b9a1260e6",
364
- }
365
- },
366
- "net": {"network": "repghost_1_0", "tag": "il-common"},
367
- },
368
- )
369
- registry.register_weights(
370
- "repghost_1_0_il-common_reparameterized",
371
- {
372
- "description": "RepGhost 1.0x (reparameterized) model trained on the il-common dataset",
373
- "resolution": (256, 256),
374
- "formats": {
375
- "pt": {
376
- "file_size": 12.6,
377
- "sha256": "e003e0498d63428305c10f879a0e2b999604795d417f07ea0da35ea925f794f5",
378
- }
379
- },
380
- "net": {"network": "repghost_1_0", "tag": "il-common_reparameterized", "reparameterized": True},
381
- },
382
- )
birder/net/resmlp.py CHANGED
@@ -40,8 +40,8 @@ class LayerScaleMLP(nn.Module):
40
40
  self.drop_path = StochasticDepth(drop_path, mode="row")
41
41
  self.norm2 = Affine(dim)
42
42
  self.mlp = MLP(dim, [int(dim * 4.0), dim], activation_layer=nn.GELU, dropout=drop)
43
- self.gamma_1 = nn.Parameter(init_value * torch.ones((dim)), requires_grad=True)
44
- self.gamma_2 = nn.Parameter(init_value * torch.ones((dim)), requires_grad=True)
43
+ self.gamma_1 = nn.Parameter(init_value * torch.ones((dim)))
44
+ self.gamma_2 = nn.Parameter(init_value * torch.ones((dim)))
45
45
 
46
46
  def forward(self, x: torch.Tensor) -> torch.Tensor:
47
47
  x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2))
birder/net/resnest.py CHANGED
@@ -3,6 +3,9 @@ ResNeSt, adapted from
3
3
  https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/resnest.py
4
4
 
5
5
  Paper "ResNeSt: Split-Attention Networks", https://arxiv.org/abs/2004.08955
6
+
7
+ Changes from original:
8
+ * Removed bias from fc1 (SplitAttn)
6
9
  """
7
10
 
8
11
  # Reference license: Apache-2.0
birder/net/resnet_v1.py CHANGED
@@ -4,6 +4,8 @@ https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
4
4
 
5
5
  Paper "Deep Residual Learning for Image Recognition", https://arxiv.org/abs/1512.03385
6
6
  and
7
+ Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
8
+ and
7
9
  Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
8
10
  https://arxiv.org/abs/1812.01187
9
11
  """
@@ -101,7 +103,6 @@ class ResNet_v1(DetectorBackbone):
101
103
  *,
102
104
  config: Optional[dict[str, Any]] = None,
103
105
  size: Optional[tuple[int, int]] = None,
104
- squeeze_excitation: bool = False,
105
106
  ) -> None:
106
107
  super().__init__(input_channels, num_classes, config=config, size=size)
107
108
  assert self.config is not None, "must set config"
@@ -110,6 +111,7 @@ class ResNet_v1(DetectorBackbone):
110
111
  filter_list: list[int] = self.config["filter_list"]
111
112
  units: list[int] = self.config["units"]
112
113
  pooling_param: Optional[float] = self.config.get("pooling_param", None)
114
+ squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
113
115
  deep_stem: bool = self.config.get("deep_stem", False)
114
116
  avg_down: bool = self.config.get("avg_down", False)
115
117
 
@@ -302,6 +304,128 @@ registry.register_model_config(
302
304
  },
303
305
  )
304
306
 
307
+ # Squeeze-and-Excitation Networks
308
+ registry.register_model_config(
309
+ "se_resnet_v1_18",
310
+ ResNet_v1,
311
+ config={
312
+ "bottle_neck": False,
313
+ "filter_list": [64, 64, 128, 256, 512],
314
+ "units": [2, 2, 2, 2],
315
+ "squeeze_excitation": True,
316
+ },
317
+ )
318
+ registry.register_model_config(
319
+ "se_resnet_v1_34",
320
+ ResNet_v1,
321
+ config={
322
+ "bottle_neck": False,
323
+ "filter_list": [64, 64, 128, 256, 512],
324
+ "units": [3, 4, 6, 3],
325
+ "squeeze_excitation": True,
326
+ },
327
+ )
328
+ registry.register_model_config(
329
+ "se_resnet_v1_50",
330
+ ResNet_v1,
331
+ config={
332
+ "bottle_neck": True,
333
+ "filter_list": [64, 256, 512, 1024, 2048],
334
+ "units": [3, 4, 6, 3],
335
+ "squeeze_excitation": True,
336
+ },
337
+ )
338
+ registry.register_model_config(
339
+ "se_resnet_v1_101",
340
+ ResNet_v1,
341
+ config={
342
+ "bottle_neck": True,
343
+ "filter_list": [64, 256, 512, 1024, 2048],
344
+ "units": [3, 4, 23, 3],
345
+ "squeeze_excitation": True,
346
+ },
347
+ )
348
+ registry.register_model_config(
349
+ "se_resnet_v1_152",
350
+ ResNet_v1,
351
+ config={
352
+ "bottle_neck": True,
353
+ "filter_list": [64, 256, 512, 1024, 2048],
354
+ "units": [3, 8, 36, 3],
355
+ "squeeze_excitation": True,
356
+ },
357
+ )
358
+ registry.register_model_config(
359
+ "se_resnet_v1_200",
360
+ ResNet_v1,
361
+ config={
362
+ "bottle_neck": True,
363
+ "filter_list": [64, 256, 512, 1024, 2048],
364
+ "units": [3, 24, 36, 3],
365
+ "squeeze_excitation": True,
366
+ },
367
+ )
368
+ registry.register_model_config(
369
+ "se_resnet_v1_269",
370
+ ResNet_v1,
371
+ config={
372
+ "bottle_neck": True,
373
+ "filter_list": [64, 256, 512, 1024, 2048],
374
+ "units": [3, 30, 48, 8],
375
+ "squeeze_excitation": True,
376
+ },
377
+ )
378
+
379
+ # SE-ResNet-D variants with SE
380
+ registry.register_model_config(
381
+ "se_resnet_d_50",
382
+ ResNet_v1,
383
+ config={
384
+ "bottle_neck": True,
385
+ "filter_list": [64, 256, 512, 1024, 2048],
386
+ "units": [3, 4, 6, 3],
387
+ "squeeze_excitation": True,
388
+ "deep_stem": True,
389
+ "avg_down": True,
390
+ },
391
+ )
392
+ registry.register_model_config(
393
+ "se_resnet_d_101",
394
+ ResNet_v1,
395
+ config={
396
+ "bottle_neck": True,
397
+ "filter_list": [64, 256, 512, 1024, 2048],
398
+ "units": [3, 4, 23, 3],
399
+ "squeeze_excitation": True,
400
+ "deep_stem": True,
401
+ "avg_down": True,
402
+ },
403
+ )
404
+ registry.register_model_config(
405
+ "se_resnet_d_152",
406
+ ResNet_v1,
407
+ config={
408
+ "bottle_neck": True,
409
+ "filter_list": [64, 256, 512, 1024, 2048],
410
+ "units": [3, 8, 36, 3],
411
+ "squeeze_excitation": True,
412
+ "deep_stem": True,
413
+ "avg_down": True,
414
+ },
415
+ )
416
+ registry.register_model_config(
417
+ "se_resnet_d_200",
418
+ ResNet_v1,
419
+ config={
420
+ "bottle_neck": True,
421
+ "filter_list": [64, 256, 512, 1024, 2048],
422
+ "units": [3, 24, 36, 3],
423
+ "squeeze_excitation": True,
424
+ "deep_stem": True,
425
+ "avg_down": True,
426
+ },
427
+ )
428
+
305
429
  registry.register_weights(
306
430
  "resnet_v1_50_arabian-peninsula",
307
431
  {
birder/net/resnet_v2.py CHANGED
@@ -3,6 +3,8 @@ ResNet v2, adapted from
3
3
  https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/resnetv2.py
4
4
 
5
5
  Paper "Identity Mappings in Deep Residual Networks", https://arxiv.org/abs/1603.05027
6
+ and
7
+ Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
6
8
  """
7
9
 
8
10
  # Reference license: Apache-2.0
@@ -98,7 +100,6 @@ class ResNet_v2(DetectorBackbone):
98
100
  *,
99
101
  config: Optional[dict[str, Any]] = None,
100
102
  size: Optional[tuple[int, int]] = None,
101
- squeeze_excitation: bool = False,
102
103
  ) -> None:
103
104
  super().__init__(input_channels, num_classes, config=config, size=size)
104
105
  assert self.config is not None, "must set config"
@@ -106,6 +107,7 @@ class ResNet_v2(DetectorBackbone):
106
107
  bottle_neck: bool = self.config["bottle_neck"]
107
108
  filter_list: list[int] = self.config["filter_list"]
108
109
  units: list[int] = self.config["units"]
110
+ squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
109
111
 
110
112
  assert len(units) + 1 == len(filter_list)
111
113
  num_unit = len(units)
@@ -231,3 +233,75 @@ registry.register_model_config(
231
233
  ResNet_v2,
232
234
  config={"bottle_neck": True, "filter_list": [64, 256, 512, 1024, 2048], "units": [3, 30, 48, 8]},
233
235
  )
236
+
237
+ # Squeeze-and-Excitation Networks
238
+ registry.register_model_config(
239
+ "se_resnet_v2_18",
240
+ ResNet_v2,
241
+ config={
242
+ "bottle_neck": False,
243
+ "filter_list": [64, 64, 128, 256, 512],
244
+ "units": [2, 2, 2, 2],
245
+ "squeeze_excitation": True,
246
+ },
247
+ )
248
+ registry.register_model_config(
249
+ "se_resnet_v2_34",
250
+ ResNet_v2,
251
+ config={
252
+ "bottle_neck": False,
253
+ "filter_list": [64, 64, 128, 256, 512],
254
+ "units": [3, 4, 6, 3],
255
+ "squeeze_excitation": True,
256
+ },
257
+ )
258
+ registry.register_model_config(
259
+ "se_resnet_v2_50",
260
+ ResNet_v2,
261
+ config={
262
+ "bottle_neck": True,
263
+ "filter_list": [64, 256, 512, 1024, 2048],
264
+ "units": [3, 4, 6, 3],
265
+ "squeeze_excitation": True,
266
+ },
267
+ )
268
+ registry.register_model_config(
269
+ "se_resnet_v2_101",
270
+ ResNet_v2,
271
+ config={
272
+ "bottle_neck": True,
273
+ "filter_list": [64, 256, 512, 1024, 2048],
274
+ "units": [3, 4, 23, 3],
275
+ "squeeze_excitation": True,
276
+ },
277
+ )
278
+ registry.register_model_config(
279
+ "se_resnet_v2_152",
280
+ ResNet_v2,
281
+ config={
282
+ "bottle_neck": True,
283
+ "filter_list": [64, 256, 512, 1024, 2048],
284
+ "units": [3, 8, 36, 3],
285
+ "squeeze_excitation": True,
286
+ },
287
+ )
288
+ registry.register_model_config(
289
+ "se_resnet_v2_200",
290
+ ResNet_v2,
291
+ config={
292
+ "bottle_neck": True,
293
+ "filter_list": [64, 256, 512, 1024, 2048],
294
+ "units": [3, 24, 36, 3],
295
+ "squeeze_excitation": True,
296
+ },
297
+ )
298
+ registry.register_model_config(
299
+ "se_resnet_v2_269",
300
+ ResNet_v2,
301
+ config={
302
+ "bottle_neck": True,
303
+ "filter_list": [64, 256, 512, 1024, 2048],
304
+ "units": [3, 30, 48, 8],
305
+ "squeeze_excitation": True,
306
+ },
307
+ )
birder/net/resnext.py CHANGED
@@ -3,6 +3,11 @@ ResNeXt, adapted from
3
3
  https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
4
4
 
5
5
  Paper "Aggregated Residual Transformations for Deep Neural Networks", https://arxiv.org/abs/1611.05431
6
+ and
7
+ Paper "Squeeze-and-Excitation Networks", https://arxiv.org/abs/1709.01507
8
+ and
9
+ Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
10
+ https://arxiv.org/abs/1812.01187
6
11
  """
7
12
 
8
13
  # Reference license: BSD 3-Clause
@@ -117,7 +122,6 @@ class ResNeXt(DetectorBackbone):
117
122
  *,
118
123
  config: Optional[dict[str, Any]] = None,
119
124
  size: Optional[tuple[int, int]] = None,
120
- squeeze_excitation: bool = False,
121
125
  ) -> None:
122
126
  super().__init__(input_channels, num_classes, config=config, size=size)
123
127
  assert self.config is not None, "must set config"
@@ -127,6 +131,7 @@ class ResNeXt(DetectorBackbone):
127
131
  base_width: int = self.config.get("base_width", 4)
128
132
  filter_list = [64, 128, 256, 512]
129
133
  units: list[int] = self.config["units"]
134
+ squeeze_excitation: bool = self.config.get("squeeze_excitation", False)
130
135
  deep_stem: bool = self.config.get("deep_stem", False)
131
136
  avg_down: bool = self.config.get("avg_down", False)
132
137
 
@@ -251,3 +256,32 @@ registry.register_model_config(
251
256
  registry.register_model_config(
252
257
  "resnext_d_152", ResNeXt, config={"units": [3, 8, 36, 3], "deep_stem": True, "avg_down": True}
253
258
  )
259
+
260
+ # Squeeze-and-Excitation Networks
261
+ registry.register_model_config("se_resnext_50", ResNeXt, config={"units": [3, 4, 6, 3], "squeeze_excitation": True})
262
+ registry.register_model_config("se_resnext_101", ResNeXt, config={"units": [3, 4, 23, 3], "squeeze_excitation": True})
263
+ registry.register_model_config("se_resnext_152", ResNeXt, config={"units": [3, 8, 36, 3], "squeeze_excitation": True})
264
+
265
+ registry.register_model_config(
266
+ "se_resnext_101_32x8", ResNeXt, config={"units": [3, 4, 23, 3], "base_width": 8, "squeeze_excitation": True}
267
+ )
268
+ registry.register_model_config(
269
+ "se_resnext_101_64x4", ResNeXt, config={"units": [3, 4, 23, 3], "groups": 64, "squeeze_excitation": True}
270
+ )
271
+
272
+ # SE-ResNeXt-D variants with SE
273
+ registry.register_model_config(
274
+ "se_resnext_d_50",
275
+ ResNeXt,
276
+ config={"units": [3, 4, 6, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
277
+ )
278
+ registry.register_model_config(
279
+ "se_resnext_d_101",
280
+ ResNeXt,
281
+ config={"units": [3, 4, 23, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
282
+ )
283
+ registry.register_model_config(
284
+ "se_resnext_d_152",
285
+ ResNeXt,
286
+ config={"units": [3, 8, 36, 3], "squeeze_excitation": True, "deep_stem": True, "avg_down": True},
287
+ )