birder 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. birder/adversarial/__init__.py +13 -0
  2. birder/adversarial/base.py +101 -0
  3. birder/adversarial/deepfool.py +173 -0
  4. birder/adversarial/fgsm.py +51 -18
  5. birder/adversarial/pgd.py +79 -28
  6. birder/adversarial/simba.py +172 -0
  7. birder/common/training_cli.py +11 -3
  8. birder/common/training_utils.py +18 -1
  9. birder/inference/data_parallel.py +1 -2
  10. birder/introspection/__init__.py +10 -6
  11. birder/introspection/attention_rollout.py +122 -54
  12. birder/introspection/base.py +73 -29
  13. birder/introspection/gradcam.py +71 -100
  14. birder/introspection/guided_backprop.py +146 -72
  15. birder/introspection/transformer_attribution.py +182 -0
  16. birder/net/detection/deformable_detr.py +14 -12
  17. birder/net/detection/detr.py +7 -3
  18. birder/net/detection/rt_detr_v1.py +3 -3
  19. birder/net/detection/yolo_v3.py +6 -11
  20. birder/net/detection/yolo_v4.py +7 -18
  21. birder/net/detection/yolo_v4_tiny.py +3 -3
  22. birder/net/fastvit.py +1 -1
  23. birder/net/mim/mae_vit.py +7 -8
  24. birder/net/pit.py +1 -1
  25. birder/net/resnet_v1.py +94 -34
  26. birder/net/ssl/data2vec.py +1 -1
  27. birder/net/ssl/data2vec2.py +4 -2
  28. birder/results/gui.py +15 -2
  29. birder/scripts/predict_detection.py +33 -1
  30. birder/scripts/train.py +24 -17
  31. birder/scripts/train_barlow_twins.py +10 -7
  32. birder/scripts/train_byol.py +10 -7
  33. birder/scripts/train_capi.py +12 -9
  34. birder/scripts/train_data2vec.py +10 -7
  35. birder/scripts/train_data2vec2.py +10 -7
  36. birder/scripts/train_detection.py +42 -18
  37. birder/scripts/train_dino_v1.py +10 -7
  38. birder/scripts/train_dino_v2.py +10 -7
  39. birder/scripts/train_dino_v2_dist.py +17 -7
  40. birder/scripts/train_franca.py +10 -7
  41. birder/scripts/train_i_jepa.py +17 -13
  42. birder/scripts/train_ibot.py +10 -7
  43. birder/scripts/train_kd.py +24 -18
  44. birder/scripts/train_mim.py +11 -10
  45. birder/scripts/train_mmcr.py +10 -7
  46. birder/scripts/train_rotnet.py +10 -7
  47. birder/scripts/train_simclr.py +10 -7
  48. birder/scripts/train_vicreg.py +10 -7
  49. birder/tools/__main__.py +6 -2
  50. birder/tools/adversarial.py +147 -96
  51. birder/tools/auto_anchors.py +361 -0
  52. birder/tools/ensemble_model.py +1 -1
  53. birder/tools/introspection.py +58 -31
  54. birder/version.py +1 -1
  55. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/METADATA +2 -1
  56. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/RECORD +60 -55
  57. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/WHEEL +0 -0
  58. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/entry_points.txt +0 -0
  59. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/licenses/LICENSE +0 -0
  60. {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/top_level.txt +0 -0
@@ -133,7 +133,7 @@ class MultiScaleDeformableAttention(nn.Module):
133
133
  self._reset_parameters()
134
134
 
135
135
  def _reset_parameters(self) -> None:
136
- nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
136
+ nn.init.constant_(self.sampling_offsets.weight, 0.0)
137
137
  thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
138
138
  grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
139
139
  grid_init = (
@@ -147,12 +147,12 @@ class MultiScaleDeformableAttention(nn.Module):
147
147
  with torch.no_grad():
148
148
  self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
149
149
 
150
- nn.init.constant_(self.attention_weights.weight.data, 0.0)
151
- nn.init.constant_(self.attention_weights.bias.data, 0.0)
152
- nn.init.xavier_uniform_(self.value_proj.weight.data)
153
- nn.init.constant_(self.value_proj.bias.data, 0.0)
154
- nn.init.xavier_uniform_(self.output_proj.weight.data)
155
- nn.init.constant_(self.output_proj.bias.data, 0.0)
150
+ nn.init.constant_(self.attention_weights.weight, 0.0)
151
+ nn.init.constant_(self.attention_weights.bias, 0.0)
152
+ nn.init.xavier_uniform_(self.value_proj.weight)
153
+ nn.init.constant_(self.value_proj.bias, 0.0)
154
+ nn.init.xavier_uniform_(self.output_proj.weight)
155
+ nn.init.constant_(self.output_proj.bias, 0.0)
156
156
 
157
157
  def forward(
158
158
  self,
@@ -280,8 +280,10 @@ class DeformableTransformerDecoderLayer(nn.Module):
280
280
  q = tgt + query_pos
281
281
  k = tgt + query_pos
282
282
 
283
- tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), attn_mask=self_attn_mask)
284
- tgt2 = tgt2[0].transpose(0, 1)
283
+ (tgt2, _) = self.self_attn(
284
+ q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), need_weights=False, attn_mask=self_attn_mask
285
+ )
286
+ tgt2 = tgt2.transpose(0, 1)
285
287
  tgt = tgt + self.dropout(tgt2)
286
288
  tgt = self.norm1(tgt)
287
289
 
@@ -451,8 +453,8 @@ class DeformableTransformer(nn.Module):
451
453
  if isinstance(m, MultiScaleDeformableAttention):
452
454
  m._reset_parameters()
453
455
 
454
- nn.init.xavier_uniform_(self.reference_points.weight.data, gain=1.0)
455
- nn.init.zeros_(self.reference_points.bias.data)
456
+ nn.init.xavier_uniform_(self.reference_points.weight, gain=1.0)
457
+ nn.init.zeros_(self.reference_points.bias)
456
458
 
457
459
  nn.init.normal_(self.level_embed)
458
460
 
@@ -613,7 +615,7 @@ class Deformable_DETR(DetectionBaseNet):
613
615
  nn.init.zeros_(bbox_embed[-2].weight)
614
616
  nn.init.zeros_(bbox_embed[-2].bias)
615
617
 
616
- nn.init.constant_(self.bbox_embed[0][-2].bias.data[2:], -2.0)
618
+ nn.init.constant_(self.bbox_embed[0][-2].bias[2:], -2.0)
617
619
 
618
620
  def reset_classifier(self, num_classes: int) -> None:
619
621
  self.num_classes = num_classes
@@ -108,7 +108,7 @@ class TransformerEncoderLayer(nn.Module):
108
108
  q = src + pos
109
109
  k = src + pos
110
110
 
111
- (src2, _) = self.self_attn(q, k, value=src, key_padding_mask=src_key_padding_mask)
111
+ (src2, _) = self.self_attn(q, k, value=src, key_padding_mask=src_key_padding_mask, need_weights=False)
112
112
  src = src + self.dropout1(src2)
113
113
  src = self.norm1(src)
114
114
  src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
@@ -148,11 +148,15 @@ class TransformerDecoderLayer(nn.Module):
148
148
  q = tgt + query_pos
149
149
  k = tgt + query_pos
150
150
 
151
- (tgt2, _) = self.self_attn(q, k, value=tgt)
151
+ (tgt2, _) = self.self_attn(q, k, value=tgt, need_weights=False)
152
152
  tgt = tgt + self.dropout1(tgt2)
153
153
  tgt = self.norm1(tgt)
154
154
  (tgt2, _) = self.multihead_attn(
155
- query=tgt + query_pos, key=memory + pos, value=memory, key_padding_mask=memory_key_padding_mask
155
+ query=tgt + query_pos,
156
+ key=memory + pos,
157
+ value=memory,
158
+ key_padding_mask=memory_key_padding_mask,
159
+ need_weights=False,
156
160
  )
157
161
  tgt = tgt + self.dropout2(tgt2)
158
162
  tgt = self.norm2(tgt)
@@ -234,7 +234,7 @@ class TransformerEncoderLayer(nn.Module):
234
234
  q = src + pos
235
235
  k = src + pos
236
236
 
237
- (src2, _) = self.self_attn(q, k, value=src, key_padding_mask=key_padding_mask)
237
+ (src2, _) = self.self_attn(q, k, value=src, key_padding_mask=key_padding_mask, need_weights=False)
238
238
  src = src + self.dropout1(src2)
239
239
  src = self.norm1(src)
240
240
 
@@ -465,8 +465,8 @@ class RT_DETRDecoder(nn.Module):
465
465
  nn.init.constant_(class_embed.bias, bias_value)
466
466
 
467
467
  for bbox_embed in self.bbox_embed:
468
- nn.init.zeros_(bbox_embed[-2].weight.data)
469
- nn.init.zeros_(bbox_embed[-2].bias.data)
468
+ nn.init.zeros_(bbox_embed[-2].weight)
469
+ nn.init.zeros_(bbox_embed[-2].bias)
470
470
 
471
471
  def set_cache_enabled(self, enabled: bool) -> None:
472
472
  self.use_cache = enabled
@@ -37,7 +37,7 @@ def scale_anchors(
37
37
  to_size: tuple[int, int],
38
38
  ) -> list[list[tuple[float, float]]]:
39
39
  if from_size == to_size:
40
- # Avoid aliasing default anchors in case they are mutated later.
40
+ # Avoid aliasing default anchors in case they are mutated later
41
41
  return [list(scale) for scale in anchors]
42
42
 
43
43
  scale_h = to_size[0] / from_size[0]
@@ -368,14 +368,16 @@ class YOLO_v3(DetectionBaseNet):
368
368
  num_anchors = self.anchor_generator.num_anchors_per_location()
369
369
  self.head = YOLOHead(self.neck.out_channels, num_anchors, self.num_classes)
370
370
 
371
- def adjust_size(self, new_size: tuple[int, int]) -> None:
371
+ def adjust_size(self, new_size: tuple[int, int], adjust_anchors: bool = False) -> None:
372
372
  if new_size == self.size:
373
373
  return
374
374
 
375
375
  old_size = self.size
376
376
  super().adjust_size(new_size)
377
- self.anchors = scale_anchors(self.anchors, old_size, new_size)
378
- self.anchor_generator.anchors = self.anchors
377
+
378
+ if adjust_anchors is True:
379
+ self.anchors = scale_anchors(self.anchors, old_size, new_size)
380
+ self.anchor_generator.anchors = self.anchors
379
381
 
380
382
  def freeze(self, freeze_classifier: bool = True) -> None:
381
383
  for param in self.parameters():
@@ -705,13 +707,6 @@ class YOLO_v3(DetectionBaseNet):
705
707
  neck_features = self.neck(features)
706
708
  predictions = self.head(neck_features)
707
709
  (anchors, grids, strides) = self.anchor_generator(images, neck_features)
708
- if self.dynamic_size is True:
709
- image_size = (images.tensors.shape[-2], images.tensors.shape[-1])
710
- if image_size[0] != self.size[0] or image_size[1] != self.size[1]:
711
- scale_w = image_size[1] / self.size[1]
712
- scale_h = image_size[0] / self.size[0]
713
- scale_tensor = torch.tensor([scale_w, scale_h], device=anchors[0].device, dtype=anchors[0].dtype)
714
- anchors = [anchor * scale_tensor for anchor in anchors]
715
710
 
716
711
  losses: dict[str, torch.Tensor] = {}
717
712
  detections: list[dict[str, torch.Tensor]] = []
@@ -400,14 +400,8 @@ class YOLO_v4(DetectionBaseNet):
400
400
  self.ignore_thresh = 0.7
401
401
 
402
402
  # Loss coefficients
403
- # Note: coord_coeff=0.07 matches darknet's iou_normalizer for CIoU loss.
404
- # However, darknet uses squared deltas (loss = sum(delta^2) / batch) while we compute
405
- # CIoU loss directly (loss = coeff * sum(ciou) / num_obj). This different formulation
406
- # means darknet's obj_normalizer=1.0 overweights background loss relative to box
407
- # regression in our implementation. We use a lower noobj_coeff (vs darknet's 1.0) to
408
- # restore a better balance, similar to YOLOv3's noobj_coeff=0.2.
409
- self.noobj_coeff = 0.3
410
- self.coord_coeff = 0.07
403
+ self.noobj_coeff = 0.25
404
+ self.coord_coeff = 3.0
411
405
  self.obj_coeff = 1.0
412
406
  self.cls_coeff = 1.0
413
407
 
@@ -439,14 +433,16 @@ class YOLO_v4(DetectionBaseNet):
439
433
  num_anchors = self.anchor_generator.num_anchors_per_location()
440
434
  self.head = YOLOHead(self.neck.out_channels, num_anchors, self.num_classes)
441
435
 
442
- def adjust_size(self, new_size: tuple[int, int]) -> None:
436
+ def adjust_size(self, new_size: tuple[int, int], adjust_anchors: bool = False) -> None:
443
437
  if new_size == self.size:
444
438
  return
445
439
 
446
440
  old_size = self.size
447
441
  super().adjust_size(new_size)
448
- self.anchors = scale_anchors(self.anchors, old_size, new_size)
449
- self.anchor_generator = YOLOAnchorGenerator(self.anchors)
442
+
443
+ if adjust_anchors is True:
444
+ self.anchors = scale_anchors(self.anchors, old_size, new_size)
445
+ self.anchor_generator = YOLOAnchorGenerator(self.anchors)
450
446
 
451
447
  def freeze(self, freeze_classifier: bool = True) -> None:
452
448
  for param in self.parameters():
@@ -809,13 +805,6 @@ class YOLO_v4(DetectionBaseNet):
809
805
  neck_features = self.neck(features)
810
806
  predictions = self.head(neck_features)
811
807
  (anchors, grids, strides) = self.anchor_generator(images, neck_features)
812
- if self.dynamic_size is True:
813
- image_size = (images.tensors.shape[-2], images.tensors.shape[-1])
814
- if image_size[0] != self.size[0] or image_size[1] != self.size[1]:
815
- scale_w = image_size[1] / self.size[1]
816
- scale_h = image_size[0] / self.size[0]
817
- scale_tensor = torch.tensor([scale_w, scale_h], device=anchors[0].device, dtype=anchors[0].dtype)
818
- anchors = [anchor * scale_tensor for anchor in anchors]
819
808
 
820
809
  losses: dict[str, torch.Tensor] = {}
821
810
  detections: list[dict[str, torch.Tensor]] = []
@@ -113,9 +113,9 @@ class YOLO_v4_Tiny(YOLO_v4):
113
113
  detections_per_img = 300
114
114
  self.ignore_thresh = 0.7
115
115
 
116
- # Loss coefficients - see YOLO v4 for detailed explanation
117
- self.noobj_coeff = 0.3
118
- self.coord_coeff = 0.07
116
+ # Loss coefficients
117
+ self.noobj_coeff = 0.25
118
+ self.coord_coeff = 3.0
119
119
  self.obj_coeff = 1.0
120
120
  self.cls_coeff = 1.0
121
121
 
birder/net/fastvit.py CHANGED
@@ -818,10 +818,10 @@ class FastViT(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
818
818
  self.embedding_size = int(embed_dims[-1] * cls_ratio)
819
819
  self.classifier = self.create_classifier()
820
820
 
821
+ self.max_stride = 2 ** (len(layers) + 1)
821
822
  self.stem_stride = 4
822
823
  self.stem_width = embed_dims[0]
823
824
  self.encoding_size = int(embed_dims[-1] * cls_ratio)
824
- self.max_stride = 2 ** (len(layers) + 1)
825
825
 
826
826
  # Weights initialization
827
827
  for m in self.modules():
birder/net/mim/mae_vit.py CHANGED
@@ -2,13 +2,12 @@
2
2
  MAE ViT, adapted from
3
3
  https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/mae.py
4
4
  and
5
- https://github.com/facebookresearch/mae/blob/main/models_mae.py
5
+ https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit_mae/modeling_vit_mae.py
6
6
 
7
- Paper "Masked Autoencoders Are Scalable Vision Learners",
8
- https://arxiv.org/abs/2111.06377
7
+ Paper "Masked Autoencoders Are Scalable Vision Learners", https://arxiv.org/abs/2111.06377
9
8
  """
10
9
 
11
- # Reference license: MIT and Attribution-NonCommercial 4.0 International
10
+ # Reference license: MIT and Apache-2.0
12
11
 
13
12
  from typing import Any
14
13
  from typing import Optional
@@ -61,7 +60,7 @@ class MAE_ViT(MIMBaseNet):
61
60
  seq_len += self.encoder.num_special_tokens
62
61
  self.decoder_pos_embed = nn.Parameter(torch.empty(1, seq_len, decoder_embed_dim).normal_(std=0.02))
63
62
  else:
64
- # Fixed sin-cos embedding
63
+ # Fixed sin-cos embeddings
65
64
  pos_embedding = pos_embedding_sin_cos_2d(
66
65
  h=self.size[0] // self.patch_size,
67
66
  w=self.size[1] // self.patch_size,
@@ -124,12 +123,12 @@ class MAE_ViT(MIMBaseNet):
124
123
  mask_tokens = self.mask_token.repeat(x.size(0), ids_restore.size(1) + special_token_len - x.size(1), 1)
125
124
  x_ = torch.concat([x[:, special_token_len:, :], mask_tokens], dim=1) # No special tokens
126
125
  x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.size(2))) # Un-shuffle
127
- x = torch.concat([x[:, :special_token_len, :], x_], dim=1) # Append special tokens
126
+ x = torch.concat([x[:, :special_token_len, :], x_], dim=1) # Re-append special tokens
128
127
 
129
- # Add pos embed
128
+ # Add positional embeddings
130
129
  x = x + self.decoder_pos_embed
131
130
 
132
- # Apply transformer
131
+ # Apply decoder transformer
133
132
  x = self.decoder(x)
134
133
 
135
134
  # Remove special tokens
birder/net/pit.py CHANGED
@@ -259,7 +259,7 @@ class PiT(DetectorBackbone):
259
259
  width = (new_size[1] - self.patch_size[1]) // self.patch_stride[1] + 1
260
260
 
261
261
  self.pos_embed = nn.Parameter(
262
- F.interpolate(self.pos_embed.data, (height, width), mode="bicubic"), requires_grad=True
262
+ F.interpolate(self.pos_embed, (height, width), mode="bicubic"), requires_grad=True
263
263
  )
264
264
 
265
265
 
birder/net/resnet_v1.py CHANGED
@@ -3,6 +3,9 @@ ResNet v1, adapted from
3
3
  https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
4
4
 
5
5
  Paper "Deep Residual Learning for Image Recognition", https://arxiv.org/abs/1512.03385
6
+ and
7
+ Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
8
+ https://arxiv.org/abs/1812.01187
6
9
  """
7
10
 
8
11
  # Reference license: BSD 3-Clause
@@ -23,34 +26,25 @@ from birder.net.base import DetectorBackbone
23
26
 
24
27
  class ResidualBlock(nn.Module):
25
28
  def __init__(
26
- self, in_channels: int, out_channels: int, stride: tuple[int, int], bottle_neck: bool, squeeze_excitation: bool
29
+ self,
30
+ in_channels: int,
31
+ out_channels: int,
32
+ stride: tuple[int, int],
33
+ bottle_neck: bool,
34
+ squeeze_excitation: bool,
35
+ avg_down: bool,
27
36
  ) -> None:
28
37
  super().__init__()
29
38
  if bottle_neck is True:
30
39
  self.block1 = nn.Sequential(
31
40
  Conv2dNormActivation(
32
- in_channels,
33
- out_channels // 4,
34
- kernel_size=(1, 1),
35
- stride=(1, 1),
36
- padding=(0, 0),
37
- bias=False,
41
+ in_channels, out_channels // 4, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False
38
42
  ),
39
43
  Conv2dNormActivation(
40
- out_channels // 4,
41
- out_channels // 4,
42
- kernel_size=(3, 3),
43
- stride=stride,
44
- padding=(1, 1),
45
- bias=False,
44
+ out_channels // 4, out_channels // 4, kernel_size=(3, 3), stride=stride, padding=(1, 1), bias=False
46
45
  ),
47
46
  nn.Conv2d(
48
- out_channels // 4,
49
- out_channels,
50
- kernel_size=(1, 1),
51
- stride=(1, 1),
52
- padding=(0, 0),
53
- bias=False,
47
+ out_channels // 4, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False
54
48
  ),
55
49
  nn.BatchNorm2d(out_channels),
56
50
  )
@@ -67,10 +61,19 @@ class ResidualBlock(nn.Module):
67
61
  if in_channels == out_channels:
68
62
  self.block2 = nn.Identity()
69
63
  else:
70
- self.block2 = nn.Sequential(
71
- nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=stride, padding=(0, 0), bias=False),
72
- nn.BatchNorm2d(out_channels),
73
- )
64
+ if avg_down is True and stride != (1, 1):
65
+ # ResNet-D: Apply average pooling before 1x1 conv for downsampling
66
+ self.block2 = nn.Sequential(
67
+ nn.AvgPool2d(kernel_size=2, stride=stride, ceil_mode=True, count_include_pad=False),
68
+ nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
69
+ nn.BatchNorm2d(out_channels),
70
+ )
71
+ else:
72
+ # Standard ResNet: Use strided 1x1 conv
73
+ self.block2 = nn.Sequential(
74
+ nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=stride, padding=(0, 0), bias=False),
75
+ nn.BatchNorm2d(out_channels),
76
+ )
74
77
 
75
78
  self.relu = nn.ReLU(inplace=True)
76
79
  if squeeze_excitation is True:
@@ -107,21 +110,30 @@ class ResNet_v1(DetectorBackbone):
107
110
  filter_list: list[int] = self.config["filter_list"]
108
111
  units: list[int] = self.config["units"]
109
112
  pooling_param: Optional[float] = self.config.get("pooling_param", None)
113
+ deep_stem: bool = self.config.get("deep_stem", False)
114
+ avg_down: bool = self.config.get("avg_down", False)
110
115
 
111
116
  assert len(units) + 1 == len(filter_list)
112
117
  num_unit = len(units)
113
118
 
114
- self.stem = nn.Sequential(
115
- Conv2dNormActivation(
116
- self.input_channels,
117
- filter_list[0],
118
- kernel_size=(7, 7),
119
- stride=(2, 2),
120
- padding=(3, 3),
121
- bias=False,
122
- ),
123
- nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
124
- )
119
+ if deep_stem is True:
120
+ # ResNet-D
121
+ self.stem = nn.Sequential(
122
+ Conv2dNormActivation(
123
+ self.input_channels, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False
124
+ ),
125
+ Conv2dNormActivation(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
126
+ Conv2dNormActivation(32, filter_list[0], kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
127
+ nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
128
+ )
129
+ else:
130
+ # Standard ResNet stem: 7x7 conv
131
+ self.stem = nn.Sequential(
132
+ Conv2dNormActivation(
133
+ self.input_channels, filter_list[0], kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
134
+ ),
135
+ nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
136
+ )
125
137
 
126
138
  # Generate body layers
127
139
  stages: OrderedDict[str, nn.Module] = OrderedDict()
@@ -140,6 +152,7 @@ class ResNet_v1(DetectorBackbone):
140
152
  stride=stride,
141
153
  bottle_neck=bottle_neck,
142
154
  squeeze_excitation=squeeze_excitation,
155
+ avg_down=avg_down,
143
156
  )
144
157
  )
145
158
  for _ in range(1, units[i]):
@@ -150,6 +163,7 @@ class ResNet_v1(DetectorBackbone):
150
163
  stride=(1, 1),
151
164
  bottle_neck=bottle_neck,
152
165
  squeeze_excitation=squeeze_excitation,
166
+ avg_down=avg_down,
153
167
  )
154
168
  )
155
169
 
@@ -242,6 +256,52 @@ registry.register_model_config(
242
256
  config={"bottle_neck": True, "filter_list": [64, 256, 512, 1024, 2048], "units": [3, 30, 48, 8]},
243
257
  )
244
258
 
259
+ # ResNet-D variants (From: Bag of Tricks for Image Classification with Convolutional Neural Networks)
260
+ registry.register_model_config(
261
+ "resnet_d_50",
262
+ ResNet_v1,
263
+ config={
264
+ "bottle_neck": True,
265
+ "filter_list": [64, 256, 512, 1024, 2048],
266
+ "units": [3, 4, 6, 3],
267
+ "deep_stem": True,
268
+ "avg_down": True,
269
+ },
270
+ )
271
+ registry.register_model_config(
272
+ "resnet_d_101",
273
+ ResNet_v1,
274
+ config={
275
+ "bottle_neck": True,
276
+ "filter_list": [64, 256, 512, 1024, 2048],
277
+ "units": [3, 4, 23, 3],
278
+ "deep_stem": True,
279
+ "avg_down": True,
280
+ },
281
+ )
282
+ registry.register_model_config(
283
+ "resnet_d_152",
284
+ ResNet_v1,
285
+ config={
286
+ "bottle_neck": True,
287
+ "filter_list": [64, 256, 512, 1024, 2048],
288
+ "units": [3, 8, 36, 3],
289
+ "deep_stem": True,
290
+ "avg_down": True,
291
+ },
292
+ )
293
+ registry.register_model_config(
294
+ "resnet_d_200",
295
+ ResNet_v1,
296
+ config={
297
+ "bottle_neck": True,
298
+ "filter_list": [64, 256, 512, 1024, 2048],
299
+ "units": [3, 24, 36, 3],
300
+ "deep_stem": True,
301
+ "avg_down": True,
302
+ },
303
+ )
304
+
245
305
  registry.register_weights(
246
306
  "resnet_v1_50_arabian-peninsula",
247
307
  {
@@ -78,7 +78,7 @@ class Data2Vec(SSLBaseNet):
78
78
  if self.normalize_targets is True:
79
79
  y = F.layer_norm(y.float(), y.shape[-1:])
80
80
 
81
- mask = ~mask.bool()
81
+ mask = mask.bool()
82
82
  x = x[mask]
83
83
  y = y[mask]
84
84
 
@@ -7,7 +7,6 @@ https://arxiv.org/abs/2212.07525
7
7
 
8
8
  Changes from original:
9
9
  * Target CLS is taken just from the last layer
10
- * Replaced instance norm (1st of the IN -> AVG -> LM) with layer norm
11
10
  """
12
11
 
13
12
  # Reference license: MIT
@@ -140,7 +139,10 @@ class Data2Vec2(SSLBaseNet):
140
139
  y = y[..., -self.average_top_k_layers :] # Take the last k layers
141
140
  y = y.permute(3, 0, 1, 2)
142
141
 
143
- y = [F.layer_norm(t.float(), t.shape[-1:]) for t in y[:-1]] + [y[-1]]
142
+ # Note: the backbone already LN-normalizes the final layer (per-token),
143
+ # but data2vec2 uses per-layer instance norm across tokens (per-channel)
144
+ # before averaging (IN -> AVG -> LN), so we keep IN for all K layers.
145
+ y = [F.instance_norm(t.float().transpose(1, 2)).transpose(1, 2) for t in y]
144
146
  y = sum(y) / len(y)
145
147
  y = F.layer_norm(y.float(), y.shape[-1:])
146
148
 
birder/results/gui.py CHANGED
@@ -31,6 +31,7 @@ def show_detections(
31
31
  detection: dict[str, torch.Tensor],
32
32
  class_to_idx: dict[str, int],
33
33
  score_threshold: float = 0.5,
34
+ class_min_scores: Optional[dict[str, float]] = None,
34
35
  color_list: Optional[list[tuple[int, ...]]] = None,
35
36
  show: bool = True,
36
37
  ) -> tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]:
@@ -38,10 +39,22 @@ def show_detections(
38
39
  idx_to_class = dict(zip(class_to_idx.values(), class_to_idx.keys()))
39
40
 
40
41
  scores = detection["scores"]
41
- idxs = torch.where(scores > score_threshold)
42
+ labels_all = detection["labels"]
43
+
44
+ # Apply per-class minimum scores if provided, otherwise use global threshold
45
+ if class_min_scores is not None and len(class_min_scores) > 0:
46
+ mask = torch.zeros(len(scores), dtype=torch.bool)
47
+ for i, (score, label) in enumerate(zip(scores, labels_all)):
48
+ class_name = idx_to_class[label.item()]
49
+ min_score = class_min_scores.get(class_name, score_threshold)
50
+ mask[i] = score > min_score
51
+ idxs = torch.where(mask)[0]
52
+ else:
53
+ idxs = torch.where(scores > score_threshold)
54
+
42
55
  scores = scores[idxs]
43
56
  boxes = detection["boxes"][idxs]
44
- labels = detection["labels"][idxs]
57
+ labels = labels_all[idxs]
45
58
  label_names = [f"{idx_to_class[i.item()]}: {s:.4f}" for i, s in zip(labels, scores)]
46
59
  if color_list is not None:
47
60
  colors = [color_list[label] for label in labels]
@@ -101,6 +101,17 @@ def predict(args: argparse.Namespace) -> None:
101
101
 
102
102
  score_threshold = args.min_score
103
103
 
104
+ # Process per-class minimum scores
105
+ class_min_scores: dict[str, float] = {}
106
+ if args.class_min_score is not None:
107
+ for class_name, score_str in args.class_min_score:
108
+ score = float(score_str)
109
+ if class_name not in class_to_idx:
110
+ logger.warning(f"Class '{class_name}' from --class-min-score not found in model classes")
111
+ else:
112
+ class_min_scores[class_name] = score
113
+ logger.info(f"Using minimum score {score} for class '{class_name}'")
114
+
104
115
  # Set label colors
105
116
  cmap = plt.get_cmap("jet")
106
117
  color_list = []
@@ -157,6 +168,7 @@ def predict(args: argparse.Namespace) -> None:
157
168
  detection,
158
169
  class_to_idx=class_to_idx,
159
170
  score_threshold=score_threshold,
171
+ class_min_scores=class_min_scores,
160
172
  color_list=color_list,
161
173
  )
162
174
 
@@ -224,7 +236,10 @@ def get_args_parser() -> argparse.ArgumentParser:
224
236
  "-e 0 --min-score 0.25 --gpu --show --shuffle data/detection_data/validation\n"
225
237
  "python predict_detection.py --network faster_rcnn -t coco --backbone csp_resnet_50 "
226
238
  "--backbone-tag imagenet1k -e 0 --batch-size 1 --gpu --gpu-id 1 "
227
- "--coco-json-path data/detection_data/validation_annotations_coco.json data/detection_data"
239
+ "--coco-json-path data/detection_data/validation_annotations_coco.json data/detection_data\n"
240
+ "python predict_detection.py -n yolo_v4 --backbone csp_resnet_50 --backbone-tag imagenet1k -t coco "
241
+ " --min-score 0.4 --class-min-score person 0.75 --class-min-score car 0.3 --batch-size 1 --show "
242
+ "--shuffle ~/Datasets/cocodataset/val2017\n"
228
243
  ),
229
244
  formatter_class=cli.ArgumentHelpFormatter,
230
245
  )
@@ -284,6 +299,13 @@ def get_args_parser() -> argparse.ArgumentParser:
284
299
  "--fast-matmul", default=False, action="store_true", help="use fast matrix multiplication (affects precision)"
285
300
  )
286
301
  parser.add_argument("--min-score", type=float, default=0.5, help="prediction score threshold")
302
+ parser.add_argument(
303
+ "--class-min-score",
304
+ action="append",
305
+ nargs=2,
306
+ metavar=("CLASS", "SCORE"),
307
+ help="set custom minimum score for specific class (can be used multiple times)",
308
+ )
287
309
  parser.add_argument(
288
310
  "--size",
289
311
  type=int,
@@ -342,6 +364,16 @@ def validate_args(args: argparse.Namespace) -> None:
342
364
  )
343
365
  if args.min_score >= 1 or args.min_score <= 0.0:
344
366
  raise cli.ValidationError(f"--min-score must be in range of (0, 1.0), got {args.min_score}")
367
+ if args.class_min_score is not None:
368
+ for class_name, score_str in args.class_min_score:
369
+ try:
370
+ score = float(score_str)
371
+ if score >= 1.0 or score <= 0.0:
372
+ raise cli.ValidationError(
373
+ f"--class-min-score for '{class_name}' must be in range of (0, 1.0), got {score}"
374
+ )
375
+ except ValueError as e:
376
+ raise cli.ValidationError(f"--class-min-score value must be a valid float, got '{score_str}'") from e
345
377
  if args.parallel is True and args.gpu is False:
346
378
  raise cli.ValidationError("--parallel requires --gpu to be set")
347
379
  if args.parallel is True and args.compile is True: