birder 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birder/adversarial/__init__.py +13 -0
- birder/adversarial/base.py +101 -0
- birder/adversarial/deepfool.py +173 -0
- birder/adversarial/fgsm.py +51 -18
- birder/adversarial/pgd.py +79 -28
- birder/adversarial/simba.py +172 -0
- birder/common/training_cli.py +11 -3
- birder/common/training_utils.py +18 -1
- birder/inference/data_parallel.py +1 -2
- birder/introspection/__init__.py +10 -6
- birder/introspection/attention_rollout.py +122 -54
- birder/introspection/base.py +73 -29
- birder/introspection/gradcam.py +71 -100
- birder/introspection/guided_backprop.py +146 -72
- birder/introspection/transformer_attribution.py +182 -0
- birder/net/detection/deformable_detr.py +14 -12
- birder/net/detection/detr.py +7 -3
- birder/net/detection/rt_detr_v1.py +3 -3
- birder/net/detection/yolo_v3.py +6 -11
- birder/net/detection/yolo_v4.py +7 -18
- birder/net/detection/yolo_v4_tiny.py +3 -3
- birder/net/fastvit.py +1 -1
- birder/net/mim/mae_vit.py +7 -8
- birder/net/pit.py +1 -1
- birder/net/resnet_v1.py +94 -34
- birder/net/ssl/data2vec.py +1 -1
- birder/net/ssl/data2vec2.py +4 -2
- birder/results/gui.py +15 -2
- birder/scripts/predict_detection.py +33 -1
- birder/scripts/train.py +24 -17
- birder/scripts/train_barlow_twins.py +10 -7
- birder/scripts/train_byol.py +10 -7
- birder/scripts/train_capi.py +12 -9
- birder/scripts/train_data2vec.py +10 -7
- birder/scripts/train_data2vec2.py +10 -7
- birder/scripts/train_detection.py +42 -18
- birder/scripts/train_dino_v1.py +10 -7
- birder/scripts/train_dino_v2.py +10 -7
- birder/scripts/train_dino_v2_dist.py +17 -7
- birder/scripts/train_franca.py +10 -7
- birder/scripts/train_i_jepa.py +17 -13
- birder/scripts/train_ibot.py +10 -7
- birder/scripts/train_kd.py +24 -18
- birder/scripts/train_mim.py +11 -10
- birder/scripts/train_mmcr.py +10 -7
- birder/scripts/train_rotnet.py +10 -7
- birder/scripts/train_simclr.py +10 -7
- birder/scripts/train_vicreg.py +10 -7
- birder/tools/__main__.py +6 -2
- birder/tools/adversarial.py +147 -96
- birder/tools/auto_anchors.py +361 -0
- birder/tools/ensemble_model.py +1 -1
- birder/tools/introspection.py +58 -31
- birder/version.py +1 -1
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/METADATA +2 -1
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/RECORD +60 -55
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/WHEEL +0 -0
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/entry_points.txt +0 -0
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {birder-0.2.1.dist-info → birder-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -133,7 +133,7 @@ class MultiScaleDeformableAttention(nn.Module):
|
|
|
133
133
|
self._reset_parameters()
|
|
134
134
|
|
|
135
135
|
def _reset_parameters(self) -> None:
|
|
136
|
-
nn.init.constant_(self.sampling_offsets.weight
|
|
136
|
+
nn.init.constant_(self.sampling_offsets.weight, 0.0)
|
|
137
137
|
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
|
|
138
138
|
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
|
|
139
139
|
grid_init = (
|
|
@@ -147,12 +147,12 @@ class MultiScaleDeformableAttention(nn.Module):
|
|
|
147
147
|
with torch.no_grad():
|
|
148
148
|
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
|
|
149
149
|
|
|
150
|
-
nn.init.constant_(self.attention_weights.weight
|
|
151
|
-
nn.init.constant_(self.attention_weights.bias
|
|
152
|
-
nn.init.xavier_uniform_(self.value_proj.weight
|
|
153
|
-
nn.init.constant_(self.value_proj.bias
|
|
154
|
-
nn.init.xavier_uniform_(self.output_proj.weight
|
|
155
|
-
nn.init.constant_(self.output_proj.bias
|
|
150
|
+
nn.init.constant_(self.attention_weights.weight, 0.0)
|
|
151
|
+
nn.init.constant_(self.attention_weights.bias, 0.0)
|
|
152
|
+
nn.init.xavier_uniform_(self.value_proj.weight)
|
|
153
|
+
nn.init.constant_(self.value_proj.bias, 0.0)
|
|
154
|
+
nn.init.xavier_uniform_(self.output_proj.weight)
|
|
155
|
+
nn.init.constant_(self.output_proj.bias, 0.0)
|
|
156
156
|
|
|
157
157
|
def forward(
|
|
158
158
|
self,
|
|
@@ -280,8 +280,10 @@ class DeformableTransformerDecoderLayer(nn.Module):
|
|
|
280
280
|
q = tgt + query_pos
|
|
281
281
|
k = tgt + query_pos
|
|
282
282
|
|
|
283
|
-
tgt2 = self.self_attn(
|
|
284
|
-
|
|
283
|
+
(tgt2, _) = self.self_attn(
|
|
284
|
+
q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), need_weights=False, attn_mask=self_attn_mask
|
|
285
|
+
)
|
|
286
|
+
tgt2 = tgt2.transpose(0, 1)
|
|
285
287
|
tgt = tgt + self.dropout(tgt2)
|
|
286
288
|
tgt = self.norm1(tgt)
|
|
287
289
|
|
|
@@ -451,8 +453,8 @@ class DeformableTransformer(nn.Module):
|
|
|
451
453
|
if isinstance(m, MultiScaleDeformableAttention):
|
|
452
454
|
m._reset_parameters()
|
|
453
455
|
|
|
454
|
-
nn.init.xavier_uniform_(self.reference_points.weight
|
|
455
|
-
nn.init.zeros_(self.reference_points.bias
|
|
456
|
+
nn.init.xavier_uniform_(self.reference_points.weight, gain=1.0)
|
|
457
|
+
nn.init.zeros_(self.reference_points.bias)
|
|
456
458
|
|
|
457
459
|
nn.init.normal_(self.level_embed)
|
|
458
460
|
|
|
@@ -613,7 +615,7 @@ class Deformable_DETR(DetectionBaseNet):
|
|
|
613
615
|
nn.init.zeros_(bbox_embed[-2].weight)
|
|
614
616
|
nn.init.zeros_(bbox_embed[-2].bias)
|
|
615
617
|
|
|
616
|
-
nn.init.constant_(self.bbox_embed[0][-2].bias
|
|
618
|
+
nn.init.constant_(self.bbox_embed[0][-2].bias[2:], -2.0)
|
|
617
619
|
|
|
618
620
|
def reset_classifier(self, num_classes: int) -> None:
|
|
619
621
|
self.num_classes = num_classes
|
birder/net/detection/detr.py
CHANGED
|
@@ -108,7 +108,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
108
108
|
q = src + pos
|
|
109
109
|
k = src + pos
|
|
110
110
|
|
|
111
|
-
(src2, _) = self.self_attn(q, k, value=src, key_padding_mask=src_key_padding_mask)
|
|
111
|
+
(src2, _) = self.self_attn(q, k, value=src, key_padding_mask=src_key_padding_mask, need_weights=False)
|
|
112
112
|
src = src + self.dropout1(src2)
|
|
113
113
|
src = self.norm1(src)
|
|
114
114
|
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
|
@@ -148,11 +148,15 @@ class TransformerDecoderLayer(nn.Module):
|
|
|
148
148
|
q = tgt + query_pos
|
|
149
149
|
k = tgt + query_pos
|
|
150
150
|
|
|
151
|
-
(tgt2, _) = self.self_attn(q, k, value=tgt)
|
|
151
|
+
(tgt2, _) = self.self_attn(q, k, value=tgt, need_weights=False)
|
|
152
152
|
tgt = tgt + self.dropout1(tgt2)
|
|
153
153
|
tgt = self.norm1(tgt)
|
|
154
154
|
(tgt2, _) = self.multihead_attn(
|
|
155
|
-
query=tgt + query_pos,
|
|
155
|
+
query=tgt + query_pos,
|
|
156
|
+
key=memory + pos,
|
|
157
|
+
value=memory,
|
|
158
|
+
key_padding_mask=memory_key_padding_mask,
|
|
159
|
+
need_weights=False,
|
|
156
160
|
)
|
|
157
161
|
tgt = tgt + self.dropout2(tgt2)
|
|
158
162
|
tgt = self.norm2(tgt)
|
|
@@ -234,7 +234,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
234
234
|
q = src + pos
|
|
235
235
|
k = src + pos
|
|
236
236
|
|
|
237
|
-
(src2, _) = self.self_attn(q, k, value=src, key_padding_mask=key_padding_mask)
|
|
237
|
+
(src2, _) = self.self_attn(q, k, value=src, key_padding_mask=key_padding_mask, need_weights=False)
|
|
238
238
|
src = src + self.dropout1(src2)
|
|
239
239
|
src = self.norm1(src)
|
|
240
240
|
|
|
@@ -465,8 +465,8 @@ class RT_DETRDecoder(nn.Module):
|
|
|
465
465
|
nn.init.constant_(class_embed.bias, bias_value)
|
|
466
466
|
|
|
467
467
|
for bbox_embed in self.bbox_embed:
|
|
468
|
-
nn.init.zeros_(bbox_embed[-2].weight
|
|
469
|
-
nn.init.zeros_(bbox_embed[-2].bias
|
|
468
|
+
nn.init.zeros_(bbox_embed[-2].weight)
|
|
469
|
+
nn.init.zeros_(bbox_embed[-2].bias)
|
|
470
470
|
|
|
471
471
|
def set_cache_enabled(self, enabled: bool) -> None:
|
|
472
472
|
self.use_cache = enabled
|
birder/net/detection/yolo_v3.py
CHANGED
|
@@ -37,7 +37,7 @@ def scale_anchors(
|
|
|
37
37
|
to_size: tuple[int, int],
|
|
38
38
|
) -> list[list[tuple[float, float]]]:
|
|
39
39
|
if from_size == to_size:
|
|
40
|
-
# Avoid aliasing default anchors in case they are mutated later
|
|
40
|
+
# Avoid aliasing default anchors in case they are mutated later
|
|
41
41
|
return [list(scale) for scale in anchors]
|
|
42
42
|
|
|
43
43
|
scale_h = to_size[0] / from_size[0]
|
|
@@ -368,14 +368,16 @@ class YOLO_v3(DetectionBaseNet):
|
|
|
368
368
|
num_anchors = self.anchor_generator.num_anchors_per_location()
|
|
369
369
|
self.head = YOLOHead(self.neck.out_channels, num_anchors, self.num_classes)
|
|
370
370
|
|
|
371
|
-
def adjust_size(self, new_size: tuple[int, int]) -> None:
|
|
371
|
+
def adjust_size(self, new_size: tuple[int, int], adjust_anchors: bool = False) -> None:
|
|
372
372
|
if new_size == self.size:
|
|
373
373
|
return
|
|
374
374
|
|
|
375
375
|
old_size = self.size
|
|
376
376
|
super().adjust_size(new_size)
|
|
377
|
-
|
|
378
|
-
|
|
377
|
+
|
|
378
|
+
if adjust_anchors is True:
|
|
379
|
+
self.anchors = scale_anchors(self.anchors, old_size, new_size)
|
|
380
|
+
self.anchor_generator.anchors = self.anchors
|
|
379
381
|
|
|
380
382
|
def freeze(self, freeze_classifier: bool = True) -> None:
|
|
381
383
|
for param in self.parameters():
|
|
@@ -705,13 +707,6 @@ class YOLO_v3(DetectionBaseNet):
|
|
|
705
707
|
neck_features = self.neck(features)
|
|
706
708
|
predictions = self.head(neck_features)
|
|
707
709
|
(anchors, grids, strides) = self.anchor_generator(images, neck_features)
|
|
708
|
-
if self.dynamic_size is True:
|
|
709
|
-
image_size = (images.tensors.shape[-2], images.tensors.shape[-1])
|
|
710
|
-
if image_size[0] != self.size[0] or image_size[1] != self.size[1]:
|
|
711
|
-
scale_w = image_size[1] / self.size[1]
|
|
712
|
-
scale_h = image_size[0] / self.size[0]
|
|
713
|
-
scale_tensor = torch.tensor([scale_w, scale_h], device=anchors[0].device, dtype=anchors[0].dtype)
|
|
714
|
-
anchors = [anchor * scale_tensor for anchor in anchors]
|
|
715
710
|
|
|
716
711
|
losses: dict[str, torch.Tensor] = {}
|
|
717
712
|
detections: list[dict[str, torch.Tensor]] = []
|
birder/net/detection/yolo_v4.py
CHANGED
|
@@ -400,14 +400,8 @@ class YOLO_v4(DetectionBaseNet):
|
|
|
400
400
|
self.ignore_thresh = 0.7
|
|
401
401
|
|
|
402
402
|
# Loss coefficients
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
# CIoU loss directly (loss = coeff * sum(ciou) / num_obj). This different formulation
|
|
406
|
-
# means darknet's obj_normalizer=1.0 overweights background loss relative to box
|
|
407
|
-
# regression in our implementation. We use a lower noobj_coeff (vs darknet's 1.0) to
|
|
408
|
-
# restore a better balance, similar to YOLOv3's noobj_coeff=0.2.
|
|
409
|
-
self.noobj_coeff = 0.3
|
|
410
|
-
self.coord_coeff = 0.07
|
|
403
|
+
self.noobj_coeff = 0.25
|
|
404
|
+
self.coord_coeff = 3.0
|
|
411
405
|
self.obj_coeff = 1.0
|
|
412
406
|
self.cls_coeff = 1.0
|
|
413
407
|
|
|
@@ -439,14 +433,16 @@ class YOLO_v4(DetectionBaseNet):
|
|
|
439
433
|
num_anchors = self.anchor_generator.num_anchors_per_location()
|
|
440
434
|
self.head = YOLOHead(self.neck.out_channels, num_anchors, self.num_classes)
|
|
441
435
|
|
|
442
|
-
def adjust_size(self, new_size: tuple[int, int]) -> None:
|
|
436
|
+
def adjust_size(self, new_size: tuple[int, int], adjust_anchors: bool = False) -> None:
|
|
443
437
|
if new_size == self.size:
|
|
444
438
|
return
|
|
445
439
|
|
|
446
440
|
old_size = self.size
|
|
447
441
|
super().adjust_size(new_size)
|
|
448
|
-
|
|
449
|
-
|
|
442
|
+
|
|
443
|
+
if adjust_anchors is True:
|
|
444
|
+
self.anchors = scale_anchors(self.anchors, old_size, new_size)
|
|
445
|
+
self.anchor_generator = YOLOAnchorGenerator(self.anchors)
|
|
450
446
|
|
|
451
447
|
def freeze(self, freeze_classifier: bool = True) -> None:
|
|
452
448
|
for param in self.parameters():
|
|
@@ -809,13 +805,6 @@ class YOLO_v4(DetectionBaseNet):
|
|
|
809
805
|
neck_features = self.neck(features)
|
|
810
806
|
predictions = self.head(neck_features)
|
|
811
807
|
(anchors, grids, strides) = self.anchor_generator(images, neck_features)
|
|
812
|
-
if self.dynamic_size is True:
|
|
813
|
-
image_size = (images.tensors.shape[-2], images.tensors.shape[-1])
|
|
814
|
-
if image_size[0] != self.size[0] or image_size[1] != self.size[1]:
|
|
815
|
-
scale_w = image_size[1] / self.size[1]
|
|
816
|
-
scale_h = image_size[0] / self.size[0]
|
|
817
|
-
scale_tensor = torch.tensor([scale_w, scale_h], device=anchors[0].device, dtype=anchors[0].dtype)
|
|
818
|
-
anchors = [anchor * scale_tensor for anchor in anchors]
|
|
819
808
|
|
|
820
809
|
losses: dict[str, torch.Tensor] = {}
|
|
821
810
|
detections: list[dict[str, torch.Tensor]] = []
|
|
@@ -113,9 +113,9 @@ class YOLO_v4_Tiny(YOLO_v4):
|
|
|
113
113
|
detections_per_img = 300
|
|
114
114
|
self.ignore_thresh = 0.7
|
|
115
115
|
|
|
116
|
-
# Loss coefficients
|
|
117
|
-
self.noobj_coeff = 0.
|
|
118
|
-
self.coord_coeff = 0
|
|
116
|
+
# Loss coefficients
|
|
117
|
+
self.noobj_coeff = 0.25
|
|
118
|
+
self.coord_coeff = 3.0
|
|
119
119
|
self.obj_coeff = 1.0
|
|
120
120
|
self.cls_coeff = 1.0
|
|
121
121
|
|
birder/net/fastvit.py
CHANGED
|
@@ -818,10 +818,10 @@ class FastViT(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
|
|
|
818
818
|
self.embedding_size = int(embed_dims[-1] * cls_ratio)
|
|
819
819
|
self.classifier = self.create_classifier()
|
|
820
820
|
|
|
821
|
+
self.max_stride = 2 ** (len(layers) + 1)
|
|
821
822
|
self.stem_stride = 4
|
|
822
823
|
self.stem_width = embed_dims[0]
|
|
823
824
|
self.encoding_size = int(embed_dims[-1] * cls_ratio)
|
|
824
|
-
self.max_stride = 2 ** (len(layers) + 1)
|
|
825
825
|
|
|
826
826
|
# Weights initialization
|
|
827
827
|
for m in self.modules():
|
birder/net/mim/mae_vit.py
CHANGED
|
@@ -2,13 +2,12 @@
|
|
|
2
2
|
MAE ViT, adapted from
|
|
3
3
|
https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/mae.py
|
|
4
4
|
and
|
|
5
|
-
https://github.com/
|
|
5
|
+
https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit_mae/modeling_vit_mae.py
|
|
6
6
|
|
|
7
|
-
Paper "Masked Autoencoders Are Scalable Vision Learners",
|
|
8
|
-
https://arxiv.org/abs/2111.06377
|
|
7
|
+
Paper "Masked Autoencoders Are Scalable Vision Learners", https://arxiv.org/abs/2111.06377
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
|
-
# Reference license: MIT and
|
|
10
|
+
# Reference license: MIT and Apache-2.0
|
|
12
11
|
|
|
13
12
|
from typing import Any
|
|
14
13
|
from typing import Optional
|
|
@@ -61,7 +60,7 @@ class MAE_ViT(MIMBaseNet):
|
|
|
61
60
|
seq_len += self.encoder.num_special_tokens
|
|
62
61
|
self.decoder_pos_embed = nn.Parameter(torch.empty(1, seq_len, decoder_embed_dim).normal_(std=0.02))
|
|
63
62
|
else:
|
|
64
|
-
# Fixed sin-cos
|
|
63
|
+
# Fixed sin-cos embeddings
|
|
65
64
|
pos_embedding = pos_embedding_sin_cos_2d(
|
|
66
65
|
h=self.size[0] // self.patch_size,
|
|
67
66
|
w=self.size[1] // self.patch_size,
|
|
@@ -124,12 +123,12 @@ class MAE_ViT(MIMBaseNet):
|
|
|
124
123
|
mask_tokens = self.mask_token.repeat(x.size(0), ids_restore.size(1) + special_token_len - x.size(1), 1)
|
|
125
124
|
x_ = torch.concat([x[:, special_token_len:, :], mask_tokens], dim=1) # No special tokens
|
|
126
125
|
x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.size(2))) # Un-shuffle
|
|
127
|
-
x = torch.concat([x[:, :special_token_len, :], x_], dim=1) #
|
|
126
|
+
x = torch.concat([x[:, :special_token_len, :], x_], dim=1) # Re-append special tokens
|
|
128
127
|
|
|
129
|
-
# Add
|
|
128
|
+
# Add positional embeddings
|
|
130
129
|
x = x + self.decoder_pos_embed
|
|
131
130
|
|
|
132
|
-
# Apply transformer
|
|
131
|
+
# Apply decoder transformer
|
|
133
132
|
x = self.decoder(x)
|
|
134
133
|
|
|
135
134
|
# Remove special tokens
|
birder/net/pit.py
CHANGED
|
@@ -259,7 +259,7 @@ class PiT(DetectorBackbone):
|
|
|
259
259
|
width = (new_size[1] - self.patch_size[1]) // self.patch_stride[1] + 1
|
|
260
260
|
|
|
261
261
|
self.pos_embed = nn.Parameter(
|
|
262
|
-
F.interpolate(self.pos_embed
|
|
262
|
+
F.interpolate(self.pos_embed, (height, width), mode="bicubic"), requires_grad=True
|
|
263
263
|
)
|
|
264
264
|
|
|
265
265
|
|
birder/net/resnet_v1.py
CHANGED
|
@@ -3,6 +3,9 @@ ResNet v1, adapted from
|
|
|
3
3
|
https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
|
|
4
4
|
|
|
5
5
|
Paper "Deep Residual Learning for Image Recognition", https://arxiv.org/abs/1512.03385
|
|
6
|
+
and
|
|
7
|
+
Paper "Bag of Tricks for Image Classification with Convolutional Neural Networks",
|
|
8
|
+
https://arxiv.org/abs/1812.01187
|
|
6
9
|
"""
|
|
7
10
|
|
|
8
11
|
# Reference license: BSD 3-Clause
|
|
@@ -23,34 +26,25 @@ from birder.net.base import DetectorBackbone
|
|
|
23
26
|
|
|
24
27
|
class ResidualBlock(nn.Module):
|
|
25
28
|
def __init__(
|
|
26
|
-
self,
|
|
29
|
+
self,
|
|
30
|
+
in_channels: int,
|
|
31
|
+
out_channels: int,
|
|
32
|
+
stride: tuple[int, int],
|
|
33
|
+
bottle_neck: bool,
|
|
34
|
+
squeeze_excitation: bool,
|
|
35
|
+
avg_down: bool,
|
|
27
36
|
) -> None:
|
|
28
37
|
super().__init__()
|
|
29
38
|
if bottle_neck is True:
|
|
30
39
|
self.block1 = nn.Sequential(
|
|
31
40
|
Conv2dNormActivation(
|
|
32
|
-
in_channels,
|
|
33
|
-
out_channels // 4,
|
|
34
|
-
kernel_size=(1, 1),
|
|
35
|
-
stride=(1, 1),
|
|
36
|
-
padding=(0, 0),
|
|
37
|
-
bias=False,
|
|
41
|
+
in_channels, out_channels // 4, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False
|
|
38
42
|
),
|
|
39
43
|
Conv2dNormActivation(
|
|
40
|
-
out_channels // 4,
|
|
41
|
-
out_channels // 4,
|
|
42
|
-
kernel_size=(3, 3),
|
|
43
|
-
stride=stride,
|
|
44
|
-
padding=(1, 1),
|
|
45
|
-
bias=False,
|
|
44
|
+
out_channels // 4, out_channels // 4, kernel_size=(3, 3), stride=stride, padding=(1, 1), bias=False
|
|
46
45
|
),
|
|
47
46
|
nn.Conv2d(
|
|
48
|
-
out_channels // 4,
|
|
49
|
-
out_channels,
|
|
50
|
-
kernel_size=(1, 1),
|
|
51
|
-
stride=(1, 1),
|
|
52
|
-
padding=(0, 0),
|
|
53
|
-
bias=False,
|
|
47
|
+
out_channels // 4, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False
|
|
54
48
|
),
|
|
55
49
|
nn.BatchNorm2d(out_channels),
|
|
56
50
|
)
|
|
@@ -67,10 +61,19 @@ class ResidualBlock(nn.Module):
|
|
|
67
61
|
if in_channels == out_channels:
|
|
68
62
|
self.block2 = nn.Identity()
|
|
69
63
|
else:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
nn.
|
|
73
|
-
|
|
64
|
+
if avg_down is True and stride != (1, 1):
|
|
65
|
+
# ResNet-D: Apply average pooling before 1x1 conv for downsampling
|
|
66
|
+
self.block2 = nn.Sequential(
|
|
67
|
+
nn.AvgPool2d(kernel_size=2, stride=stride, ceil_mode=True, count_include_pad=False),
|
|
68
|
+
nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False),
|
|
69
|
+
nn.BatchNorm2d(out_channels),
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
# Standard ResNet: Use strided 1x1 conv
|
|
73
|
+
self.block2 = nn.Sequential(
|
|
74
|
+
nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1), stride=stride, padding=(0, 0), bias=False),
|
|
75
|
+
nn.BatchNorm2d(out_channels),
|
|
76
|
+
)
|
|
74
77
|
|
|
75
78
|
self.relu = nn.ReLU(inplace=True)
|
|
76
79
|
if squeeze_excitation is True:
|
|
@@ -107,21 +110,30 @@ class ResNet_v1(DetectorBackbone):
|
|
|
107
110
|
filter_list: list[int] = self.config["filter_list"]
|
|
108
111
|
units: list[int] = self.config["units"]
|
|
109
112
|
pooling_param: Optional[float] = self.config.get("pooling_param", None)
|
|
113
|
+
deep_stem: bool = self.config.get("deep_stem", False)
|
|
114
|
+
avg_down: bool = self.config.get("avg_down", False)
|
|
110
115
|
|
|
111
116
|
assert len(units) + 1 == len(filter_list)
|
|
112
117
|
num_unit = len(units)
|
|
113
118
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
bias=False,
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
119
|
+
if deep_stem is True:
|
|
120
|
+
# ResNet-D
|
|
121
|
+
self.stem = nn.Sequential(
|
|
122
|
+
Conv2dNormActivation(
|
|
123
|
+
self.input_channels, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False
|
|
124
|
+
),
|
|
125
|
+
Conv2dNormActivation(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
|
|
126
|
+
Conv2dNormActivation(32, filter_list[0], kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
|
|
127
|
+
nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
# Standard ResNet stem: 7x7 conv
|
|
131
|
+
self.stem = nn.Sequential(
|
|
132
|
+
Conv2dNormActivation(
|
|
133
|
+
self.input_channels, filter_list[0], kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
|
|
134
|
+
),
|
|
135
|
+
nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
|
|
136
|
+
)
|
|
125
137
|
|
|
126
138
|
# Generate body layers
|
|
127
139
|
stages: OrderedDict[str, nn.Module] = OrderedDict()
|
|
@@ -140,6 +152,7 @@ class ResNet_v1(DetectorBackbone):
|
|
|
140
152
|
stride=stride,
|
|
141
153
|
bottle_neck=bottle_neck,
|
|
142
154
|
squeeze_excitation=squeeze_excitation,
|
|
155
|
+
avg_down=avg_down,
|
|
143
156
|
)
|
|
144
157
|
)
|
|
145
158
|
for _ in range(1, units[i]):
|
|
@@ -150,6 +163,7 @@ class ResNet_v1(DetectorBackbone):
|
|
|
150
163
|
stride=(1, 1),
|
|
151
164
|
bottle_neck=bottle_neck,
|
|
152
165
|
squeeze_excitation=squeeze_excitation,
|
|
166
|
+
avg_down=avg_down,
|
|
153
167
|
)
|
|
154
168
|
)
|
|
155
169
|
|
|
@@ -242,6 +256,52 @@ registry.register_model_config(
|
|
|
242
256
|
config={"bottle_neck": True, "filter_list": [64, 256, 512, 1024, 2048], "units": [3, 30, 48, 8]},
|
|
243
257
|
)
|
|
244
258
|
|
|
259
|
+
# ResNet-D variants (From: Bag of Tricks for Image Classification with Convolutional Neural Networks)
|
|
260
|
+
registry.register_model_config(
|
|
261
|
+
"resnet_d_50",
|
|
262
|
+
ResNet_v1,
|
|
263
|
+
config={
|
|
264
|
+
"bottle_neck": True,
|
|
265
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
266
|
+
"units": [3, 4, 6, 3],
|
|
267
|
+
"deep_stem": True,
|
|
268
|
+
"avg_down": True,
|
|
269
|
+
},
|
|
270
|
+
)
|
|
271
|
+
registry.register_model_config(
|
|
272
|
+
"resnet_d_101",
|
|
273
|
+
ResNet_v1,
|
|
274
|
+
config={
|
|
275
|
+
"bottle_neck": True,
|
|
276
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
277
|
+
"units": [3, 4, 23, 3],
|
|
278
|
+
"deep_stem": True,
|
|
279
|
+
"avg_down": True,
|
|
280
|
+
},
|
|
281
|
+
)
|
|
282
|
+
registry.register_model_config(
|
|
283
|
+
"resnet_d_152",
|
|
284
|
+
ResNet_v1,
|
|
285
|
+
config={
|
|
286
|
+
"bottle_neck": True,
|
|
287
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
288
|
+
"units": [3, 8, 36, 3],
|
|
289
|
+
"deep_stem": True,
|
|
290
|
+
"avg_down": True,
|
|
291
|
+
},
|
|
292
|
+
)
|
|
293
|
+
registry.register_model_config(
|
|
294
|
+
"resnet_d_200",
|
|
295
|
+
ResNet_v1,
|
|
296
|
+
config={
|
|
297
|
+
"bottle_neck": True,
|
|
298
|
+
"filter_list": [64, 256, 512, 1024, 2048],
|
|
299
|
+
"units": [3, 24, 36, 3],
|
|
300
|
+
"deep_stem": True,
|
|
301
|
+
"avg_down": True,
|
|
302
|
+
},
|
|
303
|
+
)
|
|
304
|
+
|
|
245
305
|
registry.register_weights(
|
|
246
306
|
"resnet_v1_50_arabian-peninsula",
|
|
247
307
|
{
|
birder/net/ssl/data2vec.py
CHANGED
birder/net/ssl/data2vec2.py
CHANGED
|
@@ -7,7 +7,6 @@ https://arxiv.org/abs/2212.07525
|
|
|
7
7
|
|
|
8
8
|
Changes from original:
|
|
9
9
|
* Target CLS is taken just from the last layer
|
|
10
|
-
* Replaced instance norm (1st of the IN -> AVG -> LM) with layer norm
|
|
11
10
|
"""
|
|
12
11
|
|
|
13
12
|
# Reference license: MIT
|
|
@@ -140,7 +139,10 @@ class Data2Vec2(SSLBaseNet):
|
|
|
140
139
|
y = y[..., -self.average_top_k_layers :] # Take the last k layers
|
|
141
140
|
y = y.permute(3, 0, 1, 2)
|
|
142
141
|
|
|
143
|
-
|
|
142
|
+
# Note: the backbone already LN-normalizes the final layer (per-token),
|
|
143
|
+
# but data2vec2 uses per-layer instance norm across tokens (per-channel)
|
|
144
|
+
# before averaging (IN -> AVG -> LN), so we keep IN for all K layers.
|
|
145
|
+
y = [F.instance_norm(t.float().transpose(1, 2)).transpose(1, 2) for t in y]
|
|
144
146
|
y = sum(y) / len(y)
|
|
145
147
|
y = F.layer_norm(y.float(), y.shape[-1:])
|
|
146
148
|
|
birder/results/gui.py
CHANGED
|
@@ -31,6 +31,7 @@ def show_detections(
|
|
|
31
31
|
detection: dict[str, torch.Tensor],
|
|
32
32
|
class_to_idx: dict[str, int],
|
|
33
33
|
score_threshold: float = 0.5,
|
|
34
|
+
class_min_scores: Optional[dict[str, float]] = None,
|
|
34
35
|
color_list: Optional[list[tuple[int, ...]]] = None,
|
|
35
36
|
show: bool = True,
|
|
36
37
|
) -> tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]:
|
|
@@ -38,10 +39,22 @@ def show_detections(
|
|
|
38
39
|
idx_to_class = dict(zip(class_to_idx.values(), class_to_idx.keys()))
|
|
39
40
|
|
|
40
41
|
scores = detection["scores"]
|
|
41
|
-
|
|
42
|
+
labels_all = detection["labels"]
|
|
43
|
+
|
|
44
|
+
# Apply per-class minimum scores if provided, otherwise use global threshold
|
|
45
|
+
if class_min_scores is not None and len(class_min_scores) > 0:
|
|
46
|
+
mask = torch.zeros(len(scores), dtype=torch.bool)
|
|
47
|
+
for i, (score, label) in enumerate(zip(scores, labels_all)):
|
|
48
|
+
class_name = idx_to_class[label.item()]
|
|
49
|
+
min_score = class_min_scores.get(class_name, score_threshold)
|
|
50
|
+
mask[i] = score > min_score
|
|
51
|
+
idxs = torch.where(mask)[0]
|
|
52
|
+
else:
|
|
53
|
+
idxs = torch.where(scores > score_threshold)
|
|
54
|
+
|
|
42
55
|
scores = scores[idxs]
|
|
43
56
|
boxes = detection["boxes"][idxs]
|
|
44
|
-
labels =
|
|
57
|
+
labels = labels_all[idxs]
|
|
45
58
|
label_names = [f"{idx_to_class[i.item()]}: {s:.4f}" for i, s in zip(labels, scores)]
|
|
46
59
|
if color_list is not None:
|
|
47
60
|
colors = [color_list[label] for label in labels]
|
|
@@ -101,6 +101,17 @@ def predict(args: argparse.Namespace) -> None:
|
|
|
101
101
|
|
|
102
102
|
score_threshold = args.min_score
|
|
103
103
|
|
|
104
|
+
# Process per-class minimum scores
|
|
105
|
+
class_min_scores: dict[str, float] = {}
|
|
106
|
+
if args.class_min_score is not None:
|
|
107
|
+
for class_name, score_str in args.class_min_score:
|
|
108
|
+
score = float(score_str)
|
|
109
|
+
if class_name not in class_to_idx:
|
|
110
|
+
logger.warning(f"Class '{class_name}' from --class-min-score not found in model classes")
|
|
111
|
+
else:
|
|
112
|
+
class_min_scores[class_name] = score
|
|
113
|
+
logger.info(f"Using minimum score {score} for class '{class_name}'")
|
|
114
|
+
|
|
104
115
|
# Set label colors
|
|
105
116
|
cmap = plt.get_cmap("jet")
|
|
106
117
|
color_list = []
|
|
@@ -157,6 +168,7 @@ def predict(args: argparse.Namespace) -> None:
|
|
|
157
168
|
detection,
|
|
158
169
|
class_to_idx=class_to_idx,
|
|
159
170
|
score_threshold=score_threshold,
|
|
171
|
+
class_min_scores=class_min_scores,
|
|
160
172
|
color_list=color_list,
|
|
161
173
|
)
|
|
162
174
|
|
|
@@ -224,7 +236,10 @@ def get_args_parser() -> argparse.ArgumentParser:
|
|
|
224
236
|
"-e 0 --min-score 0.25 --gpu --show --shuffle data/detection_data/validation\n"
|
|
225
237
|
"python predict_detection.py --network faster_rcnn -t coco --backbone csp_resnet_50 "
|
|
226
238
|
"--backbone-tag imagenet1k -e 0 --batch-size 1 --gpu --gpu-id 1 "
|
|
227
|
-
"--coco-json-path data/detection_data/validation_annotations_coco.json data/detection_data"
|
|
239
|
+
"--coco-json-path data/detection_data/validation_annotations_coco.json data/detection_data\n"
|
|
240
|
+
"python predict_detection.py -n yolo_v4 --backbone csp_resnet_50 --backbone-tag imagenet1k -t coco "
|
|
241
|
+
" --min-score 0.4 --class-min-score person 0.75 --class-min-score car 0.3 --batch-size 1 --show "
|
|
242
|
+
"--shuffle ~/Datasets/cocodataset/val2017\n"
|
|
228
243
|
),
|
|
229
244
|
formatter_class=cli.ArgumentHelpFormatter,
|
|
230
245
|
)
|
|
@@ -284,6 +299,13 @@ def get_args_parser() -> argparse.ArgumentParser:
|
|
|
284
299
|
"--fast-matmul", default=False, action="store_true", help="use fast matrix multiplication (affects precision)"
|
|
285
300
|
)
|
|
286
301
|
parser.add_argument("--min-score", type=float, default=0.5, help="prediction score threshold")
|
|
302
|
+
parser.add_argument(
|
|
303
|
+
"--class-min-score",
|
|
304
|
+
action="append",
|
|
305
|
+
nargs=2,
|
|
306
|
+
metavar=("CLASS", "SCORE"),
|
|
307
|
+
help="set custom minimum score for specific class (can be used multiple times)",
|
|
308
|
+
)
|
|
287
309
|
parser.add_argument(
|
|
288
310
|
"--size",
|
|
289
311
|
type=int,
|
|
@@ -342,6 +364,16 @@ def validate_args(args: argparse.Namespace) -> None:
|
|
|
342
364
|
)
|
|
343
365
|
if args.min_score >= 1 or args.min_score <= 0.0:
|
|
344
366
|
raise cli.ValidationError(f"--min-score must be in range of (0, 1.0), got {args.min_score}")
|
|
367
|
+
if args.class_min_score is not None:
|
|
368
|
+
for class_name, score_str in args.class_min_score:
|
|
369
|
+
try:
|
|
370
|
+
score = float(score_str)
|
|
371
|
+
if score >= 1.0 or score <= 0.0:
|
|
372
|
+
raise cli.ValidationError(
|
|
373
|
+
f"--class-min-score for '{class_name}' must be in range of (0, 1.0), got {score}"
|
|
374
|
+
)
|
|
375
|
+
except ValueError as e:
|
|
376
|
+
raise cli.ValidationError(f"--class-min-score value must be a valid float, got '{score_str}'") from e
|
|
345
377
|
if args.parallel is True and args.gpu is False:
|
|
346
378
|
raise cli.ValidationError("--parallel requires --gpu to be set")
|
|
347
379
|
if args.parallel is True and args.compile is True:
|