PyPI - birder - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

birder 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

birder/__init__.py +2 -0
birder/common/fs_ops.py +81 -1
birder/common/training_cli.py +12 -2
birder/common/training_utils.py +73 -12
birder/data/collators/detection.py +3 -1
birder/datahub/_lib.py +15 -6
birder/datahub/evaluation.py +591 -0
birder/eval/__init__.py +0 -0
birder/eval/__main__.py +74 -0
birder/eval/_embeddings.py +50 -0
birder/eval/adversarial.py +315 -0
birder/eval/benchmarks/__init__.py +0 -0
birder/eval/benchmarks/awa2.py +357 -0
birder/eval/benchmarks/bioscan5m.py +198 -0
birder/eval/benchmarks/fishnet.py +318 -0
birder/eval/benchmarks/flowers102.py +210 -0
birder/eval/benchmarks/fungiclef.py +261 -0
birder/eval/benchmarks/nabirds.py +202 -0
birder/eval/benchmarks/newt.py +262 -0
birder/eval/benchmarks/plankton.py +255 -0
birder/eval/benchmarks/plantdoc.py +259 -0
birder/eval/benchmarks/plantnet.py +252 -0
birder/eval/classification.py +235 -0
birder/eval/methods/__init__.py +0 -0
birder/eval/methods/ami.py +78 -0
birder/eval/methods/knn.py +71 -0
birder/eval/methods/linear.py +152 -0
birder/eval/methods/mlp.py +178 -0
birder/eval/methods/simpleshot.py +100 -0
birder/eval/methods/svm.py +92 -0
birder/inference/classification.py +23 -2
birder/inference/detection.py +35 -15
birder/net/_vit_configs.py +5 -0
birder/net/cait.py +3 -3
birder/net/coat.py +3 -3
birder/net/cswin_transformer.py +2 -1
birder/net/deit.py +1 -1
birder/net/deit3.py +1 -1
birder/net/detection/__init__.py +2 -0
birder/net/detection/base.py +41 -18
birder/net/detection/deformable_detr.py +74 -50
birder/net/detection/detr.py +29 -26
birder/net/detection/efficientdet.py +42 -25
birder/net/detection/faster_rcnn.py +53 -21
birder/net/detection/fcos.py +42 -23
birder/net/detection/lw_detr.py +1204 -0
birder/net/detection/plain_detr.py +60 -47
birder/net/detection/retinanet.py +47 -35
birder/net/detection/rt_detr_v1.py +49 -46
birder/net/detection/rt_detr_v2.py +95 -102
birder/net/detection/ssd.py +47 -31
birder/net/detection/ssdlite.py +2 -2
birder/net/detection/yolo_v2.py +33 -18
birder/net/detection/yolo_v3.py +35 -33
birder/net/detection/yolo_v4.py +35 -20
birder/net/detection/yolo_v4_tiny.py +1 -2
birder/net/edgevit.py +3 -3
birder/net/efficientvit_msft.py +1 -1
birder/net/flexivit.py +1 -1
birder/net/hiera.py +44 -67
birder/net/hieradet.py +2 -2
birder/net/maxvit.py +2 -2
birder/net/mim/fcmae.py +2 -2
birder/net/mim/mae_hiera.py +9 -16
birder/net/mnasnet.py +2 -2
birder/net/nextvit.py +4 -4
birder/net/resnext.py +2 -2
birder/net/rope_deit3.py +2 -2
birder/net/rope_flexivit.py +2 -2
birder/net/rope_vit.py +2 -2
birder/net/simple_vit.py +1 -1
birder/net/squeezenet.py +1 -1
birder/net/ssl/capi.py +32 -25
birder/net/ssl/dino_v2.py +12 -15
birder/net/ssl/franca.py +26 -19
birder/net/van.py +2 -2
birder/net/vit.py +21 -3
birder/net/vit_parallel.py +1 -1
birder/net/vit_sam.py +62 -16
birder/net/xcit.py +1 -1
birder/ops/msda.py +46 -16
birder/scripts/benchmark.py +35 -8
birder/scripts/predict.py +14 -1
birder/scripts/predict_detection.py +7 -1
birder/scripts/train.py +27 -11
birder/scripts/train_capi.py +13 -10
birder/scripts/train_detection.py +18 -7
birder/scripts/train_franca.py +10 -2
birder/scripts/train_kd.py +28 -11
birder/tools/adversarial.py +5 -0
birder/tools/convert_model.py +101 -43
birder/tools/quantize_model.py +33 -16
birder/version.py +1 -1
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/METADATA +17 -10
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/RECORD +99 -75
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/WHEEL +1 -1
birder/scripts/evaluate.py +0 -176
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/entry_points.txt +0 -0
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/licenses/LICENSE +0 -0
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/top_level.txt +0 -0

birder/net/detection/rt_detr_v2.py CHANGED Viewed

@@ -147,7 +147,7 @@ class MultiScaleDeformableAttention(nn.Module):
                 param.requires_grad_(False)
     def reset_parameters(self) -> None:
-        nn.init.constant_(self.sampling_offsets.weight, 0.0)
+        nn.init.zeros_(self.sampling_offsets.weight)
         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)[0]
@@ -158,25 +158,27 @@ class MultiScaleDeformableAttention(nn.Module):
         with torch.no_grad():
             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight, 0.0)
-        nn.init.constant_(self.attention_weights.bias, 0.0)
+        nn.init.zeros_(self.attention_weights.weight)
+        nn.init.zeros_(self.attention_weights.bias)
         nn.init.xavier_uniform_(self.value_proj.weight)
-        nn.init.constant_(self.value_proj.bias, 0.0)
+        nn.init.zeros_(self.value_proj.bias)
         nn.init.xavier_uniform_(self.output_proj.weight)
-        nn.init.constant_(self.output_proj.bias, 0.0)
+        nn.init.zeros_(self.output_proj.bias)
+    # pylint: disable=too-many-locals
     def forward(
         self,
         query: torch.Tensor,
         reference_points: torch.Tensor,
         input_flatten: torch.Tensor,
         input_spatial_shapes: torch.Tensor,
+        src_shapes: list[list[int]],
         input_level_start_index: torch.Tensor,
         input_padding_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        N, num_queries, _ = query.size()
+        num_queries = query.size(1)
         N, sequence_length, _ = input_flatten.size()
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == sequence_length
+        # assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == sequence_length
         value = self.value_proj(input_flatten)
         if input_padding_mask is not None:
@@ -231,7 +233,7 @@ class MultiScaleDeformableAttention(nn.Module):
         if self.method == "discrete":
             output = self._forward_fallback(
-                value, input_spatial_shapes, sampling_locations, attention_weights, method="discrete"
+                value, input_spatial_shapes, src_shapes, sampling_locations, attention_weights, method="discrete"
             )
         else:
             if self.uniform_points is True:
@@ -245,10 +247,11 @@ class MultiScaleDeformableAttention(nn.Module):
                     sampling_locations,
                     attention_weights,
                     self.im2col_step,
+                    src_shapes,
                 )
             else:
                 output = self._forward_fallback(
-                    value, input_spatial_shapes, sampling_locations, attention_weights, method="default"
+                    value, input_spatial_shapes, src_shapes, sampling_locations, attention_weights, method="default"
                 )
         output = self.output_proj(output)
@@ -258,6 +261,7 @@ class MultiScaleDeformableAttention(nn.Module):
         self,
         value: torch.Tensor,
         spatial_shapes: torch.Tensor,
+        src_shapes: list[list[int]],
         sampling_locations: torch.Tensor,
         attention_weights: torch.Tensor,
         method: str = "default",
@@ -272,8 +276,7 @@ class MultiScaleDeformableAttention(nn.Module):
         sampling_locations_list = sampling_grids.split(self.num_points, dim=-2)
         sampling_value_list = []
-        spatial_shapes_list: list[list[int]] = spatial_shapes.tolist()
-        for level, (H, W) in enumerate(spatial_shapes_list):
+        for level, (H, W) in enumerate(src_shapes):
             value_l = value_list[level].reshape(B * n_heads, head_dim, H, W)
             sampling_grid_l = sampling_locations_list[level]
@@ -361,21 +364,21 @@ class TransformerDecoderLayer(nn.Module):
         reference_points: torch.Tensor,
         src: torch.Tensor,
         src_spatial_shapes: torch.Tensor,
+        src_shapes: list[list[int]],
         level_start_index: torch.Tensor,
         src_padding_mask: Optional[torch.Tensor],
         self_attn_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Self attention
-        q = tgt + query_pos
-        k = tgt + query_pos
+        q_k = tgt + query_pos
-        tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)
+        tgt2 = self.self_attn(q_k, q_k, tgt, attn_mask=self_attn_mask)
         tgt = tgt + self.dropout(tgt2)
         tgt = self.norm1(tgt)
         # Cross attention
         tgt2 = self.cross_attn(
-            tgt + query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask
+            tgt + query_pos, reference_points, src, src_spatial_shapes, src_shapes, level_start_index, src_padding_mask
         )
         tgt = tgt + self.dropout(tgt2)
         tgt = self.norm2(tgt)
@@ -526,18 +529,18 @@ class RT_DETRDecoder(nn.Module):
         # Gather reference points
         reference_points_unact = enc_outputs_coord_unact.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_coord_unact.shape[-1])
         )
         enc_topk_bboxes = reference_points_unact.sigmoid()
         # Gather encoder logits for loss computation
         enc_topk_logits = enc_outputs_class.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_class.shape[-1])
         )
         # Extract region features
-        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, output_memory.shape[-1]))
         target = target.detach()
         return (target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits)
@@ -551,6 +554,7 @@ class RT_DETRDecoder(nn.Module):
         denoising_bbox_unact: Optional[torch.Tensor] = None,
         attn_mask: Optional[torch.Tensor] = None,
         padding_mask: Optional[list[torch.Tensor]] = None,
+        return_intermediates: bool = True,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         memory = []
         mask_flatten = []
@@ -578,18 +582,19 @@ class RT_DETRDecoder(nn.Module):
         level_start_index_tensor = torch.tensor(level_start_index, dtype=torch.long, device=memory.device)
         # Decoder forward
-        out_bboxes = []
-        out_logits = []
+        bboxes_list: list[torch.Tensor] = []
+        logits_list: list[torch.Tensor] = []
         reference_points = init_ref_points_unact.sigmoid()
         for decoder_layer, bbox_head, class_head in zip(self.layers, self.bbox_embed, self.class_embed):
             query_pos = self.query_pos_head(reference_points)
-            reference_points_input = reference_points.unsqueeze(2).repeat(1, 1, len(spatial_shapes), 1)
+            reference_points_input = reference_points.unsqueeze(2).expand(-1, -1, len(spatial_shapes), -1)
             target = decoder_layer(
                 target,
                 query_pos,
                 reference_points_input,
                 memory,
                 spatial_shapes_tensor,
+                spatial_shapes,
                 level_start_index_tensor,
                 memory_padding_mask,
                 attn_mask,
@@ -602,14 +607,19 @@ class RT_DETRDecoder(nn.Module):
             # Classification
             class_logits = class_head(target)
-            out_bboxes.append(new_reference_points)
-            out_logits.append(class_logits)
+            if return_intermediates is True:
+                bboxes_list.append(new_reference_points)
+                logits_list.append(class_logits)
             # Update reference points for next layer
             reference_points = new_reference_points.detach()
-        out_bboxes = torch.stack(out_bboxes)
-        out_logits = torch.stack(out_logits)
+        if return_intermediates is True:
+            out_bboxes = torch.stack(bboxes_list)
+            out_logits = torch.stack(logits_list)
+        else:
+            out_bboxes = new_reference_points
+            out_logits = class_logits
         return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits)
@@ -675,7 +685,7 @@ class RT_DETR_v2(DetectionBaseNet):
         self.decoder = RT_DETRDecoder(
             hidden_dim=hidden_dim,
             num_classes=self.num_classes,
-            num_queries=num_queries,
+            num_queries=self.num_queries,
             num_decoder_layers=num_decoder_layers,
             num_levels=self.num_levels,
             num_heads=num_heads,
@@ -744,20 +754,32 @@ class RT_DETR_v2(DetectionBaseNet):
                 for param in self.denoising_class_embed.parameters():
                     param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
-    def _class_loss(
+    def _compute_layer_losses(
         self,
         cls_logits: torch.Tensor,
         box_output: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
         num_boxes: float,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         idx = self._get_src_permutation_idx(indices)
+        src_boxes = box_output[idx]
+        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        src_boxes_xyxy = box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy")
+        target_boxes_xyxy = box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy")
+        # IoU for varifocal loss (class loss)
+        ious = torch.diag(box_ops.box_iou(src_boxes_xyxy, target_boxes_xyxy)).detach()
+        # Classification loss
         target_classes_o = torch.concat([t["labels"][J] for t, (_, J) in zip(targets, indices)], dim=0)
         target_classes = torch.full(cls_logits.shape[:2], self.num_classes, dtype=torch.int64, device=cls_logits.device)
         target_classes[idx] = target_classes_o
@@ -771,15 +793,6 @@ class RT_DETR_v2(DetectionBaseNet):
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
         target_classes_onehot = target_classes_onehot[:, :, :-1]
-        src_boxes = box_output[idx]
-        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        ious = torch.diag(
-            box_ops.box_iou(
-                box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-                box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-            )
-        ).detach()
         target_score_o = torch.zeros(cls_logits.shape[:2], dtype=cls_logits.dtype, device=cls_logits.device)
         target_score_o[idx] = ious.to(cls_logits.dtype)
         target_score = target_score_o.unsqueeze(-1) * target_classes_onehot
@@ -787,31 +800,13 @@ class RT_DETR_v2(DetectionBaseNet):
         loss = varifocal_loss(cls_logits, target_score, target_classes_onehot, alpha=0.75, gamma=2.0)
         loss_ce = (loss.mean(1).sum() / num_boxes) * cls_logits.shape[1]
-        return loss_ce
-    def _box_loss(
-        self,
-        box_output: torch.Tensor,
-        targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
-        num_boxes: float,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = box_output[idx]
-        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
-        loss_bbox = loss_bbox.sum() / num_boxes
+        # Box L1 loss
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none").sum() / num_boxes
-        loss_giou = 1 - torch.diag(
-            box_ops.generalized_box_iou(
-                box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-                box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-            )
-        )
-        loss_giou = loss_giou.sum() / num_boxes
+        # GIoU loss
+        loss_giou = (1 - torch.diag(box_ops.generalized_box_iou(src_boxes_xyxy, target_boxes_xyxy))).sum() / num_boxes
-        return (loss_bbox, loss_giou)
+        return (loss_ce, loss_bbox, loss_giou)
     def _compute_denoising_loss(
         self,
@@ -846,11 +841,9 @@ class RT_DETR_v2(DetectionBaseNet):
                         )
                     )
-            loss_ce = self._class_loss(
+            loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
                 dn_out_logits[layer_idx], dn_out_bboxes[layer_idx], targets, indices, dn_num_boxes
             )
-            loss_bbox, loss_giou = self._box_loss(dn_out_bboxes[layer_idx], targets, indices, dn_num_boxes)
             loss_ce_list.append(loss_ce)
             loss_bbox_list.append(loss_bbox)
             loss_giou_list.append(loss_giou)
@@ -861,9 +854,7 @@ class RT_DETR_v2(DetectionBaseNet):
         return (loss_ce_dn, loss_bbox_dn, loss_giou_dn)
-    @torch.jit.unused  # type: ignore[untyped-decorator]
-    @torch.compiler.disable()  # type: ignore[untyped-decorator]
-    def _compute_loss_from_outputs(  # pylint: disable=too-many-locals
+    def _compute_loss_from_outputs(
         self,
         targets: list[dict[str, torch.Tensor]],
         out_bboxes: torch.Tensor,
@@ -880,7 +871,7 @@ class RT_DETR_v2(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -889,19 +880,21 @@ class RT_DETR_v2(DetectionBaseNet):
         # Decoder losses (all layers)
         for layer_idx in range(out_logits.shape[0]):
             indices = self.matcher(out_logits[layer_idx], out_bboxes[layer_idx], targets)
-            loss_ce = self._class_loss(out_logits[layer_idx], out_bboxes[layer_idx], targets, indices, num_boxes)
-            loss_bbox, loss_giou = self._box_loss(out_bboxes[layer_idx], targets, indices, num_boxes)
+            loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
+                out_logits[layer_idx], out_bboxes[layer_idx], targets, indices, num_boxes
+            )
             loss_ce_list.append(loss_ce)
             loss_bbox_list.append(loss_bbox)
             loss_giou_list.append(loss_giou)
         # Encoder auxiliary loss
         enc_indices = self.matcher(enc_topk_logits, enc_topk_bboxes, targets)
-        loss_ce_enc = self._class_loss(enc_topk_logits, enc_topk_bboxes, targets, enc_indices, num_boxes)
-        loss_bbox_enc, loss_giou_enc = self._box_loss(enc_topk_bboxes, targets, enc_indices, num_boxes)
-        loss_ce_list.append(loss_ce_enc)
-        loss_bbox_list.append(loss_bbox_enc)
-        loss_giou_list.append(loss_giou_enc)
+        loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
+            enc_topk_logits, enc_topk_bboxes, targets, enc_indices, num_boxes
+        )
+        loss_ce_list.append(loss_ce)
+        loss_bbox_list.append(loss_bbox)
+        loss_giou_list.append(loss_giou)
         loss_ce = torch.stack(loss_ce_list).sum()  # VFL weight is 1
         loss_bbox = torch.stack(loss_bbox_list).sum() * 5
@@ -935,11 +928,11 @@ class RT_DETR_v2(DetectionBaseNet):
         images: Any,
         masks: Optional[list[torch.Tensor]] = None,
     ) -> dict[str, torch.Tensor]:
-        device = encoder_features[0].device
         for idx, target in enumerate(targets):
             boxes = target["boxes"]
             boxes = box_ops.box_convert(boxes, in_fmt="xyxy", out_fmt="cxcywh")
-            boxes = boxes / torch.tensor(images.image_sizes[idx][::-1] * 2, dtype=torch.float32, device=device)
+            scale = images.image_sizes[idx].flip(0).repeat(2).float()  # flip to [W, H], repeat to [W, H, W, H]
+            boxes = boxes / scale
             targets[idx]["boxes"] = boxes
             targets[idx]["labels"] = target["labels"] - 1  # No background
@@ -972,7 +965,7 @@ class RT_DETR_v2(DetectionBaseNet):
         return losses
     def postprocess_detections(
-        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_shapes: list[tuple[int, int]]
+        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_sizes: torch.Tensor
     ) -> list[dict[str, torch.Tensor]]:
         prob = class_logits.sigmoid()
         topk_values, topk_indexes = torch.topk(prob.view(class_logits.shape[0], -1), k=self.decoder.num_queries, dim=1)
@@ -981,14 +974,12 @@ class RT_DETR_v2(DetectionBaseNet):
         labels = topk_indexes % class_logits.shape[2]
         labels += 1  # Background offset
-        target_sizes = torch.tensor(image_shapes, device=class_logits.device)
         # Convert to [x0, y0, x1, y1] format
         boxes = box_ops.box_convert(box_regression, in_fmt="cxcywh", out_fmt="xyxy")
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).expand(-1, -1, 4))
         # Convert from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
+        img_h, img_w = image_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
         boxes = boxes * scale_fct[:, None, :]
@@ -1024,32 +1015,34 @@ class RT_DETR_v2(DetectionBaseNet):
         return (None, None, None, None)
+    def forward_net(
+        self, x: torch.Tensor, masks: Optional[torch.Tensor]
+    ) -> tuple[list[torch.Tensor], Optional[list[torch.Tensor]]]:
+        features: dict[str, torch.Tensor] = self.backbone.detection_features(x)
+        feature_list = list(features.values())
+        mask_list: Optional[list[torch.Tensor]] = None
+        if masks is not None:
+            mask_list = []
+            for feat in feature_list:
+                m = F.interpolate(masks[None].float(), size=feat.shape[-2:], mode="nearest").to(torch.bool)[0]
+                mask_list.append(m)
+        encoder_features = self.encoder(feature_list, masks=mask_list)
+        return (encoder_features, mask_list)
     def forward(
         self,
         x: torch.Tensor,
         targets: Optional[list[dict[str, torch.Tensor]]] = None,
         masks: Optional[torch.Tensor] = None,
-        image_sizes: Optional[list[list[int]]] = None,
+        image_sizes: Optional[list[tuple[int, int]]] = None,
     ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
         self._input_check(targets)
         images = self._to_img_list(x, image_sizes)
-        # Backbone features
-        features: dict[str, torch.Tensor] = self.backbone.detection_features(x)
-        feature_list = list(features.values())
-        # Hybrid encoder
-        mask_list: list[torch.Tensor] = []
-        for feat in feature_list:
-            if masks is not None:
-                mask_size = feat.shape[-2:]
-                m = F.interpolate(masks[None].float(), size=mask_size, mode="nearest").to(torch.bool)[0]
-            else:
-                B, _, H, W = feat.size()
-                m = torch.zeros(B, H, W, dtype=torch.bool, device=x.device)
-            mask_list.append(m)
-        encoder_features = self.encoder(feature_list, masks=mask_list)
+        encoder_features, mask_list = self.forward_net(x, masks)
         # Prepare spatial shapes and level start index
         spatial_shapes: list[list[int]] = []
@@ -1070,9 +1063,9 @@ class RT_DETR_v2(DetectionBaseNet):
         else:
             # Inference path - no CDN
             out_bboxes, out_logits, _, _ = self.decoder(
-                encoder_features, spatial_shapes, level_start_index, padding_mask=mask_list
+                encoder_features, spatial_shapes, level_start_index, padding_mask=mask_list, return_intermediates=False
             )
-            detections = self.postprocess_detections(out_logits[-1], out_bboxes[-1], images.image_sizes)
+            detections = self.postprocess_detections(out_logits, out_bboxes, images.image_sizes)
         return (detections, losses)

birder/net/detection/ssd.py CHANGED Viewed

@@ -30,6 +30,7 @@ from birder.net.detection.base import BoxCoder
 from birder.net.detection.base import DetectionBaseNet
 from birder.net.detection.base import ImageList
 from birder.net.detection.base import Matcher
+from birder.net.detection.base import clip_boxes_to_image
 class SSDMatcher(Matcher):
@@ -303,6 +304,12 @@ class SSD(DetectionBaseNet):
         topk_candidates = 400
         positive_fraction = 0.25
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
+        self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction
         self.backbone.return_channels = self.backbone.return_channels[-2:]
         self.backbone.return_stages = self.backbone.return_stages[-2:]
         self.extra_blocks = nn.ModuleList(
@@ -325,11 +332,8 @@ class SSD(DetectionBaseNet):
         self.head = SSDHead(self.backbone.return_channels + [512, 256, 256, 256], num_anchors, self.num_classes)
         self.proposal_matcher = SSDMatcher(iou_thresh)
-        self.score_thresh = score_thresh
-        self.nms_thresh = nms_thresh
-        self.detections_per_img = detections_per_img
-        self.topk_candidates = topk_candidates
-        self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction
+        if self.export_mode is False:
+            self.forward = torch.compiler.disable(recursive=False)(self.forward)  # type: ignore[method-assign]
     def reset_classifier(self, num_classes: int) -> None:
         self.num_classes = num_classes + 1
@@ -348,6 +352,8 @@ class SSD(DetectionBaseNet):
                 param.requires_grad_(True)
     # pylint: disable=too-many-locals
+    @torch.jit.unused  # type: ignore[untyped-decorator]
+    @torch.compiler.disable()  # type: ignore[untyped-decorator]
     def compute_loss(
         self,
         targets: list[dict[str, torch.Tensor]],
@@ -423,7 +429,7 @@ class SSD(DetectionBaseNet):
         self,
         head_outputs: dict[str, torch.Tensor],
         image_anchors: list[torch.Tensor],
-        image_shapes: list[tuple[int, int]],
+        image_sizes: torch.Tensor,
     ) -> list[dict[str, torch.Tensor]]:
         bbox_regression = head_outputs["bbox_regression"]
         pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1)
@@ -431,11 +437,10 @@ class SSD(DetectionBaseNet):
         num_classes = pred_scores.size(-1)
         device = pred_scores.device
         detections: list[dict[str, torch.Tensor]] = []
-        for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
+        for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_sizes):
             boxes = self.box_coder.decode_single(boxes, anchors)
-            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
+            boxes = clip_boxes_to_image(boxes, image_shape)
-            list_empty = True
             image_boxes_list = []
             image_scores_list = []
             image_labels_list = []
@@ -447,51 +452,62 @@ class SSD(DetectionBaseNet):
                 box = boxes[keep_idxs]
                 # Keep only topk scoring predictions
-                num_topk = min(self.topk_candidates, int(score.size(0)))
+                num_topk = min(self.topk_candidates, score.size(0))
                 score, idxs = score.topk(num_topk)
                 box = box[idxs]
-                if len(box) == 0 and list_empty is False:
-                    continue
                 image_boxes_list.append(box)
                 image_scores_list.append(score)
                 image_labels_list.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device))
-                list_empty = False
             image_boxes = torch.concat(image_boxes_list, dim=0)
             image_scores = torch.concat(image_scores_list, dim=0)
             image_labels = torch.concat(image_labels_list, dim=0)
-            # Non-maximum suppression
-            keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
-            keep = keep[: self.detections_per_img]
-            detections.append(
-                {
-                    "boxes": image_boxes[keep],
-                    "scores": image_scores[keep],
-                    "labels": image_labels[keep],
-                }
-            )
+            if self.export_mode is False:
+                # Non-maximum suppression
+                keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+                keep = keep[: self.detections_per_img]
+                detections.append(
+                    {
+                        "boxes": image_boxes[keep],
+                        "scores": image_scores[keep],
+                        "labels": image_labels[keep],
+                    }
+                )
+            else:
+                detections.append(
+                    {
+                        "boxes": image_boxes,
+                        "scores": image_scores,
+                        "labels": image_labels,
+                    }
+                )
         return detections
+    def forward_net(self, x: torch.Tensor) -> tuple[list[torch.Tensor], dict[str, torch.Tensor]]:
+        features = self.backbone.detection_features(x)
+        feature_list = list(features.values())
+        for extra_block in self.extra_blocks:
+            feature_list.append(extra_block(feature_list[-1]))
+        head_outputs = self.head(feature_list)
+        return (feature_list, head_outputs)
     def forward(
         self,
         x: torch.Tensor,
         targets: Optional[list[dict[str, torch.Tensor]]] = None,
         masks: Optional[torch.Tensor] = None,
-        image_sizes: Optional[list[list[int]]] = None,
+        image_sizes: Optional[list[tuple[int, int]]] = None,
     ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
         self._input_check(targets)
         images = self._to_img_list(x, image_sizes)
-        features = self.backbone.detection_features(x)
-        feature_list = list(features.values())
-        for extra_block in self.extra_blocks:
-            feature_list.append(extra_block(feature_list[-1]))
-        head_outputs = self.head(feature_list)
+        feature_list, head_outputs = self.forward_net(x)
         anchors = self.anchor_generator(images, feature_list)
         losses = {}

birder/net/detection/ssdlite.py CHANGED Viewed

@@ -50,7 +50,7 @@ class SSDLiteClassificationHead(SSDScoringHead):
             if isinstance(layer, nn.Conv2d):
                 nn.init.xavier_uniform_(layer.weight)
                 if layer.bias is not None:
-                    nn.init.constant_(layer.bias, 0.0)
+                    nn.init.zeros_(layer.bias)
         super().__init__(cls_logits, num_classes)
@@ -79,7 +79,7 @@ class SSDLiteRegressionHead(SSDScoringHead):
             if isinstance(layer, nn.Conv2d):
                 nn.init.xavier_uniform_(layer.weight)
                 if layer.bias is not None:
-                    nn.init.constant_(layer.bias, 0.0)
+                    nn.init.zeros_(layer.bias)
         super().__init__(bbox_reg, 4)

birder 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

birder 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl