PyPI - birder - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

birder 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

birder/__init__.py +2 -0
birder/common/fs_ops.py +81 -1
birder/common/training_cli.py +12 -2
birder/common/training_utils.py +73 -12
birder/data/collators/detection.py +3 -1
birder/datahub/_lib.py +15 -6
birder/datahub/evaluation.py +591 -0
birder/eval/__init__.py +0 -0
birder/eval/__main__.py +74 -0
birder/eval/_embeddings.py +50 -0
birder/eval/adversarial.py +315 -0
birder/eval/benchmarks/__init__.py +0 -0
birder/eval/benchmarks/awa2.py +357 -0
birder/eval/benchmarks/bioscan5m.py +198 -0
birder/eval/benchmarks/fishnet.py +318 -0
birder/eval/benchmarks/flowers102.py +210 -0
birder/eval/benchmarks/fungiclef.py +261 -0
birder/eval/benchmarks/nabirds.py +202 -0
birder/eval/benchmarks/newt.py +262 -0
birder/eval/benchmarks/plankton.py +255 -0
birder/eval/benchmarks/plantdoc.py +259 -0
birder/eval/benchmarks/plantnet.py +252 -0
birder/eval/classification.py +235 -0
birder/eval/methods/__init__.py +0 -0
birder/eval/methods/ami.py +78 -0
birder/eval/methods/knn.py +71 -0
birder/eval/methods/linear.py +152 -0
birder/eval/methods/mlp.py +178 -0
birder/eval/methods/simpleshot.py +100 -0
birder/eval/methods/svm.py +92 -0
birder/inference/classification.py +23 -2
birder/inference/detection.py +35 -15
birder/net/_vit_configs.py +5 -0
birder/net/cait.py +3 -3
birder/net/coat.py +3 -3
birder/net/cswin_transformer.py +2 -1
birder/net/deit.py +1 -1
birder/net/deit3.py +1 -1
birder/net/detection/__init__.py +2 -0
birder/net/detection/base.py +41 -18
birder/net/detection/deformable_detr.py +74 -50
birder/net/detection/detr.py +29 -26
birder/net/detection/efficientdet.py +42 -25
birder/net/detection/faster_rcnn.py +53 -21
birder/net/detection/fcos.py +42 -23
birder/net/detection/lw_detr.py +1204 -0
birder/net/detection/plain_detr.py +60 -47
birder/net/detection/retinanet.py +47 -35
birder/net/detection/rt_detr_v1.py +49 -46
birder/net/detection/rt_detr_v2.py +95 -102
birder/net/detection/ssd.py +47 -31
birder/net/detection/ssdlite.py +2 -2
birder/net/detection/yolo_v2.py +33 -18
birder/net/detection/yolo_v3.py +35 -33
birder/net/detection/yolo_v4.py +35 -20
birder/net/detection/yolo_v4_tiny.py +1 -2
birder/net/edgevit.py +3 -3
birder/net/efficientvit_msft.py +1 -1
birder/net/flexivit.py +1 -1
birder/net/hiera.py +44 -67
birder/net/hieradet.py +2 -2
birder/net/maxvit.py +2 -2
birder/net/mim/fcmae.py +2 -2
birder/net/mim/mae_hiera.py +9 -16
birder/net/mnasnet.py +2 -2
birder/net/nextvit.py +4 -4
birder/net/resnext.py +2 -2
birder/net/rope_deit3.py +2 -2
birder/net/rope_flexivit.py +2 -2
birder/net/rope_vit.py +2 -2
birder/net/simple_vit.py +1 -1
birder/net/squeezenet.py +1 -1
birder/net/ssl/capi.py +32 -25
birder/net/ssl/dino_v2.py +12 -15
birder/net/ssl/franca.py +26 -19
birder/net/van.py +2 -2
birder/net/vit.py +21 -3
birder/net/vit_parallel.py +1 -1
birder/net/vit_sam.py +62 -16
birder/net/xcit.py +1 -1
birder/ops/msda.py +46 -16
birder/scripts/benchmark.py +35 -8
birder/scripts/predict.py +14 -1
birder/scripts/predict_detection.py +7 -1
birder/scripts/train.py +27 -11
birder/scripts/train_capi.py +13 -10
birder/scripts/train_detection.py +18 -7
birder/scripts/train_franca.py +10 -2
birder/scripts/train_kd.py +28 -11
birder/tools/adversarial.py +5 -0
birder/tools/convert_model.py +101 -43
birder/tools/quantize_model.py +33 -16
birder/version.py +1 -1
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/METADATA +17 -10
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/RECORD +99 -75
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/WHEEL +1 -1
birder/scripts/evaluate.py +0 -176
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/entry_points.txt +0 -0
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/licenses/LICENSE +0 -0
{birder-0.4.1.dist-info → birder-0.4.4.dist-info}/top_level.txt +0 -0

birder/net/deit3.py CHANGED Viewed

@@ -185,7 +185,7 @@ class DeiT3(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedT
             xs = self.encoder.forward_features(x, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/detection/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from birder.net.detection.detr import DETR
 from birder.net.detection.efficientdet import EfficientDet
 from birder.net.detection.faster_rcnn import Faster_RCNN
 from birder.net.detection.fcos import FCOS
+from birder.net.detection.lw_detr import LW_DETR
 from birder.net.detection.plain_detr import Plain_DETR
 from birder.net.detection.retinanet import RetinaNet
 from birder.net.detection.rt_detr_v1 import RT_DETR_v1
@@ -21,6 +22,7 @@ __all__ = [
     "EfficientDet",
     "Faster_RCNN",
     "FCOS",
+    "LW_DETR",
     "Plain_DETR",
     "RetinaNet",
     "RT_DETR_v1",

birder/net/detection/base.py CHANGED Viewed

@@ -44,6 +44,7 @@ class DetectionBaseNet(nn.Module):
     block_group_regex: Optional[str]
     auto_register = False
     scriptable = True
+    exportable = True
     task = str(Task.OBJECT_DETECTION)
     def __init_subclass__(cls) -> None:
@@ -134,40 +135,62 @@ class DetectionBaseNet(nn.Module):
                         f" Found invalid box {degenerate_bb} for target at index {target_idx}.",
                     )
-    # pylint: disable=protected-access
-    def _to_img_list(self, x: torch.Tensor, image_sizes: Optional[list[list[int]]] = None) -> "ImageList":
+    def _to_img_list(self, x: torch.Tensor, image_sizes: Optional[list[tuple[int, int]]] = None) -> "ImageList":
+        B = x.size(0)
         if image_sizes is None:
-            image_sizes = [img.shape[-2:] for img in x]
-        image_sizes_list: list[tuple[int, int]] = []
-        for image_size in image_sizes:
-            torch._assert(
-                len(image_size) == 2,
-                f"Input tensors expected to have in the last two elements H and W, instead got {image_size}",
-            )
-            image_sizes_list.append((image_size[0], image_size[1]))
+            H = x.size(2)
+            W = x.size(3)
+            h_tensor = torch.full((B,), H, dtype=torch.int64, device=x.device)
+            w_tensor = torch.full((B,), W, dtype=torch.int64, device=x.device)
+            image_sizes_tensor = torch.stack([h_tensor, w_tensor], dim=1)
+        else:
+            image_sizes_tensor = torch.tensor(image_sizes, dtype=torch.int64, device=x.device)
-        return ImageList(x, image_sizes_list)
+        return ImageList(x, image_sizes_tensor)
     def forward(
         self,
         x: torch.Tensor,
         targets: Optional[list[dict[str, torch.Tensor]]] = None,
         masks: Optional[torch.Tensor] = None,
-        image_sizes: Optional[list[list[int]]] = None,
+        image_sizes: Optional[list[tuple[int, int]]] = None,
     ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
         # TypedDict not supported for TorchScript - avoid returning DetectorResultType
         raise NotImplementedError
 class ImageList:
-    def __init__(self, tensors: torch.Tensor, image_sizes: list[tuple[int, int]]) -> None:
+    def __init__(self, tensors: torch.Tensor, image_sizes: torch.Tensor) -> None:
         self.tensors = tensors
-        self.image_sizes = image_sizes
+        self.image_sizes = image_sizes  # Shape: (B, 2) with [H, W] format
     def to(self, device: torch.device) -> "ImageList":
         cast_tensor = self.tensors.to(device)
-        return ImageList(cast_tensor, self.image_sizes)
+        cast_sizes = self.image_sizes.to(device)
+        return ImageList(cast_tensor, cast_sizes)
+def clip_boxes_to_image(boxes: torch.Tensor, image_size: torch.Tensor) -> torch.Tensor:
+    """
+    Clip boxes to image boundaries
+    Parameters
+    ----------
+    boxes
+        Boxes in (x1, y1, x2, y2) format, shape (..., 4)
+    image_size
+        Tensor of [height, width]
+    Returns
+    -------
+    Clipped boxes
+    """
+    boxes_x = boxes[..., 0::2].clamp(min=0, max=image_size[1])
+    boxes_y = boxes[..., 1::2].clamp(min=0, max=image_size[0])
+    clipped_boxes = torch.stack([boxes_x[..., 0], boxes_y[..., 0], boxes_x[..., 1], boxes_y[..., 1]], dim=-1)
+    return clipped_boxes
 ###############################################################################
@@ -325,7 +348,7 @@ class SimpleFeaturePyramidNetwork(nn.Module):
 # pylint: disable=protected-access,too-many-locals
-@torch.jit._script_if_tracing  # type: ignore
+@torch.jit._script_if_tracing  # type: ignore[untyped-decorator]
 def encode_boxes(reference_boxes: torch.Tensor, proposals: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
     """
     Encode a set of proposals with respect to some reference boxes
@@ -609,7 +632,7 @@ class Matcher(nn.Module):
         if match_quality_matrix.numel() == 0:
             # Empty targets or proposals not supported during training
-            if match_quality_matrix.shape[0] == 0:
+            if match_quality_matrix.size(0) == 0:
                 raise ValueError("No ground-truth boxes available for one of the images during training")
             raise ValueError("No proposal boxes available for one of the images during training")

birder/net/detection/deformable_detr.py CHANGED Viewed

@@ -56,7 +56,7 @@ class HungarianMatcher(nn.Module):
     @torch.jit.unused  # type: ignore[untyped-decorator]
     def forward(
         self, class_logits: torch.Tensor, box_regression: torch.Tensor, targets: list[dict[str, torch.Tensor]]
-    ) -> list[torch.Tensor]:
+    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
         with torch.no_grad():
             B, num_queries = class_logits.shape[:2]
@@ -135,7 +135,7 @@ class MultiScaleDeformableAttention(nn.Module):
         self.reset_parameters()
     def reset_parameters(self) -> None:
-        nn.init.constant_(self.sampling_offsets.weight, 0.0)
+        nn.init.zeros_(self.sampling_offsets.weight)
         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = (
@@ -149,12 +149,12 @@ class MultiScaleDeformableAttention(nn.Module):
         with torch.no_grad():
             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight, 0.0)
-        nn.init.constant_(self.attention_weights.bias, 0.0)
+        nn.init.zeros_(self.attention_weights.weight)
+        nn.init.zeros_(self.attention_weights.bias)
         nn.init.xavier_uniform_(self.value_proj.weight)
-        nn.init.constant_(self.value_proj.bias, 0.0)
+        nn.init.zeros_(self.value_proj.bias)
         nn.init.xavier_uniform_(self.output_proj.weight)
-        nn.init.constant_(self.output_proj.bias, 0.0)
+        nn.init.zeros_(self.output_proj.bias)
     def forward(
         self,
@@ -164,10 +164,11 @@ class MultiScaleDeformableAttention(nn.Module):
         input_spatial_shapes: torch.Tensor,
         input_level_start_index: torch.Tensor,
         input_padding_mask: Optional[torch.Tensor] = None,
+        src_shapes: Optional[list[list[int]]] = None,
     ) -> torch.Tensor:
         N, num_queries, _ = query.size()
         N, sequence_length, _ = input_flatten.size()
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == sequence_length
+        # assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == sequence_length
         value = self.value_proj(input_flatten)
         if input_padding_mask is not None:
@@ -208,6 +209,7 @@ class MultiScaleDeformableAttention(nn.Module):
             sampling_locations,
             attention_weights,
             self.im2col_step,
+            src_shapes,
         )
         output = self.output_proj(output)
@@ -235,8 +237,9 @@ class DeformableTransformerEncoderLayer(nn.Module):
         spatial_shapes: torch.Tensor,
         level_start_index: torch.Tensor,
         mask: Optional[torch.Tensor],
+        src_shapes: Optional[list[list[int]]] = None,
     ) -> torch.Tensor:
-        src2 = self.self_attn(src + pos, reference_points, src, spatial_shapes, level_start_index, mask)
+        src2 = self.self_attn(src + pos, reference_points, src, spatial_shapes, level_start_index, mask, src_shapes)
         src = src + self.dropout(src2)
         src = self.norm1(src)
@@ -277,13 +280,13 @@ class DeformableTransformerDecoderLayer(nn.Module):
         level_start_index: torch.Tensor,
         src_padding_mask: Optional[torch.Tensor],
         self_attn_mask: Optional[torch.Tensor] = None,
+        src_shapes: Optional[list[list[int]]] = None,
     ) -> torch.Tensor:
         # Self attention
-        q = tgt + query_pos
-        k = tgt + query_pos
+        q_k = tgt + query_pos
         tgt2, _ = self.self_attn(
-            q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), need_weights=False, attn_mask=self_attn_mask
+            q_k.transpose(0, 1), q_k.transpose(0, 1), tgt.transpose(0, 1), need_weights=False, attn_mask=self_attn_mask
         )
         tgt2 = tgt2.transpose(0, 1)
         tgt = tgt + self.dropout(tgt2)
@@ -291,7 +294,7 @@ class DeformableTransformerDecoderLayer(nn.Module):
         # Cross attention
         tgt2 = self.cross_attn(
-            tgt + query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask
+            tgt + query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask, src_shapes
         )
         tgt = tgt + self.dropout(tgt2)
         tgt = self.norm2(tgt)
@@ -311,17 +314,15 @@ class DeformableTransformerEncoder(nn.Module):
     @staticmethod
     def get_reference_points(
-        spatial_shapes: torch.Tensor, valid_ratios: torch.Tensor, device: torch.device
+        src_shapes: list[list[int]], valid_ratios: torch.Tensor, device: torch.device
     ) -> torch.Tensor:
         reference_points_list = []
-        for lvl, spatial_shape in enumerate(spatial_shapes):
-            H = spatial_shape[0]
-            W = spatial_shape[1]
-            ref_y, ref_x = torch.meshgrid(
-                torch.linspace(0.5, H - 0.5, H, dtype=torch.float32, device=device),
-                torch.linspace(0.5, W - 0.5, W, dtype=torch.float32, device=device),
-                indexing="ij",
-            )
+        for lvl, (H, W) in enumerate(src_shapes):
+            # Use arange instead of linspace - works with symbolic sizes
+            # linspace(0.5, H-0.5, H) is equivalent to arange(H) + 0.5
+            ref_y = (torch.arange(H, dtype=torch.float32, device=device) + 0.5).view(-1, 1).expand(-1, W)
+            ref_x = (torch.arange(W, dtype=torch.float32, device=device) + 0.5).view(1, -1).expand(H, -1)
             ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H)
             ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W)
             ref = torch.stack((ref_x, ref_y), dim=-1)
@@ -336,15 +337,16 @@ class DeformableTransformerEncoder(nn.Module):
         self,
         src: torch.Tensor,
         spatial_shapes: torch.Tensor,
+        src_shapes: list[list[int]],
         level_start_index: torch.Tensor,
         pos: torch.Tensor,
         valid_ratios: torch.Tensor,
         mask: torch.Tensor,
     ) -> torch.Tensor:
         out = src
-        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        reference_points = self.get_reference_points(src_shapes, valid_ratios, device=src.device)
         for layer in self.layers:
-            out = layer(out, pos, reference_points, spatial_shapes, level_start_index, mask)
+            out = layer(out, pos, reference_points, spatial_shapes, level_start_index, mask, src_shapes)
         return out
@@ -369,6 +371,7 @@ class DeformableTransformerDecoder(nn.Module):
         query_pos: torch.Tensor,
         src_valid_ratios: torch.Tensor,
         src_padding_mask: torch.Tensor,
+        src_shapes: Optional[list[list[int]]] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         output = tgt
@@ -391,6 +394,7 @@ class DeformableTransformerDecoder(nn.Module):
                 src_spatial_shapes,
                 src_level_start_index,
                 src_padding_mask,
+                src_shapes=src_shapes,
             )
             if self.box_refine is True:
@@ -482,10 +486,11 @@ class DeformableTransformer(nn.Module):
         src_list = []
         lvl_pos_embed_list = []
         mask_list = []
-        spatial_shape_list: list[list[int]] = []  # list[tuple[int, int]] not supported on TorchScript
+        src_shapes: list[list[int]] = []  # list[tuple[int, int]] not supported on TorchScript
         for lvl, (src, pos_embed, mask) in enumerate(zip(srcs, pos_embeds, masks)):
-            _, _, H, W = src.size()
-            spatial_shape_list.append([H, W])
+            H, W = src.shape[-2], src.shape[-1]
+            src_shapes.append([H, W])
             src = src.flatten(2).transpose(1, 2)
             pos_embed = pos_embed.flatten(2).transpose(1, 2)
             mask = mask.flatten(1)
@@ -497,13 +502,19 @@ class DeformableTransformer(nn.Module):
         src_flatten = torch.concat(src_list, dim=1)
         mask_flatten = torch.concat(mask_list, dim=1)
         lvl_pos_embed_flatten = torch.concat(lvl_pos_embed_list, dim=1)
-        spatial_shapes = torch.as_tensor(spatial_shape_list, dtype=torch.long, device=src_flatten.device)
+        spatial_shapes = torch.tensor(src_shapes, dtype=torch.long, device=src_flatten.device)
         level_start_index = torch.concat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]), dim=0)
         valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], dim=1)
         # Encoder
         memory = self.encoder(
-            src_flatten, spatial_shapes, level_start_index, lvl_pos_embed_flatten, valid_ratios, mask_flatten
+            src_flatten,
+            spatial_shapes,
+            src_shapes,
+            level_start_index,
+            lvl_pos_embed_flatten,
+            valid_ratios,
+            mask_flatten,
         )
         # Prepare input for decoder
@@ -515,7 +526,15 @@ class DeformableTransformer(nn.Module):
         # Decoder
         hs, inter_references = self.decoder(
-            tgt, reference_points, memory, spatial_shapes, level_start_index, query_embed, valid_ratios, mask_flatten
+            tgt,
+            reference_points,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            query_embed,
+            valid_ratios,
+            mask_flatten,
+            src_shapes,
         )
         return (hs, reference_points, inter_references)
@@ -587,7 +606,7 @@ class Deformable_DETR(DetectionBaseNet):
         self.query_embed = nn.Embedding(num_queries, hidden_dim * 2)
         self.pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True)
-        self.matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
+        self.matcher = HungarianMatcher(cost_class=2.0, cost_bbox=5.0, cost_giou=2.0)
         class_embed = nn.Linear(hidden_dim, self.num_classes)
         bbox_embed = MLP(hidden_dim, [hidden_dim, hidden_dim, 4], activation_layer=nn.ReLU)
@@ -641,7 +660,8 @@ class Deformable_DETR(DetectionBaseNet):
             for param in self.class_embed.parameters():
                 param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
@@ -650,7 +670,7 @@ class Deformable_DETR(DetectionBaseNet):
         self,
         cls_logits: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
         num_boxes: int,
     ) -> torch.Tensor:
         idx = self._get_src_permutation_idx(indices)
@@ -675,7 +695,7 @@ class Deformable_DETR(DetectionBaseNet):
         self,
         box_output: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
         num_boxes: int,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         idx = self._get_src_permutation_idx(indices)
@@ -709,7 +729,7 @@ class Deformable_DETR(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -734,7 +754,7 @@ class Deformable_DETR(DetectionBaseNet):
         return losses
     def postprocess_detections(
-        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_shapes: list[tuple[int, int]]
+        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_sizes: torch.Tensor
     ) -> list[dict[str, torch.Tensor]]:
         prob = class_logits.sigmoid()
         topk_values, topk_indexes = torch.topk(prob.view(class_logits.shape[0], -1), k=100, dim=1)
@@ -743,14 +763,12 @@ class Deformable_DETR(DetectionBaseNet):
         labels = topk_indexes % class_logits.shape[2]
         labels += 1  # Background offset
-        target_sizes = torch.tensor(image_shapes, device=class_logits.device)
         # Convert to [x0, y0, x1, y1] format
         boxes = box_ops.box_convert(box_regression, in_fmt="cxcywh", out_fmt="xyxy")
         boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
         # Convert from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
+        img_h, img_w = image_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
         boxes = boxes * scale_fct[:, None, :]
@@ -776,16 +794,7 @@ class Deformable_DETR(DetectionBaseNet):
         return detections
     # pylint: disable=too-many-locals
-    def forward(
-        self,
-        x: torch.Tensor,
-        targets: Optional[list[dict[str, torch.Tensor]]] = None,
-        masks: Optional[torch.Tensor] = None,
-        image_sizes: Optional[list[list[int]]] = None,
-    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
-        self._input_check(targets)
-        images = self._to_img_list(x, image_sizes)
+    def forward_net(self, x: torch.Tensor, masks: Optional[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         features: dict[str, torch.Tensor] = self.backbone.detection_features(x)
         feature_list = list(features.values())
         mask_list = []
@@ -829,6 +838,20 @@ class Deformable_DETR(DetectionBaseNet):
         outputs_class = torch.stack(outputs_classes)
         outputs_coord = torch.stack(outputs_coords)
+        return (outputs_class, outputs_coord)
+    def forward(
+        self,
+        x: torch.Tensor,
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+        masks: Optional[torch.Tensor] = None,
+        image_sizes: Optional[list[tuple[int, int]]] = None,
+    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
+        self._input_check(targets)
+        image_sizes_tensor = self._to_img_list(x, image_sizes).image_sizes
+        outputs_class, outputs_coord = self.forward_net(x, masks)
         losses = {}
         detections: list[dict[str, torch.Tensor]] = []
         if self.training is True:
@@ -838,14 +861,15 @@ class Deformable_DETR(DetectionBaseNet):
             for idx, target in enumerate(targets):
                 boxes = target["boxes"]
                 boxes = box_ops.box_convert(boxes, in_fmt="xyxy", out_fmt="cxcywh")
-                boxes = boxes / torch.tensor(images.image_sizes[idx][::-1] * 2, dtype=torch.float32, device=x.device)
+                scale = image_sizes_tensor[idx].flip(0).repeat(2).float()  # flip to [W, H], repeat to [W, H, W, H]
+                boxes = boxes / scale
                 targets[idx]["boxes"] = boxes
                 targets[idx]["labels"] = target["labels"] - 1  # No background
             losses = self.compute_loss(targets, outputs_class, outputs_coord)
         else:
-            detections = self.postprocess_detections(outputs_class[-1], outputs_coord[-1], images.image_sizes)
+            detections = self.postprocess_detections(outputs_class[-1], outputs_coord[-1], image_sizes_tensor)
         return (detections, losses)

birder/net/detection/detr.py CHANGED Viewed

@@ -49,7 +49,7 @@ class HungarianMatcher(nn.Module):
     @torch.jit.unused  # type: ignore[untyped-decorator]
     def forward(
         self, class_logits: torch.Tensor, box_regression: torch.Tensor, targets: list[dict[str, torch.Tensor]]
-    ) -> list[torch.Tensor]:
+    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
         with torch.no_grad():
             B, num_queries = class_logits.shape[:2]
@@ -148,10 +148,9 @@ class TransformerDecoderLayer(nn.Module):
         query_pos: torch.Tensor,
         memory_key_padding_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        q = tgt + query_pos
-        k = tgt + query_pos
+        q_k = tgt + query_pos
-        tgt2, _ = self.self_attn(q, k, value=tgt, need_weights=False)
+        tgt2, _ = self.self_attn(q_k, q_k, value=tgt, need_weights=False)
         tgt = tgt + self.dropout1(tgt2)
         tgt = self.norm1(tgt)
         tgt2, _ = self.multihead_attn(
@@ -341,7 +340,7 @@ class DETR(DetectionBaseNet):
         )
         self.pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True)
-        self.matcher = HungarianMatcher(cost_class=1, cost_bbox=5, cost_giou=2)
+        self.matcher = HungarianMatcher(cost_class=1.0, cost_bbox=5.0, cost_giou=2.0)
         empty_weight = torch.ones(self.num_classes)
         empty_weight[0] = 0.1
         self.empty_weight = nn.Buffer(empty_weight)
@@ -365,7 +364,8 @@ class DETR(DetectionBaseNet):
             for param in self.class_embed.parameters():
                 param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
@@ -374,7 +374,7 @@ class DETR(DetectionBaseNet):
         self,
         cls_logits: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> torch.Tensor:
         idx = self._get_src_permutation_idx(indices)
         target_classes_o = torch.concat([t["labels"][J] for t, (_, J) in zip(targets, indices)], dim=0)
@@ -388,7 +388,7 @@ class DETR(DetectionBaseNet):
         self,
         box_output: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
+        indices: list[tuple[torch.Tensor, torch.Tensor]],
         num_boxes: int,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         idx = self._get_src_permutation_idx(indices)
@@ -422,7 +422,7 @@ class DETR(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -447,20 +447,17 @@ class DETR(DetectionBaseNet):
         return losses
     def postprocess_detections(
-        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_shapes: list[tuple[int, int]]
+        self, class_logits: torch.Tensor, box_regression: torch.Tensor, image_sizes: torch.Tensor
     ) -> list[dict[str, torch.Tensor]]:
         prob = F.softmax(class_logits, -1)
         scores, labels = prob[..., 1:].max(-1)
         labels = labels + 1
-        # TorchScript doesn't support creating tensor from tuples, convert everything to lists
-        target_sizes = torch.tensor([list(s) for s in image_shapes], device=class_logits.device)
         # Convert to [x0, y0, x1, y1] format
         boxes = box_ops.box_convert(box_regression, in_fmt="cxcywh", out_fmt="xyxy")
         # Convert from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
+        img_h, img_w = image_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
         boxes = boxes * scale_fct[:, None, :]
@@ -485,16 +482,7 @@ class DETR(DetectionBaseNet):
         return detections
-    def forward(
-        self,
-        x: torch.Tensor,
-        targets: Optional[list[dict[str, torch.Tensor]]] = None,
-        masks: Optional[torch.Tensor] = None,
-        image_sizes: Optional[list[list[int]]] = None,
-    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
-        self._input_check(targets)
-        images = self._to_img_list(x, image_sizes)
+    def forward_net(self, x: torch.Tensor, masks: Optional[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         features: dict[str, torch.Tensor] = self.backbone.detection_features(x)
         x = features[self.backbone.return_stages[-1]]
         if masks is not None:
@@ -505,6 +493,20 @@ class DETR(DetectionBaseNet):
         outputs_class = self.class_embed(hs)
         outputs_coord = self.bbox_embed(hs).sigmoid()
+        return (outputs_class, outputs_coord)
+    def forward(
+        self,
+        x: torch.Tensor,
+        targets: Optional[list[dict[str, torch.Tensor]]] = None,
+        masks: Optional[torch.Tensor] = None,
+        image_sizes: Optional[list[tuple[int, int]]] = None,
+    ) -> tuple[list[dict[str, torch.Tensor]], dict[str, torch.Tensor]]:
+        self._input_check(targets)
+        image_sizes_tensor = self._to_img_list(x, image_sizes).image_sizes
+        outputs_class, outputs_coord = self.forward_net(x, masks)
         losses = {}
         detections: list[dict[str, torch.Tensor]] = []
         if self.training is True:
@@ -514,13 +516,14 @@ class DETR(DetectionBaseNet):
             for idx, target in enumerate(targets):
                 boxes = target["boxes"]
                 boxes = box_ops.box_convert(boxes, in_fmt="xyxy", out_fmt="cxcywh")
-                boxes = boxes / torch.tensor(images.image_sizes[idx][::-1] * 2, dtype=torch.float32, device=x.device)
+                scale = image_sizes_tensor[idx].flip(0).repeat(2).float()  # flip to [W, H], repeat to [W, H, W, H]
+                boxes = boxes / scale
                 targets[idx]["boxes"] = boxes
             losses = self.compute_loss(targets, outputs_class, outputs_coord)
         else:
-            detections = self.postprocess_detections(outputs_class[-1], outputs_coord[-1], images.image_sizes)
+            detections = self.postprocess_detections(outputs_class[-1], outputs_coord[-1], image_sizes_tensor)
         return (detections, losses)

birder 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

birder 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl