PyPI - birder - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

birder 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

birder/common/training_cli.py +6 -1
birder/common/training_utils.py +69 -12
birder/net/_vit_configs.py +5 -0
birder/net/cait.py +3 -3
birder/net/coat.py +3 -3
birder/net/deit.py +1 -1
birder/net/deit3.py +1 -1
birder/net/detection/__init__.py +2 -0
birder/net/detection/deformable_detr.py +12 -12
birder/net/detection/detr.py +7 -7
birder/net/detection/lw_detr.py +1181 -0
birder/net/detection/plain_detr.py +7 -5
birder/net/detection/retinanet.py +1 -1
birder/net/detection/rt_detr_v1.py +10 -10
birder/net/detection/rt_detr_v2.py +47 -64
birder/net/detection/ssdlite.py +2 -2
birder/net/edgevit.py +3 -3
birder/net/efficientvit_msft.py +1 -1
birder/net/flexivit.py +1 -1
birder/net/hieradet.py +2 -2
birder/net/mnasnet.py +2 -2
birder/net/resnext.py +2 -2
birder/net/rope_deit3.py +1 -1
birder/net/rope_flexivit.py +1 -1
birder/net/rope_vit.py +1 -1
birder/net/simple_vit.py +1 -1
birder/net/vit.py +21 -3
birder/net/vit_parallel.py +1 -1
birder/net/vit_sam.py +62 -16
birder/scripts/train.py +12 -8
birder/scripts/train_capi.py +13 -10
birder/scripts/train_detection.py +2 -1
birder/scripts/train_kd.py +12 -8
birder/version.py +1 -1
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/METADATA +3 -3
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/RECORD +40 -39
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/WHEEL +1 -1
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/entry_points.txt +0 -0
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/licenses/LICENSE +0 -0
{birder-0.4.1.dist-info → birder-0.4.2.dist-info}/top_level.txt +0 -0

birder/net/detection/plain_detr.py CHANGED Viewed

@@ -522,13 +522,13 @@ class Plain_DETR(DetectionBaseNet):
         self.class_embed = nn.Linear(hidden_dim, self.num_classes)
         self.bbox_embed = MLP(hidden_dim, [hidden_dim, hidden_dim, 4], activation_layer=nn.ReLU)
-        self.query_embed = nn.Embedding(self.num_queries, hidden_dim * 2)
+        self.query_embed = nn.Parameter(torch.empty(self.num_queries, hidden_dim * 2))
         self.reference_point_head = MLP(hidden_dim, [hidden_dim, hidden_dim, 4], activation_layer=nn.ReLU)
         self.input_proj = nn.Conv2d(
             self.backbone.return_channels[-1], hidden_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)
         )
         self.pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True)
-        self.matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
+        self.matcher = HungarianMatcher(cost_class=2.0, cost_bbox=5.0, cost_giou=2.0)
         if box_refine is True:
             self.class_embed = _get_clones(self.class_embed, num_decoder_layers)
@@ -554,6 +554,7 @@ class Plain_DETR(DetectionBaseNet):
             if idx == 0:
                 nn.init.constant_(last_linear.bias[2:], -2.0)  # Small initial wh
+        nn.init.normal_(self.query_embed)
         ref_last_linear = [m for m in self.reference_point_head.modules() if isinstance(m, nn.Linear)][-1]
         nn.init.zeros_(ref_last_linear.weight)
         nn.init.zeros_(ref_last_linear.bias)
@@ -576,7 +577,8 @@ class Plain_DETR(DetectionBaseNet):
             for param in self.class_embed.parameters():
                 param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
@@ -646,7 +648,7 @@ class Plain_DETR(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -772,7 +774,7 @@ class Plain_DETR(DetectionBaseNet):
         else:
             num_queries_to_use = self.num_queries_one2one
-        query_embed = self.query_embed.weight[:num_queries_to_use]
+        query_embed = self.query_embed[:num_queries_to_use]
         query_embed, query_pos = torch.split(query_embed, self.hidden_dim, dim=1)
         query_embed = query_embed.unsqueeze(0).expand(B, -1, -1)
         query_pos = query_pos.unsqueeze(0).expand(B, -1, -1)

birder/net/detection/retinanet.py CHANGED Viewed

@@ -63,7 +63,7 @@ class RetinaNetClassificationHead(nn.Module):
             if isinstance(layer, nn.Conv2d):
                 nn.init.normal_(layer.weight, std=0.01)
                 if layer.bias is not None:
-                    nn.init.constant_(layer.bias, 0)
+                    nn.init.zeros_(layer.bias)
         self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)

birder/net/detection/rt_detr_v1.py CHANGED Viewed

@@ -596,18 +596,18 @@ class RT_DETRDecoder(nn.Module):
         # Gather reference points
         reference_points_unact = enc_outputs_coord_unact.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_coord_unact.shape[-1])
         )
         enc_topk_bboxes = reference_points_unact.sigmoid()
         # Gather encoder logits for loss computation
         enc_topk_logits = enc_outputs_class.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_class.shape[-1])
         )
         # Extract region features
-        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, output_memory.shape[-1]))
         target = target.detach()
         return (target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits)
@@ -653,7 +653,7 @@ class RT_DETRDecoder(nn.Module):
         reference_points = init_ref_points_unact.sigmoid()
         for decoder_layer, bbox_head, class_head in zip(self.layers, self.bbox_embed, self.class_embed):
             query_pos = self.query_pos_head(reference_points)
-            reference_points_input = reference_points.unsqueeze(2).repeat(1, 1, len(spatial_shapes), 1)
+            reference_points_input = reference_points.unsqueeze(2).expand(-1, -1, len(spatial_shapes), -1)
             target = decoder_layer(
                 target,
                 query_pos,
@@ -743,7 +743,7 @@ class RT_DETR_v1(DetectionBaseNet):
         self.decoder = RT_DETRDecoder(
             hidden_dim=hidden_dim,
             num_classes=self.num_classes,
-            num_queries=num_queries,
+            num_queries=self.num_queries,
             num_decoder_layers=num_decoder_layers,
             num_levels=self.num_levels,
             num_heads=num_heads,
@@ -810,7 +810,8 @@ class RT_DETR_v1(DetectionBaseNet):
                 for param in self.denoising_class_embed.parameters():
                     param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
@@ -927,8 +928,6 @@ class RT_DETR_v1(DetectionBaseNet):
         return (loss_ce_dn, loss_bbox_dn, loss_giou_dn)
-    @torch.jit.unused  # type: ignore[untyped-decorator]
-    @torch.compiler.disable()  # type: ignore[untyped-decorator]
     def _compute_loss_from_outputs(  # pylint: disable=too-many-locals
         self,
         targets: list[dict[str, torch.Tensor]],
@@ -946,7 +945,7 @@ class RT_DETR_v1(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -1051,7 +1050,7 @@ class RT_DETR_v1(DetectionBaseNet):
         # Convert to [x0, y0, x1, y1] format
         boxes = box_ops.box_convert(box_regression, in_fmt="cxcywh", out_fmt="xyxy")
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).expand(-1, -1, 4))
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
@@ -1113,6 +1112,7 @@ class RT_DETR_v1(DetectionBaseNet):
             else:
                 B, _, H, W = feat.size()
                 m = torch.zeros(B, H, W, dtype=torch.bool, device=x.device)
             mask_list.append(m)
         encoder_features = self.encoder(feature_list, masks=mask_list)

birder/net/detection/rt_detr_v2.py CHANGED Viewed

@@ -147,7 +147,7 @@ class MultiScaleDeformableAttention(nn.Module):
                 param.requires_grad_(False)
     def reset_parameters(self) -> None:
-        nn.init.constant_(self.sampling_offsets.weight, 0.0)
+        nn.init.zeros_(self.sampling_offsets.weight)
         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
         grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)[0]
@@ -158,12 +158,12 @@ class MultiScaleDeformableAttention(nn.Module):
         with torch.no_grad():
             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        nn.init.constant_(self.attention_weights.weight, 0.0)
-        nn.init.constant_(self.attention_weights.bias, 0.0)
+        nn.init.zeros_(self.attention_weights.weight)
+        nn.init.zeros_(self.attention_weights.bias)
         nn.init.xavier_uniform_(self.value_proj.weight)
-        nn.init.constant_(self.value_proj.bias, 0.0)
+        nn.init.zeros_(self.value_proj.bias)
         nn.init.xavier_uniform_(self.output_proj.weight)
-        nn.init.constant_(self.output_proj.bias, 0.0)
+        nn.init.zeros_(self.output_proj.bias)
     def forward(
         self,
@@ -174,7 +174,7 @@ class MultiScaleDeformableAttention(nn.Module):
         input_level_start_index: torch.Tensor,
         input_padding_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        N, num_queries, _ = query.size()
+        num_queries = query.size(1)
         N, sequence_length, _ = input_flatten.size()
         assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == sequence_length
@@ -366,10 +366,9 @@ class TransformerDecoderLayer(nn.Module):
         self_attn_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Self attention
-        q = tgt + query_pos
-        k = tgt + query_pos
+        q_k = tgt + query_pos
-        tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)
+        tgt2 = self.self_attn(q_k, q_k, tgt, attn_mask=self_attn_mask)
         tgt = tgt + self.dropout(tgt2)
         tgt = self.norm1(tgt)
@@ -526,18 +525,18 @@ class RT_DETRDecoder(nn.Module):
         # Gather reference points
         reference_points_unact = enc_outputs_coord_unact.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_coord_unact.shape[-1])
         )
         enc_topk_bboxes = reference_points_unact.sigmoid()
         # Gather encoder logits for loss computation
         enc_topk_logits = enc_outputs_class.gather(
-            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+            dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, enc_outputs_class.shape[-1])
         )
         # Extract region features
-        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+        target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).expand(-1, -1, output_memory.shape[-1]))
         target = target.detach()
         return (target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits)
@@ -583,7 +582,7 @@ class RT_DETRDecoder(nn.Module):
         reference_points = init_ref_points_unact.sigmoid()
         for decoder_layer, bbox_head, class_head in zip(self.layers, self.bbox_embed, self.class_embed):
             query_pos = self.query_pos_head(reference_points)
-            reference_points_input = reference_points.unsqueeze(2).repeat(1, 1, len(spatial_shapes), 1)
+            reference_points_input = reference_points.unsqueeze(2).expand(-1, -1, len(spatial_shapes), -1)
             target = decoder_layer(
                 target,
                 query_pos,
@@ -675,7 +674,7 @@ class RT_DETR_v2(DetectionBaseNet):
         self.decoder = RT_DETRDecoder(
             hidden_dim=hidden_dim,
             num_classes=self.num_classes,
-            num_queries=num_queries,
+            num_queries=self.num_queries,
             num_decoder_layers=num_decoder_layers,
             num_levels=self.num_levels,
             num_heads=num_heads,
@@ -744,20 +743,32 @@ class RT_DETR_v2(DetectionBaseNet):
                 for param in self.denoising_class_embed.parameters():
                     param.requires_grad_(True)
-    def _get_src_permutation_idx(self, indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    @staticmethod
+    def _get_src_permutation_idx(indices: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         batch_idx = torch.concat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
         src_idx = torch.concat([src for (src, _) in indices])
         return (batch_idx, src_idx)
-    def _class_loss(
+    def _compute_layer_losses(
         self,
         cls_logits: torch.Tensor,
         box_output: torch.Tensor,
         targets: list[dict[str, torch.Tensor]],
         indices: list[torch.Tensor],
         num_boxes: float,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         idx = self._get_src_permutation_idx(indices)
+        src_boxes = box_output[idx]
+        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        src_boxes_xyxy = box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy")
+        target_boxes_xyxy = box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy")
+        # IoU for varifocal loss (class loss)
+        ious = torch.diag(box_ops.box_iou(src_boxes_xyxy, target_boxes_xyxy)).detach()
+        # Classification loss
         target_classes_o = torch.concat([t["labels"][J] for t, (_, J) in zip(targets, indices)], dim=0)
         target_classes = torch.full(cls_logits.shape[:2], self.num_classes, dtype=torch.int64, device=cls_logits.device)
         target_classes[idx] = target_classes_o
@@ -771,15 +782,6 @@ class RT_DETR_v2(DetectionBaseNet):
         target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
         target_classes_onehot = target_classes_onehot[:, :, :-1]
-        src_boxes = box_output[idx]
-        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        ious = torch.diag(
-            box_ops.box_iou(
-                box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-                box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-            )
-        ).detach()
         target_score_o = torch.zeros(cls_logits.shape[:2], dtype=cls_logits.dtype, device=cls_logits.device)
         target_score_o[idx] = ious.to(cls_logits.dtype)
         target_score = target_score_o.unsqueeze(-1) * target_classes_onehot
@@ -787,31 +789,13 @@ class RT_DETR_v2(DetectionBaseNet):
         loss = varifocal_loss(cls_logits, target_score, target_classes_onehot, alpha=0.75, gamma=2.0)
         loss_ce = (loss.mean(1).sum() / num_boxes) * cls_logits.shape[1]
-        return loss_ce
+        # Box L1 loss
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none").sum() / num_boxes
-    def _box_loss(
-        self,
-        box_output: torch.Tensor,
-        targets: list[dict[str, torch.Tensor]],
-        indices: list[torch.Tensor],
-        num_boxes: float,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        idx = self._get_src_permutation_idx(indices)
-        src_boxes = box_output[idx]
-        target_boxes = torch.concat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
-        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
-        loss_bbox = loss_bbox.sum() / num_boxes
+        # GIoU loss
+        loss_giou = (1 - torch.diag(box_ops.generalized_box_iou(src_boxes_xyxy, target_boxes_xyxy))).sum() / num_boxes
-        loss_giou = 1 - torch.diag(
-            box_ops.generalized_box_iou(
-                box_ops.box_convert(src_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-                box_ops.box_convert(target_boxes, in_fmt="cxcywh", out_fmt="xyxy"),
-            )
-        )
-        loss_giou = loss_giou.sum() / num_boxes
-        return (loss_bbox, loss_giou)
+        return (loss_ce, loss_bbox, loss_giou)
     def _compute_denoising_loss(
         self,
@@ -846,11 +830,9 @@ class RT_DETR_v2(DetectionBaseNet):
                         )
                     )
-            loss_ce = self._class_loss(
+            loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
                 dn_out_logits[layer_idx], dn_out_bboxes[layer_idx], targets, indices, dn_num_boxes
             )
-            loss_bbox, loss_giou = self._box_loss(dn_out_bboxes[layer_idx], targets, indices, dn_num_boxes)
             loss_ce_list.append(loss_ce)
             loss_bbox_list.append(loss_bbox)
             loss_giou_list.append(loss_giou)
@@ -861,9 +843,7 @@ class RT_DETR_v2(DetectionBaseNet):
         return (loss_ce_dn, loss_bbox_dn, loss_giou_dn)
-    @torch.jit.unused  # type: ignore[untyped-decorator]
-    @torch.compiler.disable()  # type: ignore[untyped-decorator]
-    def _compute_loss_from_outputs(  # pylint: disable=too-many-locals
+    def _compute_loss_from_outputs(
         self,
         targets: list[dict[str, torch.Tensor]],
         out_bboxes: torch.Tensor,
@@ -880,7 +860,7 @@ class RT_DETR_v2(DetectionBaseNet):
         if training_utils.is_dist_available_and_initialized() is True:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / training_utils.get_world_size(), min=1)
         loss_ce_list = []
         loss_bbox_list = []
@@ -889,19 +869,21 @@ class RT_DETR_v2(DetectionBaseNet):
         # Decoder losses (all layers)
         for layer_idx in range(out_logits.shape[0]):
             indices = self.matcher(out_logits[layer_idx], out_bboxes[layer_idx], targets)
-            loss_ce = self._class_loss(out_logits[layer_idx], out_bboxes[layer_idx], targets, indices, num_boxes)
-            loss_bbox, loss_giou = self._box_loss(out_bboxes[layer_idx], targets, indices, num_boxes)
+            loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
+                out_logits[layer_idx], out_bboxes[layer_idx], targets, indices, num_boxes
+            )
             loss_ce_list.append(loss_ce)
             loss_bbox_list.append(loss_bbox)
             loss_giou_list.append(loss_giou)
         # Encoder auxiliary loss
         enc_indices = self.matcher(enc_topk_logits, enc_topk_bboxes, targets)
-        loss_ce_enc = self._class_loss(enc_topk_logits, enc_topk_bboxes, targets, enc_indices, num_boxes)
-        loss_bbox_enc, loss_giou_enc = self._box_loss(enc_topk_bboxes, targets, enc_indices, num_boxes)
-        loss_ce_list.append(loss_ce_enc)
-        loss_bbox_list.append(loss_bbox_enc)
-        loss_giou_list.append(loss_giou_enc)
+        loss_ce, loss_bbox, loss_giou = self._compute_layer_losses(
+            enc_topk_logits, enc_topk_bboxes, targets, enc_indices, num_boxes
+        )
+        loss_ce_list.append(loss_ce)
+        loss_bbox_list.append(loss_bbox)
+        loss_giou_list.append(loss_giou)
         loss_ce = torch.stack(loss_ce_list).sum()  # VFL weight is 1
         loss_bbox = torch.stack(loss_bbox_list).sum() * 5
@@ -985,7 +967,7 @@ class RT_DETR_v2(DetectionBaseNet):
         # Convert to [x0, y0, x1, y1] format
         boxes = box_ops.box_convert(box_regression, in_fmt="cxcywh", out_fmt="xyxy")
-        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).expand(-1, -1, 4))
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
@@ -1047,6 +1029,7 @@ class RT_DETR_v2(DetectionBaseNet):
             else:
                 B, _, H, W = feat.size()
                 m = torch.zeros(B, H, W, dtype=torch.bool, device=x.device)
             mask_list.append(m)
         encoder_features = self.encoder(feature_list, masks=mask_list)

birder/net/detection/ssdlite.py CHANGED Viewed

@@ -50,7 +50,7 @@ class SSDLiteClassificationHead(SSDScoringHead):
             if isinstance(layer, nn.Conv2d):
                 nn.init.xavier_uniform_(layer.weight)
                 if layer.bias is not None:
-                    nn.init.constant_(layer.bias, 0.0)
+                    nn.init.zeros_(layer.bias)
         super().__init__(cls_logits, num_classes)
@@ -79,7 +79,7 @@ class SSDLiteRegressionHead(SSDScoringHead):
             if isinstance(layer, nn.Conv2d):
                 nn.init.xavier_uniform_(layer.weight)
                 if layer.bias is not None:
-                    nn.init.constant_(layer.bias, 0.0)
+                    nn.init.zeros_(layer.bias)
         super().__init__(bbox_reg, 4)

birder/net/edgevit.py CHANGED Viewed

@@ -332,11 +332,11 @@ class EdgeViT(DetectorBackbone):
             if isinstance(m, nn.Linear):
                 nn.init.trunc_normal_(m.weight, std=0.02)
                 if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
+                    nn.init.zeros_(m.bias)
             elif isinstance(m, nn.LayerNorm):
-                nn.init.constant_(m.bias, 0)
-                nn.init.constant_(m.weight, 1.0)
+                nn.init.zeros_(m.bias)
+                nn.init.ones_(m.weight)
     def detection_features(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         out = {}

birder/net/efficientvit_msft.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Conv2dNorm(nn.Sequential):
         )
         self.add_module("bn", nn.BatchNorm2d(out_channels))
         nn.init.constant_(self.bn.weight, bn_weight_init)
-        nn.init.constant_(self.bn.bias, 0)
+        nn.init.zeros_(self.bn.bias)
 class PatchMerging(nn.Module):

birder/net/flexivit.py CHANGED Viewed

@@ -314,7 +314,7 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
             xs = self.encoder.forward_features(x, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/hieradet.py CHANGED Viewed

@@ -613,11 +613,11 @@ registry.register_weights(  # SAM v2: https://arxiv.org/abs/2408.00714
             "HieraDet small image encoder pre-trained by Meta AI using SAM v2. "
             "This model has not been fine-tuned for a specific classification task"
         ),
-        "resolution": (224, 224),
+        "resolution": (1024, 1024),
         "formats": {
             "pt": {
                 "file_size": 129.6,
-                "sha256": "79b6ffdfd4ea9f3b1489ce5a229fe9756b215fc3b52640d01d64136560c1d341",
+                "sha256": "2ede3a78389ca74ed37d82dbc1c3410549f1fdafb5a7a94ac02968aa6d3dec80",
             }
         },
         "net": {"network": "hieradet_small", "tag": "sam2_1"},

birder/net/mnasnet.py CHANGED Viewed

@@ -230,8 +230,8 @@ class MNASNet(DetectorBackbone):
                     nn.init.zeros_(m.bias)
             elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Linear):
                 nn.init.kaiming_uniform_(m.weight, mode="fan_out", nonlinearity="sigmoid")

birder/net/resnext.py CHANGED Viewed

@@ -205,8 +205,8 @@ class ResNeXt(DetectorBackbone):
                 nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
     def detection_features(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         x = self.stem(x)

birder/net/rope_deit3.py CHANGED Viewed

@@ -249,7 +249,7 @@ class RoPE_DeiT3(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Ma
             xs = self.encoder.forward_features(x, rope, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/rope_flexivit.py CHANGED Viewed

@@ -342,7 +342,7 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
             xs = self.encoder.forward_features(x, rope, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/rope_vit.py CHANGED Viewed

@@ -698,7 +698,7 @@ class RoPE_ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
             xs = self.encoder.forward_features(x, rope, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/simple_vit.py CHANGED Viewed

@@ -215,7 +215,7 @@ class Simple_ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin):
             xs = self.encoder.forward_features(x, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder/net/vit.py CHANGED Viewed

@@ -572,7 +572,7 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
             xs = self.encoder.forward_features(x, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()
@@ -802,6 +802,24 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
 # Register model configs (side effects)
 register_vit_configs(ViT)
+registry.register_weights(  # BioCLIP v1: https://arxiv.org/abs/2311.18803
+    "vit_b16_pn_bioclip-v1",
+    {
+        "url": "https://huggingface.co/birder-project/vit_b16_pn_bioclip-v1/resolve/main",
+        "description": (
+            "ViT b16 image encoder pre-trained by Imageomics using CLIP on the TreeOfLife-10M dataset. "
+            "This model has not been fine-tuned for a specific classification task"
+        ),
+        "resolution": (224, 224),
+        "formats": {
+            "pt": {
+                "file_size": 328.9,
+                "sha256": "9b2e5598f233657932eeb77e027cd4c4d683bf75515768fe6971cab6ec10bf15",
+            },
+        },
+        "net": {"network": "vit_b16_pn", "tag": "bioclip-v1"},
+    },
+)
 registry.register_weights(
     "vit_l16_mim_200",
     {
@@ -849,8 +867,8 @@ registry.register_weights(  # BioCLIP v2: https://arxiv.org/abs/2505.23883
         "resolution": (224, 224),
         "formats": {
             "pt": {
-                "file_size": 1156.6,
-                "sha256": "6cd7bd6993762590891fe2b41db1649cde5a0c4de5a7f341672f8856ed529d07",
+                "file_size": 1159.7,
+                "sha256": "301a325579dafdfa2ea13b0cbaf8129211ecd1429c29afa20d1c2eaaa91d8b0d",
             },
         },
         "net": {"network": "vit_l14_pn", "tag": "bioclip-v2"},

birder/net/vit_parallel.py CHANGED Viewed

@@ -370,7 +370,7 @@ class ViT_Parallel(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
             xs = self.encoder.forward_features(x, out_indices=self.out_indices)
         out: dict[str, torch.Tensor] = {}
-        for stage_name, stage_x in zip(self.return_stages, xs):
+        for stage_name, stage_x in zip(self.return_stages, xs, strict=True):
             stage_x = stage_x[:, self.num_special_tokens :]
             stage_x = stage_x.permute(0, 2, 1)
             B, C, _ = stage_x.size()

birder 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

birder 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl