PyPI - birder - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

birder 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

birder/common/lib.py +2 -9
birder/common/training_cli.py +18 -0
birder/common/training_utils.py +123 -10
birder/data/collators/detection.py +10 -3
birder/data/datasets/coco.py +8 -10
birder/data/transforms/detection.py +30 -13
birder/inference/detection.py +108 -4
birder/inference/wbf.py +226 -0
birder/net/__init__.py +8 -0
birder/net/detection/efficientdet.py +65 -86
birder/net/detection/rt_detr_v1.py +1 -0
birder/net/detection/yolo_anchors.py +205 -0
birder/net/detection/yolo_v2.py +25 -24
birder/net/detection/yolo_v3.py +39 -40
birder/net/detection/yolo_v4.py +28 -26
birder/net/detection/yolo_v4_tiny.py +24 -20
birder/net/fasternet.py +1 -1
birder/net/gc_vit.py +671 -0
birder/net/lit_v1.py +472 -0
birder/net/lit_v1_tiny.py +342 -0
birder/net/lit_v2.py +436 -0
birder/net/mobilenet_v4_hybrid.py +1 -1
birder/net/resnet_v1.py +1 -1
birder/net/resnext.py +67 -25
birder/net/se_resnet_v1.py +46 -0
birder/net/se_resnext.py +3 -0
birder/net/simple_vit.py +2 -2
birder/net/vit.py +0 -15
birder/net/vovnet_v2.py +31 -1
birder/scripts/benchmark.py +90 -21
birder/scripts/predict.py +1 -0
birder/scripts/predict_detection.py +18 -11
birder/scripts/train.py +10 -34
birder/scripts/train_barlow_twins.py +10 -34
birder/scripts/train_byol.py +10 -34
birder/scripts/train_capi.py +10 -35
birder/scripts/train_data2vec.py +9 -34
birder/scripts/train_data2vec2.py +9 -34
birder/scripts/train_detection.py +48 -40
birder/scripts/train_dino_v1.py +10 -34
birder/scripts/train_dino_v2.py +9 -34
birder/scripts/train_dino_v2_dist.py +9 -34
birder/scripts/train_franca.py +9 -34
birder/scripts/train_i_jepa.py +9 -34
birder/scripts/train_ibot.py +9 -34
birder/scripts/train_kd.py +156 -64
birder/scripts/train_mim.py +10 -34
birder/scripts/train_mmcr.py +10 -34
birder/scripts/train_rotnet.py +10 -34
birder/scripts/train_simclr.py +10 -34
birder/scripts/train_vicreg.py +10 -34
birder/tools/auto_anchors.py +20 -1
birder/tools/pack.py +172 -103
birder/tools/show_det_iterator.py +10 -1
birder/version.py +1 -1
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/METADATA +3 -3
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/RECORD +61 -55
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/WHEEL +0 -0
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/entry_points.txt +0 -0
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/licenses/LICENSE +0 -0
{birder-0.2.2.dist-info → birder-0.2.3.dist-info}/top_level.txt +0 -0

birder/net/lit_v2.py ADDED Viewed

@@ -0,0 +1,436 @@
+"""
+LIT v2, adapted from
+https://github.com/ziplab/LITv2/blob/main/classification/models/litv2.py
+Paper "Fast Vision Transformers with HiLo Attention", https://arxiv.org/abs/2205.13213
+Generated by Claude Code Opus 4.5
+"""
+# Reference license: Apache-2.0
+import math
+from collections import OrderedDict
+from typing import Any
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops import Permute
+from torchvision.ops import StochasticDepth
+from birder.model_registry import registry
+from birder.net.base import DetectorBackbone
+from birder.net.lit_v1 import DeformablePatchMerging
+from birder.net.lit_v1 import IdentityDownsample
+class DepthwiseMLP(nn.Module):
+    def __init__(self, in_features: int, hidden_features: int) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = nn.Conv2d(
+            hidden_features, hidden_features, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=hidden_features
+        )
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, in_features)
+    def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
+        x = self.fc1(x)
+        (B, N, C) = x.size()
+        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1).reshape(B, N, C)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class DepthwiseMLPBlock(nn.Module):
+    def __init__(self, dim: int, mlp_ratio: float, drop_path: float) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.mlp = DepthwiseMLP(dim, int(dim * mlp_ratio))
+        self.drop_path = StochasticDepth(drop_path, mode="row")
+    def forward(self, x: torch.Tensor, resolution: tuple[int, int]) -> torch.Tensor:
+        (H, W) = resolution
+        return x + self.drop_path(self.mlp(self.norm(x), H, W))
+class HiLoAttention(nn.Module):
+    """
+    HiLo Attention: High-frequency local attention + Low-frequency global attention
+    Hi-Fi (High frequency): Local window attention
+    Lo-Fi (Low frequency): Global attention with average pooling
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: int,
+        alpha: float,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim must be divisible by num_heads"
+        self.window_size = window_size
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # Split heads between Lo-Fi (global) and Hi-Fi (local)
+        self.l_heads = int(num_heads * alpha)  # Lo-Fi heads
+        self.h_heads = num_heads - self.l_heads  # Hi-Fi heads
+        self.l_dim = self.l_heads * self.head_dim
+        self.h_dim = self.h_heads * self.head_dim
+        self.head_dim = self.head_dim
+        # ws == 1 is equal to standard multi-head self-attention
+        if window_size == 1:
+            self.h_heads = 0
+            self.h_dim = 0
+            self.l_heads = num_heads
+            self.l_dim = dim
+        # Lo-Fi: Global attention with pooling
+        if self.l_heads > 0:
+            if window_size > 1:
+                self.sr = nn.AvgPool2d(kernel_size=(window_size, window_size), stride=(window_size, window_size))
+            else:
+                self.sr = nn.Identity()
+            self.l_q = nn.Linear(dim, self.l_dim)
+            self.l_kv = nn.Linear(dim, self.l_dim * 2)
+            self.l_proj = nn.Linear(self.l_dim, self.l_dim)
+        else:
+            self.l_q = nn.Identity()
+            self.l_kv = nn.Identity()
+            self.l_proj = nn.Identity()
+        # Hi-Fi: Local window attention
+        if self.h_heads > 0:
+            self.h_qkv = nn.Linear(dim, self.h_dim * 3)
+            self.h_proj = nn.Linear(self.h_dim, self.h_dim)
+        else:
+            self.h_qkv = nn.Identity()
+            self.h_proj = nn.Identity()
+    def _lofi(self, x: torch.Tensor) -> torch.Tensor:
+        (B, H, W, C) = x.size()
+        q = self.l_q(x).reshape(B, H * W, self.l_heads, self.head_dim).permute(0, 2, 1, 3)
+        # Spatial reduction for k, v
+        if self.window_size > 1:
+            x = x.permute(0, 3, 1, 2)
+            x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1)
+            kv = self.l_kv(x).reshape(B, -1, 2, self.l_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        else:
+            kv = self.l_kv(x).reshape(B, -1, 2, self.l_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        (k, v) = kv.unbind(0)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = F.softmax(attn, dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, H, W, self.l_dim)
+        x = self.l_proj(x)
+        return x
+    def _hifi(self, x: torch.Tensor) -> torch.Tensor:
+        (B, H, W, _) = x.size()
+        ws = self.window_size
+        # Pad if needed
+        pad_h = (ws - H % ws) % ws
+        pad_w = (ws - W % ws) % ws
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+        (_, h_pad, w_pad, _) = x.size()
+        h_groups = h_pad // ws
+        w_groups = w_pad // ws
+        total_groups = h_groups * w_groups
+        x = x.reshape(B, h_groups, ws, w_groups, ws, -1).transpose(2, 3)
+        qkv = self.h_qkv(x).reshape(B, total_groups, -1, 3, self.h_heads, self.head_dim).permute(3, 0, 1, 4, 2, 5)
+        (q, k, v) = qkv.unbind(0)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = F.softmax(attn, dim=-1)
+        x = (attn @ v).transpose(2, 3).reshape(B, h_groups, w_groups, ws, ws, self.h_dim)
+        x = x.transpose(2, 3).reshape(B, h_pad, w_pad, self.h_dim)
+        x = self.h_proj(x)
+        # Remove padding
+        if pad_h > 0 or pad_w > 0:
+            x = x[:, :H, :W, :].contiguous()
+        return x
+    def forward(self, x: torch.Tensor, H: int, W: int) -> torch.Tensor:
+        (B, N, C) = x.size()
+        x = x.reshape(B, H, W, C)
+        if self.h_heads == 0:
+            x = self._lofi(x)
+            return x.reshape(B, N, C)
+        if self.l_heads == 0:
+            x = self._hifi(x)
+            return x.reshape(B, N, C)
+        # Process both branches and concatenate
+        hifi_out = self._hifi(x)
+        lofi_out = self._lofi(x)
+        x = torch.concat((hifi_out, lofi_out), dim=-1)
+        return x.reshape(B, N, C)
+class HiLoBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: int,
+        alpha: float,
+        mlp_ratio: float,
+        drop_path: float,
+    ) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = HiLoAttention(dim, num_heads, window_size, alpha)
+        self.drop_path1 = StochasticDepth(drop_path, mode="row")
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = DepthwiseMLP(dim, int(dim * mlp_ratio))
+        self.drop_path2 = StochasticDepth(drop_path, mode="row")
+    def forward(self, x: torch.Tensor, resolution: tuple[int, int]) -> torch.Tensor:
+        (H, W) = resolution
+        x = x + self.drop_path1(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path2(self.mlp(self.norm2(x), H, W))
+        return x
+class LITStage(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        resolution: tuple[int, int],
+        depth: int,
+        num_heads: int,
+        window_size: int,
+        alpha: float,
+        mlp_ratio: float,
+        downsample: bool,
+        drop_path: list[float],
+    ) -> None:
+        super().__init__()
+        if downsample is True:
+            self.downsample = DeformablePatchMerging(in_dim, out_dim)
+            resolution = (resolution[0] // 2, resolution[1] // 2)
+        else:
+            self.downsample = IdentityDownsample()
+        blocks: list[nn.Module] = []
+        for i in range(depth):
+            if window_size > 0:
+                blocks.append(HiLoBlock(out_dim, num_heads, window_size, alpha, mlp_ratio, drop_path[i]))
+            else:
+                blocks.append(DepthwiseMLPBlock(out_dim, mlp_ratio, drop_path[i]))
+        self.blocks = nn.ModuleList(blocks)
+    def forward(self, x: torch.Tensor, input_resolution: tuple[int, int]) -> tuple[torch.Tensor, int, int]:
+        (x, H, W) = self.downsample(x, input_resolution)
+        for block in self.blocks:
+            x = block(x, (H, W))
+        return (x, H, W)
+# pylint: disable=invalid-name
+class LIT_v2(DetectorBackbone):
+    block_group_regex = r"body\.stage(\d+)\.blocks\.(\d+)"
+    # pylint:disable=too-many-locals
+    def __init__(
+        self,
+        input_channels: int,
+        num_classes: int,
+        *,
+        config: Optional[dict[str, Any]] = None,
+        size: Optional[tuple[int, int]] = None,
+    ) -> None:
+        super().__init__(input_channels, num_classes, config=config, size=size)
+        assert self.config is not None, "must set config"
+        patch_size = 4
+        embed_dim: int = self.config["embed_dim"]
+        depths: list[int] = self.config["depths"]
+        num_heads: list[int] = self.config["num_heads"]
+        local_ws: list[int] = self.config["local_ws"]
+        alpha: float = self.config["alpha"]
+        drop_path_rate: float = self.config["drop_path_rate"]
+        num_stages = len(depths)
+        self.stem = nn.Sequential(
+            nn.Conv2d(
+                self.input_channels,
+                embed_dim,
+                kernel_size=(patch_size, patch_size),
+                stride=(patch_size, patch_size),
+                padding=(0, 0),
+                bias=True,
+            ),
+            Permute([0, 2, 3, 1]),
+            nn.LayerNorm(embed_dim),
+        )
+        # Stochastic depth
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages: OrderedDict[str, nn.Module] = OrderedDict()
+        return_channels: list[int] = []
+        prev_dim = embed_dim
+        resolution = (self.size[0] // patch_size, self.size[1] // patch_size)
+        for i_stage in range(num_stages):
+            in_dim = prev_dim
+            out_dim = in_dim * 2 if i_stage > 0 else in_dim
+            stage = LITStage(
+                in_dim,
+                out_dim,
+                resolution,
+                depth=depths[i_stage],
+                num_heads=num_heads[i_stage],
+                window_size=local_ws[i_stage],
+                alpha=alpha,
+                mlp_ratio=4.0,
+                downsample=i_stage > 0,
+                drop_path=dpr[i_stage],
+            )
+            stages[f"stage{i_stage + 1}"] = stage
+            if i_stage > 0:
+                resolution = (resolution[0] // 2, resolution[1] // 2)
+            prev_dim = out_dim
+            return_channels.append(out_dim)
+        num_features = embed_dim * (2 ** (num_stages - 1))
+        self.body = nn.ModuleDict(stages)
+        self.features = nn.Sequential(
+            nn.LayerNorm(num_features),
+            Permute([0, 2, 1]),
+            nn.AdaptiveAvgPool1d(output_size=1),
+            nn.Flatten(1),
+        )
+        self.return_channels = return_channels
+        self.embedding_size = num_features
+        self.classifier = self.create_classifier()
+        # Weight initialization
+        for name, m in self.named_modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Conv2d):
+                if name.endswith("offset_conv") is True:
+                    continue
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def detection_features(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
+        x = self.stem(x)
+        (B, H, W, C) = x.size()
+        x = x.reshape(B, H * W, C)
+        out = {}
+        for name, stage in self.body.items():
+            (x, H, W) = stage(x, (H, W))
+            if name in self.return_stages:
+                features = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+                out[name] = features
+        return out
+    def freeze_stages(self, up_to_stage: int) -> None:
+        for param in self.stem.parameters():
+            param.requires_grad = False
+        for idx, stage in enumerate(self.body.values()):
+            if idx >= up_to_stage:
+                break
+            for param in stage.parameters():
+                param.requires_grad = False
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.stem(x)
+        (B, H, W, C) = x.size()
+        x = x.reshape(B, H * W, C)
+        for stage in self.body.values():
+            (x, H, W) = stage(x, (H, W))
+        return x
+    def embedding(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        return self.features(x)
+registry.register_model_config(
+    "lit_v2_s",
+    LIT_v2,
+    config={
+        "embed_dim": 96,
+        "depths": [2, 2, 6, 2],
+        "num_heads": [3, 6, 12, 24],
+        "local_ws": [0, 0, 2, 1],
+        "alpha": 0.9,
+        "drop_path_rate": 0.2,
+    },
+)
+registry.register_model_config(
+    "lit_v2_m",
+    LIT_v2,
+    config={
+        "embed_dim": 96,
+        "depths": [2, 2, 18, 2],
+        "num_heads": [3, 6, 12, 24],
+        "local_ws": [0, 0, 2, 1],
+        "alpha": 0.9,
+        "drop_path_rate": 0.3,
+    },
+)
+registry.register_model_config(
+    "lit_v2_b",
+    LIT_v2,
+    config={
+        "embed_dim": 128,
+        "depths": [2, 2, 18, 2],
+        "num_heads": [4, 8, 16, 32],
+        "local_ws": [0, 0, 2, 1],
+        "alpha": 0.9,
+        "drop_path_rate": 0.5,
+    },
+)

birder/net/mobilenet_v4_hybrid.py CHANGED Viewed

@@ -491,7 +491,7 @@ registry.register_weights(
         "formats": {
             "pt": {
                 "file_size": 39.7,
-                "sha256": "220df49e08ea49e24f30dcc777bf48c7aaea4aaa5909b56c931f41747381d390",
+                "sha256": "d7d76733e0116d351bf8aafc563659eab7bea02174a02c10fba8eb3a64ea87e1",
             }
         },
         "net": {"network": "mobilenet_v4_hybrid_m", "tag": "il-common"},

birder/net/resnet_v1.py CHANGED Viewed

@@ -58,7 +58,7 @@ class ResidualBlock(nn.Module):
                 nn.BatchNorm2d(out_channels),
             )
-        if in_channels == out_channels:
+        if in_channels == out_channels and stride == (1, 1):
             self.block2 = nn.Identity()
         else:
             if avg_down is True and stride != (1, 1):

birder/net/resnext.py CHANGED Viewed

@@ -30,6 +30,7 @@ class ResidualBlock(nn.Module):
         base_width: int,
         expansion: int,
         squeeze_excitation: bool,
+        avg_down: bool,
     ) -> None:
         super().__init__()
         width = int(out_channels * (base_width / 64.0)) * groups
@@ -62,20 +63,34 @@ class ResidualBlock(nn.Module):
             nn.BatchNorm2d(out_channels * expansion),
         )
-        if in_channels == out_channels * expansion:
+        if in_channels == out_channels * expansion and stride == (1, 1):
             self.block2 = nn.Identity()
         else:
-            self.block2 = nn.Sequential(
-                nn.Conv2d(
-                    in_channels,
-                    out_channels * expansion,
-                    kernel_size=(1, 1),
-                    stride=stride,
-                    padding=(0, 0),
-                    bias=False,
-                ),
-                nn.BatchNorm2d(out_channels * expansion),
-            )
+            if avg_down is True and stride != (1, 1):
+                self.block2 = nn.Sequential(
+                    nn.AvgPool2d(kernel_size=2, stride=stride, ceil_mode=True, count_include_pad=False),
+                    nn.Conv2d(
+                        in_channels,
+                        out_channels * expansion,
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        padding=(0, 0),
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(out_channels * expansion),
+                )
+            else:
+                self.block2 = nn.Sequential(
+                    nn.Conv2d(
+                        in_channels,
+                        out_channels * expansion,
+                        kernel_size=(1, 1),
+                        stride=stride,
+                        padding=(0, 0),
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(out_channels * expansion),
+                )
         self.relu = nn.ReLU(inplace=True)
         if squeeze_excitation is True:
@@ -107,23 +122,35 @@ class ResNeXt(DetectorBackbone):
         super().__init__(input_channels, num_classes, config=config, size=size)
         assert self.config is not None, "must set config"
-        groups = 32
-        base_width = 4
         expansion = 4
+        groups: int = self.config.get("groups", 32)
+        base_width: int = self.config.get("base_width", 4)
         filter_list = [64, 128, 256, 512]
         units: list[int] = self.config["units"]
+        deep_stem: bool = self.config.get("deep_stem", False)
+        avg_down: bool = self.config.get("avg_down", False)
-        self.stem = nn.Sequential(
-            Conv2dNormActivation(
-                self.input_channels,
-                filter_list[0],
-                kernel_size=(7, 7),
-                stride=(2, 2),
-                padding=(3, 3),
-                bias=False,
-            ),
-            nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
-        )
+        if deep_stem is True:
+            self.stem = nn.Sequential(
+                Conv2dNormActivation(
+                    self.input_channels, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False
+                ),
+                Conv2dNormActivation(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
+                Conv2dNormActivation(32, filter_list[0], kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
+                nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            )
+        else:
+            self.stem = nn.Sequential(
+                Conv2dNormActivation(
+                    self.input_channels,
+                    filter_list[0],
+                    kernel_size=(7, 7),
+                    stride=(2, 2),
+                    padding=(3, 3),
+                    bias=False,
+                ),
+                nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            )
         # Generate body layers
         in_channels = filter_list[0]
@@ -150,6 +177,7 @@ class ResNeXt(DetectorBackbone):
                         base_width=base_width,
                         expansion=expansion,
                         squeeze_excitation=squeeze_excitation,
+                        avg_down=avg_down,
                     )
                 )
                 in_channels = channels * expansion
@@ -209,3 +237,17 @@ class ResNeXt(DetectorBackbone):
 registry.register_model_config("resnext_50", ResNeXt, config={"units": [3, 4, 6, 3]})
 registry.register_model_config("resnext_101", ResNeXt, config={"units": [3, 4, 23, 3]})
 registry.register_model_config("resnext_152", ResNeXt, config={"units": [3, 8, 36, 3]})
+registry.register_model_config("resnext_101_32x8", ResNeXt, config={"units": [3, 4, 23, 3], "base_width": 8})
+registry.register_model_config("resnext_101_64x4", ResNeXt, config={"units": [3, 4, 23, 3], "groups": 64})
+# ResNeXt-D variants (From: Bag of Tricks for Image Classification with Convolutional Neural Networks)
+registry.register_model_config(
+    "resnext_d_50", ResNeXt, config={"units": [3, 4, 6, 3], "deep_stem": True, "avg_down": True}
+)
+registry.register_model_config(
+    "resnext_d_101", ResNeXt, config={"units": [3, 4, 23, 3], "deep_stem": True, "avg_down": True}
+)
+registry.register_model_config(
+    "resnext_d_152", ResNeXt, config={"units": [3, 8, 36, 3], "deep_stem": True, "avg_down": True}
+)

birder/net/se_resnet_v1.py CHANGED Viewed

@@ -57,3 +57,49 @@ registry.register_model_config(
     SE_ResNet_v1,
     config={"bottle_neck": True, "filter_list": [64, 256, 512, 1024, 2048], "units": [3, 30, 48, 8]},
 )
+# SE-ResNet-D variants (From: Bag of Tricks for Image Classification with Convolutional Neural Networks)
+registry.register_model_config(
+    "se_resnet_d_50",
+    SE_ResNet_v1,
+    config={
+        "bottle_neck": True,
+        "filter_list": [64, 256, 512, 1024, 2048],
+        "units": [3, 4, 6, 3],
+        "deep_stem": True,
+        "avg_down": True,
+    },
+)
+registry.register_model_config(
+    "se_resnet_d_101",
+    SE_ResNet_v1,
+    config={
+        "bottle_neck": True,
+        "filter_list": [64, 256, 512, 1024, 2048],
+        "units": [3, 4, 23, 3],
+        "deep_stem": True,
+        "avg_down": True,
+    },
+)
+registry.register_model_config(
+    "se_resnet_d_152",
+    SE_ResNet_v1,
+    config={
+        "bottle_neck": True,
+        "filter_list": [64, 256, 512, 1024, 2048],
+        "units": [3, 8, 36, 3],
+        "deep_stem": True,
+        "avg_down": True,
+    },
+)
+registry.register_model_config(
+    "se_resnet_d_200",
+    SE_ResNet_v1,
+    config={
+        "bottle_neck": True,
+        "filter_list": [64, 256, 512, 1024, 2048],
+        "units": [3, 24, 36, 3],
+        "deep_stem": True,
+        "avg_down": True,
+    },
+)

birder/net/se_resnext.py CHANGED Viewed

@@ -25,3 +25,6 @@ class SE_ResNeXt(ResNeXt):
 registry.register_model_config("se_resnext_50", SE_ResNeXt, config={"units": [3, 4, 6, 3]})
 registry.register_model_config("se_resnext_101", SE_ResNeXt, config={"units": [3, 4, 23, 3]})
 registry.register_model_config("se_resnext_152", SE_ResNeXt, config={"units": [3, 8, 36, 3]})
+registry.register_model_config("se_resnext_101_32x8", SE_ResNeXt, config={"units": [3, 4, 23, 3], "base_width": 8})
+registry.register_model_config("se_resnext_101_64x4", SE_ResNeXt, config={"units": [3, 4, 23, 3], "groups": 64})

birder/net/simple_vit.py CHANGED Viewed

@@ -79,7 +79,7 @@ class Simple_ViT(PreTrainEncoder, MaskedTokenOmissionMixin):
             dim=hidden_dim,
             num_special_tokens=self.num_special_tokens,
         )
-        self.pos_embedding = nn.Parameter(pos_embedding, requires_grad=False)
+        self.pos_embedding = nn.Buffer(pos_embedding)
         self.encoder = Encoder(num_layers, num_heads, hidden_dim, mlp_dim, dropout=0.0, attention_dropout=0.0, dpr=dpr)
         self.norm = nn.LayerNorm(hidden_dim, eps=1e-6)
@@ -203,7 +203,7 @@ class Simple_ViT(PreTrainEncoder, MaskedTokenOmissionMixin):
             dim=self.hidden_dim,
             num_special_tokens=self.num_special_tokens,
         )
-        self.pos_embedding = nn.Parameter(pos_embedding, requires_grad=False)
+        self.pos_embedding = nn.Buffer(pos_embedding)
     def set_causal_attention(self, is_causal: bool = True) -> None:
         self.encoder.set_causal_attention(is_causal)

birder/net/vit.py CHANGED Viewed

@@ -1588,21 +1588,6 @@ registry.register_weights(
         "net": {"network": "vit_l16", "tag": "mim"},
     },
 )
-registry.register_weights(
-    "vit_l16_mim-eu-common",
-    {
-        "url": "https://huggingface.co/birder-project/vit_l16_mim-eu-common/resolve/main",
-        "description": "ViT l16 model with MIM pretraining, then fine-tuned on the eu-common dataset",
-        "resolution": (256, 256),
-        "formats": {
-            "pt": {
-                "file_size": 1160.1,
-                "sha256": "3b7235b90f76fb1e0e36d4c4111777a4cc4e4500552fe840c51170b208310d16",
-            },
-        },
-        "net": {"network": "vit_l16", "tag": "mim-eu-common"},
-    },
-)
 registry.register_weights(  # BioCLIP v2: https://arxiv.org/abs/2505.23883
     "vit_l14_pn_bioclip-v2",
     {

birder 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

birder 0.2.2py3-none-any.whl → 0.2.3py3-none-any.whl