PyPI - birder - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

birder 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

birder/adversarial/deepfool.py +2 -0
birder/adversarial/simba.py +2 -0
birder/common/masking.py +13 -4
birder/inference/classification.py +1 -1
birder/introspection/__init__.py +2 -0
birder/introspection/base.py +0 -7
birder/introspection/feature_pca.py +101 -0
birder/kernels/soft_nms/soft_nms.cpp +5 -2
birder/model_registry/model_registry.py +3 -2
birder/net/convnext_v1.py +20 -0
birder/net/fastvit.py +0 -1
birder/net/flexivit.py +5 -0
birder/net/focalnet.py +0 -1
birder/net/hiera.py +3 -3
birder/net/hieradet.py +116 -28
birder/net/rope_flexivit.py +7 -0
birder/net/rope_vit.py +49 -4
birder/net/smt.py +0 -1
birder/net/ssl/ibot.py +0 -1
birder/net/vit.py +166 -2
birder/scripts/train.py +24 -21
birder/scripts/train_barlow_twins.py +4 -3
birder/scripts/train_byol.py +4 -3
birder/scripts/train_capi.py +6 -5
birder/scripts/train_data2vec.py +4 -3
birder/scripts/train_data2vec2.py +4 -3
birder/scripts/train_detection.py +7 -5
birder/scripts/train_dino_v1.py +5 -4
birder/scripts/train_dino_v2.py +69 -20
birder/scripts/train_dino_v2_dist.py +70 -21
birder/scripts/train_franca.py +8 -7
birder/scripts/train_i_jepa.py +4 -3
birder/scripts/train_ibot.py +5 -4
birder/scripts/train_kd.py +25 -24
birder/scripts/train_mim.py +4 -3
birder/scripts/train_mmcr.py +4 -3
birder/scripts/train_rotnet.py +5 -4
birder/scripts/train_simclr.py +4 -3
birder/scripts/train_vicreg.py +4 -3
birder/tools/avg_model.py +24 -8
birder/tools/introspection.py +35 -9
birder/tools/show_iterator.py +17 -3
birder/version.py +1 -1
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/METADATA +1 -1
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/RECORD +49 -48
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/WHEEL +0 -0
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/entry_points.txt +0 -0
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/licenses/LICENSE +0 -0
{birder-0.3.1.dist-info → birder-0.3.3.dist-info}/top_level.txt +0 -0

birder/adversarial/deepfool.py CHANGED Viewed

@@ -2,6 +2,8 @@
 DeepFool
 Paper "DeepFool: a simple and accurate method to fool deep neural networks", https://arxiv.org/abs/1511.04599
+Generated by gpt-5.2-codex xhigh.
 """
 from typing import Optional

birder/adversarial/simba.py CHANGED Viewed

@@ -2,6 +2,8 @@
 SimBA (Simple Black-box Attack)
 Paper "Simple Black-box Adversarial Attacks", https://arxiv.org/abs/1905.07121
+Generated by gpt-5.2-codex xhigh.
 """
 from typing import Optional

birder/common/masking.py CHANGED Viewed

@@ -84,8 +84,8 @@ def mask_tensor(
     (B, H, W, _) = x.size()
-    shaped_mask = mask.reshape(-1, H // patch_factor, W // patch_factor)
-    shaped_mask = shaped_mask.repeat_interleave(patch_factor, axis=1).repeat_interleave(patch_factor, axis=2)
+    shaped_mask = mask.reshape(B, H // patch_factor, W // patch_factor)
+    shaped_mask = shaped_mask.repeat_interleave(patch_factor, dim=1).repeat_interleave(patch_factor, dim=2)
     shaped_mask = shaped_mask.unsqueeze(3).type_as(x)
     if mask_token is not None:
@@ -228,14 +228,23 @@ class Masking:
 class UniformMasking(Masking):
-    def __init__(self, input_size: tuple[int, int], mask_ratio: float, device: Optional[torch.device] = None) -> None:
+    def __init__(
+        self,
+        input_size: tuple[int, int],
+        mask_ratio: float,
+        min_mask_size: int = 1,
+        device: Optional[torch.device] = None,
+    ) -> None:
         self.h = input_size[0]
         self.w = input_size[1]
         self.mask_ratio = mask_ratio
+        self.min_mask_size = min_mask_size
         self.device = device
     def __call__(self, batch_size: int) -> torch.Tensor:
-        return uniform_mask(batch_size, self.h, self.w, self.mask_ratio, device=self.device)[0]
+        return uniform_mask(
+            batch_size, self.h, self.w, self.mask_ratio, min_mask_size=self.min_mask_size, device=self.device
+        )[0]
 class BlockMasking(Masking):

birder/inference/classification.py CHANGED Viewed

@@ -85,7 +85,7 @@ def infer_batch(
             logits = net(t(tta_input), **kwargs)
             outs.append(logits if return_logits is True else F.softmax(logits, dim=1))
-        out = torch.stack(outs).mean(axis=0)
+        out = torch.stack(outs).mean(dim=0)
     else:
         logits = net(inputs, **kwargs)

birder/introspection/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from birder.introspection.attention_rollout import AttentionRollout
 from birder.introspection.base import InterpretabilityResult
+from birder.introspection.feature_pca import FeaturePCA
 from birder.introspection.gradcam import GradCAM
 from birder.introspection.guided_backprop import GuidedBackprop
 from birder.introspection.transformer_attribution import TransformerAttribution
@@ -7,6 +8,7 @@ from birder.introspection.transformer_attribution import TransformerAttribution
 __all__ = [
     "InterpretabilityResult",
     "AttentionRollout",
+    "FeaturePCA",
     "GradCAM",
     "GuidedBackprop",
     "TransformerAttribution",

birder/introspection/base.py CHANGED Viewed

@@ -2,7 +2,6 @@ from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
-from typing import Protocol
 import matplotlib
 import matplotlib.pyplot as plt
@@ -27,12 +26,6 @@ class InterpretabilityResult:
         plt.show()
-class Interpreter(Protocol):
-    def __call__(
-        self, image: str | Path | Image.Image, target_class: Optional[int] = None
-    ) -> InterpretabilityResult: ...
 def load_image(image: str | Path | Image.Image) -> Image.Image:
     if isinstance(image, (str, Path)):
         return Image.open(image)

birder/introspection/feature_pca.py ADDED Viewed

@@ -0,0 +1,101 @@
+from collections.abc import Callable
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from PIL import Image
+from sklearn.decomposition import PCA
+from birder.introspection.base import InterpretabilityResult
+from birder.introspection.base import preprocess_image
+from birder.net.base import DetectorBackbone
+class FeaturePCA:
+    """
+    Visualizes feature maps using Principal Component Analysis
+    This method extracts feature maps from a specified stage of a DetectorBackbone model,
+    applies PCA to reduce the channel dimension to 3 components, and visualizes them as an RGB image where:
+    - R channel = 1st principal component (most important)
+    - G channel = 2nd principal component
+    - B channel = 3rd principal component
+    """
+    def __init__(
+        self,
+        net: DetectorBackbone,
+        device: torch.device,
+        transform: Callable[..., torch.Tensor],
+        normalize: bool = False,
+        channels_last: bool = False,
+        stage: Optional[str] = None,
+    ) -> None:
+        self.net = net.eval()
+        self.device = device
+        self.transform = transform
+        self.normalize = normalize
+        self.channels_last = channels_last
+        self.stage = stage
+    def __call__(self, image: str | Path | Image.Image) -> InterpretabilityResult:
+        (input_tensor, rgb_img) = preprocess_image(image, self.transform, self.device)
+        with torch.inference_mode():
+            features_dict = self.net.detection_features(input_tensor)
+        if self.stage is not None:
+            features = features_dict[self.stage]
+        else:
+            features = list(features_dict.values())[-1]  # Use the last stage by default
+        features_np = features.cpu().numpy()
+        # Handle channels_last format (B, H, W, C) vs channels_first (B, C, H, W)
+        if self.channels_last is True:
+            (B, H, W, C) = features_np.shape
+            # Already in (B, H, W, C), just reshape to (B*H*W, C)
+            features_reshaped = features_np.reshape(-1, C)
+        else:
+            (B, C, H, W) = features_np.shape
+            # Reshape to (spatial_points, channels) for PCA
+            features_reshaped = features_np.reshape(B, C, -1)
+            features_reshaped = features_reshaped.transpose(0, 2, 1)  # (B, H*W, C)
+            features_reshaped = features_reshaped.reshape(-1, C)  # (B*H*W, C)
+        x = features_reshaped
+        if self.normalize is True:
+            x = x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-6)
+        pca = PCA(n_components=3)
+        pca_features = pca.fit_transform(x)
+        pca_features = pca_features.reshape(B, H, W, 3)
+        # Extract all 3 components (B=1)
+        pca_rgb = pca_features[0]  # (H, W, 3)
+        # Normalize each channel independently to [0, 1]
+        for i in range(3):
+            channel = pca_rgb[:, :, i]
+            channel = channel - channel.min()
+            channel = channel / (channel.max() + 1e-7)
+            pca_rgb[:, :, i] = channel
+        target_size = (input_tensor.size(-1), input_tensor.size(-2))  # PIL expects (width, height)
+        pca_rgb_resized = (
+            np.array(
+                Image.fromarray((pca_rgb * 255).astype(np.uint8)).resize(target_size, Image.Resampling.BILINEAR)
+            ).astype(np.float32)
+            / 255.0
+        )
+        visualization = (pca_rgb_resized * 255).astype(np.uint8)
+        return InterpretabilityResult(
+            original_image=rgb_img,
+            visualization=visualization,
+            raw_output=pca_rgb.astype(np.float32),
+            logits=None,
+            predicted_class=None,
+        )

birder/kernels/soft_nms/soft_nms.cpp CHANGED Viewed

@@ -4,6 +4,9 @@
 * Taken from:
 * https://github.com/MrParosk/soft_nms
 * Licensed under the MIT License
+*
+* Modified by:
+* Ofer Hasson — 2026-01-10
 **************************************************************************************************
 */
@@ -40,8 +43,8 @@ torch::Tensor calculate_iou(const torch::Tensor& boxes, const torch::Tensor& are
     auto xx2 = torch::minimum(boxes.index({idx, 2}), boxes.index({Slice(idx + 1, None), 2}));
     auto yy2 = torch::minimum(boxes.index({idx, 3}), boxes.index({Slice(idx + 1, None), 3}));
-    auto w = torch::maximum(torch::zeros_like(xx1), xx2 - xx1);
-    auto h = torch::maximum(torch::zeros_like(yy1), yy2 - yy1);
+    auto w = (xx2 - xx1).clamp_min(0);
+    auto h = (yy2 - yy1).clamp_min(0);
     auto intersection = w * h;
     auto union_ = areas.index({idx}) + areas.index({Slice(idx + 1, None)}) - intersection;

birder/model_registry/model_registry.py CHANGED Viewed

@@ -87,14 +87,15 @@ class ModelRegistry:
         no further registration is needed.
         """
+        alias_key = alias.lower()
         if net_type.auto_register is False:
             # Register the model manually, as the base class doesn't take care of that for us
-            registry.register_model(alias, type(alias, (net_type,), {"config": config}))
+            self.register_model(alias_key, type(alias, (net_type,), {"config": config}))
         if alias in self.aliases:
             warnings.warn(f"Alias {alias} is already registered", UserWarning)
-        self.aliases[alias] = type(alias, (net_type,), {"config": config})
+        self.aliases[alias_key] = type(alias, (net_type,), {"config": config})
     def register_weights(self, name: str, weights_info: manifest.ModelMetadataType) -> None:
         if name in self._pretrained_nets:

birder/net/convnext_v1.py CHANGED Viewed

@@ -195,6 +195,21 @@ class ConvNeXt_v1(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
         return self.features(x)
+registry.register_model_config(
+    "convnext_v1_atto",  # Not in the original v1, taken from v2
+    ConvNeXt_v1,
+    config={"in_channels": [40, 80, 160, 320], "num_layers": [2, 2, 6, 2], "drop_path_rate": 0.0},
+)
+registry.register_model_config(
+    "convnext_v1_femto",  # Not in the original v1, taken from v2
+    ConvNeXt_v1,
+    config={"in_channels": [48, 96, 192, 384], "num_layers": [2, 2, 6, 2], "drop_path_rate": 0.0},
+)
+registry.register_model_config(
+    "convnext_v1_pico",  # Not in the original v1, taken from v2
+    ConvNeXt_v1,
+    config={"in_channels": [64, 128, 256, 512], "num_layers": [2, 2, 6, 2], "drop_path_rate": 0.0},
+)
 registry.register_model_config(
     "convnext_v1_nano",  # Not in the original v1, taken from v2
     ConvNeXt_v1,
@@ -220,6 +235,11 @@ registry.register_model_config(
     ConvNeXt_v1,
     config={"in_channels": [192, 384, 768, 1536], "num_layers": [3, 3, 27, 3], "drop_path_rate": 0.5},
 )
+registry.register_model_config(
+    "convnext_v1_huge",  # Not in the original v1, taken from v2
+    ConvNeXt_v1,
+    config={"in_channels": [352, 704, 1408, 2816], "num_layers": [3, 3, 27, 3], "drop_path_rate": 0.5},
+)
 registry.register_weights(
     "convnext_v1_tiny_eu-common256px",

birder/net/fastvit.py CHANGED Viewed

@@ -607,7 +607,6 @@ class AttentionBlock(nn.Module):
 class FastVitStage(nn.Module):
-    # pylint: disable=too-many-arguments,too-many-positional-arguments
     def __init__(
         self,
         dim: int,

birder/net/flexivit.py CHANGED Viewed

@@ -98,6 +98,8 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
         layer_scale_init_value: Optional[float] = self.config.get("layer_scale_init_value", None)
         pre_norm: bool = self.config.get("pre_norm", False)
         post_norm: bool = self.config.get("post_norm", True)
+        qkv_bias: bool = self.config.get("qkv_bias", True)
+        qk_norm: bool = self.config.get("qk_norm", False)
         num_reg_tokens: int = self.config.get("num_reg_tokens", 0)
         class_token: bool = self.config.get("class_token", True)
         attn_pool_head: bool = self.config.get("attn_pool_head", False)
@@ -186,6 +188,8 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
             attention_dropout,
             dpr,
             pre_norm=pre_norm,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
             activation_layer=act_layer,
             layer_scale_init_value=layer_scale_init_value,
             norm_layer=norm_layer,
@@ -224,6 +228,7 @@ class FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, Mask
             drop_path=0,
             activation_layer=act_layer,
             norm_layer=norm_layer,
+            norm_layer_eps=norm_layer_eps,
             mlp_layer=mlp_layer,
         )

birder/net/focalnet.py CHANGED Viewed

@@ -245,7 +245,6 @@ class FocalNetBlock(nn.Module):
 class FocalNetStage(nn.Module):
-    # pylint: disable=too-many-arguments,too-many-positional-arguments
     def __init__(
         self,
         dim: int,

birder/net/hiera.py CHANGED Viewed

@@ -301,14 +301,14 @@ class HieraBlock(nn.Module):
         self.dim = dim
         self.dim_out = dim_out
-        self.norm1 = nn.LayerNorm(dim)
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
         if dim != dim_out:
             self.proj = nn.Linear(dim, dim_out)
         else:
             self.proj = None
         self.attn = MaskUnitAttention(dim, dim_out, heads, q_stride, window_size, use_mask_unit_attn)
-        self.norm2 = nn.LayerNorm(dim_out)
+        self.norm2 = nn.LayerNorm(dim_out, eps=1e-6)
         self.mlp = MLP(dim_out, [int(dim_out * mlp_ratio), dim_out], activation_layer=nn.GELU)
         self.drop_path = StochasticDepth(drop_path, mode="row")
@@ -450,7 +450,7 @@ class Hiera(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin):
         self.body = nn.Sequential(stages)
         self.features = nn.Sequential(
             attn_pool if attn_pool is not None else AvgTokens(),
-            nn.LayerNorm(embed_dim),
+            nn.LayerNorm(embed_dim, eps=1e-6),
             nn.Flatten(1),
         )
         self.return_channels = return_channels

birder/net/hieradet.py CHANGED Viewed

@@ -125,7 +125,7 @@ class MultiScaleBlock(nn.Module):
         self.dim = dim
         self.dim_out = dim_out
-        self.norm1 = nn.LayerNorm(dim)
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
         if dim != dim_out:
             self.proj = nn.Linear(dim, dim_out)
         else:
@@ -144,7 +144,7 @@ class MultiScaleBlock(nn.Module):
             num_heads=num_heads,
             q_pool=copy.deepcopy(self.pool),
         )
-        self.norm2 = nn.LayerNorm(dim_out)
+        self.norm2 = nn.LayerNorm(dim_out, eps=1e-6)
         self.mlp = MLP(dim_out, [int(dim_out * mlp_ratio), dim_out], activation_layer=nn.GELU)
         self.drop_path = StochasticDepth(drop_path, mode="row")
@@ -173,11 +173,9 @@ class MultiScaleBlock(nn.Module):
         if self.q_stride is not None:
             # Shapes have changed due to Q pooling
             window_size = self.window_size // self.q_stride[0]
-            (H, W) = (shortcut.size(1), shortcut.size(2))
+            pad_hw = (pad_hw[0] // self.q_stride[0], pad_hw[1] // self.q_stride[1])
-            pad_h = (window_size - H % window_size) % window_size
-            pad_w = (window_size - W % window_size) % window_size
-            pad_hw = (H + pad_h, W + pad_w)
+            (H, W) = (shortcut.size(1), shortcut.size(2))
         # Reverse window partition
         if self.window_size > 0:
@@ -271,7 +269,7 @@ class HieraDet(DetectorBackbone, PreTrainEncoder, MaskedTokenRetentionMixin):
         self.body = nn.Sequential(stages)
         self.features = nn.Sequential(
-            nn.LayerNorm(embed_dim),
+            nn.LayerNorm(embed_dim, eps=1e-6),
             Permute([0, 3, 1, 2]),  # B H W C -> B C H W
             nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             nn.Flatten(1),
@@ -415,7 +413,7 @@ registry.register_model_config(
         "num_heads": 1,
         "global_pos_size": (7, 7),
         "global_att_blocks": [5, 7, 9],
-        "window_spec": [8, 4, -16, -32],
+        "window_spec": [8, 4, 14, 7],
         "drop_path_rate": 0.1,
     },
 )
@@ -428,7 +426,7 @@ registry.register_model_config(
         "num_heads": 1,
         "global_pos_size": (7, 7),
         "global_att_blocks": [7, 10, 13],
-        "window_spec": [8, 4, -16, -32],
+        "window_spec": [8, 4, 14, 7],
         "drop_path_rate": 0.1,
     },
 )
@@ -441,7 +439,7 @@ registry.register_model_config(
         "num_heads": 1,
         "global_pos_size": (14, 14),
         "global_att_blocks": [12, 16, 20],
-        "window_spec": [8, 4, -16, -32],
+        "window_spec": [8, 4, 14, 7],
         "drop_path_rate": 0.1,
     },
 )
@@ -454,7 +452,7 @@ registry.register_model_config(
         "num_heads": 2,
         "global_pos_size": (14, 14),
         "global_att_blocks": [12, 16, 20],
-        "window_spec": [8, 4, -16, -32],
+        "window_spec": [8, 4, 14, 7],
         "drop_path_rate": 0.1,
     },
 )
@@ -467,17 +465,84 @@ registry.register_model_config(
         "num_heads": 2,
         "global_pos_size": (7, 7),
         "global_att_blocks": [23, 33, 43],
-        "window_spec": [8, 4, -16, -32],
+        "window_spec": [8, 4, 14, 7],
+        "drop_path_rate": 0.2,
+    },
+)
+# Dynamic window size
+registry.register_model_config(
+    "hieradet_d_tiny",
+    HieraDet,
+    config={
+        "depths": [1, 2, 7, 2],
+        "embed_dim": 96,
+        "num_heads": 1,
+        "global_pos_size": (7, 7),
+        "global_att_blocks": [5, 7, 9],
+        "window_spec": [8, 4, 0, 0],
+        "drop_path_rate": 0.1,
+    },
+)
+registry.register_model_config(
+    "hieradet_d_small",
+    HieraDet,
+    config={
+        "depths": [1, 2, 11, 2],
+        "embed_dim": 96,
+        "num_heads": 1,
+        "global_pos_size": (7, 7),
+        "global_att_blocks": [7, 10, 13],
+        "window_spec": [8, 4, 0, 0],
+        "drop_path_rate": 0.1,
+    },
+)
+registry.register_model_config(
+    "hieradet_d_base",
+    HieraDet,
+    config={
+        "depths": [2, 3, 16, 3],
+        "embed_dim": 96,
+        "num_heads": 1,
+        "global_pos_size": (14, 14),
+        "global_att_blocks": [12, 16, 20],
+        "window_spec": [8, 4, 0, 0],
+        "drop_path_rate": 0.1,
+    },
+)
+registry.register_model_config(
+    "hieradet_d_base_plus",
+    HieraDet,
+    config={
+        "depths": [2, 3, 16, 3],
+        "embed_dim": 112,
+        "num_heads": 2,
+        "global_pos_size": (14, 14),
+        "global_att_blocks": [12, 16, 20],
+        "window_spec": [8, 4, 0, 0],
+        "drop_path_rate": 0.1,
+    },
+)
+registry.register_model_config(
+    "hieradet_d_large",
+    HieraDet,
+    config={
+        "depths": [2, 6, 36, 4],
+        "embed_dim": 144,
+        "num_heads": 2,
+        "global_pos_size": (7, 7),
+        "global_att_blocks": [23, 33, 43],
+        "window_spec": [8, 4, 0, 0],
         "drop_path_rate": 0.2,
     },
 )
 registry.register_weights(
-    "hieradet_small_dino-v2",
+    "hieradet_d_small_dino-v2",
     {
-        "url": "https://huggingface.co/birder-project/hieradet_small_dino-v2/resolve/main",
+        "url": "https://huggingface.co/birder-project/hieradet_d_small_dino-v2/resolve/main",
         "description": (
-            "HieraDet small image encoder pre-trained using DINOv2. "
+            "HieraDet (d) small image encoder pre-trained using DINOv2. "
             "This model has not been fine-tuned for a specific classification task"
         ),
         "resolution": (224, 224),
@@ -487,14 +552,16 @@ registry.register_weights(
                 "sha256": "eb41b8a35445e7f350797094d5e365306b29351e64edd4a316420c23d1e17073",
             }
         },
-        "net": {"network": "hieradet_small", "tag": "dino-v2"},
+        "net": {"network": "hieradet_d_small", "tag": "dino-v2"},
     },
 )
 registry.register_weights(
-    "hieradet_small_dino-v2-inat21-256px",
+    "hieradet_d_small_dino-v2-inat21-256px",
     {
-        "url": "https://huggingface.co/birder-project/hieradet_small_dino-v2-inat21/resolve/main",
-        "description": "HieraDet small model pre-trained using DINOv2, then fine-tuned on the iNaturalist 2021 dataset",
+        "url": "https://huggingface.co/birder-project/hieradet_d_small_dino-v2-inat21/resolve/main",
+        "description": (
+            "HieraDet (d) small model pre-trained using DINOv2, then fine-tuned on the iNaturalist 2021 dataset"
+        ),
         "resolution": (256, 256),
         "formats": {
             "pt": {
@@ -502,14 +569,16 @@ registry.register_weights(
                 "sha256": "e1bdeba97eae816ec3ab9b3238d97decf2c34d29b70f9291116ce962b9a4f9df",
             }
         },
-        "net": {"network": "hieradet_small", "tag": "dino-v2-inat21-256px"},
+        "net": {"network": "hieradet_d_small", "tag": "dino-v2-inat21-256px"},
     },
 )
 registry.register_weights(
-    "hieradet_small_dino-v2-inat21",
+    "hieradet_d_small_dino-v2-inat21",
     {
-        "url": "https://huggingface.co/birder-project/hieradet_small_dino-v2-inat21/resolve/main",
-        "description": "HieraDet small model pre-trained using DINOv2, then fine-tuned on the iNaturalist 2021 dataset",
+        "url": "https://huggingface.co/birder-project/hieradet_d_small_dino-v2-inat21/resolve/main",
+        "description": (
+            "HieraDet (d) small model pre-trained using DINOv2, then fine-tuned on the iNaturalist 2021 dataset"
+        ),
         "resolution": (384, 384),
         "formats": {
             "pt": {
@@ -517,14 +586,14 @@ registry.register_weights(
                 "sha256": "271fa9ed6a9aa1f4d1fc8bbb4c4cac9d15b264f2ac544efb5cd971412691880d",
             }
         },
-        "net": {"network": "hieradet_small", "tag": "dino-v2-inat21"},
+        "net": {"network": "hieradet_d_small", "tag": "dino-v2-inat21"},
     },
 )
 registry.register_weights(
-    "hieradet_small_dino-v2-imagenet12k",
+    "hieradet_d_small_dino-v2-imagenet12k",
     {
-        "url": "https://huggingface.co/birder-project/hieradet_small_dino-v2-imagenet12k/resolve/main",
-        "description": "HieraDet small model pre-trained using DINOv2, then fine-tuned on the ImageNet-12K dataset",
+        "url": "https://huggingface.co/birder-project/hieradet_d_small_dino-v2-imagenet12k/resolve/main",
+        "description": "HieraDet (d) small model pre-trained using DINOv2, then fine-tuned on the ImageNet-12K dataset",
         "resolution": (256, 256),
         "formats": {
             "pt": {
@@ -532,6 +601,25 @@ registry.register_weights(
                 "sha256": "b89dd6c13d061fe8a09d051bb3d76e632e650067ca71578e37b02033107c9963",
             }
         },
-        "net": {"network": "hieradet_small", "tag": "dino-v2-imagenet12k"},
+        "net": {"network": "hieradet_d_small", "tag": "dino-v2-imagenet12k"},
+    },
+)
+registry.register_weights(  # SAM v2: https://arxiv.org/abs/2408.00714
+    "hieradet_small_sam2_1",
+    {
+        "url": "https://huggingface.co/birder-project/hieradet_small_sam2_1/resolve/main",
+        "description": (
+            "HieraDet small image encoder pre-trained by Meta AI using SAM v2. "
+            "This model has not been fine-tuned for a specific classification task"
+        ),
+        "resolution": (224, 224),
+        "formats": {
+            "pt": {
+                "file_size": 129.6,
+                "sha256": "79b6ffdfd4ea9f3b1489ce5a229fe9756b215fc3b52640d01d64136560c1d341",
+            }
+        },
+        "net": {"network": "hieradet_small", "tag": "sam2_1"},
     },
 )

birder/net/rope_flexivit.py CHANGED Viewed

@@ -69,6 +69,8 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
         layer_scale_init_value: Optional[float] = self.config.get("layer_scale_init_value", None)
         pre_norm: bool = self.config.get("pre_norm", False)
         post_norm: bool = self.config.get("post_norm", True)
+        qkv_bias: bool = self.config.get("qkv_bias", True)
+        qk_norm: bool = self.config.get("qk_norm", False)
         num_reg_tokens: int = self.config.get("num_reg_tokens", 0)
         class_token: bool = self.config.get("class_token", True)
         attn_pool_head: bool = self.config.get("attn_pool_head", False)
@@ -118,6 +120,7 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
         self.num_reg_tokens = num_reg_tokens
         self.attn_pool_special_tokens = attn_pool_special_tokens
         self.norm_layer = norm_layer
+        self.norm_layer_eps = norm_layer_eps
         self.mlp_layer = mlp_layer
         self.act_layer = act_layer
         self.rope_rot_type = rope_rot_type
@@ -190,6 +193,8 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
             attention_dropout,
             dpr,
             pre_norm=pre_norm,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
             activation_layer=act_layer,
             layer_scale_init_value=layer_scale_init_value,
             norm_layer=norm_layer,
@@ -231,6 +236,7 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
             rope_temperature=rope_temperature,
             layer_scale_init_value=layer_scale_init_value,
             norm_layer=norm_layer,
+            norm_layer_eps=norm_layer_eps,
             mlp_layer=mlp_layer,
             rope_rot_type=rope_rot_type,
         )
@@ -588,6 +594,7 @@ class RoPE_FlexiViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin,
             rope_temperature=self.rope_temperature,
             layer_scale_init_value=self.layer_scale_init_value,
             norm_layer=self.norm_layer,
+            norm_layer_eps=self.norm_layer_eps,
             mlp_layer=self.mlp_layer,
             rope_rot_type=self.rope_rot_type,
         )

birder 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

birder 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl