PyPI - birder - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

birder 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

birder/adversarial/__init__.py +13 -0
birder/adversarial/base.py +101 -0
birder/adversarial/deepfool.py +173 -0
birder/adversarial/fgsm.py +51 -18
birder/adversarial/pgd.py +79 -28
birder/adversarial/simba.py +172 -0
birder/common/training_cli.py +11 -3
birder/common/training_utils.py +18 -1
birder/inference/data_parallel.py +1 -2
birder/introspection/__init__.py +10 -6
birder/introspection/attention_rollout.py +122 -54
birder/introspection/base.py +73 -29
birder/introspection/gradcam.py +71 -100
birder/introspection/guided_backprop.py +146 -72
birder/introspection/transformer_attribution.py +182 -0
birder/net/detection/deformable_detr.py +14 -12
birder/net/detection/detr.py +7 -3
birder/net/detection/rt_detr_v1.py +3 -3
birder/net/detection/yolo_v3.py +6 -11
birder/net/detection/yolo_v4.py +7 -18
birder/net/detection/yolo_v4_tiny.py +3 -3
birder/net/fastvit.py +1 -1
birder/net/mim/mae_vit.py +7 -8
birder/net/pit.py +1 -1
birder/net/resnet_v1.py +94 -34
birder/net/ssl/data2vec.py +1 -1
birder/net/ssl/data2vec2.py +4 -2
birder/results/gui.py +15 -2
birder/scripts/predict_detection.py +33 -1
birder/scripts/train.py +24 -17
birder/scripts/train_barlow_twins.py +10 -7
birder/scripts/train_byol.py +10 -7
birder/scripts/train_capi.py +12 -9
birder/scripts/train_data2vec.py +10 -7
birder/scripts/train_data2vec2.py +10 -7
birder/scripts/train_detection.py +42 -18
birder/scripts/train_dino_v1.py +10 -7
birder/scripts/train_dino_v2.py +10 -7
birder/scripts/train_dino_v2_dist.py +17 -7
birder/scripts/train_franca.py +10 -7
birder/scripts/train_i_jepa.py +17 -13
birder/scripts/train_ibot.py +10 -7
birder/scripts/train_kd.py +24 -18
birder/scripts/train_mim.py +11 -10
birder/scripts/train_mmcr.py +10 -7
birder/scripts/train_rotnet.py +10 -7
birder/scripts/train_simclr.py +10 -7
birder/scripts/train_vicreg.py +10 -7
birder/tools/__main__.py +6 -2
birder/tools/adversarial.py +147 -96
birder/tools/auto_anchors.py +361 -0
birder/tools/ensemble_model.py +1 -1
birder/tools/introspection.py +58 -31
birder/version.py +1 -1
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/METADATA +2 -1
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/RECORD +60 -55
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/WHEEL +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/entry_points.txt +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/licenses/LICENSE +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/top_level.txt +0 -0

birder/common/training_cli.py CHANGED Viewed

@@ -110,10 +110,13 @@ def add_lr_scheduler_args(parser: argparse.ArgumentParser) -> None:
         type=int,
         default=40,
         metavar="N",
-        help="decrease lr every step-size epochs (for step scheduler only)",
+        help="decrease lr every N epochs/steps (relative to after warmup, step scheduler only)",
     )
     group.add_argument(
-        "--lr-steps", type=int, nargs="+", help="decrease lr every step-size epochs (multistep scheduler only)"
+        "--lr-steps",
+        type=int,
+        nargs="+",
+        help="absolute epoch/step milestones when to decrease lr (multistep scheduler only)",
     )
     group.add_argument(
         "--lr-step-gamma",
@@ -391,7 +394,7 @@ def add_ema_args(
         "--model-ema-warmup",
         type=int,
         metavar="N",
-        help="number of epochs before EMA is applied (defaults to warmup epochs/iters, pass 0 to disable warmup)",
+        help="number of epochs/steps before EMA is applied (defaults to warmup epochs/steps, pass 0 to disable warmup)",
     )
@@ -656,6 +659,11 @@ def common_args_validation(args: argparse.Namespace) -> None:
                 f"but it is set to '{args.lr_scheduler_update}'"
             )
+    # EMA
+    if hasattr(args, "model_ema_steps") is True:
+        if args.model_ema_steps < 1:
+            raise ValidationError("--model-ema-steps must be >= 1")
     # Compile args, argument dependant
     if hasattr(args, "compile_teacher") is True:
         if args.compile is True and args.compile_teacher is True:

birder/common/training_utils.py CHANGED Viewed

@@ -491,12 +491,29 @@ def get_scheduler(
     if args.lr_scheduler == "constant":
         main_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0, total_iters=1)
     elif args.lr_scheduler == "step":
+        # Note: StepLR step_size is relative to when the main scheduler starts (after warmup)
+        # This means drops occur relative to the end of warmup, not at absolute epoch numbers
         main_scheduler = torch.optim.lr_scheduler.StepLR(
             optimizer, step_size=args.lr_step_size, gamma=args.lr_step_gamma
         )
     elif args.lr_scheduler == "multistep":
+        # For MultiStepLR, milestones should be absolute step numbers
+        # Adjust them to be relative to when the main scheduler starts (after warmup)
+        # This ensures drops occur at the specified absolute steps, not relative to after warmup
+        adjusted_milestones = [m - warmup_steps for m in args.lr_steps if m >= warmup_steps]
+        if len(adjusted_milestones) == 0:
+            logger.debug(
+                f"All MultiStepLR milestones {args.lr_steps} are before warmup "
+                f"(warmup ends at step {warmup_steps}). Using empty milestone list."
+            )
+            adjusted_milestones = []
+        logger.debug(
+            f"MultiStepLR milestones adjusted from {args.lr_steps} to {adjusted_milestones} "
+            f"(relative to main scheduler start after {warmup_steps} warmup steps)"
+        )
         main_scheduler = torch.optim.lr_scheduler.MultiStepLR(
-            optimizer, milestones=args.lr_steps, gamma=args.lr_step_gamma
+            optimizer, milestones=adjusted_milestones, gamma=args.lr_step_gamma
         )
     elif args.lr_scheduler == "cosine":
         main_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(

birder/inference/data_parallel.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 Inference-optimized multi-GPU parallelization
-This module provides InferenceDataParallel, an inference-specific alternative to
-torch.nn.DataParallel.
+This module provides InferenceDataParallel, an inference-specific alternative to torch.nn.DataParallel.
 """
 import copy

birder/introspection/__init__.py CHANGED Viewed

@@ -1,9 +1,13 @@
-from birder.introspection.attention_rollout import AttentionRolloutInterpreter
-from birder.introspection.gradcam import GradCamInterpreter
-from birder.introspection.guided_backprop import GuidedBackpropInterpreter
+from birder.introspection.attention_rollout import AttentionRollout
+from birder.introspection.base import InterpretabilityResult
+from birder.introspection.gradcam import GradCAM
+from birder.introspection.guided_backprop import GuidedBackprop
+from birder.introspection.transformer_attribution import TransformerAttribution
 __all__ = [
-    "AttentionRolloutInterpreter",
-    "GradCamInterpreter",
-    "GuidedBackpropInterpreter",
+    "InterpretabilityResult",
+    "AttentionRollout",
+    "GradCAM",
+    "GuidedBackprop",
+    "TransformerAttribution",
 ]

birder/introspection/attention_rollout.py CHANGED Viewed

@@ -1,5 +1,8 @@
 """
-Adapted from https://github.com/jacobgil/vit-explain/blob/main/vit_rollout.py
+Attention Rollout for Vision Transformers, adapted from
+https://github.com/jacobgil/vit-explain/blob/main/vit_rollout.py
+Paper "Quantifying Attention Flow in Transformers", https://arxiv.org/abs/2005.00928
 """
 # Reference license: MIT
@@ -15,103 +18,168 @@ from PIL import Image
 from torch import nn
 from birder.introspection.base import InterpretabilityResult
-from birder.introspection.base import Interpreter
+from birder.introspection.base import predict_class
+from birder.introspection.base import preprocess_image
 from birder.introspection.base import show_mask_on_image
 from birder.net.vit import Encoder
-def rollout(
+# pylint: disable=too-many-locals
+def compute_rollout(
     attentions: list[torch.Tensor],
     discard_ratio: float,
     head_fusion: Literal["mean", "max", "min"],
     num_special_tokens: int,
+    patch_grid_shape: tuple[int, int],
 ) -> torch.Tensor:
-    result = torch.eye(attentions[0].size(-1))
+    # Assume batch size = 1
+    num_tokens = attentions[0].size(-1)
+    device = attentions[0].device
+    # Start with identity (residual)
+    result = torch.eye(num_tokens, device=device)
     with torch.no_grad():
         for attention in attentions:
+            # Fuse heads: [B, H, T, T] -> [B, T, T]
             if head_fusion == "mean":
-                attention_heads_fused = attention.mean(axis=1)
+                attention_heads_fused = attention.mean(dim=1)
             elif head_fusion == "max":
-                attention_heads_fused = attention.max(axis=1)[0]
+                attention_heads_fused = attention.max(dim=1)[0]
             elif head_fusion == "min":
-                attention_heads_fused = attention.min(axis=1)[0]
+                attention_heads_fused = attention.min(dim=1)[0]
             else:
-                raise ValueError("Attention head fusion type Not supported")
-            # Drop the lowest attentions, but don't drop the class token
-            flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
-            (_, indices) = flat.topk(int(flat.size(-1) * discard_ratio), -1, False)
-            indices = indices[indices != 0]
-            flat[0, indices] = 0
-            eye = torch.eye(attention_heads_fused.size(-1))
-            a = (attention_heads_fused + 1.0 * eye) / 2
-            a = a / a.sum(dim=-1)
-            result = torch.matmul(a, result)
-    # Look at the total attention between the class token and the image patches
-    mask = result[0, 0, num_special_tokens:]
-    width = int(mask.size(-1) ** 0.5)
-    mask = mask.reshape(width, width)
-    mask = mask / torch.max(mask)
+                raise ValueError(f"Unsupported head_fusion: {head_fusion}")
+            # attention_heads_fused: [1, T, T] (batch = 1)
+            if discard_ratio > 0:
+                # Work on the single batch element
+                attn = attention_heads_fused[0]  # [T, T]
+                # Define which positions are "non-special"
+                idx = torch.arange(num_tokens, device=attn.device)
+                is_special = idx < num_special_tokens
+                non_special = ~is_special
+                # We are only allowed to prune NON-special <-> NON-special entries
+                allow = non_special[:, None] & non_special[None, :]  # [T, T]
+                allowed_values = attn[allow]
+                num_allowed = allowed_values.numel()
+                if num_allowed > 0:
+                    num_to_discard = int(num_allowed * discard_ratio)
+                    if num_to_discard > 0:
+                        # Drop the smallest allowed values
+                        (_, low_idx) = torch.topk(allowed_values, num_to_discard, largest=False)
+                        allowed_values[low_idx] = 0
+                        attn[allow] = allowed_values
+                        attention_heads_fused[0] = attn
+            # Add residual connection and normalize
+            eye = torch.eye(num_tokens, device=attention_heads_fused.device)
+            a = (attention_heads_fused + eye) / 2.0  # [1, T, T]
+            a = a / a.sum(dim=-1, keepdim=True)
+            # Accumulate attention across layers
+            result = torch.matmul(a, result)  # [1, T, T]
+    rollout = result[0]  # [T, T]
+    # Build final token → patch map
+    if 0 < num_special_tokens < num_tokens:
+        # Sources: all special tokens (0 .. num_special_tokens-1)
+        # Targets: all non-special tokens (num_special_tokens .. end)
+        source_to_patches = rollout[:num_special_tokens, num_special_tokens:]
+        mask = source_to_patches.mean(dim=0)
+    else:
+        # No special tokens (or all are special): fall back to averaging over all sources
+        mask = rollout.mean(dim=0)  # [T]
+    # Normalize and reshape to 2D map using actual patch grid dimensions
+    mask = mask / (mask.max() + 1e-8)
+    (grid_h, grid_w) = patch_grid_shape
+    mask = mask.reshape(grid_h, grid_w)
     return mask
-class AttentionRollout:
-    def __init__(self, net: torch.nn.Module, attention_layer_name: str) -> None:
+class AttentionGatherer:
+    def __init__(self, net: nn.Module, attention_layer_name: str) -> None:
         assert hasattr(net, "encoder") is True and isinstance(net.encoder, Encoder)
         net.encoder.set_need_attn()
         self.net = net
+        self.attentions: list[torch.Tensor] = []
+        self.handles: list[torch.utils.hooks.RemovableHandle] = []
+        # Register hooks on attention layers
         for name, module in self.net.named_modules():
             if name.endswith(attention_layer_name) is True:
-                module.register_forward_hook(self.get_attention)
+                handle = module.register_forward_hook(self._capture_attention)
+                self.handles.append(handle)
-        self.attentions: list[torch.Tensor] = []
-    def get_attention(
-        self, _module: torch.nn.Module, _inputs: tuple[torch.Tensor, ...], outputs: tuple[torch.Tensor, ...]
+    def _capture_attention(
+        self, _module: nn.Module, _inputs: tuple[torch.Tensor, ...], outputs: tuple[torch.Tensor, ...]
     ) -> None:
         self.attentions.append(outputs[1].cpu())
-    def __call__(
-        self, x: torch.Tensor, discard_ratio: float, head_fusion: Literal["mean", "max", "min"]
-    ) -> torch.Tensor:
+    def __call__(self, x: torch.Tensor) -> tuple[list[torch.Tensor], torch.Tensor]:
         self.attentions = []
         with torch.inference_mode():
-            self.net(x)
+            logits = self.net(x)
+        return (self.attentions, logits)
-        return rollout(self.attentions, discard_ratio, head_fusion, self.net.num_special_tokens)
+    def release(self) -> None:
+        for handle in self.handles:
+            handle.remove()
-class AttentionRolloutInterpreter(Interpreter):
+class AttentionRollout:
     def __init__(
         self,
-        model: nn.Module,
+        net: nn.Module,
         device: torch.device,
         transform: Callable[..., torch.Tensor],
-        attention_layer_name: str,
-        discard_ratio: float,
-        head_fusion: Literal["mean", "max", "min"],
+        attention_layer_name: str = "self_attention",
+        discard_ratio: float = 0.9,
+        head_fusion: Literal["mean", "max", "min"] = "max",
     ) -> None:
-        super().__init__(model, device, transform)
-        self.attention_rollout = AttentionRollout(model, attention_layer_name)
+        if not 0 <= discard_ratio <= 1:
+            raise ValueError(f"discard_ratio must be in [0, 1], got {discard_ratio}")
+        self.net = net.eval()
+        self.device = device
+        self.transform = transform
         self.discard_ratio = discard_ratio
         self.head_fusion = head_fusion
+        self.attention_gatherer = AttentionGatherer(net, attention_layer_name)
-    def interpret(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
-        (input_tensor, rgb_img) = self._preprocess_image(image)
+    def __call__(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
+        (input_tensor, rgb_img) = preprocess_image(image, self.transform, self.device)
-        attention_map = self.attention_rollout(
-            input_tensor, discard_ratio=self.discard_ratio, head_fusion=self.head_fusion
-        )
+        (attentions, logits) = self.attention_gatherer(input_tensor)
-        # Resize attention map to match image size
+        (_, _, H, W) = input_tensor.shape
+        patch_grid_shape = (H // self.net.stem_stride, W // self.net.stem_stride)
+        attention_map = compute_rollout(
+            attentions, self.discard_ratio, self.head_fusion, self.net.num_special_tokens, patch_grid_shape
+        )
         attention_img = Image.fromarray(attention_map.numpy())
-        attention_img = attention_img.resize(rgb_img.shape[:2])
+        attention_img = attention_img.resize((rgb_img.shape[1], rgb_img.shape[0]))
         attention_arr = np.array(attention_img)
         visualization = show_mask_on_image(rgb_img, attention_arr, image_weight=0.4)
-        return InterpretabilityResult(rgb_img, visualization, raw_output=attention_arr)
+        return InterpretabilityResult(
+            original_image=rgb_img,
+            visualization=visualization,
+            raw_output=attention_arr,
+            logits=logits.detach(),
+            predicted_class=predict_class(logits),
+        )
+    def __del__(self) -> None:
+        if hasattr(self, "attention_gatherer") is True:
+            self.attention_gatherer.release()

birder/introspection/base.py CHANGED Viewed

@@ -2,6 +2,7 @@ from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
+from typing import Protocol
 import matplotlib
 import matplotlib.pyplot as plt
@@ -9,13 +10,56 @@ import numpy as np
 import numpy.typing as npt
 import torch
 from PIL import Image
-from torch import nn
+@dataclass(frozen=True)
+class InterpretabilityResult:
+    original_image: npt.NDArray[np.float32]
+    visualization: npt.NDArray[np.float32] | npt.NDArray[np.uint8]
+    raw_output: npt.NDArray[np.float32]
+    logits: Optional[torch.Tensor] = None
+    predicted_class: Optional[int] = None
+    def show(self, figsize: tuple[int, int] = (12, 8)) -> None:
+        _, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
+        ax1.imshow(self.visualization)
+        ax2.imshow(self.original_image)
+        plt.show()
+class Interpreter(Protocol):
+    def __call__(
+        self, image: str | Path | Image.Image, target_class: Optional[int] = None
+    ) -> InterpretabilityResult: ...
+def load_image(image: str | Path | Image.Image) -> Image.Image:
+    if isinstance(image, (str, Path)):
+        return Image.open(image)
+    return image
+def preprocess_image(
+    image: str | Path | Image.Image, transform: Callable[..., torch.Tensor], device: torch.device
+) -> tuple[torch.Tensor, npt.NDArray[np.float32]]:
+    pil_image = load_image(image)
+    input_tensor = transform(pil_image).unsqueeze(dim=0).to(device)
+    # Resize and normalize for visualization
+    resized = pil_image.resize((input_tensor.shape[-1], input_tensor.shape[-2]))
+    rgb_img = np.array(resized).astype(np.float32) / 255.0
+    return (input_tensor, rgb_img)
 def show_mask_on_image(
-    img: npt.NDArray[np.float32], mask: npt.NDArray[np.float32], image_weight: float = 0.5
-) -> npt.NDArray[np.float32]:
-    color_map = matplotlib.colormaps["jet"]
+    img: npt.NDArray[np.float32],
+    mask: npt.NDArray[np.float32],
+    image_weight: float = 0.5,
+    colormap: str = "jet",
+) -> npt.NDArray[np.uint8]:
+    color_map = matplotlib.colormaps[colormap]
     heatmap = color_map(mask)[:, :, :3]
     cam: npt.NDArray[np.float32] = (1 - image_weight) * heatmap + image_weight * img
@@ -25,36 +69,36 @@ def show_mask_on_image(
     return cam.astype(np.uint8)
-@dataclass
-class InterpretabilityResult:
-    original_image: npt.NDArray[np.float32]
-    visualization: npt.NDArray[np.float32] | npt.NDArray[np.uint8]
-    raw_output: npt.NDArray[np.float32]
+def scale_cam_image(
+    cam: npt.NDArray[np.float32], target_size: Optional[tuple[int, int]] = None
+) -> npt.NDArray[np.float32]:
+    result = []
+    for img in cam:
+        img = img - np.min(img)
+        img = img / (1e-7 + np.max(img))
+        if target_size is not None:
+            img = np.array(Image.fromarray(img).resize(target_size))
-    def show(self, figsize: tuple[int, int] = (12, 8)) -> None:
-        _, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
-        ax1.imshow(self.visualization)
-        ax2.imshow(self.original_image)
-        plt.show()
+        result.append(img)
+    return np.array(result, dtype=np.float32)
-class Interpreter:
-    def __init__(self, model: nn.Module, device: torch.device, transform: Callable[..., torch.Tensor]) -> None:
-        self.model = model.eval()
-        self.device = device
-        self.transform = transform
+def deprocess_image(img: npt.NDArray[np.float32]) -> npt.NDArray[np.uint8]:
+    img = img - np.mean(img)
+    img = img / (np.std(img) + 1e-5)
+    img = img * 0.1
+    img = img + 0.5
+    img = np.clip(img, 0, 1)
-    def interpret(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
-        raise NotImplementedError
+    return np.array(img * 255).astype(np.uint8)
-    def _preprocess_image(self, image: str | Path | Image.Image) -> tuple[torch.Tensor, npt.NDArray[np.float32]]:
-        if isinstance(image, (str, Path)):
-            image = Image.open(image)
-        # Transform for model
-        input_tensor = self.transform(image).unsqueeze(dim=0).to(self.device)
+def validate_target_class(target_class: Optional[int], num_classes: int) -> None:
+    if target_class is not None:
+        if target_class < 0 or target_class >= num_classes:
+            raise ValueError(f"target_class must be in range [0, {num_classes}), got {target_class}")
-        # Store original for visualization
-        rgb_img = np.array(image.resize(input_tensor.shape[-2:])).astype(np.float32) / 255.0
-        return (input_tensor, rgb_img)
+def predict_class(logits: torch.Tensor) -> int:
+    return int(torch.argmax(logits, dim=-1).item())

birder 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

birder 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl