PyPI - birder - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

birder 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

birder/adversarial/__init__.py +13 -0
birder/adversarial/base.py +101 -0
birder/adversarial/deepfool.py +173 -0
birder/adversarial/fgsm.py +51 -18
birder/adversarial/pgd.py +79 -28
birder/adversarial/simba.py +172 -0
birder/common/training_cli.py +11 -3
birder/common/training_utils.py +18 -1
birder/inference/data_parallel.py +1 -2
birder/introspection/__init__.py +10 -6
birder/introspection/attention_rollout.py +122 -54
birder/introspection/base.py +73 -29
birder/introspection/gradcam.py +71 -100
birder/introspection/guided_backprop.py +146 -72
birder/introspection/transformer_attribution.py +182 -0
birder/net/detection/deformable_detr.py +14 -12
birder/net/detection/detr.py +7 -3
birder/net/detection/rt_detr_v1.py +3 -3
birder/net/detection/yolo_v3.py +6 -11
birder/net/detection/yolo_v4.py +7 -18
birder/net/detection/yolo_v4_tiny.py +3 -3
birder/net/fastvit.py +1 -1
birder/net/mim/mae_vit.py +7 -8
birder/net/pit.py +1 -1
birder/net/resnet_v1.py +94 -34
birder/net/ssl/data2vec.py +1 -1
birder/net/ssl/data2vec2.py +4 -2
birder/results/gui.py +15 -2
birder/scripts/predict_detection.py +33 -1
birder/scripts/train.py +24 -17
birder/scripts/train_barlow_twins.py +10 -7
birder/scripts/train_byol.py +10 -7
birder/scripts/train_capi.py +12 -9
birder/scripts/train_data2vec.py +10 -7
birder/scripts/train_data2vec2.py +10 -7
birder/scripts/train_detection.py +42 -18
birder/scripts/train_dino_v1.py +10 -7
birder/scripts/train_dino_v2.py +10 -7
birder/scripts/train_dino_v2_dist.py +17 -7
birder/scripts/train_franca.py +10 -7
birder/scripts/train_i_jepa.py +17 -13
birder/scripts/train_ibot.py +10 -7
birder/scripts/train_kd.py +24 -18
birder/scripts/train_mim.py +11 -10
birder/scripts/train_mmcr.py +10 -7
birder/scripts/train_rotnet.py +10 -7
birder/scripts/train_simclr.py +10 -7
birder/scripts/train_vicreg.py +10 -7
birder/tools/__main__.py +6 -2
birder/tools/adversarial.py +147 -96
birder/tools/auto_anchors.py +361 -0
birder/tools/ensemble_model.py +1 -1
birder/tools/introspection.py +58 -31
birder/version.py +1 -1
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/METADATA +2 -1
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/RECORD +60 -55
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/WHEEL +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/entry_points.txt +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/licenses/LICENSE +0 -0
{birder-0.2.1.dist-info → birder-0.2.2.dist-info}/top_level.txt +0 -0

birder/introspection/gradcam.py CHANGED Viewed

@@ -1,5 +1,9 @@
 """
-Adapted from https://github.com/jacobgil/pytorch-grad-cam
+Gradient-weighted Class Activation Mapping (Grad-CAM), adapted from
+https://github.com/jacobgil/pytorch-grad-cam
+Paper "Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization",
+https://arxiv.org/abs/1610.02391
 """
 # Reference license: MIT
@@ -16,71 +20,51 @@ from torch import nn
 from torch.utils.hooks import RemovableHandle
 from birder.introspection.base import InterpretabilityResult
-from birder.introspection.base import Interpreter
+from birder.introspection.base import predict_class
+from birder.introspection.base import preprocess_image
+from birder.introspection.base import scale_cam_image
 from birder.introspection.base import show_mask_on_image
+from birder.introspection.base import validate_target_class
-def _scale_cam_image(
-    cam: npt.NDArray[np.float32], target_size: Optional[tuple[int, int]] = None
-) -> npt.NDArray[np.float32]:
-    result = []
-    for img in cam:
-        img = img - np.min(img)
-        img = img / (1e-7 + np.max(img))
-        if target_size is not None:
-            img = np.array(Image.fromarray(img).resize(target_size))
-        result.append(img)
-    return np.array(result, dtype=np.float32)
-class ClassifierOutputTarget:
-    def __init__(self, category: int) -> None:
-        self.category = category
+def compute_cam(activations: npt.NDArray[np.float32], gradients: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
+    weights: npt.NDArray[np.float32] = np.mean(gradients, axis=(2, 3))
+    weighted_activations = weights[:, :, None, None] * activations
+    cam: npt.NDArray[np.float32] = weighted_activations.sum(axis=1)
+    cam = np.maximum(cam, 0)
-    def __call__(self, model_output: torch.Tensor) -> torch.Tensor:
-        if len(model_output.shape) == 1:
-            return model_output[self.category]
+    return cam
-        return model_output[:, self.category]
-class ActivationsAndGradients:
-    """
-    Class for extracting activations and
-    registering gradients from targeted intermediate layers
-    """
+class ActivationCapture:
     def __init__(
         self,
         model: nn.Module,
         target_layer: nn.Module,
-        reshape_transform: Optional[Callable[[torch.Tensor], torch.Tensor]],
+        reshape_transform: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
     ) -> None:
         self.model = model
-        self.gradients: torch.Tensor
-        self.activations: torch.Tensor
+        self.target_layer = target_layer
         self.reshape_transform = reshape_transform
+        self.activations: Optional[torch.Tensor] = None
+        self.gradients: Optional[torch.Tensor] = None
         self.handles: list[RemovableHandle] = []
-        self.handles.append(target_layer.register_forward_hook(self.save_activation))
-        # Because of https://github.com/pytorch/pytorch/issues/61519,
-        # we don't use backward hook to record gradients.
-        self.handles.append(target_layer.register_forward_hook(self.save_gradient))
+        # Register hooks
+        self.handles.append(target_layer.register_forward_hook(self._save_activation))
+        self.handles.append(target_layer.register_forward_hook(self._save_gradient))
-    def save_activation(self, _module: nn.Module, _input: torch.Tensor, output: torch.Tensor) -> None:
+    def _save_activation(self, _module: nn.Module, _input: torch.Tensor, output: torch.Tensor) -> None:
         if self.reshape_transform is not None:
             output = self.reshape_transform(output)
         self.activations = output.cpu().detach()
-    def save_gradient(self, _module: nn.Module, _input: torch.Tensor, output: torch.Tensor) -> None:
+    def _save_gradient(self, _module: nn.Module, _input: torch.Tensor, output: torch.Tensor) -> None:
         if hasattr(output, "requires_grad") is False or output.requires_grad is False:
-            # You can only register hooks on tensor requires grad.
             return
-        # Gradients are computed in reverse order
         def _store_grad(grad: torch.Tensor) -> None:
             if self.reshape_transform is not None:
                 grad = self.reshape_transform(grad)
@@ -100,77 +84,64 @@ class ActivationsAndGradients:
 class GradCAM:
     def __init__(
         self,
-        model: nn.Module,
+        net: nn.Module,
+        device: torch.device,
+        transform: Callable[..., torch.Tensor],
         target_layer: nn.Module,
         reshape_transform: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
     ) -> None:
-        self.model = model.eval()
+        self.net = net.eval()
+        self.device = device
+        self.transform = transform
         self.target_layer = target_layer
-        self.activations_and_grads = ActivationsAndGradients(self.model, target_layer, reshape_transform)
-    def get_cam_image(
-        self, activations: npt.NDArray[np.float32], grads: npt.NDArray[np.float32]
-    ) -> npt.NDArray[np.float32]:
-        weights: npt.NDArray[np.float32] = np.mean(grads, axis=(2, 3))
-        weighted_activations = weights[:, :, None, None] * activations
-        cam: npt.NDArray[np.float32] = weighted_activations.sum(axis=1)
-        return cam
-    def compute_layer_cam(self, input_tensor: torch.Tensor) -> npt.NDArray[np.float32]:
-        target_size = (input_tensor.size(-1), input_tensor.size(-2))
-        layer_activations = self.activations_and_grads.activations.numpy()
-        layer_grads = self.activations_and_grads.gradients.numpy()
+        self.activation_capture = ActivationCapture(net, target_layer, reshape_transform)
-        cam = self.get_cam_image(layer_activations, layer_grads)
-        cam = np.maximum(cam, 0)
-        scaled = _scale_cam_image(cam, target_size)
-        return scaled[:, None, :]
+    def __call__(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
+        (input_tensor, rgb_img) = preprocess_image(image, self.transform, self.device)
+        input_tensor.requires_grad_(True)
-    def __call__(
-        self, input_tensor: torch.Tensor, target: Optional[ClassifierOutputTarget] = None
-    ) -> npt.NDArray[np.float32]:
-        output = self.activations_and_grads(input_tensor)
-        if target is None:
-            category = np.argmax(output.cpu().data.numpy(), axis=-1)
-            target = ClassifierOutputTarget(category)
+        # Forward pass
+        logits = self.activation_capture(input_tensor)
-        self.model.zero_grad()
-        loss = target(output)
-        loss.backward(retain_graph=True)
-        cam_per_layer = self.compute_layer_cam(input_tensor)
-        cam_per_layer = np.mean(cam_per_layer, axis=1)
-        cam_per_layer = _scale_cam_image(cam_per_layer)
-        return cam_per_layer
+        # Determine target class
+        if target_class is None:
+            target_class = predict_class(logits)
+        else:
+            validate_target_class(target_class, logits.shape[-1])
-    def __del__(self) -> None:
-        self.activations_and_grads.release()
+        # Backward pass
+        self.net.zero_grad()
+        loss = logits[0, target_class]
+        loss.backward(retain_graph=False)
+        # Get captured activations and gradients
+        if self.activation_capture.activations is None:
+            raise RuntimeError("No activations captured")
-class GradCamInterpreter(Interpreter):
-    def __init__(
-        self,
-        model: nn.Module,
-        device: torch.device,
-        transform: Callable[..., torch.Tensor],
-        target_layer: nn.Module,
-        reshape_transform: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
-    ) -> None:
-        super().__init__(model, device, transform)
-        self.grad_cam = GradCAM(model, target_layer, reshape_transform=reshape_transform)
+        if self.activation_capture.gradients is None:
+            raise RuntimeError("No gradients captured")
-    def interpret(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
-        (input_tensor, rgb_img) = self._preprocess_image(image)
+        activations = self.activation_capture.activations.numpy()
+        gradients = self.activation_capture.gradients.numpy()
-        if target_class is not None:
-            target = ClassifierOutputTarget(target_class)
-        else:
-            target = None
+        # Compute CAM
+        cam = compute_cam(activations, gradients)
+        target_size = (input_tensor.size(-1), input_tensor.size(-2))
+        cam_scaled = scale_cam_image(cam, target_size)
+        grayscale_cam = cam_scaled[0]
-        grayscale_cam = self.grad_cam(input_tensor, target=target)[0, :]
+        # Create visualization
         visualization = show_mask_on_image(rgb_img, grayscale_cam)
-        return InterpretabilityResult(rgb_img, visualization, raw_output=grayscale_cam)
+        return InterpretabilityResult(
+            original_image=rgb_img,
+            visualization=visualization,
+            raw_output=grayscale_cam,
+            logits=logits.detach(),
+            predicted_class=target_class,
+        )
+    def __del__(self) -> None:
+        if hasattr(self, "activation_capture") is True:
+            self.activation_capture.release()

birder/introspection/guided_backprop.py CHANGED Viewed

@@ -1,47 +1,29 @@
 """
-Adapted from https://github.com/jacobgil/pytorch-grad-cam
+Guided Backpropagation, adapted from
+https://github.com/jacobgil/pytorch-grad-cam
 Paper "Striving for Simplicity: The All Convolutional Net", https://arxiv.org/abs/1412.6806
 """
 # Reference license: MIT
+import math
+from collections.abc import Callable
 from pathlib import Path
 from typing import Any
 from typing import Optional
-import numpy as np
-import numpy.typing as npt
 import torch
+import torch.nn.functional as F
 from PIL import Image
 from torch import nn
 from torch.autograd import Function
 from birder.introspection.base import InterpretabilityResult
-from birder.introspection.base import Interpreter
-def _deprocess_image(img: npt.NDArray[np.float32]) -> npt.NDArray[np.uint8]:
-    """
-    See https://github.com/jacobgil/keras-grad-cam/blob/master/grad-cam.py#L65
-    """
-    img = img - np.mean(img)
-    img = img / (np.std(img) + 1e-5)
-    img = img * 0.1
-    img = img + 0.5
-    img = np.clip(img, 0, 1)
-    return np.array(img * 255).astype(np.uint8)
-# pylint: disable=protected-access
-def _replace_all_layer_type_recursive(model: nn.Module, old_layer_type: nn.Module, new_layer: nn.Module) -> None:
-    for name, layer in model._modules.items():
-        if isinstance(layer, old_layer_type):
-            model._modules[name] = new_layer
-        _replace_all_layer_type_recursive(layer, old_layer_type, new_layer)
+from birder.introspection.base import deprocess_image
+from birder.introspection.base import predict_class
+from birder.introspection.base import preprocess_image
+from birder.introspection.base import validate_target_class
 # pylint: disable=abstract-method,arguments-differ
@@ -57,7 +39,6 @@ class GuidedBackpropReLU(Function):
     @staticmethod
     def backward(ctx: Any, grad_output: torch.Tensor) -> torch.Tensor:
         (input_img, _output) = ctx.saved_tensors
-        grad_input = None
         positive_mask_1 = (input_img > 0).type_as(grad_output)
         positive_mask_2 = (grad_output > 0).type_as(grad_output)
@@ -71,7 +52,7 @@ class GuidedBackpropReLU(Function):
 # pylint: disable=abstract-method,arguments-differ
-class GuidedBackpropSwish(Function):
+class GuidedBackpropSiLU(Function):
     @staticmethod
     def forward(ctx: Any, input_img: torch.Tensor) -> torch.Tensor:
         result = input_img * torch.sigmoid(input_img)
@@ -90,66 +71,159 @@ class GuidedBackpropSwish(Function):
         return grad_input
-class GuidedBackpropReLUAsModule(nn.Module):
-    def forward(self, input_img: torch.Tensor) -> Any:
-        return GuidedBackpropReLU.apply(input_img)
+# pylint: disable=abstract-method,arguments-differ
+class GuidedBackpropGELU(Function):
+    @staticmethod
+    def forward(ctx: Any, input_img: torch.Tensor) -> torch.Tensor:
+        result = F.gelu(input_img, approximate="none")  # pylint:disable=not-callable
+        ctx.save_for_backward(input_img)
+        return result
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> torch.Tensor:
+        x = ctx.saved_tensors[0]
-class GuidedBackpropSwishAsModule(nn.Module):
-    def forward(self, input_img: torch.Tensor) -> Any:
-        return GuidedBackpropSwish.apply(input_img)
+        sqrt_2 = math.sqrt(2.0)
+        sqrt_2pi = math.sqrt(2.0 * math.pi)
+        cdf = 0.5 * (1.0 + torch.erf(x / sqrt_2))
+        pdf = torch.exp(-0.5 * x * x) / sqrt_2pi
-class GuidedBackpropGeLUAsModule(nn.Module):
-    def forward(self, input_img: torch.Tensor) -> Any:
-        return GuidedBackpropSwish.apply(input_img)
+        d_gelu = cdf + x * pdf
+        positive_mask_1 = (x > 0).type_as(grad_output)
+        positive_mask_2 = (grad_output > 0).type_as(grad_output)
-class GuidedBackpropHardswishAsModule(nn.Module):
-    def forward(self, input_img: torch.Tensor) -> Any:
-        return GuidedBackpropSwish.apply(input_img)
+        grad_input = grad_output * d_gelu * positive_mask_1 * positive_mask_2
+        return grad_input
-class GuidedBackpropModel:
-    def __init__(self, model: nn.Module) -> None:
-        self.model = model
-        self.model.eval()
-    def forward(self, input_img: torch.Tensor) -> torch.Tensor:
-        return self.model(input_img)
+# pylint: disable=abstract-method,arguments-differ
+class GuidedBackpropHardswish(Function):
+    @staticmethod
+    def forward(ctx: Any, input_img: torch.Tensor) -> torch.Tensor:
+        result = F.hardswish(input_img)
+        ctx.save_for_backward(input_img)
+        return result
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> torch.Tensor:
+        x = ctx.saved_tensors[0]
+        grad = torch.zeros_like(x)
-    def __call__(self, input_img: torch.Tensor, target_category: Optional[int] = None) -> npt.NDArray[np.float32]:
-        _replace_all_layer_type_recursive(self.model, nn.ReLU, GuidedBackpropReLUAsModule())
-        _replace_all_layer_type_recursive(self.model, nn.GELU, GuidedBackpropGeLUAsModule())
-        _replace_all_layer_type_recursive(self.model, nn.SiLU, GuidedBackpropSwishAsModule())
-        _replace_all_layer_type_recursive(self.model, nn.Hardswish, GuidedBackpropHardswishAsModule())
+        mask_mid = (x > -3) & (x < 3)
+        grad[mask_mid] = (2.0 * x[mask_mid] + 3.0) / 6.0
-        input_img = input_img.requires_grad_(True)
-        output = self.forward(input_img)
+        mask_high = x >= 3
+        grad[mask_high] = 1.0
-        if target_category is None:
-            target_category = np.argmax(output.cpu().data.numpy()).item()
+        positive_mask_1 = (x > 0).type_as(grad_output)
+        positive_mask_2 = (grad_output > 0).type_as(grad_output)
-        loss = output[0, target_category]
-        loss.backward(retain_graph=True)
+        grad_input = grad_output * grad * positive_mask_1 * positive_mask_2
+        return grad_input
-        output_grad = input_img.grad.cpu().data.numpy()
-        output_grad = output_grad[0, :, :, :]
-        output_grad = output_grad.transpose((1, 2, 0))
-        _replace_all_layer_type_recursive(self.model, GuidedBackpropHardswishAsModule, nn.Hardswish())
-        _replace_all_layer_type_recursive(self.model, GuidedBackpropSwishAsModule, nn.SiLU())
-        _replace_all_layer_type_recursive(self.model, GuidedBackpropGeLUAsModule, nn.GELU())
-        _replace_all_layer_type_recursive(self.model, GuidedBackpropReLUAsModule, nn.ReLU())
+class GuidedReLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return GuidedBackpropReLU.apply(x)
-        return output_grad  # type: ignore[no-any-return]
+class GuidedSiLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return GuidedBackpropSiLU.apply(x)
-class GuidedBackpropInterpreter(Interpreter):
-    def interpret(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
-        (input_tensor, rgb_img) = self._preprocess_image(image)
-        guided_bp = GuidedBackpropModel(self.model)
-        bp_img = guided_bp(input_tensor, target_category=target_class)
+class GuidedGELU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return GuidedBackpropGELU.apply(x)
-        return InterpretabilityResult(rgb_img, _deprocess_image(bp_img * rgb_img), raw_output=bp_img)
+class GuidedHardswish(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return GuidedBackpropHardswish.apply(x)
+# Activation replacement mapping
+ACTIVATION_REPLACEMENTS: dict[type, type] = {
+    nn.ReLU: GuidedReLU,
+    nn.SiLU: GuidedSiLU,
+    nn.GELU: GuidedGELU,
+    nn.Hardswish: GuidedHardswish,
+}
+def replace_activations_recursive(model: nn.Module, replacements: dict[type, type]) -> None:
+    """
+    NOTE: This ONLY works for activations defined as nn.Module objects (e.g., self.act = nn.ReLU()).
+    It will NOT affect functional calls inside forward methods, such as F.relu(x) or F.gelu(x).
+    """
+    for name, module in list(model._modules.items()):  # pylint: disable=protected-access
+        for old_type, new_type in replacements.items():
+            if isinstance(module, old_type):
+                model._modules[name] = new_type()  # pylint: disable=protected-access
+                break
+        else:
+            # Recurse into submodules
+            replace_activations_recursive(module, replacements)
+def restore_activations_recursive(model: nn.Module, guided_types: dict[type, type]) -> None:
+    reverse_mapping = {v: k for k, v in guided_types.items()}
+    for name, module in list(model._modules.items()):  # pylint: disable=protected-access
+        for guided_type, original_type in reverse_mapping.items():
+            if isinstance(module, guided_type):
+                model._modules[name] = original_type()  # pylint: disable=protected-access
+                break
+        else:
+            restore_activations_recursive(module, guided_types)
+class GuidedBackprop:
+    def __init__(self, net: nn.Module, device: torch.device, transform: Callable[..., torch.Tensor]) -> None:
+        self.net = net.eval()
+        self.device = device
+        self.transform = transform
+    def __call__(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
+        (input_tensor, rgb_img) = preprocess_image(image, self.transform, self.device)
+        # Get prediction
+        with torch.inference_mode():
+            logits = self.net(input_tensor)
+        if target_class is None:
+            target_class = predict_class(logits)
+        else:
+            validate_target_class(target_class, logits.shape[-1])
+        # Replace activations with guided versions
+        replace_activations_recursive(self.net, ACTIVATION_REPLACEMENTS)
+        try:
+            input_tensor = input_tensor.detach().requires_grad_(True)
+            output = self.net(input_tensor)
+            loss = output[0, target_class]
+            loss.backward(retain_graph=False)
+            gradients = input_tensor.grad.cpu().numpy()
+            gradients = gradients[0, :, :, :]  # Remove batch dim
+            gradients = gradients.transpose((1, 2, 0))  # CHW -> HWC
+        finally:
+            restore_activations_recursive(self.net, ACTIVATION_REPLACEMENTS)
+        visualization = deprocess_image(gradients * rgb_img)
+        return InterpretabilityResult(
+            original_image=rgb_img,
+            visualization=visualization,
+            raw_output=gradients,
+            logits=logits.detach(),
+            predicted_class=target_class,
+        )

birder/introspection/transformer_attribution.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""
+Transformer Attribution (Gradient-weighted Attention Rollout), adapted from
+https://github.com/hila-chefer/Transformer-Explainability
+Paper "Transformer Interpretability Beyond Attention Visualization", https://arxiv.org/abs/2012.09838
+"""
+# Reference license: MIT
+from collections.abc import Callable
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from PIL import Image
+from torch import nn
+from birder.introspection.base import InterpretabilityResult
+from birder.introspection.base import predict_class
+from birder.introspection.base import preprocess_image
+from birder.introspection.base import show_mask_on_image
+from birder.introspection.base import validate_target_class
+from birder.net.vit import Encoder
+def compute_attribution_rollout(
+    attributions: list[tuple[torch.Tensor, torch.Tensor]], num_special_tokens: int, patch_grid_shape: tuple[int, int]
+) -> torch.Tensor:
+    """
+    NOTE: Uses gradient norm per token instead of element-wise grad * attention multiplication.
+    """
+    # Assume batch size = 1
+    num_tokens = attributions[0][0].size(-1)
+    device = attributions[0][0].device
+    result = torch.eye(num_tokens, device=device)
+    with torch.no_grad():
+        for attn_weights, output_grad in attributions:
+            # Compute token importance from output gradient norm across embedding dimension
+            token_importance = output_grad.norm(dim=-1, keepdim=True)
+            token_importance = token_importance.transpose(-1, -2)
+            # Weight attention patterns by token importance
+            weighted_attn = attn_weights * token_importance.unsqueeze(1)
+            # Fuse attention heads and apply non-negativity constraint
+            relevance = weighted_attn.mean(dim=1).clamp(min=0)
+            # Add residual connection and normalize
+            eye = torch.eye(num_tokens, device=device)
+            normalized = (relevance + eye) / 2.0
+            normalized = normalized / normalized.sum(dim=-1, keepdim=True)
+            # Accumulate attention across layers
+            result = torch.matmul(normalized, result)
+    rollout = result[0]
+    if 0 < num_special_tokens:
+        source_to_patches = rollout[:num_special_tokens, num_special_tokens:]
+        mask = source_to_patches.mean(dim=0)
+    else:
+        mask = rollout.mean(dim=0)
+    mask = mask / (mask.max() + 1e-8)
+    (grid_h, grid_w) = patch_grid_shape
+    mask = mask.reshape(grid_h, grid_w)
+    return mask
+class AttributionGatherer:
+    def __init__(self, net: nn.Module, attention_layer_name: str) -> None:
+        assert hasattr(net, "encoder") is True and isinstance(net.encoder, Encoder)
+        net.encoder.set_need_attn()
+        self.net = net
+        self.handles: list[torch.utils.hooks.RemovableHandle] = []
+        self._gradients: list[torch.Tensor] = []
+        self._attention_weights: list[torch.Tensor] = []
+        for name, module in self.net.named_modules():
+            if name.endswith(attention_layer_name) is True:
+                handle = module.register_forward_hook(self._capture_forward)
+                self.handles.append(handle)
+    def _capture_forward(
+        self, _module: nn.Module, _inputs: tuple[torch.Tensor, ...], output: tuple[torch.Tensor, ...] | torch.Tensor
+    ) -> None:
+        output_tensor = output[0]
+        attn_weights = output[1]
+        self._attention_weights.append(attn_weights.detach())
+        if output_tensor.requires_grad:
+            def _store_grad(grad: torch.Tensor) -> None:
+                self._gradients.append(grad.detach())
+            output_tensor.register_hook(_store_grad)
+    def get_captured_data(self) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        if len(self._attention_weights) != len(self._gradients):
+            raise RuntimeError(
+                f"Mismatch between attention weights ({len(self._attention_weights)}) "
+                f"and gradients ({len(self._gradients)}). Ensure backward() was called."
+            )
+        if len(self._attention_weights) == 0:
+            raise RuntimeError("No attention data captured. Ensure the model has attention layers.")
+        # Pair attention weights with output gradients (gradients reversed to match forward order)
+        results = [(attn.cpu(), grad.cpu()) for attn, grad in zip(self._attention_weights, reversed(self._gradients))]
+        # Clear storage for next forward pass
+        self._gradients = []
+        self._attention_weights = []
+        return results
+    def release(self) -> None:
+        for handle in self.handles:
+            handle.remove()
+class TransformerAttribution:
+    def __init__(
+        self,
+        net: nn.Module,
+        device: torch.device,
+        transform: Callable[..., torch.Tensor],
+        attention_layer_name: str = "self_attention",
+    ) -> None:
+        self.net = net.eval()
+        self.device = device
+        self.transform = transform
+        self.gatherer = AttributionGatherer(net, attention_layer_name)
+    def __call__(self, image: str | Path | Image.Image, target_class: Optional[int] = None) -> InterpretabilityResult:
+        (input_tensor, rgb_img) = preprocess_image(image, self.transform, self.device)
+        input_tensor.requires_grad_(True)
+        self.net.zero_grad()
+        logits = self.net(input_tensor)
+        if target_class is None:
+            target_class = predict_class(logits)
+        else:
+            validate_target_class(target_class, logits.shape[-1])
+        score = logits[0, target_class]
+        score.backward()
+        attribution_data = self.gatherer.get_captured_data()
+        (_, _, H, W) = input_tensor.shape
+        patch_grid_shape = (H // self.net.stem_stride, W // self.net.stem_stride)
+        attribution_map = compute_attribution_rollout(
+            attribution_data, num_special_tokens=self.net.num_special_tokens, patch_grid_shape=patch_grid_shape
+        )
+        attribution_img = Image.fromarray(attribution_map.numpy())
+        attribution_img = attribution_img.resize((rgb_img.shape[1], rgb_img.shape[0]))
+        attribution_arr = np.array(attribution_img)
+        visualization = show_mask_on_image(rgb_img, attribution_arr, image_weight=0.4)
+        return InterpretabilityResult(
+            original_image=rgb_img,
+            visualization=visualization,
+            raw_output=attribution_arr,
+            logits=logits.detach(),
+            predicted_class=target_class,
+        )
+    def __del__(self) -> None:
+        if hasattr(self, "gatherer") is True:
+            self.gatherer.release()

birder 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

birder 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl