nndbg 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nndbg-0.2.0/.github/workflows/ci.yml +30 -0
- nndbg-0.2.0/.gitignore +62 -0
- nndbg-0.2.0/CHANGELOG.md +78 -0
- nndbg-0.2.0/LICENSE.md +21 -0
- nndbg-0.2.0/PKG-INFO +212 -0
- nndbg-0.2.0/README.md +149 -0
- nndbg-0.2.0/docs/ROADMAP.md +36 -0
- nndbg-0.2.0/nndbg/__init__.py +13 -0
- nndbg-0.2.0/nndbg/_version.py +1 -0
- nndbg-0.2.0/nndbg/analysis/__init__.py +0 -0
- nndbg-0.2.0/nndbg/analysis/attention/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/attention/base.py +117 -0
- nndbg-0.2.0/nndbg/analysis/attention/results.py +88 -0
- nndbg-0.2.0/nndbg/analysis/attribution/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/attribution/base.py +212 -0
- nndbg-0.2.0/nndbg/analysis/attribution/results.py +42 -0
- nndbg-0.2.0/nndbg/analysis/erasure/__init__.py +0 -0
- nndbg-0.2.0/nndbg/analysis/erasure/base.py +124 -0
- nndbg-0.2.0/nndbg/analysis/erasure/results.py +66 -0
- nndbg-0.2.0/nndbg/analysis/geometry/__init__.py +0 -0
- nndbg-0.2.0/nndbg/analysis/geometry/base.py +211 -0
- nndbg-0.2.0/nndbg/analysis/geometry/results.py +98 -0
- nndbg-0.2.0/nndbg/analysis/latent/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/latent/base.py +161 -0
- nndbg-0.2.0/nndbg/analysis/latent/results.py +83 -0
- nndbg-0.2.0/nndbg/analysis/neurons/__init__.py +0 -0
- nndbg-0.2.0/nndbg/analysis/neurons/base.py +116 -0
- nndbg-0.2.0/nndbg/analysis/neurons/results.py +91 -0
- nndbg-0.2.0/nndbg/analysis/patching/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/patching/base.py +221 -0
- nndbg-0.2.0/nndbg/analysis/patching/results.py +80 -0
- nndbg-0.2.0/nndbg/analysis/probing/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/probing/base.py +124 -0
- nndbg-0.2.0/nndbg/analysis/probing/results.py +56 -0
- nndbg-0.2.0/nndbg/analysis/sae/__init__.py +4 -0
- nndbg-0.2.0/nndbg/analysis/sae/base.py +197 -0
- nndbg-0.2.0/nndbg/analysis/sae/results.py +70 -0
- nndbg-0.2.0/nndbg/core/__init__.py +6 -0
- nndbg-0.2.0/nndbg/core/cache.py +59 -0
- nndbg-0.2.0/nndbg/core/collect.py +62 -0
- nndbg-0.2.0/nndbg/core/hooks.py +119 -0
- nndbg-0.2.0/nndbg/core/registry.py +61 -0
- nndbg-0.2.0/nndbg/inspector.py +162 -0
- nndbg-0.2.0/nndbg/utils/__init__.py +5 -0
- nndbg-0.2.0/nndbg/utils/console.py +23 -0
- nndbg-0.2.0/nndbg/utils/device.py +9 -0
- nndbg-0.2.0/nndbg/utils/logging.py +63 -0
- nndbg-0.2.0/nndbg/viz/__init__.py +3 -0
- nndbg-0.2.0/nndbg/viz/plotly_backend.py +62 -0
- nndbg-0.2.0/nndbg/viz/plotting.py +168 -0
- nndbg-0.2.0/nndbg/viz/style.py +34 -0
- nndbg-0.2.0/nndbg/viz/summary.py +43 -0
- nndbg-0.2.0/pyproject.toml +89 -0
- nndbg-0.2.0/tests/conftest.py +69 -0
- nndbg-0.2.0/tests/test_attention.py +50 -0
- nndbg-0.2.0/tests/test_attribution.py +75 -0
- nndbg-0.2.0/tests/test_core_hooks.py +80 -0
- nndbg-0.2.0/tests/test_erasure.py +70 -0
- nndbg-0.2.0/tests/test_geometry.py +112 -0
- nndbg-0.2.0/tests/test_inspector.py +28 -0
- nndbg-0.2.0/tests/test_latent_vae.py +38 -0
- nndbg-0.2.0/tests/test_neurons.py +81 -0
- nndbg-0.2.0/tests/test_patching.py +71 -0
- nndbg-0.2.0/tests/test_probing.py +71 -0
- nndbg-0.2.0/tests/test_sae.py +53 -0
- nndbg-0.2.0/tests/test_viz.py +54 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: ${{ matrix.python-version }}
|
|
22
|
+
|
|
23
|
+
- name: Install nndbg with dev dependencies
|
|
24
|
+
run: pip install -e ".[dev]"
|
|
25
|
+
|
|
26
|
+
- name: Lint
|
|
27
|
+
run: ruff check nndbg tests
|
|
28
|
+
|
|
29
|
+
- name: Test
|
|
30
|
+
run: pytest -q
|
nndbg-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
eggs/
|
|
12
|
+
parts/
|
|
13
|
+
var/
|
|
14
|
+
sdist/
|
|
15
|
+
wheels/
|
|
16
|
+
.installed.cfg
|
|
17
|
+
*.egg-info
|
|
18
|
+
|
|
19
|
+
# Virtual environments
|
|
20
|
+
.venv/
|
|
21
|
+
venv/
|
|
22
|
+
env/
|
|
23
|
+
ENV/
|
|
24
|
+
|
|
25
|
+
# Poetry
|
|
26
|
+
.poetry/
|
|
27
|
+
|
|
28
|
+
# Jupyter
|
|
29
|
+
.ipynb_checkpoints/
|
|
30
|
+
*.ipynb_checkpoints
|
|
31
|
+
|
|
32
|
+
# Data & Storage
|
|
33
|
+
*.db
|
|
34
|
+
*.duckdb
|
|
35
|
+
*.parquet
|
|
36
|
+
*.arrow
|
|
37
|
+
data/
|
|
38
|
+
runs/
|
|
39
|
+
checkpoints/
|
|
40
|
+
|
|
41
|
+
# Logs
|
|
42
|
+
*.log
|
|
43
|
+
logs/
|
|
44
|
+
|
|
45
|
+
# IDE
|
|
46
|
+
.vscode/
|
|
47
|
+
.idea/
|
|
48
|
+
*.swp
|
|
49
|
+
*.swo
|
|
50
|
+
.DS_Store
|
|
51
|
+
|
|
52
|
+
# Testing
|
|
53
|
+
.pytest_cache/
|
|
54
|
+
.ruff_cache/
|
|
55
|
+
.coverage
|
|
56
|
+
htmlcov/
|
|
57
|
+
.tox/
|
|
58
|
+
|
|
59
|
+
# Outputs
|
|
60
|
+
outputs/
|
|
61
|
+
figures/
|
|
62
|
+
reports/
|
nndbg-0.2.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 — plane upgrades + three new planes
|
|
4
|
+
|
|
5
|
+
### Fixes / upgrades to existing planes
|
|
6
|
+
|
|
7
|
+
- **Probing** — `inspector.probing.fit(..., method=)` now accepts `"logistic"`
|
|
8
|
+
(default, unchanged), `"svm"` (LinearSVC), or `"mlp"` (1-hidden-layer
|
|
9
|
+
MLPClassifier). `ProbeResult` gains a `method` field shown in `repr()` and
|
|
10
|
+
plot titles.
|
|
11
|
+
- **Attribution** — two new methods: `gradient_x_input()` (sign-aware
|
|
12
|
+
saliency) and `smoothgrad()` (averaged saliency over Gaussian-perturbed
|
|
13
|
+
inputs, Smilkov et al. 2017).
|
|
14
|
+
- **SAE** — `inspector.sae.train(..., activation=)` now accepts `"topk:<k>"`
|
|
15
|
+
(e.g. `"topk:32"`) for exact top-k sparsity control (Gao et al. 2024),
|
|
16
|
+
in addition to the default `"relu"` L1-penalised mode.
|
|
17
|
+
- **Activation patching** — new `inspector.patching.mean_ablation(clean,
|
|
18
|
+
dataset)` method: replaces each (layer, position) with the dataset mean and
|
|
19
|
+
reports the logit drop. `PatchingResult` gains a `method` field; the plot
|
|
20
|
+
switches to a diverging colormap for mean-ablation results.
|
|
21
|
+
|
|
22
|
+
### New planes
|
|
23
|
+
|
|
24
|
+
- **`inspector.geometry`** — `layer_similarity(dataset)` computes pairwise
|
|
25
|
+
linear CKA (Kornblith et al. 2019) between all layers; `compare(other,
|
|
26
|
+
dataset)` cross-compares two model checkpoints on the same data; `pca(dataset,
|
|
27
|
+
layer)` projects a layer's activations to 2-D; `umap(...)` does the same via
|
|
28
|
+
UMAP (requires `pip install umap-learn`).
|
|
29
|
+
- **`inspector.neurons`** — `stats(dataset, layer)` reports per-neuron mean,
|
|
30
|
+
max, std, kurtosis (polysemanticity proxy), dead-neuron mask, and top-k
|
|
31
|
+
activating examples. `top_activating(...)` is a convenience alias.
|
|
32
|
+
- **`inspector.erasure`** — `inlp(dataset, concept=, layer=)` runs Iterative
|
|
33
|
+
Null-space Projection (Ravfogel et al. 2020): repeatedly trains a probe and
|
|
34
|
+
projects out its direction, returning an `ErasureResult` with the accumulated
|
|
35
|
+
projection matrix and accuracy decay curve. `result.apply(activations)`
|
|
36
|
+
projects new activations through the learned erasure.
|
|
37
|
+
|
|
38
|
+
### Testing
|
|
39
|
+
|
|
40
|
+
- 30+ new tests covering all new planes and fixed methods (total: ~90 tests).
|
|
41
|
+
|
|
42
|
+
## 0.1.0 — clean rewrite
|
|
43
|
+
|
|
44
|
+
The previous `nndbg` codebase had a broken core (`nndbg/core/{hooker,registry,store,tracer}.py`
|
|
45
|
+
were empty stub files that the rest of the package imported from) alongside
|
|
46
|
+
an orphaned earlier implementation. This release is a full rewrite from
|
|
47
|
+
scratch, scoped to six analysis planes that are each genuinely implemented,
|
|
48
|
+
tested, and documented, rather than many partial ones.
|
|
49
|
+
|
|
50
|
+
### Added
|
|
51
|
+
|
|
52
|
+
- `Inspector` — single entrypoint wrapping any `nn.Module` / HuggingFace
|
|
53
|
+
model, exposing lazily-initialized analysis planes.
|
|
54
|
+
- Core hooking infrastructure: `LayerRegistry`, `HookManager`
|
|
55
|
+
(forward-hook capture + activation patching), `ActivationCache`,
|
|
56
|
+
`collect_activations`.
|
|
57
|
+
- **Probing** — cross-validated linear probes per layer
|
|
58
|
+
(`inspector.probing.fit`).
|
|
59
|
+
- **Attribution** — native saliency, Integrated Gradients, and Grad-CAM
|
|
60
|
+
via autograd (`inspector.attribution.*`), no captum dependency required.
|
|
61
|
+
- **Attention analysis** — per-head heatmaps, attention rollout, per-head
|
|
62
|
+
entropy (`inspector.attention.*`).
|
|
63
|
+
- **Activation patching / causal tracing** — ROME-style logit-recovery
|
|
64
|
+
heatmaps (`inspector.patching.causal_trace`).
|
|
65
|
+
- **Sparse autoencoders** — train + decompose a layer's activations into
|
|
66
|
+
sparse features (`inspector.sae.*`).
|
|
67
|
+
- **VAE latent analysis** — compressed latent-space visualization and
|
|
68
|
+
reconstruction-error anomaly detection (`inspector.latent.*`).
|
|
69
|
+
- Matplotlib-based plotting (`Result.plot()`) for every plane, with an
|
|
70
|
+
optional interactive Plotly backend (`Result.plotly()`).
|
|
71
|
+
- 54 unit tests against a tiny synthetic MLP and a tiny random-init GPT2
|
|
72
|
+
(no network access required).
|
|
73
|
+
|
|
74
|
+
### Removed
|
|
75
|
+
|
|
76
|
+
- The DuckDB-backed activation store, the Typer CLI, and the `geometry`/
|
|
77
|
+
`neurons`/`erasure` planes from the previous design — see
|
|
78
|
+
[docs/ROADMAP.md](docs/ROADMAP.md) for the v2 plan.
|
nndbg-0.2.0/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Darsh Nandu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
nndbg-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nndbg
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A focused, practical toolkit for diagnosing and interpreting neural networks: probing, attribution, attention, activation patching, sparse autoencoders, VAE latent analysis, CKA geometry, neuron analysis, and concept erasure.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Darsh-Nandu/neural-network-debugger
|
|
6
|
+
Project-URL: Repository, https://github.com/Darsh-Nandu/neural-network-debugger
|
|
7
|
+
Project-URL: Issues, https://github.com/Darsh-Nandu/neural-network-debugger/issues
|
|
8
|
+
Author: Darsh Nandu
|
|
9
|
+
License: # MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 Darsh Nandu
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE.md
|
|
31
|
+
Keywords: activation-patching,explainability,interpretability,mechanistic-interpretability,neural-networks,probing,pytorch,sparse-autoencoder,transformers
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
40
|
+
Requires-Python: >=3.10
|
|
41
|
+
Requires-Dist: matplotlib>=3.8
|
|
42
|
+
Requires-Dist: numpy>=1.26
|
|
43
|
+
Requires-Dist: rich>=13.7
|
|
44
|
+
Requires-Dist: scikit-learn>=1.4
|
|
45
|
+
Requires-Dist: torch>=2.1
|
|
46
|
+
Requires-Dist: transformers>=4.40
|
|
47
|
+
Provides-Extra: all
|
|
48
|
+
Requires-Dist: ipykernel; extra == 'all'
|
|
49
|
+
Requires-Dist: jupyter; extra == 'all'
|
|
50
|
+
Requires-Dist: plotly>=5.20; extra == 'all'
|
|
51
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'all'
|
|
52
|
+
Requires-Dist: pytest>=8.0; extra == 'all'
|
|
53
|
+
Requires-Dist: ruff>=0.4; extra == 'all'
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: ipykernel; extra == 'dev'
|
|
56
|
+
Requires-Dist: jupyter; extra == 'dev'
|
|
57
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
58
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
60
|
+
Provides-Extra: plotly
|
|
61
|
+
Requires-Dist: plotly>=5.20; extra == 'plotly'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
# NNDbg — a diagnostic toolkit for neural networks
|
|
65
|
+
|
|
66
|
+
[](https://github.com/Darsh-Nandu/neural-network-debugger/actions)
|
|
67
|
+
[](https://www.python.org)
|
|
68
|
+
[](LICENSE.md)
|
|
69
|
+
|
|
70
|
+
NNDbg wraps a PyTorch or HuggingFace model in a single `Inspector` and
|
|
71
|
+
answers the questions people actually ask when interpreting a neural
|
|
72
|
+
network: where a concept is encoded, which inputs caused an output, what
|
|
73
|
+
each attention head is doing, which layer causally produces a behaviour,
|
|
74
|
+
what features a layer's activations decompose into, and how similar the
|
|
75
|
+
representations of different layers (or different models) are.
|
|
76
|
+
|
|
77
|
+
## Analysis planes
|
|
78
|
+
|
|
79
|
+
| `inspector.<plane>` | Answers | Method |
|
|
80
|
+
|---|---|---|
|
|
81
|
+
| `probing` | Where is concept X encoded? | cross-validated linear / SVM / MLP probes |
|
|
82
|
+
| `attribution` | Which inputs caused this output? | saliency, gradient×input, SmoothGrad, IG, Grad-CAM |
|
|
83
|
+
| `attention` | What does each head attend to? | per-head heatmaps, rollout, entropy |
|
|
84
|
+
| `patching` | Which layers causally produce a behaviour? | causal tracing, mean ablation |
|
|
85
|
+
| `sae` | What sparse features does a layer learn? | sparse autoencoder (ReLU or top-k) |
|
|
86
|
+
| `latent` | Where do activations sit in a compressed space? | VAE latent space + anomaly detection |
|
|
87
|
+
| `geometry` | How similar are layers to each other (or to another model)? | linear CKA, PCA, UMAP |
|
|
88
|
+
| `neurons` | What is each neuron doing? | dead neurons, top examples, kurtosis |
|
|
89
|
+
| `erasure` | How do I remove a concept from representations? | INLP null-space projection |
|
|
90
|
+
|
|
91
|
+
Every result is a plain dataclass with a `.plot()` method (matplotlib,
|
|
92
|
+
zero-config) and an optional `.plotly()` method if you have `plotly` installed.
|
|
93
|
+
|
|
94
|
+
## Installation
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install nndbg
|
|
98
|
+
|
|
99
|
+
# with interactive Plotly figures
|
|
100
|
+
pip install nndbg[plotly]
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Quick start
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
107
|
+
from nndbg import Inspector
|
|
108
|
+
|
|
109
|
+
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
110
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
111
|
+
inspector = Inspector(model, tokenizer)
|
|
112
|
+
|
|
113
|
+
inspector.summary() # model + all available planes
|
|
114
|
+
inspector.layers()[:5] # every layer name you can pass to any plane
|
|
115
|
+
|
|
116
|
+
# Attribution — which input tokens drove the prediction?
|
|
117
|
+
input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
|
|
118
|
+
inspector.attribution.saliency(input_ids).plot()
|
|
119
|
+
inspector.attribution.smoothgrad(input_ids).plot() # noise-averaged saliency
|
|
120
|
+
|
|
121
|
+
# Attention — what does head 0 of layer 0 attend to?
|
|
122
|
+
inspector.attention.heads(input_ids, layer=0).plot()
|
|
123
|
+
inspector.attention.rollout(input_ids).plot() # cumulative information flow
|
|
124
|
+
|
|
125
|
+
# Probing — is sentiment decodable, and from which layer?
|
|
126
|
+
dataset = [
|
|
127
|
+
(tokenizer(text, return_tensors="pt").input_ids, label)
|
|
128
|
+
for text, label in [
|
|
129
|
+
("I love this movie", 1), ("I hate this movie", 0),
|
|
130
|
+
("This is wonderful", 1), ("This is terrible", 0),
|
|
131
|
+
]
|
|
132
|
+
]
|
|
133
|
+
inspector.probing.fit(dataset, concept="sentiment").plot()
|
|
134
|
+
inspector.probing.fit(dataset, concept="sentiment", method="svm").plot()
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Any plain `nn.Module` works too — `tokenizer` is optional and only needed
|
|
138
|
+
for token-level labeling.
|
|
139
|
+
|
|
140
|
+
### Activation patching / causal tracing
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
clean = tokenizer("The Eiffel Tower is in the city of", return_tensors="pt").input_ids
|
|
144
|
+
corrupted = tokenizer("The Space Needle is in the city of", return_tensors="pt").input_ids
|
|
145
|
+
|
|
146
|
+
# Causal trace: which (layer, position) recovers the clean prediction?
|
|
147
|
+
result = inspector.patching.causal_trace(
|
|
148
|
+
clean, corrupted, layers=inspector.find_layers(r"h\.\d+$")
|
|
149
|
+
)
|
|
150
|
+
result.plot() # (layer × position) logit-recovery heatmap
|
|
151
|
+
|
|
152
|
+
# Mean ablation: which positions carry above-average information?
|
|
153
|
+
corpus = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
|
|
154
|
+
inspector.patching.mean_ablation(clean, corpus).plot()
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Sparse autoencoders and VAE latent analysis
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
dataset = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
|
|
161
|
+
|
|
162
|
+
# Sparse feature decomposition (ReLU or exact top-k)
|
|
163
|
+
inspector.sae.train(dataset, layer="transformer.h.6", n_features=512)
|
|
164
|
+
inspector.sae.train(dataset, layer="transformer.h.6", n_features=512, activation="topk:32")
|
|
165
|
+
inspector.sae.decompose(dataset, layer="transformer.h.6").plot()
|
|
166
|
+
|
|
167
|
+
# Compressed latent space + reconstruction-error anomaly detection
|
|
168
|
+
inspector.latent.train(dataset, layer="transformer.h.6", latent_dim=2)
|
|
169
|
+
result = inspector.latent.encode(dataset, layer="transformer.h.6")
|
|
170
|
+
result.plot()
|
|
171
|
+
result.anomalies() # indices of outlier examples
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Representational geometry
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
# How similar are the layers to each other?
|
|
178
|
+
layers = inspector.find_layers(r"h\.\d+$")
|
|
179
|
+
inspector.geometry.layer_similarity(dataset, layers=layers).plot()
|
|
180
|
+
|
|
181
|
+
# How much did fine-tuning change each layer?
|
|
182
|
+
base_inspector = Inspector(base_model, tokenizer)
|
|
183
|
+
inspector.geometry.compare(base_inspector, dataset, layers=layers).plot()
|
|
184
|
+
|
|
185
|
+
# 2-D PCA scatter of a layer's representations
|
|
186
|
+
inspector.geometry.pca(dataset, layer="transformer.h.6", labels=class_labels).plot()
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Neuron analysis and concept erasure
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
# Dead neurons, top-activating examples, polysemanticity proxy
|
|
193
|
+
result = inspector.neurons.stats(dataset, layer="transformer.h.6.mlp.c_fc")
|
|
194
|
+
print(result) # NeuronResult(... dead=12/3072 ...)
|
|
195
|
+
result.plot() # bar chart, dead neurons in red
|
|
196
|
+
result.polysemantic_neurons() # low-kurtosis (broadly-activating) indices
|
|
197
|
+
|
|
198
|
+
# Remove a concept from a layer's representations (INLP)
|
|
199
|
+
result = inspector.erasure.inlp(dataset, concept="sentiment", layer="transformer.h.4")
|
|
200
|
+
result.plot() # probe accuracy decay over iterations
|
|
201
|
+
erased = result.apply(some_activations) # project new activations through erasure
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Development
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
git clone https://github.com/Darsh-Nandu/neural-network-debugger
|
|
208
|
+
cd neural-network-debugger
|
|
209
|
+
pip install -e ".[dev]"
|
|
210
|
+
ruff check nndbg tests
|
|
211
|
+
pytest
|
|
212
|
+
```
|
nndbg-0.2.0/README.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# NNDbg — a diagnostic toolkit for neural networks
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Darsh-Nandu/neural-network-debugger/actions)
|
|
4
|
+
[](https://www.python.org)
|
|
5
|
+
[](LICENSE.md)
|
|
6
|
+
|
|
7
|
+
NNDbg wraps a PyTorch or HuggingFace model in a single `Inspector` and
|
|
8
|
+
answers the questions people actually ask when interpreting a neural
|
|
9
|
+
network: where a concept is encoded, which inputs caused an output, what
|
|
10
|
+
each attention head is doing, which layer causally produces a behaviour,
|
|
11
|
+
what features a layer's activations decompose into, and how similar the
|
|
12
|
+
representations of different layers (or different models) are.
|
|
13
|
+
|
|
14
|
+
## Analysis planes
|
|
15
|
+
|
|
16
|
+
| `inspector.<plane>` | Answers | Method |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| `probing` | Where is concept X encoded? | cross-validated linear / SVM / MLP probes |
|
|
19
|
+
| `attribution` | Which inputs caused this output? | saliency, gradient×input, SmoothGrad, IG, Grad-CAM |
|
|
20
|
+
| `attention` | What does each head attend to? | per-head heatmaps, rollout, entropy |
|
|
21
|
+
| `patching` | Which layers causally produce a behaviour? | causal tracing, mean ablation |
|
|
22
|
+
| `sae` | What sparse features does a layer learn? | sparse autoencoder (ReLU or top-k) |
|
|
23
|
+
| `latent` | Where do activations sit in a compressed space? | VAE latent space + anomaly detection |
|
|
24
|
+
| `geometry` | How similar are layers to each other (or to another model)? | linear CKA, PCA, UMAP |
|
|
25
|
+
| `neurons` | What is each neuron doing? | dead neurons, top examples, kurtosis |
|
|
26
|
+
| `erasure` | How do I remove a concept from representations? | INLP null-space projection |
|
|
27
|
+
|
|
28
|
+
Every result is a plain dataclass with a `.plot()` method (matplotlib,
|
|
29
|
+
zero-config) and an optional `.plotly()` method if you have `plotly` installed.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install nndbg
|
|
35
|
+
|
|
36
|
+
# with interactive Plotly figures
|
|
37
|
+
pip install nndbg[plotly]
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
44
|
+
from nndbg import Inspector
|
|
45
|
+
|
|
46
|
+
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
47
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
48
|
+
inspector = Inspector(model, tokenizer)
|
|
49
|
+
|
|
50
|
+
inspector.summary() # model + all available planes
|
|
51
|
+
inspector.layers()[:5] # every layer name you can pass to any plane
|
|
52
|
+
|
|
53
|
+
# Attribution — which input tokens drove the prediction?
|
|
54
|
+
input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
|
|
55
|
+
inspector.attribution.saliency(input_ids).plot()
|
|
56
|
+
inspector.attribution.smoothgrad(input_ids).plot() # noise-averaged saliency
|
|
57
|
+
|
|
58
|
+
# Attention — what does head 0 of layer 0 attend to?
|
|
59
|
+
inspector.attention.heads(input_ids, layer=0).plot()
|
|
60
|
+
inspector.attention.rollout(input_ids).plot() # cumulative information flow
|
|
61
|
+
|
|
62
|
+
# Probing — is sentiment decodable, and from which layer?
|
|
63
|
+
dataset = [
|
|
64
|
+
(tokenizer(text, return_tensors="pt").input_ids, label)
|
|
65
|
+
for text, label in [
|
|
66
|
+
("I love this movie", 1), ("I hate this movie", 0),
|
|
67
|
+
("This is wonderful", 1), ("This is terrible", 0),
|
|
68
|
+
]
|
|
69
|
+
]
|
|
70
|
+
inspector.probing.fit(dataset, concept="sentiment").plot()
|
|
71
|
+
inspector.probing.fit(dataset, concept="sentiment", method="svm").plot()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Any plain `nn.Module` works too — `tokenizer` is optional and only needed
|
|
75
|
+
for token-level labeling.
|
|
76
|
+
|
|
77
|
+
### Activation patching / causal tracing
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
clean = tokenizer("The Eiffel Tower is in the city of", return_tensors="pt").input_ids
|
|
81
|
+
corrupted = tokenizer("The Space Needle is in the city of", return_tensors="pt").input_ids
|
|
82
|
+
|
|
83
|
+
# Causal trace: which (layer, position) recovers the clean prediction?
|
|
84
|
+
result = inspector.patching.causal_trace(
|
|
85
|
+
clean, corrupted, layers=inspector.find_layers(r"h\.\d+$")
|
|
86
|
+
)
|
|
87
|
+
result.plot() # (layer × position) logit-recovery heatmap
|
|
88
|
+
|
|
89
|
+
# Mean ablation: which positions carry above-average information?
|
|
90
|
+
corpus = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
|
|
91
|
+
inspector.patching.mean_ablation(clean, corpus).plot()
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Sparse autoencoders and VAE latent analysis
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
dataset = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
|
|
98
|
+
|
|
99
|
+
# Sparse feature decomposition (ReLU or exact top-k)
|
|
100
|
+
inspector.sae.train(dataset, layer="transformer.h.6", n_features=512)
|
|
101
|
+
inspector.sae.train(dataset, layer="transformer.h.6", n_features=512, activation="topk:32")
|
|
102
|
+
inspector.sae.decompose(dataset, layer="transformer.h.6").plot()
|
|
103
|
+
|
|
104
|
+
# Compressed latent space + reconstruction-error anomaly detection
|
|
105
|
+
inspector.latent.train(dataset, layer="transformer.h.6", latent_dim=2)
|
|
106
|
+
result = inspector.latent.encode(dataset, layer="transformer.h.6")
|
|
107
|
+
result.plot()
|
|
108
|
+
result.anomalies() # indices of outlier examples
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Representational geometry
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# How similar are the layers to each other?
|
|
115
|
+
layers = inspector.find_layers(r"h\.\d+$")
|
|
116
|
+
inspector.geometry.layer_similarity(dataset, layers=layers).plot()
|
|
117
|
+
|
|
118
|
+
# How much did fine-tuning change each layer?
|
|
119
|
+
base_inspector = Inspector(base_model, tokenizer)
|
|
120
|
+
inspector.geometry.compare(base_inspector, dataset, layers=layers).plot()
|
|
121
|
+
|
|
122
|
+
# 2-D PCA scatter of a layer's representations
|
|
123
|
+
inspector.geometry.pca(dataset, layer="transformer.h.6", labels=class_labels).plot()
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Neuron analysis and concept erasure
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# Dead neurons, top-activating examples, polysemanticity proxy
|
|
130
|
+
result = inspector.neurons.stats(dataset, layer="transformer.h.6.mlp.c_fc")
|
|
131
|
+
print(result) # NeuronResult(... dead=12/3072 ...)
|
|
132
|
+
result.plot() # bar chart, dead neurons in red
|
|
133
|
+
result.polysemantic_neurons() # low-kurtosis (broadly-activating) indices
|
|
134
|
+
|
|
135
|
+
# Remove a concept from a layer's representations (INLP)
|
|
136
|
+
result = inspector.erasure.inlp(dataset, concept="sentiment", layer="transformer.h.4")
|
|
137
|
+
result.plot() # probe accuracy decay over iterations
|
|
138
|
+
erased = result.apply(some_activations) # project new activations through erasure
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Development
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
git clone https://github.com/Darsh-Nandu/neural-network-debugger
|
|
145
|
+
cd neural-network-debugger
|
|
146
|
+
pip install -e ".[dev]"
|
|
147
|
+
ruff check nndbg tests
|
|
148
|
+
pytest
|
|
149
|
+
```
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Roadmap
|
|
2
|
+
|
|
3
|
+
NNDbg ships fully-implemented, tested analysis planes rather than broad
|
|
4
|
+
scaffolding. Each version ships what it ships completely.
|
|
5
|
+
|
|
6
|
+
## Shipped in v0.2.0 (new planes)
|
|
7
|
+
|
|
8
|
+
- **Representation geometry** (`inspector.geometry`) — linear CKA between
|
|
9
|
+
layers, cross-model comparison, PCA and optional UMAP projections.
|
|
10
|
+
- **Neuron-level analysis** (`inspector.neurons`) — dead-neuron detection,
|
|
11
|
+
top-activating examples, per-neuron kurtosis as a polysemanticity proxy.
|
|
12
|
+
- **Concept erasure** (`inspector.erasure`) — INLP iterative null-space
|
|
13
|
+
projection; returns a projection matrix applicable to new activations.
|
|
14
|
+
|
|
15
|
+
Also in v0.2.0 (upgrades):
|
|
16
|
+
- Probing: `method="svm"` / `"mlp"` options.
|
|
17
|
+
- Attribution: `gradient_x_input()` and `smoothgrad()`.
|
|
18
|
+
- SAE: `activation="topk:<k>"` for exact sparsity.
|
|
19
|
+
- Patching: `mean_ablation(clean, dataset)`.
|
|
20
|
+
|
|
21
|
+
## Planned for v3
|
|
22
|
+
|
|
23
|
+
- **CLI** — `nndbg summary <model>` and friends, for inspecting a model
|
|
24
|
+
without writing a script.
|
|
25
|
+
- **Captum integration** — an opt-in `nndbg[captum]` extra wrapping
|
|
26
|
+
Captum's broader attribution method library (DeepLIFT, SHAP, ...) behind
|
|
27
|
+
the same `AttributionResult` interface as the native methods.
|
|
28
|
+
- **LEACE concept erasure** — closed-form concept removal with better
|
|
29
|
+
theoretical guarantees than INLP for multiclass concepts.
|
|
30
|
+
|
|
31
|
+
## Explicitly out of scope
|
|
32
|
+
|
|
33
|
+
- A persistent activation-storage backend (the original prototype used
|
|
34
|
+
DuckDB; activations live in memory / plain `torch.save` files — add a
|
|
35
|
+
backend only if a real workflow needs querying activations across runs,
|
|
36
|
+
not preemptively).
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NNDbg — A diagnostic toolkit for neural networks.
|
|
3
|
+
|
|
4
|
+
Built around a single entrypoint, ``Inspector``, exposing focused analysis
|
|
5
|
+
planes (probing, attribution, attention, activation patching, sparse
|
|
6
|
+
autoencoders, VAE latent analysis) for PyTorch and HuggingFace models.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from nndbg._version import __version__
|
|
10
|
+
from nndbg.inspector import Inspector
|
|
11
|
+
from nndbg.utils.logging import is_verbose, set_verbose
|
|
12
|
+
|
|
13
|
+
__all__ = ["Inspector", "__version__", "set_verbose", "is_verbose"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
File without changes
|