nndbg 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. nndbg-0.2.0/.github/workflows/ci.yml +30 -0
  2. nndbg-0.2.0/.gitignore +62 -0
  3. nndbg-0.2.0/CHANGELOG.md +78 -0
  4. nndbg-0.2.0/LICENSE.md +21 -0
  5. nndbg-0.2.0/PKG-INFO +212 -0
  6. nndbg-0.2.0/README.md +149 -0
  7. nndbg-0.2.0/docs/ROADMAP.md +36 -0
  8. nndbg-0.2.0/nndbg/__init__.py +13 -0
  9. nndbg-0.2.0/nndbg/_version.py +1 -0
  10. nndbg-0.2.0/nndbg/analysis/__init__.py +0 -0
  11. nndbg-0.2.0/nndbg/analysis/attention/__init__.py +4 -0
  12. nndbg-0.2.0/nndbg/analysis/attention/base.py +117 -0
  13. nndbg-0.2.0/nndbg/analysis/attention/results.py +88 -0
  14. nndbg-0.2.0/nndbg/analysis/attribution/__init__.py +4 -0
  15. nndbg-0.2.0/nndbg/analysis/attribution/base.py +212 -0
  16. nndbg-0.2.0/nndbg/analysis/attribution/results.py +42 -0
  17. nndbg-0.2.0/nndbg/analysis/erasure/__init__.py +0 -0
  18. nndbg-0.2.0/nndbg/analysis/erasure/base.py +124 -0
  19. nndbg-0.2.0/nndbg/analysis/erasure/results.py +66 -0
  20. nndbg-0.2.0/nndbg/analysis/geometry/__init__.py +0 -0
  21. nndbg-0.2.0/nndbg/analysis/geometry/base.py +211 -0
  22. nndbg-0.2.0/nndbg/analysis/geometry/results.py +98 -0
  23. nndbg-0.2.0/nndbg/analysis/latent/__init__.py +4 -0
  24. nndbg-0.2.0/nndbg/analysis/latent/base.py +161 -0
  25. nndbg-0.2.0/nndbg/analysis/latent/results.py +83 -0
  26. nndbg-0.2.0/nndbg/analysis/neurons/__init__.py +0 -0
  27. nndbg-0.2.0/nndbg/analysis/neurons/base.py +116 -0
  28. nndbg-0.2.0/nndbg/analysis/neurons/results.py +91 -0
  29. nndbg-0.2.0/nndbg/analysis/patching/__init__.py +4 -0
  30. nndbg-0.2.0/nndbg/analysis/patching/base.py +221 -0
  31. nndbg-0.2.0/nndbg/analysis/patching/results.py +80 -0
  32. nndbg-0.2.0/nndbg/analysis/probing/__init__.py +4 -0
  33. nndbg-0.2.0/nndbg/analysis/probing/base.py +124 -0
  34. nndbg-0.2.0/nndbg/analysis/probing/results.py +56 -0
  35. nndbg-0.2.0/nndbg/analysis/sae/__init__.py +4 -0
  36. nndbg-0.2.0/nndbg/analysis/sae/base.py +197 -0
  37. nndbg-0.2.0/nndbg/analysis/sae/results.py +70 -0
  38. nndbg-0.2.0/nndbg/core/__init__.py +6 -0
  39. nndbg-0.2.0/nndbg/core/cache.py +59 -0
  40. nndbg-0.2.0/nndbg/core/collect.py +62 -0
  41. nndbg-0.2.0/nndbg/core/hooks.py +119 -0
  42. nndbg-0.2.0/nndbg/core/registry.py +61 -0
  43. nndbg-0.2.0/nndbg/inspector.py +162 -0
  44. nndbg-0.2.0/nndbg/utils/__init__.py +5 -0
  45. nndbg-0.2.0/nndbg/utils/console.py +23 -0
  46. nndbg-0.2.0/nndbg/utils/device.py +9 -0
  47. nndbg-0.2.0/nndbg/utils/logging.py +63 -0
  48. nndbg-0.2.0/nndbg/viz/__init__.py +3 -0
  49. nndbg-0.2.0/nndbg/viz/plotly_backend.py +62 -0
  50. nndbg-0.2.0/nndbg/viz/plotting.py +168 -0
  51. nndbg-0.2.0/nndbg/viz/style.py +34 -0
  52. nndbg-0.2.0/nndbg/viz/summary.py +43 -0
  53. nndbg-0.2.0/pyproject.toml +89 -0
  54. nndbg-0.2.0/tests/conftest.py +69 -0
  55. nndbg-0.2.0/tests/test_attention.py +50 -0
  56. nndbg-0.2.0/tests/test_attribution.py +75 -0
  57. nndbg-0.2.0/tests/test_core_hooks.py +80 -0
  58. nndbg-0.2.0/tests/test_erasure.py +70 -0
  59. nndbg-0.2.0/tests/test_geometry.py +112 -0
  60. nndbg-0.2.0/tests/test_inspector.py +28 -0
  61. nndbg-0.2.0/tests/test_latent_vae.py +38 -0
  62. nndbg-0.2.0/tests/test_neurons.py +81 -0
  63. nndbg-0.2.0/tests/test_patching.py +71 -0
  64. nndbg-0.2.0/tests/test_probing.py +71 -0
  65. nndbg-0.2.0/tests/test_sae.py +53 -0
  66. nndbg-0.2.0/tests/test_viz.py +54 -0
@@ -0,0 +1,30 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+
23
+ - name: Install nndbg with dev dependencies
24
+ run: pip install -e ".[dev]"
25
+
26
+ - name: Lint
27
+ run: ruff check nndbg tests
28
+
29
+ - name: Test
30
+ run: pytest -q
nndbg-0.2.0/.gitignore ADDED
@@ -0,0 +1,62 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ eggs/
12
+ parts/
13
+ var/
14
+ sdist/
15
+ wheels/
16
+ .installed.cfg
17
+ *.egg-info
18
+
19
+ # Virtual environments
20
+ .venv/
21
+ venv/
22
+ env/
23
+ ENV/
24
+
25
+ # Poetry
26
+ .poetry/
27
+
28
+ # Jupyter
29
+ .ipynb_checkpoints/
30
+ *.ipynb_checkpoints
31
+
32
+ # Data & Storage
33
+ *.db
34
+ *.duckdb
35
+ *.parquet
36
+ *.arrow
37
+ data/
38
+ runs/
39
+ checkpoints/
40
+
41
+ # Logs
42
+ *.log
43
+ logs/
44
+
45
+ # IDE
46
+ .vscode/
47
+ .idea/
48
+ *.swp
49
+ *.swo
50
+ .DS_Store
51
+
52
+ # Testing
53
+ .pytest_cache/
54
+ .ruff_cache/
55
+ .coverage
56
+ htmlcov/
57
+ .tox/
58
+
59
+ # Outputs
60
+ outputs/
61
+ figures/
62
+ reports/
@@ -0,0 +1,78 @@
1
+ # Changelog
2
+
3
+ ## 0.2.0 — plane upgrades + three new planes
4
+
5
+ ### Fixes / upgrades to existing planes
6
+
7
+ - **Probing** — `inspector.probing.fit(..., method=)` now accepts `"logistic"`
8
+ (default, unchanged), `"svm"` (LinearSVC), or `"mlp"` (1-hidden-layer
9
+ MLPClassifier). `ProbeResult` gains a `method` field shown in `repr()` and
10
+ plot titles.
11
+ - **Attribution** — two new methods: `gradient_x_input()` (sign-aware
12
+ saliency) and `smoothgrad()` (averaged saliency over Gaussian-perturbed
13
+ inputs, Smilkov et al. 2017).
14
+ - **SAE** — `inspector.sae.train(..., activation=)` now accepts `"topk:<k>"`
15
+ (e.g. `"topk:32"`) for exact top-k sparsity control (Gao et al. 2024),
16
+ in addition to the default `"relu"` L1-penalised mode.
17
+ - **Activation patching** — new `inspector.patching.mean_ablation(clean,
18
+ dataset)` method: replaces each (layer, position) with the dataset mean and
19
+ reports the logit drop. `PatchingResult` gains a `method` field; the plot
20
+ switches to a diverging colormap for mean-ablation results.
21
+
22
+ ### New planes
23
+
24
+ - **`inspector.geometry`** — `layer_similarity(dataset)` computes pairwise
25
+ linear CKA (Kornblith et al. 2019) between all layers; `compare(other,
26
+ dataset)` cross-compares two model checkpoints on the same data; `pca(dataset,
27
+ layer)` projects a layer's activations to 2-D; `umap(...)` does the same via
28
+ UMAP (requires `pip install umap-learn`).
29
+ - **`inspector.neurons`** — `stats(dataset, layer)` reports per-neuron mean,
30
+ max, std, kurtosis (polysemanticity proxy), dead-neuron mask, and top-k
31
+ activating examples. `top_activating(...)` is a convenience alias.
32
+ - **`inspector.erasure`** — `inlp(dataset, concept=, layer=)` runs Iterative
33
+ Null-space Projection (Ravfogel et al. 2020): repeatedly trains a probe and
34
+ projects out its direction, returning an `ErasureResult` with the accumulated
35
+ projection matrix and accuracy decay curve. `result.apply(activations)`
36
+ projects new activations through the learned erasure.
37
+
38
+ ### Testing
39
+
40
+ - 30+ new tests covering all new planes and fixed methods (total: ~90 tests).
41
+
42
+ ## 0.1.0 — clean rewrite
43
+
44
+ The previous `nndbg` codebase had a broken core (`nndbg/core/{hooker,registry,store,tracer}.py`
45
+ were empty stub files that the rest of the package imported from) alongside
46
+ an orphaned earlier implementation. This release is a full rewrite from
47
+ scratch, scoped to six analysis planes that are each genuinely implemented,
48
+ tested, and documented, rather than many partial ones.
49
+
50
+ ### Added
51
+
52
+ - `Inspector` — single entrypoint wrapping any `nn.Module` / HuggingFace
53
+ model, exposing lazily-initialized analysis planes.
54
+ - Core hooking infrastructure: `LayerRegistry`, `HookManager`
55
+ (forward-hook capture + activation patching), `ActivationCache`,
56
+ `collect_activations`.
57
+ - **Probing** — cross-validated linear probes per layer
58
+ (`inspector.probing.fit`).
59
+ - **Attribution** — native saliency, Integrated Gradients, and Grad-CAM
60
+ via autograd (`inspector.attribution.*`), no captum dependency required.
61
+ - **Attention analysis** — per-head heatmaps, attention rollout, per-head
62
+ entropy (`inspector.attention.*`).
63
+ - **Activation patching / causal tracing** — ROME-style logit-recovery
64
+ heatmaps (`inspector.patching.causal_trace`).
65
+ - **Sparse autoencoders** — train + decompose a layer's activations into
66
+ sparse features (`inspector.sae.*`).
67
+ - **VAE latent analysis** — compressed latent-space visualization and
68
+ reconstruction-error anomaly detection (`inspector.latent.*`).
69
+ - Matplotlib-based plotting (`Result.plot()`) for every plane, with an
70
+ optional interactive Plotly backend (`Result.plotly()`).
71
+ - 54 unit tests against a tiny synthetic MLP and a tiny random-init GPT2
72
+ (no network access required).
73
+
74
+ ### Removed
75
+
76
+ - The DuckDB-backed activation store, the Typer CLI, and the `geometry`/
77
+ `neurons`/`erasure` planes from the previous design — see
78
+ [docs/ROADMAP.md](docs/ROADMAP.md) for the v2 plan.
nndbg-0.2.0/LICENSE.md ADDED
@@ -0,0 +1,21 @@
1
+ # MIT License
2
+
3
+ Copyright (c) 2026 Darsh Nandu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
nndbg-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.4
2
+ Name: nndbg
3
+ Version: 0.2.0
4
+ Summary: A focused, practical toolkit for diagnosing and interpreting neural networks: probing, attribution, attention, activation patching, sparse autoencoders, VAE latent analysis, CKA geometry, neuron analysis, and concept erasure.
5
+ Project-URL: Homepage, https://github.com/Darsh-Nandu/neural-network-debugger
6
+ Project-URL: Repository, https://github.com/Darsh-Nandu/neural-network-debugger
7
+ Project-URL: Issues, https://github.com/Darsh-Nandu/neural-network-debugger/issues
8
+ Author: Darsh Nandu
9
+ License: # MIT License
10
+
11
+ Copyright (c) 2026 Darsh Nandu
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE.md
31
+ Keywords: activation-patching,explainability,interpretability,mechanistic-interpretability,neural-networks,probing,pytorch,sparse-autoencoder,transformers
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3.10
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
40
+ Requires-Python: >=3.10
41
+ Requires-Dist: matplotlib>=3.8
42
+ Requires-Dist: numpy>=1.26
43
+ Requires-Dist: rich>=13.7
44
+ Requires-Dist: scikit-learn>=1.4
45
+ Requires-Dist: torch>=2.1
46
+ Requires-Dist: transformers>=4.40
47
+ Provides-Extra: all
48
+ Requires-Dist: ipykernel; extra == 'all'
49
+ Requires-Dist: jupyter; extra == 'all'
50
+ Requires-Dist: plotly>=5.20; extra == 'all'
51
+ Requires-Dist: pytest-cov>=5.0; extra == 'all'
52
+ Requires-Dist: pytest>=8.0; extra == 'all'
53
+ Requires-Dist: ruff>=0.4; extra == 'all'
54
+ Provides-Extra: dev
55
+ Requires-Dist: ipykernel; extra == 'dev'
56
+ Requires-Dist: jupyter; extra == 'dev'
57
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
58
+ Requires-Dist: pytest>=8.0; extra == 'dev'
59
+ Requires-Dist: ruff>=0.4; extra == 'dev'
60
+ Provides-Extra: plotly
61
+ Requires-Dist: plotly>=5.20; extra == 'plotly'
62
+ Description-Content-Type: text/markdown
63
+
64
+ # NNDbg — a diagnostic toolkit for neural networks
65
+
66
+ [![CI](https://github.com/Darsh-Nandu/neural-network-debugger/actions/workflows/ci.yml/badge.svg)](https://github.com/Darsh-Nandu/neural-network-debugger/actions)
67
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org)
68
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE.md)
69
+
70
+ NNDbg wraps a PyTorch or HuggingFace model in a single `Inspector` and
71
+ answers the questions people actually ask when interpreting a neural
72
+ network: where a concept is encoded, which inputs caused an output, what
73
+ each attention head is doing, which layer causally produces a behaviour,
74
+ what features a layer's activations decompose into, and how similar the
75
+ representations of different layers (or different models) are.
76
+
77
+ ## Analysis planes
78
+
79
+ | `inspector.<plane>` | Answers | Method |
80
+ |---|---|---|
81
+ | `probing` | Where is concept X encoded? | cross-validated linear / SVM / MLP probes |
82
+ | `attribution` | Which inputs caused this output? | saliency, gradient×input, SmoothGrad, IG, Grad-CAM |
83
+ | `attention` | What does each head attend to? | per-head heatmaps, rollout, entropy |
84
+ | `patching` | Which layers causally produce a behaviour? | causal tracing, mean ablation |
85
+ | `sae` | What sparse features does a layer learn? | sparse autoencoder (ReLU or top-k) |
86
+ | `latent` | Where do activations sit in a compressed space? | VAE latent space + anomaly detection |
87
+ | `geometry` | How similar are layers to each other (or to another model)? | linear CKA, PCA, UMAP |
88
+ | `neurons` | What is each neuron doing? | dead neurons, top examples, kurtosis |
89
+ | `erasure` | How do I remove a concept from representations? | INLP null-space projection |
90
+
91
+ Every result is a plain dataclass with a `.plot()` method (matplotlib,
92
+ zero-config) and an optional `.plotly()` method if you have `plotly` installed.
93
+
94
+ ## Installation
95
+
96
+ ```bash
97
+ pip install nndbg
98
+
99
+ # with interactive Plotly figures
100
+ pip install nndbg[plotly]
101
+ ```
102
+
103
+ ## Quick start
104
+
105
+ ```python
106
+ from transformers import AutoModelForCausalLM, AutoTokenizer
107
+ from nndbg import Inspector
108
+
109
+ model = AutoModelForCausalLM.from_pretrained("gpt2")
110
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
111
+ inspector = Inspector(model, tokenizer)
112
+
113
+ inspector.summary() # model + all available planes
114
+ inspector.layers()[:5] # every layer name you can pass to any plane
115
+
116
+ # Attribution — which input tokens drove the prediction?
117
+ input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
118
+ inspector.attribution.saliency(input_ids).plot()
119
+ inspector.attribution.smoothgrad(input_ids).plot() # noise-averaged saliency
120
+
121
+ # Attention — what does head 0 of layer 0 attend to?
122
+ inspector.attention.heads(input_ids, layer=0).plot()
123
+ inspector.attention.rollout(input_ids).plot() # cumulative information flow
124
+
125
+ # Probing — is sentiment decodable, and from which layer?
126
+ dataset = [
127
+ (tokenizer(text, return_tensors="pt").input_ids, label)
128
+ for text, label in [
129
+ ("I love this movie", 1), ("I hate this movie", 0),
130
+ ("This is wonderful", 1), ("This is terrible", 0),
131
+ ]
132
+ ]
133
+ inspector.probing.fit(dataset, concept="sentiment").plot()
134
+ inspector.probing.fit(dataset, concept="sentiment", method="svm").plot()
135
+ ```
136
+
137
+ Any plain `nn.Module` works too — `tokenizer` is optional and only needed
138
+ for token-level labeling.
139
+
140
+ ### Activation patching / causal tracing
141
+
142
+ ```python
143
+ clean = tokenizer("The Eiffel Tower is in the city of", return_tensors="pt").input_ids
144
+ corrupted = tokenizer("The Space Needle is in the city of", return_tensors="pt").input_ids
145
+
146
+ # Causal trace: which (layer, position) recovers the clean prediction?
147
+ result = inspector.patching.causal_trace(
148
+ clean, corrupted, layers=inspector.find_layers(r"h\.\d+$")
149
+ )
150
+ result.plot() # (layer × position) logit-recovery heatmap
151
+
152
+ # Mean ablation: which positions carry above-average information?
153
+ corpus = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
154
+ inspector.patching.mean_ablation(clean, corpus).plot()
155
+ ```
156
+
157
+ ### Sparse autoencoders and VAE latent analysis
158
+
159
+ ```python
160
+ dataset = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
161
+
162
+ # Sparse feature decomposition (ReLU or exact top-k)
163
+ inspector.sae.train(dataset, layer="transformer.h.6", n_features=512)
164
+ inspector.sae.train(dataset, layer="transformer.h.6", n_features=512, activation="topk:32")
165
+ inspector.sae.decompose(dataset, layer="transformer.h.6").plot()
166
+
167
+ # Compressed latent space + reconstruction-error anomaly detection
168
+ inspector.latent.train(dataset, layer="transformer.h.6", latent_dim=2)
169
+ result = inspector.latent.encode(dataset, layer="transformer.h.6")
170
+ result.plot()
171
+ result.anomalies() # indices of outlier examples
172
+ ```
173
+
174
+ ### Representational geometry
175
+
176
+ ```python
177
+ # How similar are the layers to each other?
178
+ layers = inspector.find_layers(r"h\.\d+$")
179
+ inspector.geometry.layer_similarity(dataset, layers=layers).plot()
180
+
181
+ # How much did fine-tuning change each layer?
182
+ base_inspector = Inspector(base_model, tokenizer)
183
+ inspector.geometry.compare(base_inspector, dataset, layers=layers).plot()
184
+
185
+ # 2-D PCA scatter of a layer's representations
186
+ inspector.geometry.pca(dataset, layer="transformer.h.6", labels=class_labels).plot()
187
+ ```
188
+
189
+ ### Neuron analysis and concept erasure
190
+
191
+ ```python
192
+ # Dead neurons, top-activating examples, polysemanticity proxy
193
+ result = inspector.neurons.stats(dataset, layer="transformer.h.6.mlp.c_fc")
194
+ print(result) # NeuronResult(... dead=12/3072 ...)
195
+ result.plot() # bar chart, dead neurons in red
196
+ result.polysemantic_neurons() # low-kurtosis (broadly-activating) indices
197
+
198
+ # Remove a concept from a layer's representations (INLP)
199
+ result = inspector.erasure.inlp(dataset, concept="sentiment", layer="transformer.h.4")
200
+ result.plot() # probe accuracy decay over iterations
201
+ erased = result.apply(some_activations) # project new activations through erasure
202
+ ```
203
+
204
+ ## Development
205
+
206
+ ```bash
207
+ git clone https://github.com/Darsh-Nandu/neural-network-debugger
208
+ cd neural-network-debugger
209
+ pip install -e ".[dev]"
210
+ ruff check nndbg tests
211
+ pytest
212
+ ```
nndbg-0.2.0/README.md ADDED
@@ -0,0 +1,149 @@
1
+ # NNDbg — a diagnostic toolkit for neural networks
2
+
3
+ [![CI](https://github.com/Darsh-Nandu/neural-network-debugger/actions/workflows/ci.yml/badge.svg)](https://github.com/Darsh-Nandu/neural-network-debugger/actions)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE.md)
6
+
7
+ NNDbg wraps a PyTorch or HuggingFace model in a single `Inspector` and
8
+ answers the questions people actually ask when interpreting a neural
9
+ network: where a concept is encoded, which inputs caused an output, what
10
+ each attention head is doing, which layer causally produces a behaviour,
11
+ what features a layer's activations decompose into, and how similar the
12
+ representations of different layers (or different models) are.
13
+
14
+ ## Analysis planes
15
+
16
+ | `inspector.<plane>` | Answers | Method |
17
+ |---|---|---|
18
+ | `probing` | Where is concept X encoded? | cross-validated linear / SVM / MLP probes |
19
+ | `attribution` | Which inputs caused this output? | saliency, gradient×input, SmoothGrad, IG, Grad-CAM |
20
+ | `attention` | What does each head attend to? | per-head heatmaps, rollout, entropy |
21
+ | `patching` | Which layers causally produce a behaviour? | causal tracing, mean ablation |
22
+ | `sae` | What sparse features does a layer learn? | sparse autoencoder (ReLU or top-k) |
23
+ | `latent` | Where do activations sit in a compressed space? | VAE latent space + anomaly detection |
24
+ | `geometry` | How similar are layers to each other (or to another model)? | linear CKA, PCA, UMAP |
25
+ | `neurons` | What is each neuron doing? | dead neurons, top examples, kurtosis |
26
+ | `erasure` | How do I remove a concept from representations? | INLP null-space projection |
27
+
28
+ Every result is a plain dataclass with a `.plot()` method (matplotlib,
29
+ zero-config) and an optional `.plotly()` method if you have `plotly` installed.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install nndbg
35
+
36
+ # with interactive Plotly figures
37
+ pip install nndbg[plotly]
38
+ ```
39
+
40
+ ## Quick start
41
+
42
+ ```python
43
+ from transformers import AutoModelForCausalLM, AutoTokenizer
44
+ from nndbg import Inspector
45
+
46
+ model = AutoModelForCausalLM.from_pretrained("gpt2")
47
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
48
+ inspector = Inspector(model, tokenizer)
49
+
50
+ inspector.summary() # model + all available planes
51
+ inspector.layers()[:5] # every layer name you can pass to any plane
52
+
53
+ # Attribution — which input tokens drove the prediction?
54
+ input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
55
+ inspector.attribution.saliency(input_ids).plot()
56
+ inspector.attribution.smoothgrad(input_ids).plot() # noise-averaged saliency
57
+
58
+ # Attention — what does head 0 of layer 0 attend to?
59
+ inspector.attention.heads(input_ids, layer=0).plot()
60
+ inspector.attention.rollout(input_ids).plot() # cumulative information flow
61
+
62
+ # Probing — is sentiment decodable, and from which layer?
63
+ dataset = [
64
+ (tokenizer(text, return_tensors="pt").input_ids, label)
65
+ for text, label in [
66
+ ("I love this movie", 1), ("I hate this movie", 0),
67
+ ("This is wonderful", 1), ("This is terrible", 0),
68
+ ]
69
+ ]
70
+ inspector.probing.fit(dataset, concept="sentiment").plot()
71
+ inspector.probing.fit(dataset, concept="sentiment", method="svm").plot()
72
+ ```
73
+
74
+ Any plain `nn.Module` works too — `tokenizer` is optional and only needed
75
+ for token-level labeling.
76
+
77
+ ### Activation patching / causal tracing
78
+
79
+ ```python
80
+ clean = tokenizer("The Eiffel Tower is in the city of", return_tensors="pt").input_ids
81
+ corrupted = tokenizer("The Space Needle is in the city of", return_tensors="pt").input_ids
82
+
83
+ # Causal trace: which (layer, position) recovers the clean prediction?
84
+ result = inspector.patching.causal_trace(
85
+ clean, corrupted, layers=inspector.find_layers(r"h\.\d+$")
86
+ )
87
+ result.plot() # (layer × position) logit-recovery heatmap
88
+
89
+ # Mean ablation: which positions carry above-average information?
90
+ corpus = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
91
+ inspector.patching.mean_ablation(clean, corpus).plot()
92
+ ```
93
+
94
+ ### Sparse autoencoders and VAE latent analysis
95
+
96
+ ```python
97
+ dataset = [tokenizer(t, return_tensors="pt").input_ids for t in texts]
98
+
99
+ # Sparse feature decomposition (ReLU or exact top-k)
100
+ inspector.sae.train(dataset, layer="transformer.h.6", n_features=512)
101
+ inspector.sae.train(dataset, layer="transformer.h.6", n_features=512, activation="topk:32")
102
+ inspector.sae.decompose(dataset, layer="transformer.h.6").plot()
103
+
104
+ # Compressed latent space + reconstruction-error anomaly detection
105
+ inspector.latent.train(dataset, layer="transformer.h.6", latent_dim=2)
106
+ result = inspector.latent.encode(dataset, layer="transformer.h.6")
107
+ result.plot()
108
+ result.anomalies() # indices of outlier examples
109
+ ```
110
+
111
+ ### Representational geometry
112
+
113
+ ```python
114
+ # How similar are the layers to each other?
115
+ layers = inspector.find_layers(r"h\.\d+$")
116
+ inspector.geometry.layer_similarity(dataset, layers=layers).plot()
117
+
118
+ # How much did fine-tuning change each layer?
119
+ base_inspector = Inspector(base_model, tokenizer)
120
+ inspector.geometry.compare(base_inspector, dataset, layers=layers).plot()
121
+
122
+ # 2-D PCA scatter of a layer's representations
123
+ inspector.geometry.pca(dataset, layer="transformer.h.6", labels=class_labels).plot()
124
+ ```
125
+
126
+ ### Neuron analysis and concept erasure
127
+
128
+ ```python
129
+ # Dead neurons, top-activating examples, polysemanticity proxy
130
+ result = inspector.neurons.stats(dataset, layer="transformer.h.6.mlp.c_fc")
131
+ print(result) # NeuronResult(... dead=12/3072 ...)
132
+ result.plot() # bar chart, dead neurons in red
133
+ result.polysemantic_neurons() # low-kurtosis (broadly-activating) indices
134
+
135
+ # Remove a concept from a layer's representations (INLP)
136
+ result = inspector.erasure.inlp(dataset, concept="sentiment", layer="transformer.h.4")
137
+ result.plot() # probe accuracy decay over iterations
138
+ erased = result.apply(some_activations) # project new activations through erasure
139
+ ```
140
+
141
+ ## Development
142
+
143
+ ```bash
144
+ git clone https://github.com/Darsh-Nandu/neural-network-debugger
145
+ cd neural-network-debugger
146
+ pip install -e ".[dev]"
147
+ ruff check nndbg tests
148
+ pytest
149
+ ```
@@ -0,0 +1,36 @@
1
+ # Roadmap
2
+
3
+ NNDbg ships fully-implemented, tested analysis planes rather than broad
4
+ scaffolding. Each version ships what it ships completely.
5
+
6
+ ## Shipped in v0.2.0 (new planes)
7
+
8
+ - **Representation geometry** (`inspector.geometry`) — linear CKA between
9
+ layers, cross-model comparison, PCA and optional UMAP projections.
10
+ - **Neuron-level analysis** (`inspector.neurons`) — dead-neuron detection,
11
+ top-activating examples, per-neuron kurtosis as a polysemanticity proxy.
12
+ - **Concept erasure** (`inspector.erasure`) — INLP iterative null-space
13
+ projection; returns a projection matrix applicable to new activations.
14
+
15
+ Also in v0.2.0 (upgrades):
16
+ - Probing: `method="svm"` / `"mlp"` options.
17
+ - Attribution: `gradient_x_input()` and `smoothgrad()`.
18
+ - SAE: `activation="topk:<k>"` for exact sparsity.
19
+ - Patching: `mean_ablation(clean, dataset)`.
20
+
21
+ ## Planned for v3
22
+
23
+ - **CLI** — `nndbg summary <model>` and friends, for inspecting a model
24
+ without writing a script.
25
+ - **Captum integration** — an opt-in `nndbg[captum]` extra wrapping
26
+ Captum's broader attribution method library (DeepLIFT, SHAP, ...) behind
27
+ the same `AttributionResult` interface as the native methods.
28
+ - **LEACE concept erasure** — closed-form concept removal with better
29
+ theoretical guarantees than INLP for multiclass concepts.
30
+
31
+ ## Explicitly out of scope
32
+
33
+ - A persistent activation-storage backend (the original prototype used
34
+ DuckDB; activations live in memory / plain `torch.save` files — add a
35
+ backend only if a real workflow needs querying activations across runs,
36
+ not preemptively).
@@ -0,0 +1,13 @@
1
+ """
2
+ NNDbg — A diagnostic toolkit for neural networks.
3
+
4
+ Built around a single entrypoint, ``Inspector``, exposing focused analysis
5
+ planes (probing, attribution, attention, activation patching, sparse
6
+ autoencoders, VAE latent analysis) for PyTorch and HuggingFace models.
7
+ """
8
+
9
+ from nndbg._version import __version__
10
+ from nndbg.inspector import Inspector
11
+ from nndbg.utils.logging import is_verbose, set_verbose
12
+
13
+ __all__ = ["Inspector", "__version__", "set_verbose", "is_verbose"]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
File without changes
@@ -0,0 +1,4 @@
1
+ from nndbg.analysis.attention.base import AttentionAnalyzer
2
+ from nndbg.analysis.attention.results import AttentionResult
3
+
4
+ __all__ = ["AttentionAnalyzer", "AttentionResult"]