interpkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- interpkit-0.1.0/LICENSE +21 -0
- interpkit-0.1.0/PKG-INFO +295 -0
- interpkit-0.1.0/README.md +258 -0
- interpkit-0.1.0/interpkit/__init__.py +15 -0
- interpkit-0.1.0/interpkit/cli/__init__.py +0 -0
- interpkit-0.1.0/interpkit/cli/main.py +337 -0
- interpkit-0.1.0/interpkit/core/__init__.py +0 -0
- interpkit-0.1.0/interpkit/core/discovery.py +228 -0
- interpkit-0.1.0/interpkit/core/html.py +375 -0
- interpkit-0.1.0/interpkit/core/inputs.py +117 -0
- interpkit-0.1.0/interpkit/core/model.py +551 -0
- interpkit-0.1.0/interpkit/core/plot.py +352 -0
- interpkit-0.1.0/interpkit/core/registry.py +82 -0
- interpkit-0.1.0/interpkit/core/render.py +465 -0
- interpkit-0.1.0/interpkit/core/tl_compat.py +174 -0
- interpkit-0.1.0/interpkit/ops/__init__.py +0 -0
- interpkit-0.1.0/interpkit/ops/ablate.py +90 -0
- interpkit-0.1.0/interpkit/ops/activations.py +67 -0
- interpkit-0.1.0/interpkit/ops/attention.py +234 -0
- interpkit-0.1.0/interpkit/ops/attribute.py +206 -0
- interpkit-0.1.0/interpkit/ops/diff.py +79 -0
- interpkit-0.1.0/interpkit/ops/inspect.py +14 -0
- interpkit-0.1.0/interpkit/ops/lens.py +151 -0
- interpkit-0.1.0/interpkit/ops/patch.py +112 -0
- interpkit-0.1.0/interpkit/ops/probe.py +128 -0
- interpkit-0.1.0/interpkit/ops/sae.py +212 -0
- interpkit-0.1.0/interpkit/ops/steer.py +118 -0
- interpkit-0.1.0/interpkit/ops/trace.py +182 -0
- interpkit-0.1.0/interpkit.egg-info/PKG-INFO +295 -0
- interpkit-0.1.0/interpkit.egg-info/SOURCES.txt +52 -0
- interpkit-0.1.0/interpkit.egg-info/dependency_links.txt +1 -0
- interpkit-0.1.0/interpkit.egg-info/entry_points.txt +2 -0
- interpkit-0.1.0/interpkit.egg-info/requires.txt +16 -0
- interpkit-0.1.0/interpkit.egg-info/top_level.txt +1 -0
- interpkit-0.1.0/pyproject.toml +58 -0
- interpkit-0.1.0/setup.cfg +4 -0
- interpkit-0.1.0/tests/test_ablate.py +23 -0
- interpkit-0.1.0/tests/test_activations.py +34 -0
- interpkit-0.1.0/tests/test_attention.py +42 -0
- interpkit-0.1.0/tests/test_attribute.py +15 -0
- interpkit-0.1.0/tests/test_cache.py +66 -0
- interpkit-0.1.0/tests/test_diff.py +23 -0
- interpkit-0.1.0/tests/test_discovery.py +61 -0
- interpkit-0.1.0/tests/test_html.py +133 -0
- interpkit-0.1.0/tests/test_inspect.py +51 -0
- interpkit-0.1.0/tests/test_lens.py +25 -0
- interpkit-0.1.0/tests/test_patch.py +32 -0
- interpkit-0.1.0/tests/test_plots.py +58 -0
- interpkit-0.1.0/tests/test_probe.py +34 -0
- interpkit-0.1.0/tests/test_registry.py +30 -0
- interpkit-0.1.0/tests/test_sae.py +115 -0
- interpkit-0.1.0/tests/test_steer.py +30 -0
- interpkit-0.1.0/tests/test_tl_compat.py +171 -0
- interpkit-0.1.0/tests/test_trace.py +37 -0
interpkit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Davide Zani
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
interpkit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: interpkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Mech interp for any HuggingFace model.
|
|
5
|
+
Author: Davide Zani
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/davidezani/InterpKit
|
|
8
|
+
Project-URL: Repository, https://github.com/davidezani/InterpKit
|
|
9
|
+
Project-URL: Issues, https://github.com/davidezani/InterpKit/issues
|
|
10
|
+
Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: torch>=2.1
|
|
23
|
+
Requires-Dist: transformers>=4.36
|
|
24
|
+
Requires-Dist: nnsight>=0.3
|
|
25
|
+
Requires-Dist: rich>=13.0
|
|
26
|
+
Requires-Dist: typer>=0.9
|
|
27
|
+
Requires-Dist: Pillow>=10.0
|
|
28
|
+
Requires-Dist: matplotlib>=3.8
|
|
29
|
+
Requires-Dist: huggingface-hub>=0.20
|
|
30
|
+
Provides-Extra: probe
|
|
31
|
+
Requires-Dist: scikit-learn>=1.3; extra == "probe"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-timeout>=2.2; extra == "dev"
|
|
35
|
+
Requires-Dist: scikit-learn>=1.3; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# InterpKit
|
|
39
|
+
|
|
40
|
+
> Mech interp for any HuggingFace model.
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/interpkit/)
|
|
43
|
+
[](https://opensource.org/licenses/MIT)
|
|
44
|
+
[](https://www.python.org/downloads/)
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## The Problem
|
|
49
|
+
|
|
50
|
+
TransformerLens is excellent — but only works on GPT-style decoder-only transformers. The moment you step outside that (Mamba, SSMs, ViT, CNNs, BERT, T5, MoE models), there is no equivalent tool. You write hook code from scratch every time.
|
|
51
|
+
|
|
52
|
+
InterpKit fills this gap: the same standard mech interp operations, on any HuggingFace model, with no annotation required.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install interpkit
|
|
60
|
+
|
|
61
|
+
# For linear probe support:
|
|
62
|
+
pip install interpkit[probe]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Or install from source for development:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
git clone https://github.com/davidezani/InterpKit.git
|
|
69
|
+
cd InterpKit
|
|
70
|
+
pip install -e ".[dev]"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Quickstart
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import interpkit
|
|
79
|
+
|
|
80
|
+
model = interpkit.load("gpt2")
|
|
81
|
+
|
|
82
|
+
model.inspect() # module tree with roles, params, shapes
|
|
83
|
+
model.trace("...Paris...", "...Rome...", top_k=20) # causal tracing
|
|
84
|
+
model.patch("...Paris...", "...Rome...", at="transformer.h.8.mlp")
|
|
85
|
+
model.lens("The capital of France is") # logit lens
|
|
86
|
+
model.attribute("The capital of France is") # gradient saliency
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Works the same on any HF architecture:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
model = interpkit.load("state-spaces/mamba-370m")
|
|
93
|
+
model = interpkit.load("google/vit-base-patch16-224")
|
|
94
|
+
model = interpkit.load("bert-base-uncased")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Operations
|
|
100
|
+
|
|
101
|
+
| Operation | What it does | Works on |
|
|
102
|
+
|-----------|-------------|----------|
|
|
103
|
+
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
104
|
+
| `patch` | Activation patching at a named module | Any model |
|
|
105
|
+
| `trace` | Causal tracing across modules, ranked by effect | Any model |
|
|
106
|
+
| `attribute` | Gradient saliency over inputs | Any model |
|
|
107
|
+
| `lens` | Logit lens — project activations to vocabulary | LMs (auto-detected) |
|
|
108
|
+
| `activations` | Extract raw activation tensors at any module | Any model |
|
|
109
|
+
| `ablate` | Zero/mean ablate a component and measure effect | Any model |
|
|
110
|
+
| `attention` | Visualize attention patterns per layer/head | Transformers |
|
|
111
|
+
| `steer` | Extract and apply steering vectors | Any model |
|
|
112
|
+
| `probe` | Linear probe on activations | Any model |
|
|
113
|
+
| `diff` | Compare activations between two models | Any model |
|
|
114
|
+
| `features` | SAE feature decomposition | Any model |
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Activations, Ablation, Attention
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
# Extract raw activations
|
|
122
|
+
act = model.activations("The capital of France is", at="transformer.h.8.mlp")
|
|
123
|
+
acts = model.activations("...", at=["transformer.h.0", "transformer.h.8.mlp"])
|
|
124
|
+
|
|
125
|
+
# Ablation — zero or mean
|
|
126
|
+
result = model.ablate("The capital of France is", at="transformer.h.8.mlp")
|
|
127
|
+
result = model.ablate("...", at="transformer.h.8.mlp", method="mean")
|
|
128
|
+
|
|
129
|
+
# Attention patterns
|
|
130
|
+
model.attention("The capital of France is") # all layers
|
|
131
|
+
model.attention("The capital of France is", layer=8, head=3) # single head
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Steering
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
# 1. Extract a steering vector
|
|
138
|
+
vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
|
|
139
|
+
|
|
140
|
+
# 2. Apply during inference — side-by-side comparison
|
|
141
|
+
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Linear Probe
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
result = model.probe(
|
|
148
|
+
texts=["The cat sat", "The dog ran", "A bird flew", "A fish swam"],
|
|
149
|
+
labels=[0, 0, 1, 1],
|
|
150
|
+
at="transformer.h.8",
|
|
151
|
+
)
|
|
152
|
+
print(result["accuracy"])
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Model Diff
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
base = interpkit.load("gpt2")
|
|
159
|
+
finetuned = interpkit.load("my-finetuned-gpt2")
|
|
160
|
+
interpkit.diff(base, finetuned, "The capital of France is")
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## SAE Features
|
|
164
|
+
|
|
165
|
+
Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
model.features(
|
|
169
|
+
"The capital of France is",
|
|
170
|
+
at="transformer.h.8",
|
|
171
|
+
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
172
|
+
)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
No SAELens dependency — weights are loaded directly via `safetensors`.
|
|
176
|
+
|
|
177
|
+
## Activation Cache
|
|
178
|
+
|
|
179
|
+
Avoid redundant forward passes when exploring the same input with multiple operations:
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
model.cache("The capital of France is") # one forward pass, cache all layers
|
|
183
|
+
model.activations("The capital of France is", at="transformer.h.8.mlp") # instant
|
|
184
|
+
model.activations("The capital of France is", at="transformer.h.0.mlp") # instant
|
|
185
|
+
|
|
186
|
+
model.clear_cache() # free memory
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Visualizations
|
|
192
|
+
|
|
193
|
+
Pass `save="path.png"` to export a static matplotlib figure, or `html="path.html"` for an interactive visualization:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
model.attention("hello world", layer=0, head=0, save="attention.png")
|
|
197
|
+
model.trace("...Paris...", "...Rome...", save="trace.png")
|
|
198
|
+
model.lens("The capital of France is", save="lens.png")
|
|
199
|
+
model.steer("The weather is", vector=vector, at="transformer.h.8", save="steer.png")
|
|
200
|
+
model.attribute("The capital of France is", save="attribution.png")
|
|
201
|
+
interpkit.diff(base, finetuned, "...", save="diff.png")
|
|
202
|
+
|
|
203
|
+
# Interactive HTML — self-contained files with hover tooltips, filters, and sliders
|
|
204
|
+
model.attention("hello world", html="attention.html")
|
|
205
|
+
model.trace("...Paris...", "...Rome...", html="trace.html")
|
|
206
|
+
model.attribute("The capital of France is", html="attribution.html")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## CLI
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
interpkit inspect gpt2
|
|
215
|
+
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --top-k 20
|
|
216
|
+
interpkit lens gpt2 "The capital of France is"
|
|
217
|
+
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
218
|
+
interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
|
|
219
|
+
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
220
|
+
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
221
|
+
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
|
|
222
|
+
|
|
223
|
+
# Interactive HTML output
|
|
224
|
+
interpkit attention gpt2 "hello world" --html attention.html
|
|
225
|
+
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
|
|
226
|
+
interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
227
|
+
|
|
228
|
+
# Vision models — auto-preprocessed
|
|
229
|
+
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Run `interpkit` with no arguments for a full command reference.
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## TransformerLens interop
|
|
237
|
+
|
|
238
|
+
Already using TransformerLens? Pass your `HookedTransformer` directly into InterpKit — it auto-detects the model and extracts the tokenizer:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
from transformer_lens import HookedTransformer
|
|
242
|
+
import interpkit
|
|
243
|
+
|
|
244
|
+
tl_model = HookedTransformer.from_pretrained("gpt2")
|
|
245
|
+
model = interpkit.load(tl_model)
|
|
246
|
+
|
|
247
|
+
# All InterpKit operations work on TL models
|
|
248
|
+
model.trace("The Eiffel Tower is in Paris", "The Eiffel Tower is in Rome", top_k=20)
|
|
249
|
+
model.attention("The capital of France is", save="attention.png")
|
|
250
|
+
model.steer("The weather is", vector=vector, at="blocks.8", scale=2.0)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Translate between native and TL hook point names:
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
interpkit.to_tl_name("transformer.h.8.mlp") # -> "blocks.8.mlp"
|
|
257
|
+
interpkit.to_native_name("blocks.8.attn", model.arch_info) # -> "transformer.h.8.attn"
|
|
258
|
+
interpkit.list_tl_hooks(tl_model) # -> ["blocks.0.hook_resid_pre", ...]
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## Local models
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
import torch.nn as nn
|
|
267
|
+
import interpkit
|
|
268
|
+
|
|
269
|
+
my_model = MyCustomModel()
|
|
270
|
+
interpkit.register(my_model, layers=["blocks.0", "blocks.1"], output_head="head")
|
|
271
|
+
model = interpkit.load(my_model, tokenizer=my_tokenizer)
|
|
272
|
+
model.trace(input_a, input_b, top_k=10)
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Examples
|
|
278
|
+
|
|
279
|
+
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
280
|
+
|
|
281
|
+
| Notebook | Topics |
|
|
282
|
+
|----------|--------|
|
|
283
|
+
| `01_quickstart` | Inspect, trace, lens, attribution, patching, ablation |
|
|
284
|
+
| `02_attention_patterns` | Per-head heatmaps, layer filtering, HTML export |
|
|
285
|
+
| `03_steering_vectors` | Extract and apply steering vectors at different layers/scales |
|
|
286
|
+
| `04_sae_features` | Sparse Autoencoder feature decomposition |
|
|
287
|
+
| `05_caching_and_probing` | Activation cache, linear probes across layers |
|
|
288
|
+
| `06_model_comparison` | Diff two models, side-by-side tracing and logit lens |
|
|
289
|
+
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## License
|
|
294
|
+
|
|
295
|
+
MIT
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# InterpKit
|
|
2
|
+
|
|
3
|
+
> Mech interp for any HuggingFace model.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/interpkit/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## The Problem
|
|
12
|
+
|
|
13
|
+
TransformerLens is excellent — but only works on GPT-style decoder-only transformers. The moment you step outside that (Mamba, SSMs, ViT, CNNs, BERT, T5, MoE models), there is no equivalent tool. You write hook code from scratch every time.
|
|
14
|
+
|
|
15
|
+
InterpKit fills this gap: the same standard mech interp operations, on any HuggingFace model, with no annotation required.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install interpkit
|
|
23
|
+
|
|
24
|
+
# For linear probe support:
|
|
25
|
+
pip install interpkit[probe]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or install from source for development:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/davidezani/InterpKit.git
|
|
32
|
+
cd InterpKit
|
|
33
|
+
pip install -e ".[dev]"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Quickstart
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import interpkit
|
|
42
|
+
|
|
43
|
+
model = interpkit.load("gpt2")
|
|
44
|
+
|
|
45
|
+
model.inspect() # module tree with roles, params, shapes
|
|
46
|
+
model.trace("...Paris...", "...Rome...", top_k=20) # causal tracing
|
|
47
|
+
model.patch("...Paris...", "...Rome...", at="transformer.h.8.mlp")
|
|
48
|
+
model.lens("The capital of France is") # logit lens
|
|
49
|
+
model.attribute("The capital of France is") # gradient saliency
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Works the same on any HF architecture:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
model = interpkit.load("state-spaces/mamba-370m")
|
|
56
|
+
model = interpkit.load("google/vit-base-patch16-224")
|
|
57
|
+
model = interpkit.load("bert-base-uncased")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Operations
|
|
63
|
+
|
|
64
|
+
| Operation | What it does | Works on |
|
|
65
|
+
|-----------|-------------|----------|
|
|
66
|
+
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
67
|
+
| `patch` | Activation patching at a named module | Any model |
|
|
68
|
+
| `trace` | Causal tracing across modules, ranked by effect | Any model |
|
|
69
|
+
| `attribute` | Gradient saliency over inputs | Any model |
|
|
70
|
+
| `lens` | Logit lens — project activations to vocabulary | LMs (auto-detected) |
|
|
71
|
+
| `activations` | Extract raw activation tensors at any module | Any model |
|
|
72
|
+
| `ablate` | Zero/mean ablate a component and measure effect | Any model |
|
|
73
|
+
| `attention` | Visualize attention patterns per layer/head | Transformers |
|
|
74
|
+
| `steer` | Extract and apply steering vectors | Any model |
|
|
75
|
+
| `probe` | Linear probe on activations | Any model |
|
|
76
|
+
| `diff` | Compare activations between two models | Any model |
|
|
77
|
+
| `features` | SAE feature decomposition | Any model |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Activations, Ablation, Attention
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Extract raw activations
|
|
85
|
+
act = model.activations("The capital of France is", at="transformer.h.8.mlp")
|
|
86
|
+
acts = model.activations("...", at=["transformer.h.0", "transformer.h.8.mlp"])
|
|
87
|
+
|
|
88
|
+
# Ablation — zero or mean
|
|
89
|
+
result = model.ablate("The capital of France is", at="transformer.h.8.mlp")
|
|
90
|
+
result = model.ablate("...", at="transformer.h.8.mlp", method="mean")
|
|
91
|
+
|
|
92
|
+
# Attention patterns
|
|
93
|
+
model.attention("The capital of France is") # all layers
|
|
94
|
+
model.attention("The capital of France is", layer=8, head=3) # single head
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Steering
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# 1. Extract a steering vector
|
|
101
|
+
vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
|
|
102
|
+
|
|
103
|
+
# 2. Apply during inference — side-by-side comparison
|
|
104
|
+
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Linear Probe
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
result = model.probe(
|
|
111
|
+
texts=["The cat sat", "The dog ran", "A bird flew", "A fish swam"],
|
|
112
|
+
labels=[0, 0, 1, 1],
|
|
113
|
+
at="transformer.h.8",
|
|
114
|
+
)
|
|
115
|
+
print(result["accuracy"])
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Model Diff
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
base = interpkit.load("gpt2")
|
|
122
|
+
finetuned = interpkit.load("my-finetuned-gpt2")
|
|
123
|
+
interpkit.diff(base, finetuned, "The capital of France is")
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## SAE Features
|
|
127
|
+
|
|
128
|
+
Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
model.features(
|
|
132
|
+
"The capital of France is",
|
|
133
|
+
at="transformer.h.8",
|
|
134
|
+
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
No SAELens dependency — weights are loaded directly via `safetensors`.
|
|
139
|
+
|
|
140
|
+
## Activation Cache
|
|
141
|
+
|
|
142
|
+
Avoid redundant forward passes when exploring the same input with multiple operations:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
model.cache("The capital of France is") # one forward pass, cache all layers
|
|
146
|
+
model.activations("The capital of France is", at="transformer.h.8.mlp") # instant
|
|
147
|
+
model.activations("The capital of France is", at="transformer.h.0.mlp") # instant
|
|
148
|
+
|
|
149
|
+
model.clear_cache() # free memory
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Visualizations
|
|
155
|
+
|
|
156
|
+
Pass `save="path.png"` to export a static matplotlib figure, or `html="path.html"` for an interactive visualization:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
model.attention("hello world", layer=0, head=0, save="attention.png")
|
|
160
|
+
model.trace("...Paris...", "...Rome...", save="trace.png")
|
|
161
|
+
model.lens("The capital of France is", save="lens.png")
|
|
162
|
+
model.steer("The weather is", vector=vector, at="transformer.h.8", save="steer.png")
|
|
163
|
+
model.attribute("The capital of France is", save="attribution.png")
|
|
164
|
+
interpkit.diff(base, finetuned, "...", save="diff.png")
|
|
165
|
+
|
|
166
|
+
# Interactive HTML — self-contained files with hover tooltips, filters, and sliders
|
|
167
|
+
model.attention("hello world", html="attention.html")
|
|
168
|
+
model.trace("...Paris...", "...Rome...", html="trace.html")
|
|
169
|
+
model.attribute("The capital of France is", html="attribution.html")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## CLI
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
interpkit inspect gpt2
|
|
178
|
+
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --top-k 20
|
|
179
|
+
interpkit lens gpt2 "The capital of France is"
|
|
180
|
+
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
181
|
+
interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
|
|
182
|
+
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
183
|
+
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
184
|
+
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
|
|
185
|
+
|
|
186
|
+
# Interactive HTML output
|
|
187
|
+
interpkit attention gpt2 "hello world" --html attention.html
|
|
188
|
+
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
|
|
189
|
+
interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
190
|
+
|
|
191
|
+
# Vision models — auto-preprocessed
|
|
192
|
+
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Run `interpkit` with no arguments for a full command reference.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## TransformerLens interop
|
|
200
|
+
|
|
201
|
+
Already using TransformerLens? Pass your `HookedTransformer` directly into InterpKit — it auto-detects the model and extracts the tokenizer:
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from transformer_lens import HookedTransformer
|
|
205
|
+
import interpkit
|
|
206
|
+
|
|
207
|
+
tl_model = HookedTransformer.from_pretrained("gpt2")
|
|
208
|
+
model = interpkit.load(tl_model)
|
|
209
|
+
|
|
210
|
+
# All InterpKit operations work on TL models
|
|
211
|
+
model.trace("The Eiffel Tower is in Paris", "The Eiffel Tower is in Rome", top_k=20)
|
|
212
|
+
model.attention("The capital of France is", save="attention.png")
|
|
213
|
+
model.steer("The weather is", vector=vector, at="blocks.8", scale=2.0)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Translate between native and TL hook point names:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
interpkit.to_tl_name("transformer.h.8.mlp") # -> "blocks.8.mlp"
|
|
220
|
+
interpkit.to_native_name("blocks.8.attn", model.arch_info) # -> "transformer.h.8.attn"
|
|
221
|
+
interpkit.list_tl_hooks(tl_model) # -> ["blocks.0.hook_resid_pre", ...]
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Local models
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
import torch.nn as nn
|
|
230
|
+
import interpkit
|
|
231
|
+
|
|
232
|
+
my_model = MyCustomModel()
|
|
233
|
+
interpkit.register(my_model, layers=["blocks.0", "blocks.1"], output_head="head")
|
|
234
|
+
model = interpkit.load(my_model, tokenizer=my_tokenizer)
|
|
235
|
+
model.trace(input_a, input_b, top_k=10)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Examples
|
|
241
|
+
|
|
242
|
+
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
243
|
+
|
|
244
|
+
| Notebook | Topics |
|
|
245
|
+
|----------|--------|
|
|
246
|
+
| `01_quickstart` | Inspect, trace, lens, attribution, patching, ablation |
|
|
247
|
+
| `02_attention_patterns` | Per-head heatmaps, layer filtering, HTML export |
|
|
248
|
+
| `03_steering_vectors` | Extract and apply steering vectors at different layers/scales |
|
|
249
|
+
| `04_sae_features` | Sparse Autoencoder feature decomposition |
|
|
250
|
+
| `05_caching_and_probing` | Activation cache, linear probes across layers |
|
|
251
|
+
| `06_model_comparison` | Diff two models, side-by-side tracing and logit lens |
|
|
252
|
+
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## License
|
|
257
|
+
|
|
258
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""interpkit — mech interp for any HuggingFace model."""
|
|
2
|
+
|
|
3
|
+
from interpkit.core.model import load
|
|
4
|
+
from interpkit.core.registry import register
|
|
5
|
+
from interpkit.core.tl_compat import list_tl_hooks, to_native_name, to_tl_name
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def diff(model_a, model_b, input_data, *, save=None):
|
|
9
|
+
"""Compare activations between two models on the same input."""
|
|
10
|
+
from interpkit.ops.diff import run_diff
|
|
11
|
+
|
|
12
|
+
return run_diff(model_a, model_b, input_data, save=save)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = ["load", "register", "diff", "to_tl_name", "to_native_name", "list_tl_hooks"]
|
|
File without changes
|