interpkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. interpkit-0.1.0/LICENSE +21 -0
  2. interpkit-0.1.0/PKG-INFO +295 -0
  3. interpkit-0.1.0/README.md +258 -0
  4. interpkit-0.1.0/interpkit/__init__.py +15 -0
  5. interpkit-0.1.0/interpkit/cli/__init__.py +0 -0
  6. interpkit-0.1.0/interpkit/cli/main.py +337 -0
  7. interpkit-0.1.0/interpkit/core/__init__.py +0 -0
  8. interpkit-0.1.0/interpkit/core/discovery.py +228 -0
  9. interpkit-0.1.0/interpkit/core/html.py +375 -0
  10. interpkit-0.1.0/interpkit/core/inputs.py +117 -0
  11. interpkit-0.1.0/interpkit/core/model.py +551 -0
  12. interpkit-0.1.0/interpkit/core/plot.py +352 -0
  13. interpkit-0.1.0/interpkit/core/registry.py +82 -0
  14. interpkit-0.1.0/interpkit/core/render.py +465 -0
  15. interpkit-0.1.0/interpkit/core/tl_compat.py +174 -0
  16. interpkit-0.1.0/interpkit/ops/__init__.py +0 -0
  17. interpkit-0.1.0/interpkit/ops/ablate.py +90 -0
  18. interpkit-0.1.0/interpkit/ops/activations.py +67 -0
  19. interpkit-0.1.0/interpkit/ops/attention.py +234 -0
  20. interpkit-0.1.0/interpkit/ops/attribute.py +206 -0
  21. interpkit-0.1.0/interpkit/ops/diff.py +79 -0
  22. interpkit-0.1.0/interpkit/ops/inspect.py +14 -0
  23. interpkit-0.1.0/interpkit/ops/lens.py +151 -0
  24. interpkit-0.1.0/interpkit/ops/patch.py +112 -0
  25. interpkit-0.1.0/interpkit/ops/probe.py +128 -0
  26. interpkit-0.1.0/interpkit/ops/sae.py +212 -0
  27. interpkit-0.1.0/interpkit/ops/steer.py +118 -0
  28. interpkit-0.1.0/interpkit/ops/trace.py +182 -0
  29. interpkit-0.1.0/interpkit.egg-info/PKG-INFO +295 -0
  30. interpkit-0.1.0/interpkit.egg-info/SOURCES.txt +52 -0
  31. interpkit-0.1.0/interpkit.egg-info/dependency_links.txt +1 -0
  32. interpkit-0.1.0/interpkit.egg-info/entry_points.txt +2 -0
  33. interpkit-0.1.0/interpkit.egg-info/requires.txt +16 -0
  34. interpkit-0.1.0/interpkit.egg-info/top_level.txt +1 -0
  35. interpkit-0.1.0/pyproject.toml +58 -0
  36. interpkit-0.1.0/setup.cfg +4 -0
  37. interpkit-0.1.0/tests/test_ablate.py +23 -0
  38. interpkit-0.1.0/tests/test_activations.py +34 -0
  39. interpkit-0.1.0/tests/test_attention.py +42 -0
  40. interpkit-0.1.0/tests/test_attribute.py +15 -0
  41. interpkit-0.1.0/tests/test_cache.py +66 -0
  42. interpkit-0.1.0/tests/test_diff.py +23 -0
  43. interpkit-0.1.0/tests/test_discovery.py +61 -0
  44. interpkit-0.1.0/tests/test_html.py +133 -0
  45. interpkit-0.1.0/tests/test_inspect.py +51 -0
  46. interpkit-0.1.0/tests/test_lens.py +25 -0
  47. interpkit-0.1.0/tests/test_patch.py +32 -0
  48. interpkit-0.1.0/tests/test_plots.py +58 -0
  49. interpkit-0.1.0/tests/test_probe.py +34 -0
  50. interpkit-0.1.0/tests/test_registry.py +30 -0
  51. interpkit-0.1.0/tests/test_sae.py +115 -0
  52. interpkit-0.1.0/tests/test_steer.py +30 -0
  53. interpkit-0.1.0/tests/test_tl_compat.py +171 -0
  54. interpkit-0.1.0/tests/test_trace.py +37 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Davide Zani
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,295 @@
1
+ Metadata-Version: 2.4
2
+ Name: interpkit
3
+ Version: 0.1.0
4
+ Summary: Mech interp for any HuggingFace model.
5
+ Author: Davide Zani
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/davidezani/InterpKit
8
+ Project-URL: Repository, https://github.com/davidezani/InterpKit
9
+ Project-URL: Issues, https://github.com/davidezani/InterpKit/issues
10
+ Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: torch>=2.1
23
+ Requires-Dist: transformers>=4.36
24
+ Requires-Dist: nnsight>=0.3
25
+ Requires-Dist: rich>=13.0
26
+ Requires-Dist: typer>=0.9
27
+ Requires-Dist: Pillow>=10.0
28
+ Requires-Dist: matplotlib>=3.8
29
+ Requires-Dist: huggingface-hub>=0.20
30
+ Provides-Extra: probe
31
+ Requires-Dist: scikit-learn>=1.3; extra == "probe"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0; extra == "dev"
34
+ Requires-Dist: pytest-timeout>=2.2; extra == "dev"
35
+ Requires-Dist: scikit-learn>=1.3; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # InterpKit
39
+
40
+ > Mech interp for any HuggingFace model.
41
+
42
+ [![PyPI version](https://img.shields.io/pypi/v/interpkit.svg)](https://pypi.org/project/interpkit/)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
44
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
45
+
46
+ ---
47
+
48
+ ## The Problem
49
+
50
+ TransformerLens is excellent — but only works on GPT-style decoder-only transformers. The moment you step outside that (Mamba, SSMs, ViT, CNNs, BERT, T5, MoE models), there is no equivalent tool. You write hook code from scratch every time.
51
+
52
+ InterpKit fills this gap: the same standard mech interp operations, on any HuggingFace model, with no annotation required.
53
+
54
+ ---
55
+
56
+ ## Install
57
+
58
+ ```bash
59
+ pip install interpkit
60
+
61
+ # For linear probe support:
62
+ pip install interpkit[probe]
63
+ ```
64
+
65
+ Or install from source for development:
66
+
67
+ ```bash
68
+ git clone https://github.com/davidezani/InterpKit.git
69
+ cd InterpKit
70
+ pip install -e ".[dev]"
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Quickstart
76
+
77
+ ```python
78
+ import interpkit
79
+
80
+ model = interpkit.load("gpt2")
81
+
82
+ model.inspect() # module tree with roles, params, shapes
83
+ model.trace("...Paris...", "...Rome...", top_k=20) # causal tracing
84
+ model.patch("...Paris...", "...Rome...", at="transformer.h.8.mlp")
85
+ model.lens("The capital of France is") # logit lens
86
+ model.attribute("The capital of France is") # gradient saliency
87
+ ```
88
+
89
+ Works the same on any HF architecture:
90
+
91
+ ```python
92
+ model = interpkit.load("state-spaces/mamba-370m")
93
+ model = interpkit.load("google/vit-base-patch16-224")
94
+ model = interpkit.load("bert-base-uncased")
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Operations
100
+
101
+ | Operation | What it does | Works on |
102
+ |-----------|-------------|----------|
103
+ | `inspect` | Module tree with types, param counts, shapes | Any model |
104
+ | `patch` | Activation patching at a named module | Any model |
105
+ | `trace` | Causal tracing across modules, ranked by effect | Any model |
106
+ | `attribute` | Gradient saliency over inputs | Any model |
107
+ | `lens` | Logit lens — project activations to vocabulary | LMs (auto-detected) |
108
+ | `activations` | Extract raw activation tensors at any module | Any model |
109
+ | `ablate` | Zero/mean ablate a component and measure effect | Any model |
110
+ | `attention` | Visualize attention patterns per layer/head | Transformers |
111
+ | `steer` | Extract and apply steering vectors | Any model |
112
+ | `probe` | Linear probe on activations | Any model |
113
+ | `diff` | Compare activations between two models | Any model |
114
+ | `features` | SAE feature decomposition | Any model |
115
+
116
+ ---
117
+
118
+ ## Activations, Ablation, Attention
119
+
120
+ ```python
121
+ # Extract raw activations
122
+ act = model.activations("The capital of France is", at="transformer.h.8.mlp")
123
+ acts = model.activations("...", at=["transformer.h.0", "transformer.h.8.mlp"])
124
+
125
+ # Ablation — zero or mean
126
+ result = model.ablate("The capital of France is", at="transformer.h.8.mlp")
127
+ result = model.ablate("...", at="transformer.h.8.mlp", method="mean")
128
+
129
+ # Attention patterns
130
+ model.attention("The capital of France is") # all layers
131
+ model.attention("The capital of France is", layer=8, head=3) # single head
132
+ ```
133
+
134
+ ## Steering
135
+
136
+ ```python
137
+ # 1. Extract a steering vector
138
+ vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
139
+
140
+ # 2. Apply during inference — side-by-side comparison
141
+ model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
142
+ ```
143
+
144
+ ## Linear Probe
145
+
146
+ ```python
147
+ result = model.probe(
148
+ texts=["The cat sat", "The dog ran", "A bird flew", "A fish swam"],
149
+ labels=[0, 0, 1, 1],
150
+ at="transformer.h.8",
151
+ )
152
+ print(result["accuracy"])
153
+ ```
154
+
155
+ ## Model Diff
156
+
157
+ ```python
158
+ base = interpkit.load("gpt2")
159
+ finetuned = interpkit.load("my-finetuned-gpt2")
160
+ interpkit.diff(base, finetuned, "The capital of France is")
161
+ ```
162
+
163
+ ## SAE Features
164
+
165
+ Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
166
+
167
+ ```python
168
+ model.features(
169
+ "The capital of France is",
170
+ at="transformer.h.8",
171
+ sae="jbloom/GPT2-Small-SAEs-Reformatted",
172
+ )
173
+ ```
174
+
175
+ No SAELens dependency — weights are loaded directly via `safetensors`.
176
+
177
+ ## Activation Cache
178
+
179
+ Avoid redundant forward passes when exploring the same input with multiple operations:
180
+
181
+ ```python
182
+ model.cache("The capital of France is") # one forward pass, cache all layers
183
+ model.activations("The capital of France is", at="transformer.h.8.mlp") # instant
184
+ model.activations("The capital of France is", at="transformer.h.0.mlp") # instant
185
+
186
+ model.clear_cache() # free memory
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Visualizations
192
+
193
+ Pass `save="path.png"` to export a static matplotlib figure, or `html="path.html"` for an interactive visualization:
194
+
195
+ ```python
196
+ model.attention("hello world", layer=0, head=0, save="attention.png")
197
+ model.trace("...Paris...", "...Rome...", save="trace.png")
198
+ model.lens("The capital of France is", save="lens.png")
199
+ model.steer("The weather is", vector=vector, at="transformer.h.8", save="steer.png")
200
+ model.attribute("The capital of France is", save="attribution.png")
201
+ interpkit.diff(base, finetuned, "...", save="diff.png")
202
+
203
+ # Interactive HTML — self-contained files with hover tooltips, filters, and sliders
204
+ model.attention("hello world", html="attention.html")
205
+ model.trace("...Paris...", "...Rome...", html="trace.html")
206
+ model.attribute("The capital of France is", html="attribution.html")
207
+ ```
208
+
209
+ ---
210
+
211
+ ## CLI
212
+
213
+ ```bash
214
+ interpkit inspect gpt2
215
+ interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --top-k 20
216
+ interpkit lens gpt2 "The capital of France is"
217
+ interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
218
+ interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
219
+ interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
220
+ interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
221
+ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
222
+
223
+ # Interactive HTML output
224
+ interpkit attention gpt2 "hello world" --html attention.html
225
+ interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
226
+ interpkit attribute gpt2 "The capital of France is" --html attribution.html
227
+
228
+ # Vision models — auto-preprocessed
229
+ interpkit attribute microsoft/resnet-50 cat.jpg --target 281
230
+ ```
231
+
232
+ Run `interpkit` with no arguments for a full command reference.
233
+
234
+ ---
235
+
236
+ ## TransformerLens interop
237
+
238
+ Already using TransformerLens? Pass your `HookedTransformer` directly into InterpKit — it auto-detects the model and extracts the tokenizer:
239
+
240
+ ```python
241
+ from transformer_lens import HookedTransformer
242
+ import interpkit
243
+
244
+ tl_model = HookedTransformer.from_pretrained("gpt2")
245
+ model = interpkit.load(tl_model)
246
+
247
+ # All InterpKit operations work on TL models
248
+ model.trace("The Eiffel Tower is in Paris", "The Eiffel Tower is in Rome", top_k=20)
249
+ model.attention("The capital of France is", save="attention.png")
250
+ model.steer("The weather is", vector=vector, at="blocks.8", scale=2.0)
251
+ ```
252
+
253
+ Translate between native and TL hook point names:
254
+
255
+ ```python
256
+ interpkit.to_tl_name("transformer.h.8.mlp") # -> "blocks.8.mlp"
257
+ interpkit.to_native_name("blocks.8.attn", model.arch_info) # -> "transformer.h.8.attn"
258
+ interpkit.list_tl_hooks(tl_model) # -> ["blocks.0.hook_resid_pre", ...]
259
+ ```
260
+
261
+ ---
262
+
263
+ ## Local models
264
+
265
+ ```python
266
+ import torch.nn as nn
267
+ import interpkit
268
+
269
+ my_model = MyCustomModel()
270
+ interpkit.register(my_model, layers=["blocks.0", "blocks.1"], output_head="head")
271
+ model = interpkit.load(my_model, tokenizer=my_tokenizer)
272
+ model.trace(input_a, input_b, top_k=10)
273
+ ```
274
+
275
+ ---
276
+
277
+ ## Examples
278
+
279
+ See the [`examples/`](examples/) directory for Jupyter notebooks:
280
+
281
+ | Notebook | Topics |
282
+ |----------|--------|
283
+ | `01_quickstart` | Inspect, trace, lens, attribution, patching, ablation |
284
+ | `02_attention_patterns` | Per-head heatmaps, layer filtering, HTML export |
285
+ | `03_steering_vectors` | Extract and apply steering vectors at different layers/scales |
286
+ | `04_sae_features` | Sparse Autoencoder feature decomposition |
287
+ | `05_caching_and_probing` | Activation cache, linear probes across layers |
288
+ | `06_model_comparison` | Diff two models, side-by-side tracing and logit lens |
289
+ | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
290
+
291
+ ---
292
+
293
+ ## License
294
+
295
+ MIT
@@ -0,0 +1,258 @@
1
+ # InterpKit
2
+
3
+ > Mech interp for any HuggingFace model.
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/interpkit.svg)](https://pypi.org/project/interpkit/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
8
+
9
+ ---
10
+
11
+ ## The Problem
12
+
13
+ TransformerLens is excellent — but only works on GPT-style decoder-only transformers. The moment you step outside that (Mamba, SSMs, ViT, CNNs, BERT, T5, MoE models), there is no equivalent tool. You write hook code from scratch every time.
14
+
15
+ InterpKit fills this gap: the same standard mech interp operations, on any HuggingFace model, with no annotation required.
16
+
17
+ ---
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install interpkit
23
+
24
+ # For linear probe support:
25
+ pip install interpkit[probe]
26
+ ```
27
+
28
+ Or install from source for development:
29
+
30
+ ```bash
31
+ git clone https://github.com/davidezani/InterpKit.git
32
+ cd InterpKit
33
+ pip install -e ".[dev]"
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Quickstart
39
+
40
+ ```python
41
+ import interpkit
42
+
43
+ model = interpkit.load("gpt2")
44
+
45
+ model.inspect() # module tree with roles, params, shapes
46
+ model.trace("...Paris...", "...Rome...", top_k=20) # causal tracing
47
+ model.patch("...Paris...", "...Rome...", at="transformer.h.8.mlp")
48
+ model.lens("The capital of France is") # logit lens
49
+ model.attribute("The capital of France is") # gradient saliency
50
+ ```
51
+
52
+ Works the same on any HF architecture:
53
+
54
+ ```python
55
+ model = interpkit.load("state-spaces/mamba-370m")
56
+ model = interpkit.load("google/vit-base-patch16-224")
57
+ model = interpkit.load("bert-base-uncased")
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Operations
63
+
64
+ | Operation | What it does | Works on |
65
+ |-----------|-------------|----------|
66
+ | `inspect` | Module tree with types, param counts, shapes | Any model |
67
+ | `patch` | Activation patching at a named module | Any model |
68
+ | `trace` | Causal tracing across modules, ranked by effect | Any model |
69
+ | `attribute` | Gradient saliency over inputs | Any model |
70
+ | `lens` | Logit lens — project activations to vocabulary | LMs (auto-detected) |
71
+ | `activations` | Extract raw activation tensors at any module | Any model |
72
+ | `ablate` | Zero/mean ablate a component and measure effect | Any model |
73
+ | `attention` | Visualize attention patterns per layer/head | Transformers |
74
+ | `steer` | Extract and apply steering vectors | Any model |
75
+ | `probe` | Linear probe on activations | Any model |
76
+ | `diff` | Compare activations between two models | Any model |
77
+ | `features` | SAE feature decomposition | Any model |
78
+
79
+ ---
80
+
81
+ ## Activations, Ablation, Attention
82
+
83
+ ```python
84
+ # Extract raw activations
85
+ act = model.activations("The capital of France is", at="transformer.h.8.mlp")
86
+ acts = model.activations("...", at=["transformer.h.0", "transformer.h.8.mlp"])
87
+
88
+ # Ablation — zero or mean
89
+ result = model.ablate("The capital of France is", at="transformer.h.8.mlp")
90
+ result = model.ablate("...", at="transformer.h.8.mlp", method="mean")
91
+
92
+ # Attention patterns
93
+ model.attention("The capital of France is") # all layers
94
+ model.attention("The capital of France is", layer=8, head=3) # single head
95
+ ```
96
+
97
+ ## Steering
98
+
99
+ ```python
100
+ # 1. Extract a steering vector
101
+ vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
102
+
103
+ # 2. Apply during inference — side-by-side comparison
104
+ model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
105
+ ```
106
+
107
+ ## Linear Probe
108
+
109
+ ```python
110
+ result = model.probe(
111
+ texts=["The cat sat", "The dog ran", "A bird flew", "A fish swam"],
112
+ labels=[0, 0, 1, 1],
113
+ at="transformer.h.8",
114
+ )
115
+ print(result["accuracy"])
116
+ ```
117
+
118
+ ## Model Diff
119
+
120
+ ```python
121
+ base = interpkit.load("gpt2")
122
+ finetuned = interpkit.load("my-finetuned-gpt2")
123
+ interpkit.diff(base, finetuned, "The capital of France is")
124
+ ```
125
+
126
+ ## SAE Features
127
+
128
+ Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
129
+
130
+ ```python
131
+ model.features(
132
+ "The capital of France is",
133
+ at="transformer.h.8",
134
+ sae="jbloom/GPT2-Small-SAEs-Reformatted",
135
+ )
136
+ ```
137
+
138
+ No SAELens dependency — weights are loaded directly via `safetensors`.
139
+
140
+ ## Activation Cache
141
+
142
+ Avoid redundant forward passes when exploring the same input with multiple operations:
143
+
144
+ ```python
145
+ model.cache("The capital of France is") # one forward pass, cache all layers
146
+ model.activations("The capital of France is", at="transformer.h.8.mlp") # instant
147
+ model.activations("The capital of France is", at="transformer.h.0.mlp") # instant
148
+
149
+ model.clear_cache() # free memory
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Visualizations
155
+
156
+ Pass `save="path.png"` to export a static matplotlib figure, or `html="path.html"` for an interactive visualization:
157
+
158
+ ```python
159
+ model.attention("hello world", layer=0, head=0, save="attention.png")
160
+ model.trace("...Paris...", "...Rome...", save="trace.png")
161
+ model.lens("The capital of France is", save="lens.png")
162
+ model.steer("The weather is", vector=vector, at="transformer.h.8", save="steer.png")
163
+ model.attribute("The capital of France is", save="attribution.png")
164
+ interpkit.diff(base, finetuned, "...", save="diff.png")
165
+
166
+ # Interactive HTML — self-contained files with hover tooltips, filters, and sliders
167
+ model.attention("hello world", html="attention.html")
168
+ model.trace("...Paris...", "...Rome...", html="trace.html")
169
+ model.attribute("The capital of France is", html="attribution.html")
170
+ ```
171
+
172
+ ---
173
+
174
+ ## CLI
175
+
176
+ ```bash
177
+ interpkit inspect gpt2
178
+ interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --top-k 20
179
+ interpkit lens gpt2 "The capital of France is"
180
+ interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
181
+ interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
182
+ interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
183
+ interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
184
+ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
185
+
186
+ # Interactive HTML output
187
+ interpkit attention gpt2 "hello world" --html attention.html
188
+ interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
189
+ interpkit attribute gpt2 "The capital of France is" --html attribution.html
190
+
191
+ # Vision models — auto-preprocessed
192
+ interpkit attribute microsoft/resnet-50 cat.jpg --target 281
193
+ ```
194
+
195
+ Run `interpkit` with no arguments for a full command reference.
196
+
197
+ ---
198
+
199
+ ## TransformerLens interop
200
+
201
+ Already using TransformerLens? Pass your `HookedTransformer` directly into InterpKit — it auto-detects the model and extracts the tokenizer:
202
+
203
+ ```python
204
+ from transformer_lens import HookedTransformer
205
+ import interpkit
206
+
207
+ tl_model = HookedTransformer.from_pretrained("gpt2")
208
+ model = interpkit.load(tl_model)
209
+
210
+ # All InterpKit operations work on TL models
211
+ model.trace("The Eiffel Tower is in Paris", "The Eiffel Tower is in Rome", top_k=20)
212
+ model.attention("The capital of France is", save="attention.png")
213
+ model.steer("The weather is", vector=vector, at="blocks.8", scale=2.0)
214
+ ```
215
+
216
+ Translate between native and TL hook point names:
217
+
218
+ ```python
219
+ interpkit.to_tl_name("transformer.h.8.mlp") # -> "blocks.8.mlp"
220
+ interpkit.to_native_name("blocks.8.attn", model.arch_info) # -> "transformer.h.8.attn"
221
+ interpkit.list_tl_hooks(tl_model) # -> ["blocks.0.hook_resid_pre", ...]
222
+ ```
223
+
224
+ ---
225
+
226
+ ## Local models
227
+
228
+ ```python
229
+ import torch.nn as nn
230
+ import interpkit
231
+
232
+ my_model = MyCustomModel()
233
+ interpkit.register(my_model, layers=["blocks.0", "blocks.1"], output_head="head")
234
+ model = interpkit.load(my_model, tokenizer=my_tokenizer)
235
+ model.trace(input_a, input_b, top_k=10)
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Examples
241
+
242
+ See the [`examples/`](examples/) directory for Jupyter notebooks:
243
+
244
+ | Notebook | Topics |
245
+ |----------|--------|
246
+ | `01_quickstart` | Inspect, trace, lens, attribution, patching, ablation |
247
+ | `02_attention_patterns` | Per-head heatmaps, layer filtering, HTML export |
248
+ | `03_steering_vectors` | Extract and apply steering vectors at different layers/scales |
249
+ | `04_sae_features` | Sparse Autoencoder feature decomposition |
250
+ | `05_caching_and_probing` | Activation cache, linear probes across layers |
251
+ | `06_model_comparison` | Diff two models, side-by-side tracing and logit lens |
252
+ | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
253
+
254
+ ---
255
+
256
+ ## License
257
+
258
+ MIT
@@ -0,0 +1,15 @@
1
+ """interpkit — mech interp for any HuggingFace model."""
2
+
3
+ from interpkit.core.model import load
4
+ from interpkit.core.registry import register
5
+ from interpkit.core.tl_compat import list_tl_hooks, to_native_name, to_tl_name
6
+
7
+
8
+ def diff(model_a, model_b, input_data, *, save=None):
9
+ """Compare activations between two models on the same input."""
10
+ from interpkit.ops.diff import run_diff
11
+
12
+ return run_diff(model_a, model_b, input_data, save=save)
13
+
14
+
15
+ __all__ = ["load", "register", "diff", "to_tl_name", "to_native_name", "list_tl_hooks"]
File without changes