interpkit 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {interpkit-0.3.0 → interpkit-0.5.0}/PKG-INFO +100 -9
- {interpkit-0.3.0 → interpkit-0.5.0}/README.md +96 -7
- interpkit-0.5.0/interpkit/__init__.py +65 -0
- interpkit-0.5.0/interpkit/__main__.py +23 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/cli/main.py +274 -10
- interpkit-0.5.0/interpkit/core/arch/__init__.py +102 -0
- interpkit-0.5.0/interpkit/core/arch/blocks.py +257 -0
- interpkit-0.5.0/interpkit/core/arch/family.py +421 -0
- interpkit-0.5.0/interpkit/core/arch/heads.py +583 -0
- interpkit-0.5.0/interpkit/core/arch/layers.py +462 -0
- interpkit-0.5.0/interpkit/core/arch/names.py +60 -0
- interpkit-0.5.0/interpkit/core/arch/probe.py +241 -0
- interpkit-0.5.0/interpkit/core/arch/residual.py +653 -0
- interpkit-0.5.0/interpkit/core/arch/resolve.py +679 -0
- interpkit-0.5.0/interpkit/core/arch/tree.py +190 -0
- interpkit-0.5.0/interpkit/core/arch/types.py +486 -0
- interpkit-0.5.0/interpkit/core/enums.py +105 -0
- interpkit-0.5.0/interpkit/core/exceptions.py +83 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/html.py +5 -2
- interpkit-0.5.0/interpkit/core/inputs.py +447 -0
- interpkit-0.5.0/interpkit/core/loader.py +704 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/model.py +537 -38
- interpkit-0.5.0/interpkit/core/paths.py +71 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/render.py +74 -18
- interpkit-0.5.0/interpkit/core/support_matrix.py +690 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/theme.py +11 -8
- interpkit-0.5.0/interpkit/core/tl_compat.py +297 -0
- interpkit-0.5.0/interpkit/ops/_atp.py +182 -0
- interpkit-0.5.0/interpkit/ops/_hooks.py +233 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/ablate.py +14 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/activations.py +9 -1
- interpkit-0.5.0/interpkit/ops/attention.py +334 -0
- interpkit-0.5.0/interpkit/ops/attribute.py +844 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/batch.py +4 -4
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/circuits.py +221 -110
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/diff.py +22 -2
- interpkit-0.5.0/interpkit/ops/dla.py +628 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/find_circuit.py +15 -17
- interpkit-0.5.0/interpkit/ops/heads.py +282 -0
- interpkit-0.5.0/interpkit/ops/lens.py +397 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/patch.py +113 -22
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/probe.py +14 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/report.py +55 -10
- interpkit-0.5.0/interpkit/ops/sae.py +739 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/scan.py +28 -6
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/steer.py +59 -2
- interpkit-0.5.0/interpkit/ops/trace.py +502 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/PKG-INFO +100 -9
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/SOURCES.txt +31 -2
- interpkit-0.5.0/interpkit.egg-info/entry_points.txt +2 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/requires.txt +3 -1
- {interpkit-0.3.0 → interpkit-0.5.0}/pyproject.toml +27 -4
- interpkit-0.5.0/tests/test_archinfo_serialization.py +61 -0
- interpkit-0.5.0/tests/test_attention.py +112 -0
- interpkit-0.5.0/tests/test_audit_regressions.py +1891 -0
- interpkit-0.5.0/tests/test_cache_invalidation.py +66 -0
- interpkit-0.5.0/tests/test_capabilities.py +227 -0
- interpkit-0.5.0/tests/test_chat.py +217 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_cli.py +77 -1
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_discovery.py +1 -1
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_discovery_units.py +21 -21
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_error_handling.py +11 -0
- interpkit-0.5.0/tests/test_inputs.py +251 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_invariants.py +22 -8
- interpkit-0.5.0/tests/test_lens.py +53 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_load_params.py +12 -2
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_multi_arch.py +12 -5
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_ops.py +6 -1
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_param_variants.py +4 -2
- interpkit-0.5.0/tests/test_phase3_regressions.py +121 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_regressions.py +5 -2
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_render_internals.py +34 -6
- interpkit-0.5.0/tests/test_resolver.py +268 -0
- interpkit-0.5.0/tests/test_resolver_golden.py +131 -0
- interpkit-0.5.0/tests/test_robustness_audit.py +790 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_sae.py +161 -1
- interpkit-0.5.0/tests/test_seq2seq_contract.py +119 -0
- interpkit-0.5.0/tests/test_steer.py +91 -0
- interpkit-0.5.0/tests/test_trace.py +76 -0
- interpkit-0.5.0/tests/test_validation.py +130 -0
- interpkit-0.3.0/interpkit/__init__.py +0 -27
- interpkit-0.3.0/interpkit/core/discovery.py +0 -810
- interpkit-0.3.0/interpkit/core/inputs.py +0 -130
- interpkit-0.3.0/interpkit/core/loader.py +0 -292
- interpkit-0.3.0/interpkit/core/tl_compat.py +0 -174
- interpkit-0.3.0/interpkit/ops/attention.py +0 -365
- interpkit-0.3.0/interpkit/ops/attribute.py +0 -308
- interpkit-0.3.0/interpkit/ops/dla.py +0 -488
- interpkit-0.3.0/interpkit/ops/heads.py +0 -175
- interpkit-0.3.0/interpkit/ops/lens.py +0 -243
- interpkit-0.3.0/interpkit/ops/sae.py +0 -439
- interpkit-0.3.0/interpkit/ops/trace.py +0 -349
- interpkit-0.3.0/interpkit.egg-info/entry_points.txt +0 -2
- interpkit-0.3.0/tests/test_attention.py +0 -44
- interpkit-0.3.0/tests/test_lens.py +0 -25
- interpkit-0.3.0/tests/test_steer.py +0 -30
- interpkit-0.3.0/tests/test_trace.py +0 -35
- {interpkit-0.3.0 → interpkit-0.5.0}/LICENSE +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/cli/__init__.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/__init__.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/cache.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/plot.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/registry.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/__init__.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/inspect.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/dependency_links.txt +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/top_level.txt +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/setup.cfg +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_ablate.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_activations.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_architectures.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_attribute.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_cache.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_diff.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_html.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_inspect.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_patch.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_plot_internals.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_plots.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_probe.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_registry.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_tl_compat.py +0 -0
- {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_tl_ops.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: interpkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Mech interp for any HuggingFace model.
|
|
5
5
|
Author: Davide Zani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -20,7 +20,8 @@ Requires-Python: >=3.10
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: torch>=2.1
|
|
23
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: transformers<6,>=4.36
|
|
24
25
|
Requires-Dist: safetensors>=0.4
|
|
25
26
|
Requires-Dist: rich>=13.0
|
|
26
27
|
Requires-Dist: rich-gradient>=0.3
|
|
@@ -28,6 +29,7 @@ Requires-Dist: typer>=0.9
|
|
|
28
29
|
Requires-Dist: Pillow>=10.0
|
|
29
30
|
Requires-Dist: matplotlib>=3.8
|
|
30
31
|
Requires-Dist: huggingface-hub>=0.20
|
|
32
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
31
33
|
Provides-Extra: vision
|
|
32
34
|
Requires-Dist: torchvision>=0.16; extra == "vision"
|
|
33
35
|
Provides-Extra: probe
|
|
@@ -60,27 +62,55 @@ Dynamic: license-file
|
|
|
60
62
|
|
|
61
63
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
62
64
|
|
|
63
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
65
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
64
66
|
|
|
65
67
|
---
|
|
66
68
|
|
|
67
69
|
## Install
|
|
68
70
|
|
|
71
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
72
|
+
|
|
73
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv venv --python 3.11
|
|
77
|
+
source .venv/bin/activate
|
|
78
|
+
uv pip install interpkit
|
|
79
|
+
|
|
80
|
+
# For linear probe support:
|
|
81
|
+
uv pip install "interpkit[probe]"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Or with plain `venv` + `pip`:
|
|
85
|
+
|
|
69
86
|
```bash
|
|
87
|
+
python3.11 -m venv .venv
|
|
88
|
+
source .venv/bin/activate
|
|
70
89
|
pip install interpkit
|
|
71
90
|
|
|
72
91
|
# For linear probe support:
|
|
73
|
-
pip install interpkit[probe]
|
|
92
|
+
pip install "interpkit[probe]"
|
|
74
93
|
```
|
|
75
94
|
|
|
76
|
-
Or
|
|
95
|
+
Or with `conda`:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
conda create -n interpkit python=3.11 -y
|
|
99
|
+
conda activate interpkit
|
|
100
|
+
pip install interpkit
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Installing from source for development:
|
|
77
104
|
|
|
78
105
|
```bash
|
|
79
106
|
git clone https://github.com/z4nix/interpkit.git
|
|
80
107
|
cd interpkit
|
|
81
|
-
|
|
108
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
109
|
+
uv pip install -e ".[dev]"
|
|
82
110
|
```
|
|
83
111
|
|
|
112
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
113
|
+
|
|
84
114
|
---
|
|
85
115
|
|
|
86
116
|
## Quickstart
|
|
@@ -111,6 +141,25 @@ model = interpkit.load("google/vit-base-patch16-224")
|
|
|
111
141
|
model = interpkit.load("bert-base-uncased")
|
|
112
142
|
```
|
|
113
143
|
|
|
144
|
+
### Chat models
|
|
145
|
+
|
|
146
|
+
Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
|
|
150
|
+
|
|
151
|
+
result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
|
|
152
|
+
print(result["response"])
|
|
153
|
+
|
|
154
|
+
# Run any other op on the templated prompt
|
|
155
|
+
chat.dla(result["prompt"])
|
|
156
|
+
|
|
157
|
+
# Or pass a message list directly to any op
|
|
158
|
+
chat.dla([{"role": "user", "content": "Capital of France?"}])
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
|
|
162
|
+
|
|
114
163
|
---
|
|
115
164
|
|
|
116
165
|
## Operations
|
|
@@ -118,6 +167,7 @@ model = interpkit.load("bert-base-uncased")
|
|
|
118
167
|
| Operation | What it does | Works on |
|
|
119
168
|
|-----------|-------------|----------|
|
|
120
169
|
| **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
|
|
170
|
+
| **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
|
|
121
171
|
| **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
|
|
122
172
|
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
123
173
|
| `patch` | Activation patching at a module, head, or position | Any model |
|
|
@@ -328,10 +378,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
|
|
|
328
378
|
## Steering
|
|
329
379
|
|
|
330
380
|
```python
|
|
331
|
-
vector = model.steer_vector("
|
|
381
|
+
vector = model.steer_vector(" love", " hate", at="transformer.h.8")
|
|
332
382
|
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
333
383
|
```
|
|
334
384
|
|
|
385
|
+
> Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
|
|
386
|
+
|
|
335
387
|
## Linear Probe
|
|
336
388
|
|
|
337
389
|
```python
|
|
@@ -422,7 +474,7 @@ interpkit lens gpt2 "The capital of France is"
|
|
|
422
474
|
interpkit lens gpt2 "The capital of France is" --position -1
|
|
423
475
|
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
424
476
|
interpkit attribute gpt2 "The capital of France is"
|
|
425
|
-
interpkit steer gpt2 "The weather is" --positive
|
|
477
|
+
interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
|
|
426
478
|
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
427
479
|
interpkit decompose gpt2 "The capital of France is"
|
|
428
480
|
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
@@ -430,6 +482,10 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
|
|
|
430
482
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
431
483
|
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
432
484
|
|
|
485
|
+
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
486
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
487
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
488
|
+
|
|
433
489
|
# Interactive HTML output
|
|
434
490
|
interpkit attention gpt2 "hello world" --html attention.html
|
|
435
491
|
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
|
|
@@ -439,7 +495,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
|
439
495
|
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
440
496
|
```
|
|
441
497
|
|
|
442
|
-
Run `interpkit` with no arguments for a full command reference
|
|
498
|
+
Run `interpkit` with no arguments for a full command reference, or
|
|
499
|
+
`interpkit --extensive` for a beginner-friendly walkthrough of every command.
|
|
500
|
+
|
|
501
|
+
If the `interpkit` console script isn't on your `PATH` (e.g. fresh
|
|
502
|
+
environments, sandboxed installs, or running from a checkout without
|
|
503
|
+
re-installing), every command also works as `python -m interpkit ...`:
|
|
504
|
+
|
|
505
|
+
```bash
|
|
506
|
+
python -m interpkit scan gpt2 "The capital of France is"
|
|
507
|
+
python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
|
|
508
|
+
```
|
|
443
509
|
|
|
444
510
|
---
|
|
445
511
|
|
|
@@ -486,6 +552,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
486
552
|
|
|
487
553
|
---
|
|
488
554
|
|
|
555
|
+
## Known limitations
|
|
556
|
+
|
|
557
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
558
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
559
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
560
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
561
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
562
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
563
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
564
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
565
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
566
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
567
|
+
|
|
568
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
569
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
570
|
+
sum does not converge to model-output completeness even at large
|
|
571
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
572
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
573
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
574
|
+
reports this programmatically: `result["interpretation"]` is
|
|
575
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
576
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
577
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
578
|
+
|
|
489
579
|
## Examples
|
|
490
580
|
|
|
491
581
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -501,6 +591,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
501
591
|
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
502
592
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
503
593
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
594
|
+
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
504
595
|
|
|
505
596
|
---
|
|
506
597
|
|
|
@@ -12,27 +12,55 @@
|
|
|
12
12
|
|
|
13
13
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
14
14
|
|
|
15
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
15
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
16
16
|
|
|
17
17
|
---
|
|
18
18
|
|
|
19
19
|
## Install
|
|
20
20
|
|
|
21
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
22
|
+
|
|
23
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv venv --python 3.11
|
|
27
|
+
source .venv/bin/activate
|
|
28
|
+
uv pip install interpkit
|
|
29
|
+
|
|
30
|
+
# For linear probe support:
|
|
31
|
+
uv pip install "interpkit[probe]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or with plain `venv` + `pip`:
|
|
35
|
+
|
|
21
36
|
```bash
|
|
37
|
+
python3.11 -m venv .venv
|
|
38
|
+
source .venv/bin/activate
|
|
22
39
|
pip install interpkit
|
|
23
40
|
|
|
24
41
|
# For linear probe support:
|
|
25
|
-
pip install interpkit[probe]
|
|
42
|
+
pip install "interpkit[probe]"
|
|
26
43
|
```
|
|
27
44
|
|
|
28
|
-
Or
|
|
45
|
+
Or with `conda`:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
conda create -n interpkit python=3.11 -y
|
|
49
|
+
conda activate interpkit
|
|
50
|
+
pip install interpkit
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Installing from source for development:
|
|
29
54
|
|
|
30
55
|
```bash
|
|
31
56
|
git clone https://github.com/z4nix/interpkit.git
|
|
32
57
|
cd interpkit
|
|
33
|
-
|
|
58
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
59
|
+
uv pip install -e ".[dev]"
|
|
34
60
|
```
|
|
35
61
|
|
|
62
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
63
|
+
|
|
36
64
|
---
|
|
37
65
|
|
|
38
66
|
## Quickstart
|
|
@@ -63,6 +91,25 @@ model = interpkit.load("google/vit-base-patch16-224")
|
|
|
63
91
|
model = interpkit.load("bert-base-uncased")
|
|
64
92
|
```
|
|
65
93
|
|
|
94
|
+
### Chat models
|
|
95
|
+
|
|
96
|
+
Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
|
|
100
|
+
|
|
101
|
+
result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
|
|
102
|
+
print(result["response"])
|
|
103
|
+
|
|
104
|
+
# Run any other op on the templated prompt
|
|
105
|
+
chat.dla(result["prompt"])
|
|
106
|
+
|
|
107
|
+
# Or pass a message list directly to any op
|
|
108
|
+
chat.dla([{"role": "user", "content": "Capital of France?"}])
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
|
|
112
|
+
|
|
66
113
|
---
|
|
67
114
|
|
|
68
115
|
## Operations
|
|
@@ -70,6 +117,7 @@ model = interpkit.load("bert-base-uncased")
|
|
|
70
117
|
| Operation | What it does | Works on |
|
|
71
118
|
|-----------|-------------|----------|
|
|
72
119
|
| **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
|
|
120
|
+
| **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
|
|
73
121
|
| **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
|
|
74
122
|
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
75
123
|
| `patch` | Activation patching at a module, head, or position | Any model |
|
|
@@ -280,10 +328,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
|
|
|
280
328
|
## Steering
|
|
281
329
|
|
|
282
330
|
```python
|
|
283
|
-
vector = model.steer_vector("
|
|
331
|
+
vector = model.steer_vector(" love", " hate", at="transformer.h.8")
|
|
284
332
|
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
285
333
|
```
|
|
286
334
|
|
|
335
|
+
> Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
|
|
336
|
+
|
|
287
337
|
## Linear Probe
|
|
288
338
|
|
|
289
339
|
```python
|
|
@@ -374,7 +424,7 @@ interpkit lens gpt2 "The capital of France is"
|
|
|
374
424
|
interpkit lens gpt2 "The capital of France is" --position -1
|
|
375
425
|
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
376
426
|
interpkit attribute gpt2 "The capital of France is"
|
|
377
|
-
interpkit steer gpt2 "The weather is" --positive
|
|
427
|
+
interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
|
|
378
428
|
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
379
429
|
interpkit decompose gpt2 "The capital of France is"
|
|
380
430
|
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
@@ -382,6 +432,10 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
|
|
|
382
432
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
383
433
|
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
384
434
|
|
|
435
|
+
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
436
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
437
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
438
|
+
|
|
385
439
|
# Interactive HTML output
|
|
386
440
|
interpkit attention gpt2 "hello world" --html attention.html
|
|
387
441
|
interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
|
|
@@ -391,7 +445,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
|
391
445
|
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
392
446
|
```
|
|
393
447
|
|
|
394
|
-
Run `interpkit` with no arguments for a full command reference
|
|
448
|
+
Run `interpkit` with no arguments for a full command reference, or
|
|
449
|
+
`interpkit --extensive` for a beginner-friendly walkthrough of every command.
|
|
450
|
+
|
|
451
|
+
If the `interpkit` console script isn't on your `PATH` (e.g. fresh
|
|
452
|
+
environments, sandboxed installs, or running from a checkout without
|
|
453
|
+
re-installing), every command also works as `python -m interpkit ...`:
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
python -m interpkit scan gpt2 "The capital of France is"
|
|
457
|
+
python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
|
|
458
|
+
```
|
|
395
459
|
|
|
396
460
|
---
|
|
397
461
|
|
|
@@ -438,6 +502,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
438
502
|
|
|
439
503
|
---
|
|
440
504
|
|
|
505
|
+
## Known limitations
|
|
506
|
+
|
|
507
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
508
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
509
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
510
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
511
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
512
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
513
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
514
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
515
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
516
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
517
|
+
|
|
518
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
519
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
520
|
+
sum does not converge to model-output completeness even at large
|
|
521
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
522
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
523
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
524
|
+
reports this programmatically: `result["interpretation"]` is
|
|
525
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
526
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
527
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
528
|
+
|
|
441
529
|
## Examples
|
|
442
530
|
|
|
443
531
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -453,6 +541,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
453
541
|
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
454
542
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
455
543
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
544
|
+
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
456
545
|
|
|
457
546
|
---
|
|
458
547
|
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""interpkit — mech interp for any HuggingFace model."""
|
|
2
|
+
|
|
3
|
+
from interpkit.core.arch import (
|
|
4
|
+
ArchFamily,
|
|
5
|
+
ArchInfo,
|
|
6
|
+
BlockSpec,
|
|
7
|
+
LayerInfo,
|
|
8
|
+
ModuleInfo,
|
|
9
|
+
resolve_arch,
|
|
10
|
+
)
|
|
11
|
+
from interpkit.core.exceptions import (
|
|
12
|
+
ArchitectureNotSupported,
|
|
13
|
+
AttentionBackendUnavailable,
|
|
14
|
+
InterpkitError,
|
|
15
|
+
LensPipelineMismatch,
|
|
16
|
+
OperationNotSupportedForArchitecture,
|
|
17
|
+
WrongInputType,
|
|
18
|
+
)
|
|
19
|
+
from interpkit.core.loader import load, load_module
|
|
20
|
+
from interpkit.core.model import Model
|
|
21
|
+
from interpkit.core.registry import register
|
|
22
|
+
from interpkit.core.tl_compat import (
|
|
23
|
+
list_roundtrippable_hooks,
|
|
24
|
+
list_tl_hooks,
|
|
25
|
+
to_native_name,
|
|
26
|
+
to_tl_name,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def diff(model_a, model_b, input_data, *, save=None):
|
|
31
|
+
"""Compare activations between two models on the same input."""
|
|
32
|
+
from interpkit.ops.diff import run_diff
|
|
33
|
+
|
|
34
|
+
return run_diff(model_a, model_b, input_data, save=save)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
# Loaders
|
|
39
|
+
"load",
|
|
40
|
+
"load_module",
|
|
41
|
+
"Model",
|
|
42
|
+
# Architecture types
|
|
43
|
+
"ArchInfo",
|
|
44
|
+
"ArchFamily",
|
|
45
|
+
"BlockSpec",
|
|
46
|
+
"resolve_arch",
|
|
47
|
+
# Per-layer structural types
|
|
48
|
+
"LayerInfo",
|
|
49
|
+
"ModuleInfo",
|
|
50
|
+
# Exception types
|
|
51
|
+
"InterpkitError",
|
|
52
|
+
"ArchitectureNotSupported",
|
|
53
|
+
"AttentionBackendUnavailable",
|
|
54
|
+
"LensPipelineMismatch",
|
|
55
|
+
"OperationNotSupportedForArchitecture",
|
|
56
|
+
"WrongInputType",
|
|
57
|
+
# Operations
|
|
58
|
+
"register",
|
|
59
|
+
"diff",
|
|
60
|
+
# TL compat
|
|
61
|
+
"to_tl_name",
|
|
62
|
+
"to_native_name",
|
|
63
|
+
"list_tl_hooks",
|
|
64
|
+
"list_roundtrippable_hooks",
|
|
65
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Entry point so ``python -m interpkit`` invokes the Typer CLI.
|
|
2
|
+
|
|
3
|
+
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:run"``
|
|
4
|
+
console script declared in :file:`pyproject.toml`, so users without the
|
|
5
|
+
console script on their ``$PATH`` (e.g. just-installed in a fresh
|
|
6
|
+
environment, vendored copies, ad-hoc subprocess invocations) can still
|
|
7
|
+
reach every CLI command via ``python -m interpkit ...``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from interpkit.cli.main import run
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main() -> None:
|
|
14
|
+
"""Invoke the CLI — separate function makes patching easier in tests.
|
|
15
|
+
|
|
16
|
+
Uses ``run`` (not ``app`` directly) so interpkit's fail-loud errors are
|
|
17
|
+
rendered as clean one-line messages instead of tracebacks.
|
|
18
|
+
"""
|
|
19
|
+
run()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
if __name__ == "__main__":
|
|
23
|
+
main()
|