interpkit 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {interpkit-0.4.0 → interpkit-0.6.0}/PKG-INFO +85 -7
- {interpkit-0.4.0 → interpkit-0.6.0}/README.md +79 -5
- interpkit-0.6.0/interpkit/__init__.py +84 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/__main__.py +8 -4
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/cli/main.py +506 -11
- interpkit-0.6.0/interpkit/core/arch/__init__.py +102 -0
- interpkit-0.6.0/interpkit/core/arch/blocks.py +257 -0
- interpkit-0.6.0/interpkit/core/arch/family.py +421 -0
- interpkit-0.6.0/interpkit/core/arch/heads.py +583 -0
- interpkit-0.6.0/interpkit/core/arch/layers.py +462 -0
- interpkit-0.6.0/interpkit/core/arch/names.py +60 -0
- interpkit-0.6.0/interpkit/core/arch/probe.py +241 -0
- interpkit-0.6.0/interpkit/core/arch/residual.py +653 -0
- interpkit-0.6.0/interpkit/core/arch/resolve.py +679 -0
- interpkit-0.6.0/interpkit/core/arch/tree.py +190 -0
- interpkit-0.6.0/interpkit/core/arch/types.py +486 -0
- interpkit-0.6.0/interpkit/core/enums.py +121 -0
- interpkit-0.6.0/interpkit/core/exceptions.py +83 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/html.py +5 -2
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/inputs.py +70 -8
- interpkit-0.6.0/interpkit/core/interventions.py +492 -0
- interpkit-0.6.0/interpkit/core/loader.py +704 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/model.py +610 -36
- interpkit-0.6.0/interpkit/core/paths.py +88 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/render.py +239 -7
- interpkit-0.6.0/interpkit/core/support_matrix.py +698 -0
- interpkit-0.6.0/interpkit/core/tl_compat.py +297 -0
- interpkit-0.6.0/interpkit/core/topk.py +63 -0
- interpkit-0.6.0/interpkit/ops/_atp.py +13 -0
- interpkit-0.6.0/interpkit/ops/_hooks.py +272 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/ablate.py +23 -39
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/activations.py +9 -1
- interpkit-0.6.0/interpkit/ops/atp.py +230 -0
- interpkit-0.6.0/interpkit/ops/attention.py +334 -0
- interpkit-0.6.0/interpkit/ops/attribute.py +844 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/circuits.py +219 -108
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/diff.py +22 -2
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/dla.py +309 -190
- interpkit-0.6.0/interpkit/ops/eap.py +355 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/find_circuit.py +135 -76
- interpkit-0.6.0/interpkit/ops/generate.py +292 -0
- interpkit-0.6.0/interpkit/ops/heads.py +282 -0
- interpkit-0.6.0/interpkit/ops/lens.py +442 -0
- interpkit-0.6.0/interpkit/ops/maxact.py +347 -0
- interpkit-0.6.0/interpkit/ops/patch.py +328 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/probe.py +14 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/sae.py +142 -22
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/steer.py +16 -24
- interpkit-0.6.0/interpkit/ops/trace.py +456 -0
- interpkit-0.6.0/interpkit/ops/tuned_lens.py +437 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/PKG-INFO +85 -7
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/SOURCES.txt +41 -2
- interpkit-0.6.0/interpkit.egg-info/entry_points.txt +2 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/requires.txt +6 -1
- {interpkit-0.4.0 → interpkit-0.6.0}/pyproject.toml +30 -4
- interpkit-0.6.0/tests/test_archinfo_serialization.py +61 -0
- interpkit-0.6.0/tests/test_atp.py +68 -0
- interpkit-0.6.0/tests/test_attention.py +112 -0
- interpkit-0.6.0/tests/test_audit_regressions.py +1891 -0
- interpkit-0.6.0/tests/test_cache_invalidation.py +66 -0
- interpkit-0.6.0/tests/test_capabilities.py +227 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_cli.py +210 -1
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_discovery.py +1 -1
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_discovery_units.py +21 -21
- interpkit-0.6.0/tests/test_eap.py +138 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_error_handling.py +11 -0
- interpkit-0.6.0/tests/test_generate.py +186 -0
- interpkit-0.6.0/tests/test_interventions.py +241 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_invariants.py +22 -8
- interpkit-0.6.0/tests/test_lens.py +53 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_load_params.py +12 -2
- interpkit-0.6.0/tests/test_maxact.py +149 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_multi_arch.py +12 -5
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_ops.py +6 -1
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_param_variants.py +4 -2
- interpkit-0.6.0/tests/test_phase3_regressions.py +121 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_regressions.py +5 -2
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_render_internals.py +34 -6
- interpkit-0.6.0/tests/test_resolver.py +268 -0
- interpkit-0.6.0/tests/test_resolver_golden.py +131 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_robustness_audit.py +56 -29
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_sae.py +6 -1
- interpkit-0.6.0/tests/test_seq2seq_contract.py +119 -0
- interpkit-0.6.0/tests/test_topk.py +58 -0
- interpkit-0.6.0/tests/test_trace.py +76 -0
- interpkit-0.6.0/tests/test_tuned_lens.py +140 -0
- interpkit-0.6.0/tests/test_validation.py +130 -0
- interpkit-0.4.0/interpkit/__init__.py +0 -27
- interpkit-0.4.0/interpkit/core/discovery.py +0 -810
- interpkit-0.4.0/interpkit/core/loader.py +0 -322
- interpkit-0.4.0/interpkit/core/tl_compat.py +0 -174
- interpkit-0.4.0/interpkit/ops/attention.py +0 -365
- interpkit-0.4.0/interpkit/ops/attribute.py +0 -377
- interpkit-0.4.0/interpkit/ops/heads.py +0 -175
- interpkit-0.4.0/interpkit/ops/lens.py +0 -243
- interpkit-0.4.0/interpkit/ops/patch.py +0 -261
- interpkit-0.4.0/interpkit/ops/trace.py +0 -349
- interpkit-0.4.0/interpkit.egg-info/entry_points.txt +0 -2
- interpkit-0.4.0/tests/test_attention.py +0 -44
- interpkit-0.4.0/tests/test_lens.py +0 -25
- interpkit-0.4.0/tests/test_trace.py +0 -35
- {interpkit-0.4.0 → interpkit-0.6.0}/LICENSE +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/cli/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/cache.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/plot.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/registry.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/theme.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/batch.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/inspect.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/report.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/scan.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/dependency_links.txt +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/top_level.txt +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/setup.cfg +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_ablate.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_activations.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_architectures.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_attribute.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_cache.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_chat.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_diff.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_html.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_inputs.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_inspect.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_patch.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_plot_internals.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_plots.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_probe.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_registry.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_steer.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_tl_compat.py +0 -0
- {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_tl_ops.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: interpkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Mech interp for any HuggingFace model.
|
|
5
5
|
Author: Davide Zani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -20,7 +20,8 @@ Requires-Python: >=3.10
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: torch>=2.1
|
|
23
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: transformers<6,>=4.36
|
|
24
25
|
Requires-Dist: safetensors>=0.4
|
|
25
26
|
Requires-Dist: rich>=13.0
|
|
26
27
|
Requires-Dist: rich-gradient>=0.3
|
|
@@ -28,10 +29,13 @@ Requires-Dist: typer>=0.9
|
|
|
28
29
|
Requires-Dist: Pillow>=10.0
|
|
29
30
|
Requires-Dist: matplotlib>=3.8
|
|
30
31
|
Requires-Dist: huggingface-hub>=0.20
|
|
32
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
31
33
|
Provides-Extra: vision
|
|
32
34
|
Requires-Dist: torchvision>=0.16; extra == "vision"
|
|
33
35
|
Provides-Extra: probe
|
|
34
36
|
Requires-Dist: scikit-learn>=1.3; extra == "probe"
|
|
37
|
+
Provides-Extra: data
|
|
38
|
+
Requires-Dist: datasets>=2.14; extra == "data"
|
|
35
39
|
Provides-Extra: dev
|
|
36
40
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
37
41
|
Requires-Dist: pytest-timeout>=2.2; extra == "dev"
|
|
@@ -60,27 +64,55 @@ Dynamic: license-file
|
|
|
60
64
|
|
|
61
65
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
62
66
|
|
|
63
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
67
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
64
68
|
|
|
65
69
|
---
|
|
66
70
|
|
|
67
71
|
## Install
|
|
68
72
|
|
|
73
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
74
|
+
|
|
75
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
76
|
+
|
|
69
77
|
```bash
|
|
78
|
+
uv venv --python 3.11
|
|
79
|
+
source .venv/bin/activate
|
|
80
|
+
uv pip install interpkit
|
|
81
|
+
|
|
82
|
+
# For linear probe support:
|
|
83
|
+
uv pip install "interpkit[probe]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Or with plain `venv` + `pip`:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
python3.11 -m venv .venv
|
|
90
|
+
source .venv/bin/activate
|
|
70
91
|
pip install interpkit
|
|
71
92
|
|
|
72
93
|
# For linear probe support:
|
|
73
|
-
pip install interpkit[probe]
|
|
94
|
+
pip install "interpkit[probe]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Or with `conda`:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
conda create -n interpkit python=3.11 -y
|
|
101
|
+
conda activate interpkit
|
|
102
|
+
pip install interpkit
|
|
74
103
|
```
|
|
75
104
|
|
|
76
|
-
|
|
105
|
+
Installing from source for development:
|
|
77
106
|
|
|
78
107
|
```bash
|
|
79
108
|
git clone https://github.com/z4nix/interpkit.git
|
|
80
109
|
cd interpkit
|
|
81
|
-
|
|
110
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
111
|
+
uv pip install -e ".[dev]"
|
|
82
112
|
```
|
|
83
113
|
|
|
114
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
115
|
+
|
|
84
116
|
---
|
|
85
117
|
|
|
86
118
|
## Quickstart
|
|
@@ -156,7 +188,13 @@ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full wa
|
|
|
156
188
|
| **`ov_scores`** | OV circuit analysis — W_OV matrix per head | Transformers |
|
|
157
189
|
| **`qk_scores`** | QK circuit analysis — W_QK matrix per head | Transformers |
|
|
158
190
|
| **`composition`** | Q/K/V composition scores between heads in two layers | Transformers |
|
|
159
|
-
| **`find_circuit`** | Automated circuit discovery
|
|
191
|
+
| **`find_circuit`** | Automated circuit discovery — iterative ablation or EAP-based selection with causal verification | Transformers |
|
|
192
|
+
| **`generate`** | Generation with interventions active across every decode step + per-token lens capture | Generative LMs |
|
|
193
|
+
| **`intervene`** | Context manager applying steer/ablate/patch interventions to any op | Any model |
|
|
194
|
+
| **`atp`** | Attribution Patching — first-order patch-effect scores for all modules in 3 passes | Any model |
|
|
195
|
+
| **`eap`** | Edge Attribution Patching — gradient-based component → residual-stream edge scores (EAP-IG via `ig_steps`) | Causal LMs |
|
|
196
|
+
| **`train_tuned_lens`** | Train per-layer tuned-lens translators (Belrose et al. 2023); use via `lens(kind="tuned")` | LMs |
|
|
197
|
+
| **`max_activating`** | Scan a corpus for the examples that most activate a neuron / SAE feature / head | Any model |
|
|
160
198
|
| **`batch`** | Run any operation over a dataset with result aggregation | Any model |
|
|
161
199
|
|
|
162
200
|
---
|
|
@@ -452,6 +490,20 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
|
|
|
452
490
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
453
491
|
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
454
492
|
|
|
493
|
+
# Generation-time interventions + per-token lens trajectories
|
|
494
|
+
interpkit generate gpt2 "I feel" --positive " joy" --negative " fear" --at transformer.h.6 --scale 8
|
|
495
|
+
interpkit generate gpt2 "The capital of France is" --capture lens
|
|
496
|
+
|
|
497
|
+
# Gradient-based circuit discovery
|
|
498
|
+
interpkit atp gpt2 --clean "The capital of France is" --corrupted "The capital of Germany is"
|
|
499
|
+
interpkit eap gpt2 --clean "..." --corrupted "..." --ig-steps 5
|
|
500
|
+
interpkit find-circuit gpt2 --clean "..." --corrupted "..." --method eap --threshold 0.3
|
|
501
|
+
|
|
502
|
+
# Tuned lens + max-activating examples
|
|
503
|
+
interpkit train-tuned-lens gpt2 --corpus-file texts.txt --save lens_dir/
|
|
504
|
+
interpkit lens gpt2 "The capital of France is" --tuned-lens lens_dir/
|
|
505
|
+
interpkit maxact gpt2 --at transformer.h.6.mlp --neuron 42 --texts-file corpus.txt
|
|
506
|
+
|
|
455
507
|
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
456
508
|
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
457
509
|
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
@@ -522,6 +574,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
522
574
|
|
|
523
575
|
---
|
|
524
576
|
|
|
577
|
+
## Known limitations
|
|
578
|
+
|
|
579
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
580
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
581
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
582
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
583
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
584
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
585
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
586
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
587
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
588
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
589
|
+
|
|
590
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
591
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
592
|
+
sum does not converge to model-output completeness even at large
|
|
593
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
594
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
595
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
596
|
+
reports this programmatically: `result["interpretation"]` is
|
|
597
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
598
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
599
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
600
|
+
|
|
525
601
|
## Examples
|
|
526
602
|
|
|
527
603
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -538,6 +614,8 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
538
614
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
539
615
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
540
616
|
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
617
|
+
| `11_generation_interventions` | Steering/ablation active across every decode step, per-token lens trajectories, positional interventions, `model.intervene()` |
|
|
618
|
+
| `12_circuit_discovery_and_lenses` | Attribution Patching, Edge Attribution Patching, EAP-driven `find_circuit`, tuned lens, max-activating examples |
|
|
541
619
|
|
|
542
620
|
---
|
|
543
621
|
|
|
@@ -12,27 +12,55 @@
|
|
|
12
12
|
|
|
13
13
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
14
14
|
|
|
15
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
15
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
16
16
|
|
|
17
17
|
---
|
|
18
18
|
|
|
19
19
|
## Install
|
|
20
20
|
|
|
21
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
22
|
+
|
|
23
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
24
|
+
|
|
21
25
|
```bash
|
|
26
|
+
uv venv --python 3.11
|
|
27
|
+
source .venv/bin/activate
|
|
28
|
+
uv pip install interpkit
|
|
29
|
+
|
|
30
|
+
# For linear probe support:
|
|
31
|
+
uv pip install "interpkit[probe]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or with plain `venv` + `pip`:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
python3.11 -m venv .venv
|
|
38
|
+
source .venv/bin/activate
|
|
22
39
|
pip install interpkit
|
|
23
40
|
|
|
24
41
|
# For linear probe support:
|
|
25
|
-
pip install interpkit[probe]
|
|
42
|
+
pip install "interpkit[probe]"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or with `conda`:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
conda create -n interpkit python=3.11 -y
|
|
49
|
+
conda activate interpkit
|
|
50
|
+
pip install interpkit
|
|
26
51
|
```
|
|
27
52
|
|
|
28
|
-
|
|
53
|
+
Installing from source for development:
|
|
29
54
|
|
|
30
55
|
```bash
|
|
31
56
|
git clone https://github.com/z4nix/interpkit.git
|
|
32
57
|
cd interpkit
|
|
33
|
-
|
|
58
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
59
|
+
uv pip install -e ".[dev]"
|
|
34
60
|
```
|
|
35
61
|
|
|
62
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
63
|
+
|
|
36
64
|
---
|
|
37
65
|
|
|
38
66
|
## Quickstart
|
|
@@ -108,7 +136,13 @@ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full wa
|
|
|
108
136
|
| **`ov_scores`** | OV circuit analysis — W_OV matrix per head | Transformers |
|
|
109
137
|
| **`qk_scores`** | QK circuit analysis — W_QK matrix per head | Transformers |
|
|
110
138
|
| **`composition`** | Q/K/V composition scores between heads in two layers | Transformers |
|
|
111
|
-
| **`find_circuit`** | Automated circuit discovery
|
|
139
|
+
| **`find_circuit`** | Automated circuit discovery — iterative ablation or EAP-based selection with causal verification | Transformers |
|
|
140
|
+
| **`generate`** | Generation with interventions active across every decode step + per-token lens capture | Generative LMs |
|
|
141
|
+
| **`intervene`** | Context manager applying steer/ablate/patch interventions to any op | Any model |
|
|
142
|
+
| **`atp`** | Attribution Patching — first-order patch-effect scores for all modules in 3 passes | Any model |
|
|
143
|
+
| **`eap`** | Edge Attribution Patching — gradient-based component → residual-stream edge scores (EAP-IG via `ig_steps`) | Causal LMs |
|
|
144
|
+
| **`train_tuned_lens`** | Train per-layer tuned-lens translators (Belrose et al. 2023); use via `lens(kind="tuned")` | LMs |
|
|
145
|
+
| **`max_activating`** | Scan a corpus for the examples that most activate a neuron / SAE feature / head | Any model |
|
|
112
146
|
| **`batch`** | Run any operation over a dataset with result aggregation | Any model |
|
|
113
147
|
|
|
114
148
|
---
|
|
@@ -404,6 +438,20 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
|
|
|
404
438
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
405
439
|
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
406
440
|
|
|
441
|
+
# Generation-time interventions + per-token lens trajectories
|
|
442
|
+
interpkit generate gpt2 "I feel" --positive " joy" --negative " fear" --at transformer.h.6 --scale 8
|
|
443
|
+
interpkit generate gpt2 "The capital of France is" --capture lens
|
|
444
|
+
|
|
445
|
+
# Gradient-based circuit discovery
|
|
446
|
+
interpkit atp gpt2 --clean "The capital of France is" --corrupted "The capital of Germany is"
|
|
447
|
+
interpkit eap gpt2 --clean "..." --corrupted "..." --ig-steps 5
|
|
448
|
+
interpkit find-circuit gpt2 --clean "..." --corrupted "..." --method eap --threshold 0.3
|
|
449
|
+
|
|
450
|
+
# Tuned lens + max-activating examples
|
|
451
|
+
interpkit train-tuned-lens gpt2 --corpus-file texts.txt --save lens_dir/
|
|
452
|
+
interpkit lens gpt2 "The capital of France is" --tuned-lens lens_dir/
|
|
453
|
+
interpkit maxact gpt2 --at transformer.h.6.mlp --neuron 42 --texts-file corpus.txt
|
|
454
|
+
|
|
407
455
|
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
408
456
|
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
409
457
|
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
@@ -474,6 +522,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
474
522
|
|
|
475
523
|
---
|
|
476
524
|
|
|
525
|
+
## Known limitations
|
|
526
|
+
|
|
527
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
528
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
529
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
530
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
531
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
532
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
533
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
534
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
535
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
536
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
537
|
+
|
|
538
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
539
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
540
|
+
sum does not converge to model-output completeness even at large
|
|
541
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
542
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
543
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
544
|
+
reports this programmatically: `result["interpretation"]` is
|
|
545
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
546
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
547
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
548
|
+
|
|
477
549
|
## Examples
|
|
478
550
|
|
|
479
551
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -490,6 +562,8 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
490
562
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
491
563
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
492
564
|
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
565
|
+
| `11_generation_interventions` | Steering/ablation active across every decode step, per-token lens trajectories, positional interventions, `model.intervene()` |
|
|
566
|
+
| `12_circuit_discovery_and_lenses` | Attribution Patching, Edge Attribution Patching, EAP-driven `find_circuit`, tuned lens, max-activating examples |
|
|
493
567
|
|
|
494
568
|
---
|
|
495
569
|
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""interpkit — mech interp for any HuggingFace model."""
|
|
2
|
+
|
|
3
|
+
from interpkit.core.arch import (
|
|
4
|
+
ArchFamily,
|
|
5
|
+
ArchInfo,
|
|
6
|
+
BlockSpec,
|
|
7
|
+
LayerInfo,
|
|
8
|
+
ModuleInfo,
|
|
9
|
+
resolve_arch,
|
|
10
|
+
)
|
|
11
|
+
from interpkit.core.exceptions import (
|
|
12
|
+
ArchitectureNotSupported,
|
|
13
|
+
AttentionBackendUnavailable,
|
|
14
|
+
InterpkitError,
|
|
15
|
+
LensPipelineMismatch,
|
|
16
|
+
OperationNotSupportedForArchitecture,
|
|
17
|
+
WrongInputType,
|
|
18
|
+
)
|
|
19
|
+
from interpkit.core.interventions import (
|
|
20
|
+
AblateIntervention,
|
|
21
|
+
CaptureProbe,
|
|
22
|
+
FnIntervention,
|
|
23
|
+
GenerationContext,
|
|
24
|
+
Intervention,
|
|
25
|
+
PatchIntervention,
|
|
26
|
+
SteerIntervention,
|
|
27
|
+
apply_interventions,
|
|
28
|
+
)
|
|
29
|
+
from interpkit.core.loader import load, load_module
|
|
30
|
+
from interpkit.core.model import Model
|
|
31
|
+
from interpkit.core.registry import register
|
|
32
|
+
from interpkit.core.tl_compat import (
|
|
33
|
+
list_roundtrippable_hooks,
|
|
34
|
+
list_tl_hooks,
|
|
35
|
+
to_native_name,
|
|
36
|
+
to_tl_name,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def diff(model_a, model_b, input_data, *, save=None):
|
|
41
|
+
"""Compare activations between two models on the same input."""
|
|
42
|
+
from interpkit.ops.diff import run_diff
|
|
43
|
+
|
|
44
|
+
return run_diff(model_a, model_b, input_data, save=save)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
# Loaders
|
|
49
|
+
"load",
|
|
50
|
+
"load_module",
|
|
51
|
+
"Model",
|
|
52
|
+
# Architecture types
|
|
53
|
+
"ArchInfo",
|
|
54
|
+
"ArchFamily",
|
|
55
|
+
"BlockSpec",
|
|
56
|
+
"resolve_arch",
|
|
57
|
+
# Per-layer structural types
|
|
58
|
+
"LayerInfo",
|
|
59
|
+
"ModuleInfo",
|
|
60
|
+
# Exception types
|
|
61
|
+
"InterpkitError",
|
|
62
|
+
"ArchitectureNotSupported",
|
|
63
|
+
"AttentionBackendUnavailable",
|
|
64
|
+
"LensPipelineMismatch",
|
|
65
|
+
"OperationNotSupportedForArchitecture",
|
|
66
|
+
"WrongInputType",
|
|
67
|
+
# Interventions
|
|
68
|
+
"Intervention",
|
|
69
|
+
"SteerIntervention",
|
|
70
|
+
"AblateIntervention",
|
|
71
|
+
"PatchIntervention",
|
|
72
|
+
"FnIntervention",
|
|
73
|
+
"CaptureProbe",
|
|
74
|
+
"GenerationContext",
|
|
75
|
+
"apply_interventions",
|
|
76
|
+
# Operations
|
|
77
|
+
"register",
|
|
78
|
+
"diff",
|
|
79
|
+
# TL compat
|
|
80
|
+
"to_tl_name",
|
|
81
|
+
"to_native_name",
|
|
82
|
+
"list_tl_hooks",
|
|
83
|
+
"list_roundtrippable_hooks",
|
|
84
|
+
]
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
"""Entry point so ``python -m interpkit`` invokes the Typer CLI.
|
|
2
2
|
|
|
3
|
-
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:
|
|
3
|
+
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:run"``
|
|
4
4
|
console script declared in :file:`pyproject.toml`, so users without the
|
|
5
5
|
console script on their ``$PATH`` (e.g. just-installed in a fresh
|
|
6
6
|
environment, vendored copies, ad-hoc subprocess invocations) can still
|
|
7
7
|
reach every CLI command via ``python -m interpkit ...``.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
from interpkit.cli.main import
|
|
10
|
+
from interpkit.cli.main import run
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def main() -> None:
|
|
14
|
-
"""Invoke the
|
|
15
|
-
|
|
14
|
+
"""Invoke the CLI — separate function makes patching easier in tests.
|
|
15
|
+
|
|
16
|
+
Uses ``run`` (not ``app`` directly) so interpkit's fail-loud errors are
|
|
17
|
+
rendered as clean one-line messages instead of tracebacks.
|
|
18
|
+
"""
|
|
19
|
+
run()
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
if __name__ == "__main__":
|