interpkit 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {interpkit-0.2.0/interpkit.egg-info → interpkit-0.4.0}/PKG-INFO +79 -21
- interpkit-0.2.0/PKG-INFO → interpkit-0.4.0/README.md +67 -57
- interpkit-0.4.0/interpkit/__main__.py +19 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/cli/main.py +321 -195
- interpkit-0.4.0/interpkit/core/cache.py +36 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/discovery.py +116 -10
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/html.py +1 -2
- interpkit-0.4.0/interpkit/core/inputs.py +403 -0
- interpkit-0.4.0/interpkit/core/loader.py +322 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/model.py +207 -327
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/plot.py +5 -6
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/registry.py +18 -4
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/render.py +299 -183
- interpkit-0.4.0/interpkit/core/theme.py +36 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/tl_compat.py +3 -3
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/ablate.py +1 -1
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/activations.py +5 -2
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/attention.py +15 -15
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/attribute.py +88 -19
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/batch.py +17 -16
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/circuits.py +15 -16
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/diff.py +8 -4
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/dla.py +180 -12
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/find_circuit.py +14 -10
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/heads.py +4 -3
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/inspect.py +1 -1
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/lens.py +7 -3
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/patch.py +11 -11
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/probe.py +4 -3
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/report.py +56 -11
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/sae.py +283 -40
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/scan.py +78 -40
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/steer.py +59 -9
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/trace.py +5 -4
- interpkit-0.2.0/README.md → interpkit-0.4.0/interpkit.egg-info/PKG-INFO +115 -17
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/SOURCES.txt +12 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/requires.txt +9 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/pyproject.toml +40 -4
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_activations.py +3 -1
- interpkit-0.4.0/tests/test_architectures.py +286 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_attention.py +3 -1
- interpkit-0.4.0/tests/test_chat.py +217 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_cli.py +2 -2
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_discovery_units.py +4 -6
- interpkit-0.4.0/tests/test_inputs.py +251 -0
- interpkit-0.4.0/tests/test_invariants.py +108 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_load_params.py +13 -15
- interpkit-0.4.0/tests/test_multi_arch.py +140 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_ops.py +61 -1
- interpkit-0.4.0/tests/test_param_variants.py +140 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_plot_internals.py +22 -25
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_registry.py +0 -1
- interpkit-0.4.0/tests/test_regressions.py +90 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_render_internals.py +0 -3
- interpkit-0.4.0/tests/test_robustness_audit.py +763 -0
- interpkit-0.4.0/tests/test_sae.py +374 -0
- interpkit-0.4.0/tests/test_steer.py +91 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_tl_compat.py +0 -2
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_tl_ops.py +0 -1
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_trace.py +0 -2
- interpkit-0.2.0/interpkit/core/inputs.py +0 -123
- interpkit-0.2.0/tests/test_sae.py +0 -115
- interpkit-0.2.0/tests/test_steer.py +0 -30
- {interpkit-0.2.0 → interpkit-0.4.0}/LICENSE +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/__init__.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/cli/__init__.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/__init__.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/__init__.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/dependency_links.txt +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/entry_points.txt +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/top_level.txt +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/setup.cfg +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_ablate.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_attribute.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_cache.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_diff.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_discovery.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_error_handling.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_html.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_inspect.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_lens.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_patch.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_plots.py +0 -0
- {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_probe.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: interpkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Mech interp for any HuggingFace model.
|
|
5
5
|
Author: Davide Zani
|
|
6
6
|
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/z4nix/
|
|
8
|
-
Project-URL: Repository, https://github.com/z4nix/
|
|
9
|
-
Project-URL: Issues, https://github.com/z4nix/
|
|
7
|
+
Project-URL: Homepage, https://github.com/z4nix/interpkit
|
|
8
|
+
Project-URL: Repository, https://github.com/z4nix/interpkit
|
|
9
|
+
Project-URL: Issues, https://github.com/z4nix/interpkit/issues
|
|
10
10
|
Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
|
|
11
11
|
Classifier: Development Status :: 3 - Alpha
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -23,6 +23,7 @@ Requires-Dist: torch>=2.1
|
|
|
23
23
|
Requires-Dist: transformers>=4.36
|
|
24
24
|
Requires-Dist: safetensors>=0.4
|
|
25
25
|
Requires-Dist: rich>=13.0
|
|
26
|
+
Requires-Dist: rich-gradient>=0.3
|
|
26
27
|
Requires-Dist: typer>=0.9
|
|
27
28
|
Requires-Dist: Pillow>=10.0
|
|
28
29
|
Requires-Dist: matplotlib>=3.8
|
|
@@ -34,20 +35,20 @@ Requires-Dist: scikit-learn>=1.3; extra == "probe"
|
|
|
34
35
|
Provides-Extra: dev
|
|
35
36
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
37
|
Requires-Dist: pytest-timeout>=2.2; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
37
39
|
Requires-Dist: scikit-learn>=1.3; extra == "dev"
|
|
38
40
|
Requires-Dist: torchvision>=0.16; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
42
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
43
|
+
Provides-Extra: docs
|
|
44
|
+
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
45
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
46
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
39
47
|
Dynamic: license-file
|
|
40
48
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
III nnn nn tttt ee e rrr r ppp pp KKKK iii tttt
|
|
45
|
-
III nn nn tt eeeee rr pppppp KK KK iii tt
|
|
46
|
-
IIIII nn nn tttt eeeee rr pp KK KK iii tttt
|
|
47
|
-
pp
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
> Mech interp for any HuggingFace model.
|
|
49
|
+
<p align="center">
|
|
50
|
+
<img src="assets/logo.svg" alt="InterpKit" width="680">
|
|
51
|
+
</p>
|
|
51
52
|
|
|
52
53
|
[](https://pypi.org/project/interpkit/)
|
|
53
54
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -75,8 +76,8 @@ pip install interpkit[probe]
|
|
|
75
76
|
Or install from source for development:
|
|
76
77
|
|
|
77
78
|
```bash
|
|
78
|
-
git clone https://github.com/
|
|
79
|
-
cd
|
|
79
|
+
git clone https://github.com/z4nix/interpkit.git
|
|
80
|
+
cd interpkit
|
|
80
81
|
pip install -e ".[dev]"
|
|
81
82
|
```
|
|
82
83
|
|
|
@@ -110,6 +111,25 @@ model = interpkit.load("google/vit-base-patch16-224")
|
|
|
110
111
|
model = interpkit.load("bert-base-uncased")
|
|
111
112
|
```
|
|
112
113
|
|
|
114
|
+
### Chat models
|
|
115
|
+
|
|
116
|
+
Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
|
|
120
|
+
|
|
121
|
+
result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
|
|
122
|
+
print(result["response"])
|
|
123
|
+
|
|
124
|
+
# Run any other op on the templated prompt
|
|
125
|
+
chat.dla(result["prompt"])
|
|
126
|
+
|
|
127
|
+
# Or pass a message list directly to any op
|
|
128
|
+
chat.dla([{"role": "user", "content": "Capital of France?"}])
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
|
|
132
|
+
|
|
113
133
|
---
|
|
114
134
|
|
|
115
135
|
## Operations
|
|
@@ -117,7 +137,8 @@ model = interpkit.load("bert-base-uncased")
|
|
|
117
137
|
| Operation | What it does | Works on |
|
|
118
138
|
|-----------|-------------|----------|
|
|
119
139
|
| **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
|
|
120
|
-
| **`
|
|
140
|
+
| **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
|
|
141
|
+
| **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
|
|
121
142
|
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
122
143
|
| `patch` | Activation patching at a module, head, or position | Any model |
|
|
123
144
|
| `trace` | Causal tracing — module-level or position-aware (Meng et al.) heatmap | Any model |
|
|
@@ -172,6 +193,16 @@ model.dla("The capital of France is", token="Paris")
|
|
|
172
193
|
|
|
173
194
|
# Save a bar chart
|
|
174
195
|
model.dla("The capital of France is", save="dla.png")
|
|
196
|
+
|
|
197
|
+
# Feature-level DLA — decompose a component through an SAE
|
|
198
|
+
# to see which individual features drive the prediction
|
|
199
|
+
model.dla(
|
|
200
|
+
"The capital of France is",
|
|
201
|
+
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
202
|
+
sae_at="transformer.h.11.attn",
|
|
203
|
+
)
|
|
204
|
+
# result["feature_contributions"]["features"]
|
|
205
|
+
# — per-feature logit attributions at the specified component
|
|
175
206
|
```
|
|
176
207
|
|
|
177
208
|
## Causal Tracing
|
|
@@ -317,10 +348,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
|
|
|
317
348
|
## Steering
|
|
318
349
|
|
|
319
350
|
```python
|
|
320
|
-
vector = model.steer_vector("
|
|
351
|
+
vector = model.steer_vector(" love", " hate", at="transformer.h.8")
|
|
321
352
|
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
322
353
|
```
|
|
323
354
|
|
|
355
|
+
> Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
|
|
356
|
+
|
|
324
357
|
## Linear Probe
|
|
325
358
|
|
|
326
359
|
```python
|
|
@@ -342,14 +375,22 @@ interpkit.diff(base, finetuned, "The capital of France is")
|
|
|
342
375
|
|
|
343
376
|
## SAE Features
|
|
344
377
|
|
|
345
|
-
Decompose activations into interpretable features using pre-trained Sparse Autoencoders
|
|
378
|
+
Decompose activations into interpretable features using pre-trained Sparse Autoencoders:
|
|
346
379
|
|
|
347
380
|
```python
|
|
381
|
+
# From HuggingFace
|
|
348
382
|
model.features(
|
|
349
383
|
"The capital of France is",
|
|
350
384
|
at="transformer.h.8",
|
|
351
385
|
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
352
386
|
)
|
|
387
|
+
|
|
388
|
+
# From a local file (.safetensors or .pt)
|
|
389
|
+
model.features(
|
|
390
|
+
"The capital of France is",
|
|
391
|
+
at="transformer.h.8",
|
|
392
|
+
sae="/path/to/sae_weights.safetensors",
|
|
393
|
+
)
|
|
353
394
|
```
|
|
354
395
|
|
|
355
396
|
No SAELens dependency — weights are loaded directly via `safetensors`.
|
|
@@ -403,11 +444,17 @@ interpkit lens gpt2 "The capital of France is"
|
|
|
403
444
|
interpkit lens gpt2 "The capital of France is" --position -1
|
|
404
445
|
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
405
446
|
interpkit attribute gpt2 "The capital of France is"
|
|
406
|
-
interpkit steer gpt2 "The weather is" --positive
|
|
447
|
+
interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
|
|
407
448
|
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
408
449
|
interpkit decompose gpt2 "The capital of France is"
|
|
409
450
|
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
410
451
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
|
|
452
|
+
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
453
|
+
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
454
|
+
|
|
455
|
+
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
456
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
457
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
411
458
|
|
|
412
459
|
# Interactive HTML output
|
|
413
460
|
interpkit attention gpt2 "hello world" --html attention.html
|
|
@@ -418,7 +465,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
|
418
465
|
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
419
466
|
```
|
|
420
467
|
|
|
421
|
-
Run `interpkit` with no arguments for a full command reference
|
|
468
|
+
Run `interpkit` with no arguments for a full command reference, or
|
|
469
|
+
`interpkit --extensive` for a beginner-friendly walkthrough of every command.
|
|
470
|
+
|
|
471
|
+
If the `interpkit` console script isn't on your `PATH` (e.g. fresh
|
|
472
|
+
environments, sandboxed installs, or running from a checkout without
|
|
473
|
+
re-installing), every command also works as `python -m interpkit ...`:
|
|
474
|
+
|
|
475
|
+
```bash
|
|
476
|
+
python -m interpkit scan gpt2 "The capital of France is"
|
|
477
|
+
python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
|
|
478
|
+
```
|
|
422
479
|
|
|
423
480
|
---
|
|
424
481
|
|
|
@@ -480,6 +537,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
480
537
|
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
481
538
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
482
539
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
540
|
+
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
483
541
|
|
|
484
542
|
---
|
|
485
543
|
|
|
@@ -1,53 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
Summary: Mech interp for any HuggingFace model.
|
|
5
|
-
Author: Davide Zani
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/z4nix/MechKit
|
|
8
|
-
Project-URL: Repository, https://github.com/z4nix/MechKit
|
|
9
|
-
Project-URL: Issues, https://github.com/z4nix/MechKit/issues
|
|
10
|
-
Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
|
|
11
|
-
Classifier: Development Status :: 3 - Alpha
|
|
12
|
-
Classifier: Intended Audience :: Science/Research
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
-
Requires-Python: >=3.10
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Requires-Dist: torch>=2.1
|
|
23
|
-
Requires-Dist: transformers>=4.36
|
|
24
|
-
Requires-Dist: safetensors>=0.4
|
|
25
|
-
Requires-Dist: rich>=13.0
|
|
26
|
-
Requires-Dist: typer>=0.9
|
|
27
|
-
Requires-Dist: Pillow>=10.0
|
|
28
|
-
Requires-Dist: matplotlib>=3.8
|
|
29
|
-
Requires-Dist: huggingface-hub>=0.20
|
|
30
|
-
Provides-Extra: vision
|
|
31
|
-
Requires-Dist: torchvision>=0.16; extra == "vision"
|
|
32
|
-
Provides-Extra: probe
|
|
33
|
-
Requires-Dist: scikit-learn>=1.3; extra == "probe"
|
|
34
|
-
Provides-Extra: dev
|
|
35
|
-
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
|
-
Requires-Dist: pytest-timeout>=2.2; extra == "dev"
|
|
37
|
-
Requires-Dist: scikit-learn>=1.3; extra == "dev"
|
|
38
|
-
Requires-Dist: torchvision>=0.16; extra == "dev"
|
|
39
|
-
Dynamic: license-file
|
|
40
|
-
|
|
41
|
-
```
|
|
42
|
-
IIIII tt KK KK iii tt
|
|
43
|
-
III nn nnn tt eee rr rr pp pp KK KK tt
|
|
44
|
-
III nnn nn tttt ee e rrr r ppp pp KKKK iii tttt
|
|
45
|
-
III nn nn tt eeeee rr pppppp KK KK iii tt
|
|
46
|
-
IIIII nn nn tttt eeeee rr pp KK KK iii tttt
|
|
47
|
-
pp
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
> Mech interp for any HuggingFace model.
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.svg" alt="InterpKit" width="680">
|
|
3
|
+
</p>
|
|
51
4
|
|
|
52
5
|
[](https://pypi.org/project/interpkit/)
|
|
53
6
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -75,8 +28,8 @@ pip install interpkit[probe]
|
|
|
75
28
|
Or install from source for development:
|
|
76
29
|
|
|
77
30
|
```bash
|
|
78
|
-
git clone https://github.com/
|
|
79
|
-
cd
|
|
31
|
+
git clone https://github.com/z4nix/interpkit.git
|
|
32
|
+
cd interpkit
|
|
80
33
|
pip install -e ".[dev]"
|
|
81
34
|
```
|
|
82
35
|
|
|
@@ -110,6 +63,25 @@ model = interpkit.load("google/vit-base-patch16-224")
|
|
|
110
63
|
model = interpkit.load("bert-base-uncased")
|
|
111
64
|
```
|
|
112
65
|
|
|
66
|
+
### Chat models
|
|
67
|
+
|
|
68
|
+
Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
|
|
72
|
+
|
|
73
|
+
result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
|
|
74
|
+
print(result["response"])
|
|
75
|
+
|
|
76
|
+
# Run any other op on the templated prompt
|
|
77
|
+
chat.dla(result["prompt"])
|
|
78
|
+
|
|
79
|
+
# Or pass a message list directly to any op
|
|
80
|
+
chat.dla([{"role": "user", "content": "Capital of France?"}])
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
|
|
84
|
+
|
|
113
85
|
---
|
|
114
86
|
|
|
115
87
|
## Operations
|
|
@@ -117,7 +89,8 @@ model = interpkit.load("bert-base-uncased")
|
|
|
117
89
|
| Operation | What it does | Works on |
|
|
118
90
|
|-----------|-------------|----------|
|
|
119
91
|
| **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
|
|
120
|
-
| **`
|
|
92
|
+
| **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
|
|
93
|
+
| **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
|
|
121
94
|
| `inspect` | Module tree with types, param counts, shapes | Any model |
|
|
122
95
|
| `patch` | Activation patching at a module, head, or position | Any model |
|
|
123
96
|
| `trace` | Causal tracing — module-level or position-aware (Meng et al.) heatmap | Any model |
|
|
@@ -172,6 +145,16 @@ model.dla("The capital of France is", token="Paris")
|
|
|
172
145
|
|
|
173
146
|
# Save a bar chart
|
|
174
147
|
model.dla("The capital of France is", save="dla.png")
|
|
148
|
+
|
|
149
|
+
# Feature-level DLA — decompose a component through an SAE
|
|
150
|
+
# to see which individual features drive the prediction
|
|
151
|
+
model.dla(
|
|
152
|
+
"The capital of France is",
|
|
153
|
+
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
154
|
+
sae_at="transformer.h.11.attn",
|
|
155
|
+
)
|
|
156
|
+
# result["feature_contributions"]["features"]
|
|
157
|
+
# — per-feature logit attributions at the specified component
|
|
175
158
|
```
|
|
176
159
|
|
|
177
160
|
## Causal Tracing
|
|
@@ -317,10 +300,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
|
|
|
317
300
|
## Steering
|
|
318
301
|
|
|
319
302
|
```python
|
|
320
|
-
vector = model.steer_vector("
|
|
303
|
+
vector = model.steer_vector(" love", " hate", at="transformer.h.8")
|
|
321
304
|
model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
|
|
322
305
|
```
|
|
323
306
|
|
|
307
|
+
> Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
|
|
308
|
+
|
|
324
309
|
## Linear Probe
|
|
325
310
|
|
|
326
311
|
```python
|
|
@@ -342,14 +327,22 @@ interpkit.diff(base, finetuned, "The capital of France is")
|
|
|
342
327
|
|
|
343
328
|
## SAE Features
|
|
344
329
|
|
|
345
|
-
Decompose activations into interpretable features using pre-trained Sparse Autoencoders
|
|
330
|
+
Decompose activations into interpretable features using pre-trained Sparse Autoencoders:
|
|
346
331
|
|
|
347
332
|
```python
|
|
333
|
+
# From HuggingFace
|
|
348
334
|
model.features(
|
|
349
335
|
"The capital of France is",
|
|
350
336
|
at="transformer.h.8",
|
|
351
337
|
sae="jbloom/GPT2-Small-SAEs-Reformatted",
|
|
352
338
|
)
|
|
339
|
+
|
|
340
|
+
# From a local file (.safetensors or .pt)
|
|
341
|
+
model.features(
|
|
342
|
+
"The capital of France is",
|
|
343
|
+
at="transformer.h.8",
|
|
344
|
+
sae="/path/to/sae_weights.safetensors",
|
|
345
|
+
)
|
|
353
346
|
```
|
|
354
347
|
|
|
355
348
|
No SAELens dependency — weights are loaded directly via `safetensors`.
|
|
@@ -403,11 +396,17 @@ interpkit lens gpt2 "The capital of France is"
|
|
|
403
396
|
interpkit lens gpt2 "The capital of France is" --position -1
|
|
404
397
|
interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
|
|
405
398
|
interpkit attribute gpt2 "The capital of France is"
|
|
406
|
-
interpkit steer gpt2 "The weather is" --positive
|
|
399
|
+
interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
|
|
407
400
|
interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
|
|
408
401
|
interpkit decompose gpt2 "The capital of France is"
|
|
409
402
|
interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
|
|
410
403
|
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
|
|
404
|
+
interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
|
|
405
|
+
interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
|
|
406
|
+
|
|
407
|
+
# Chat / instruct models — applies the tokenizer's chat template automatically
|
|
408
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
|
|
409
|
+
interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
|
|
411
410
|
|
|
412
411
|
# Interactive HTML output
|
|
413
412
|
interpkit attention gpt2 "hello world" --html attention.html
|
|
@@ -418,7 +417,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
|
|
|
418
417
|
interpkit attribute microsoft/resnet-50 cat.jpg --target 281
|
|
419
418
|
```
|
|
420
419
|
|
|
421
|
-
Run `interpkit` with no arguments for a full command reference
|
|
420
|
+
Run `interpkit` with no arguments for a full command reference, or
|
|
421
|
+
`interpkit --extensive` for a beginner-friendly walkthrough of every command.
|
|
422
|
+
|
|
423
|
+
If the `interpkit` console script isn't on your `PATH` (e.g. fresh
|
|
424
|
+
environments, sandboxed installs, or running from a checkout without
|
|
425
|
+
re-installing), every command also works as `python -m interpkit ...`:
|
|
426
|
+
|
|
427
|
+
```bash
|
|
428
|
+
python -m interpkit scan gpt2 "The capital of France is"
|
|
429
|
+
python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
|
|
430
|
+
```
|
|
422
431
|
|
|
423
432
|
---
|
|
424
433
|
|
|
@@ -480,6 +489,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
|
480
489
|
| `07_vision_models` | ResNet/ViT attribution, ablation, activations |
|
|
481
490
|
| `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
|
|
482
491
|
| `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
|
|
492
|
+
| `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
|
|
483
493
|
|
|
484
494
|
---
|
|
485
495
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Entry point so ``python -m interpkit`` invokes the Typer CLI.
|
|
2
|
+
|
|
3
|
+
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:app"``
|
|
4
|
+
console script declared in :file:`pyproject.toml`, so users without the
|
|
5
|
+
console script on their ``$PATH`` (e.g. just-installed in a fresh
|
|
6
|
+
environment, vendored copies, ad-hoc subprocess invocations) can still
|
|
7
|
+
reach every CLI command via ``python -m interpkit ...``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from interpkit.cli.main import app
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main() -> None:
|
|
14
|
+
"""Invoke the Typer app — separate function makes patching easier in tests."""
|
|
15
|
+
app()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
main()
|