interpkit 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {interpkit-0.2.0/interpkit.egg-info → interpkit-0.4.0}/PKG-INFO +79 -21
  2. interpkit-0.2.0/PKG-INFO → interpkit-0.4.0/README.md +67 -57
  3. interpkit-0.4.0/interpkit/__main__.py +19 -0
  4. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/cli/main.py +321 -195
  5. interpkit-0.4.0/interpkit/core/cache.py +36 -0
  6. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/discovery.py +116 -10
  7. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/html.py +1 -2
  8. interpkit-0.4.0/interpkit/core/inputs.py +403 -0
  9. interpkit-0.4.0/interpkit/core/loader.py +322 -0
  10. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/model.py +207 -327
  11. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/plot.py +5 -6
  12. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/registry.py +18 -4
  13. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/render.py +299 -183
  14. interpkit-0.4.0/interpkit/core/theme.py +36 -0
  15. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/tl_compat.py +3 -3
  16. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/ablate.py +1 -1
  17. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/activations.py +5 -2
  18. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/attention.py +15 -15
  19. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/attribute.py +88 -19
  20. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/batch.py +17 -16
  21. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/circuits.py +15 -16
  22. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/diff.py +8 -4
  23. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/dla.py +180 -12
  24. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/find_circuit.py +14 -10
  25. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/heads.py +4 -3
  26. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/inspect.py +1 -1
  27. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/lens.py +7 -3
  28. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/patch.py +11 -11
  29. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/probe.py +4 -3
  30. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/report.py +56 -11
  31. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/sae.py +283 -40
  32. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/scan.py +78 -40
  33. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/steer.py +59 -9
  34. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/trace.py +5 -4
  35. interpkit-0.2.0/README.md → interpkit-0.4.0/interpkit.egg-info/PKG-INFO +115 -17
  36. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/SOURCES.txt +12 -0
  37. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/requires.txt +9 -0
  38. {interpkit-0.2.0 → interpkit-0.4.0}/pyproject.toml +40 -4
  39. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_activations.py +3 -1
  40. interpkit-0.4.0/tests/test_architectures.py +286 -0
  41. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_attention.py +3 -1
  42. interpkit-0.4.0/tests/test_chat.py +217 -0
  43. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_cli.py +2 -2
  44. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_discovery_units.py +4 -6
  45. interpkit-0.4.0/tests/test_inputs.py +251 -0
  46. interpkit-0.4.0/tests/test_invariants.py +108 -0
  47. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_load_params.py +13 -15
  48. interpkit-0.4.0/tests/test_multi_arch.py +140 -0
  49. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_ops.py +61 -1
  50. interpkit-0.4.0/tests/test_param_variants.py +140 -0
  51. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_plot_internals.py +22 -25
  52. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_registry.py +0 -1
  53. interpkit-0.4.0/tests/test_regressions.py +90 -0
  54. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_render_internals.py +0 -3
  55. interpkit-0.4.0/tests/test_robustness_audit.py +763 -0
  56. interpkit-0.4.0/tests/test_sae.py +374 -0
  57. interpkit-0.4.0/tests/test_steer.py +91 -0
  58. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_tl_compat.py +0 -2
  59. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_tl_ops.py +0 -1
  60. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_trace.py +0 -2
  61. interpkit-0.2.0/interpkit/core/inputs.py +0 -123
  62. interpkit-0.2.0/tests/test_sae.py +0 -115
  63. interpkit-0.2.0/tests/test_steer.py +0 -30
  64. {interpkit-0.2.0 → interpkit-0.4.0}/LICENSE +0 -0
  65. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/__init__.py +0 -0
  66. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/cli/__init__.py +0 -0
  67. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/core/__init__.py +0 -0
  68. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit/ops/__init__.py +0 -0
  69. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/dependency_links.txt +0 -0
  70. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/entry_points.txt +0 -0
  71. {interpkit-0.2.0 → interpkit-0.4.0}/interpkit.egg-info/top_level.txt +0 -0
  72. {interpkit-0.2.0 → interpkit-0.4.0}/setup.cfg +0 -0
  73. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_ablate.py +0 -0
  74. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_attribute.py +0 -0
  75. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_cache.py +0 -0
  76. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_diff.py +0 -0
  77. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_discovery.py +0 -0
  78. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_error_handling.py +0 -0
  79. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_html.py +0 -0
  80. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_inspect.py +0 -0
  81. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_lens.py +0 -0
  82. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_patch.py +0 -0
  83. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_plots.py +0 -0
  84. {interpkit-0.2.0 → interpkit-0.4.0}/tests/test_probe.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: interpkit
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Mech interp for any HuggingFace model.
5
5
  Author: Davide Zani
6
6
  License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/z4nix/MechKit
8
- Project-URL: Repository, https://github.com/z4nix/MechKit
9
- Project-URL: Issues, https://github.com/z4nix/MechKit/issues
7
+ Project-URL: Homepage, https://github.com/z4nix/interpkit
8
+ Project-URL: Repository, https://github.com/z4nix/interpkit
9
+ Project-URL: Issues, https://github.com/z4nix/interpkit/issues
10
10
  Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
11
11
  Classifier: Development Status :: 3 - Alpha
12
12
  Classifier: Intended Audience :: Science/Research
@@ -23,6 +23,7 @@ Requires-Dist: torch>=2.1
23
23
  Requires-Dist: transformers>=4.36
24
24
  Requires-Dist: safetensors>=0.4
25
25
  Requires-Dist: rich>=13.0
26
+ Requires-Dist: rich-gradient>=0.3
26
27
  Requires-Dist: typer>=0.9
27
28
  Requires-Dist: Pillow>=10.0
28
29
  Requires-Dist: matplotlib>=3.8
@@ -34,20 +35,20 @@ Requires-Dist: scikit-learn>=1.3; extra == "probe"
34
35
  Provides-Extra: dev
35
36
  Requires-Dist: pytest>=7.0; extra == "dev"
36
37
  Requires-Dist: pytest-timeout>=2.2; extra == "dev"
38
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
37
39
  Requires-Dist: scikit-learn>=1.3; extra == "dev"
38
40
  Requires-Dist: torchvision>=0.16; extra == "dev"
41
+ Requires-Dist: ruff>=0.4; extra == "dev"
42
+ Requires-Dist: mypy>=1.8; extra == "dev"
43
+ Provides-Extra: docs
44
+ Requires-Dist: mkdocs>=1.5; extra == "docs"
45
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
46
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
39
47
  Dynamic: license-file
40
48
 
41
- ```
42
- IIIII tt KK KK iii tt
43
- III nn nnn tt eee rr rr pp pp KK KK tt
44
- III nnn nn tttt ee e rrr r ppp pp KKKK iii tttt
45
- III nn nn tt eeeee rr pppppp KK KK iii tt
46
- IIIII nn nn tttt eeeee rr pp KK KK iii tttt
47
- pp
48
- ```
49
-
50
- > Mech interp for any HuggingFace model.
49
+ <p align="center">
50
+ <img src="assets/logo.svg" alt="InterpKit" width="680">
51
+ </p>
51
52
 
52
53
  [![PyPI version](https://img.shields.io/pypi/v/interpkit.svg)](https://pypi.org/project/interpkit/)
53
54
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -75,8 +76,8 @@ pip install interpkit[probe]
75
76
  Or install from source for development:
76
77
 
77
78
  ```bash
78
- git clone https://github.com/davidezani/InterpKit.git
79
- cd InterpKit
79
+ git clone https://github.com/z4nix/interpkit.git
80
+ cd interpkit
80
81
  pip install -e ".[dev]"
81
82
  ```
82
83
 
@@ -110,6 +111,25 @@ model = interpkit.load("google/vit-base-patch16-224")
110
111
  model = interpkit.load("bert-base-uncased")
111
112
  ```
112
113
 
114
+ ### Chat models
115
+
116
+ Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
117
+
118
+ ```python
119
+ chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
120
+
121
+ result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
122
+ print(result["response"])
123
+
124
+ # Run any other op on the templated prompt
125
+ chat.dla(result["prompt"])
126
+
127
+ # Or pass a message list directly to any op
128
+ chat.dla([{"role": "user", "content": "Capital of France?"}])
129
+ ```
130
+
131
+ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
132
+
113
133
  ---
114
134
 
115
135
  ## Operations
@@ -117,7 +137,8 @@ model = interpkit.load("bert-base-uncased")
117
137
  | Operation | What it does | Works on |
118
138
  |-----------|-------------|----------|
119
139
  | **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
120
- | **`dla`** | Direct Logit Attribution decompose output logits by head and MLP contribution | LMs |
140
+ | **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
141
+ | **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
121
142
  | `inspect` | Module tree with types, param counts, shapes | Any model |
122
143
  | `patch` | Activation patching at a module, head, or position | Any model |
123
144
  | `trace` | Causal tracing — module-level or position-aware (Meng et al.) heatmap | Any model |
@@ -172,6 +193,16 @@ model.dla("The capital of France is", token="Paris")
172
193
 
173
194
  # Save a bar chart
174
195
  model.dla("The capital of France is", save="dla.png")
196
+
197
+ # Feature-level DLA — decompose a component through an SAE
198
+ # to see which individual features drive the prediction
199
+ model.dla(
200
+ "The capital of France is",
201
+ sae="jbloom/GPT2-Small-SAEs-Reformatted",
202
+ sae_at="transformer.h.11.attn",
203
+ )
204
+ # result["feature_contributions"]["features"]
205
+ # — per-feature logit attributions at the specified component
175
206
  ```
176
207
 
177
208
  ## Causal Tracing
@@ -317,10 +348,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
317
348
  ## Steering
318
349
 
319
350
  ```python
320
- vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
351
+ vector = model.steer_vector(" love", " hate", at="transformer.h.8")
321
352
  model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
322
353
  ```
323
354
 
355
+ > Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
356
+
324
357
  ## Linear Probe
325
358
 
326
359
  ```python
@@ -342,14 +375,22 @@ interpkit.diff(base, finetuned, "The capital of France is")
342
375
 
343
376
  ## SAE Features
344
377
 
345
- Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
378
+ Decompose activations into interpretable features using pre-trained Sparse Autoencoders:
346
379
 
347
380
  ```python
381
+ # From HuggingFace
348
382
  model.features(
349
383
  "The capital of France is",
350
384
  at="transformer.h.8",
351
385
  sae="jbloom/GPT2-Small-SAEs-Reformatted",
352
386
  )
387
+
388
+ # From a local file (.safetensors or .pt)
389
+ model.features(
390
+ "The capital of France is",
391
+ at="transformer.h.8",
392
+ sae="/path/to/sae_weights.safetensors",
393
+ )
353
394
  ```
354
395
 
355
396
  No SAELens dependency — weights are loaded directly via `safetensors`.
@@ -403,11 +444,17 @@ interpkit lens gpt2 "The capital of France is"
403
444
  interpkit lens gpt2 "The capital of France is" --position -1
404
445
  interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
405
446
  interpkit attribute gpt2 "The capital of France is"
406
- interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
447
+ interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
407
448
  interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
408
449
  interpkit decompose gpt2 "The capital of France is"
409
450
  interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
410
451
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
452
+ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
453
+ interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
454
+
455
+ # Chat / instruct models — applies the tokenizer's chat template automatically
456
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
457
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
411
458
 
412
459
  # Interactive HTML output
413
460
  interpkit attention gpt2 "hello world" --html attention.html
@@ -418,7 +465,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
418
465
  interpkit attribute microsoft/resnet-50 cat.jpg --target 281
419
466
  ```
420
467
 
421
- Run `interpkit` with no arguments for a full command reference.
468
+ Run `interpkit` with no arguments for a full command reference, or
469
+ `interpkit --extensive` for a beginner-friendly walkthrough of every command.
470
+
471
+ If the `interpkit` console script isn't on your `PATH` (e.g. fresh
472
+ environments, sandboxed installs, or running from a checkout without
473
+ re-installing), every command also works as `python -m interpkit ...`:
474
+
475
+ ```bash
476
+ python -m interpkit scan gpt2 "The capital of France is"
477
+ python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
478
+ ```
422
479
 
423
480
  ---
424
481
 
@@ -480,6 +537,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
480
537
  | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
481
538
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
482
539
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
540
+ | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
483
541
 
484
542
  ---
485
543
 
@@ -1,53 +1,6 @@
1
- Metadata-Version: 2.4
2
- Name: interpkit
3
- Version: 0.2.0
4
- Summary: Mech interp for any HuggingFace model.
5
- Author: Davide Zani
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/z4nix/MechKit
8
- Project-URL: Repository, https://github.com/z4nix/MechKit
9
- Project-URL: Issues, https://github.com/z4nix/MechKit/issues
10
- Keywords: mechanistic-interpretability,pytorch,transformers,mech-interp,interpretability
11
- Classifier: Development Status :: 3 - Alpha
12
- Classifier: Intended Audience :: Science/Research
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: Programming Language :: Python :: 3.13
18
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
- Requires-Python: >=3.10
20
- Description-Content-Type: text/markdown
21
- License-File: LICENSE
22
- Requires-Dist: torch>=2.1
23
- Requires-Dist: transformers>=4.36
24
- Requires-Dist: safetensors>=0.4
25
- Requires-Dist: rich>=13.0
26
- Requires-Dist: typer>=0.9
27
- Requires-Dist: Pillow>=10.0
28
- Requires-Dist: matplotlib>=3.8
29
- Requires-Dist: huggingface-hub>=0.20
30
- Provides-Extra: vision
31
- Requires-Dist: torchvision>=0.16; extra == "vision"
32
- Provides-Extra: probe
33
- Requires-Dist: scikit-learn>=1.3; extra == "probe"
34
- Provides-Extra: dev
35
- Requires-Dist: pytest>=7.0; extra == "dev"
36
- Requires-Dist: pytest-timeout>=2.2; extra == "dev"
37
- Requires-Dist: scikit-learn>=1.3; extra == "dev"
38
- Requires-Dist: torchvision>=0.16; extra == "dev"
39
- Dynamic: license-file
40
-
41
- ```
42
- IIIII tt KK KK iii tt
43
- III nn nnn tt eee rr rr pp pp KK KK tt
44
- III nnn nn tttt ee e rrr r ppp pp KKKK iii tttt
45
- III nn nn tt eeeee rr pppppp KK KK iii tt
46
- IIIII nn nn tttt eeeee rr pp KK KK iii tttt
47
- pp
48
- ```
49
-
50
- > Mech interp for any HuggingFace model.
1
+ <p align="center">
2
+ <img src="assets/logo.svg" alt="InterpKit" width="680">
3
+ </p>
51
4
 
52
5
  [![PyPI version](https://img.shields.io/pypi/v/interpkit.svg)](https://pypi.org/project/interpkit/)
53
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -75,8 +28,8 @@ pip install interpkit[probe]
75
28
  Or install from source for development:
76
29
 
77
30
  ```bash
78
- git clone https://github.com/davidezani/InterpKit.git
79
- cd InterpKit
31
+ git clone https://github.com/z4nix/interpkit.git
32
+ cd interpkit
80
33
  pip install -e ".[dev]"
81
34
  ```
82
35
 
@@ -110,6 +63,25 @@ model = interpkit.load("google/vit-base-patch16-224")
110
63
  model = interpkit.load("bert-base-uncased")
111
64
  ```
112
65
 
66
+ ### Chat models
67
+
68
+ Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
69
+
70
+ ```python
71
+ chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
72
+
73
+ result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
74
+ print(result["response"])
75
+
76
+ # Run any other op on the templated prompt
77
+ chat.dla(result["prompt"])
78
+
79
+ # Or pass a message list directly to any op
80
+ chat.dla([{"role": "user", "content": "Capital of France?"}])
81
+ ```
82
+
83
+ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
84
+
113
85
  ---
114
86
 
115
87
  ## Operations
@@ -117,7 +89,8 @@ model = interpkit.load("bert-base-uncased")
117
89
  | Operation | What it does | Works on |
118
90
  |-----------|-------------|----------|
119
91
  | **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
120
- | **`dla`** | Direct Logit Attribution decompose output logits by head and MLP contribution | LMs |
92
+ | **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
93
+ | **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
121
94
  | `inspect` | Module tree with types, param counts, shapes | Any model |
122
95
  | `patch` | Activation patching at a module, head, or position | Any model |
123
96
  | `trace` | Causal tracing — module-level or position-aware (Meng et al.) heatmap | Any model |
@@ -172,6 +145,16 @@ model.dla("The capital of France is", token="Paris")
172
145
 
173
146
  # Save a bar chart
174
147
  model.dla("The capital of France is", save="dla.png")
148
+
149
+ # Feature-level DLA — decompose a component through an SAE
150
+ # to see which individual features drive the prediction
151
+ model.dla(
152
+ "The capital of France is",
153
+ sae="jbloom/GPT2-Small-SAEs-Reformatted",
154
+ sae_at="transformer.h.11.attn",
155
+ )
156
+ # result["feature_contributions"]["features"]
157
+ # — per-feature logit attributions at the specified component
175
158
  ```
176
159
 
177
160
  ## Causal Tracing
@@ -317,10 +300,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
317
300
  ## Steering
318
301
 
319
302
  ```python
320
- vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
303
+ vector = model.steer_vector(" love", " hate", at="transformer.h.8")
321
304
  model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
322
305
  ```
323
306
 
307
+ > Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
308
+
324
309
  ## Linear Probe
325
310
 
326
311
  ```python
@@ -342,14 +327,22 @@ interpkit.diff(base, finetuned, "The capital of France is")
342
327
 
343
328
  ## SAE Features
344
329
 
345
- Decompose activations into interpretable features using pre-trained Sparse Autoencoders from HuggingFace:
330
+ Decompose activations into interpretable features using pre-trained Sparse Autoencoders:
346
331
 
347
332
  ```python
333
+ # From HuggingFace
348
334
  model.features(
349
335
  "The capital of France is",
350
336
  at="transformer.h.8",
351
337
  sae="jbloom/GPT2-Small-SAEs-Reformatted",
352
338
  )
339
+
340
+ # From a local file (.safetensors or .pt)
341
+ model.features(
342
+ "The capital of France is",
343
+ at="transformer.h.8",
344
+ sae="/path/to/sae_weights.safetensors",
345
+ )
353
346
  ```
354
347
 
355
348
  No SAELens dependency — weights are loaded directly via `safetensors`.
@@ -403,11 +396,17 @@ interpkit lens gpt2 "The capital of France is"
403
396
  interpkit lens gpt2 "The capital of France is" --position -1
404
397
  interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
405
398
  interpkit attribute gpt2 "The capital of France is"
406
- interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
399
+ interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
407
400
  interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
408
401
  interpkit decompose gpt2 "The capital of France is"
409
402
  interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
410
403
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jbloom/GPT2-Small-SAEs-Reformatted
404
+ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
405
+ interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
406
+
407
+ # Chat / instruct models — applies the tokenizer's chat template automatically
408
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
409
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
411
410
 
412
411
  # Interactive HTML output
413
412
  interpkit attention gpt2 "hello world" --html attention.html
@@ -418,7 +417,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
418
417
  interpkit attribute microsoft/resnet-50 cat.jpg --target 281
419
418
  ```
420
419
 
421
- Run `interpkit` with no arguments for a full command reference.
420
+ Run `interpkit` with no arguments for a full command reference, or
421
+ `interpkit --extensive` for a beginner-friendly walkthrough of every command.
422
+
423
+ If the `interpkit` console script isn't on your `PATH` (e.g. fresh
424
+ environments, sandboxed installs, or running from a checkout without
425
+ re-installing), every command also works as `python -m interpkit ...`:
426
+
427
+ ```bash
428
+ python -m interpkit scan gpt2 "The capital of France is"
429
+ python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
430
+ ```
422
431
 
423
432
  ---
424
433
 
@@ -480,6 +489,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
480
489
  | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
481
490
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
482
491
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
492
+ | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
483
493
 
484
494
  ---
485
495
 
@@ -0,0 +1,19 @@
1
+ """Entry point so ``python -m interpkit`` invokes the Typer CLI.
2
+
3
+ Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:app"``
4
+ console script declared in :file:`pyproject.toml`, so users without the
5
+ console script on their ``$PATH`` (e.g. just-installed in a fresh
6
+ environment, vendored copies, ad-hoc subprocess invocations) can still
7
+ reach every CLI command via ``python -m interpkit ...``.
8
+ """
9
+
10
+ from interpkit.cli.main import app
11
+
12
+
13
+ def main() -> None:
14
+ """Invoke the Typer app — separate function makes patching easier in tests."""
15
+ app()
16
+
17
+
18
+ if __name__ == "__main__":
19
+ main()