interpkit 0.3.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {interpkit-0.3.0 → interpkit-0.5.0}/PKG-INFO +100 -9
  2. {interpkit-0.3.0 → interpkit-0.5.0}/README.md +96 -7
  3. interpkit-0.5.0/interpkit/__init__.py +65 -0
  4. interpkit-0.5.0/interpkit/__main__.py +23 -0
  5. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/cli/main.py +274 -10
  6. interpkit-0.5.0/interpkit/core/arch/__init__.py +102 -0
  7. interpkit-0.5.0/interpkit/core/arch/blocks.py +257 -0
  8. interpkit-0.5.0/interpkit/core/arch/family.py +421 -0
  9. interpkit-0.5.0/interpkit/core/arch/heads.py +583 -0
  10. interpkit-0.5.0/interpkit/core/arch/layers.py +462 -0
  11. interpkit-0.5.0/interpkit/core/arch/names.py +60 -0
  12. interpkit-0.5.0/interpkit/core/arch/probe.py +241 -0
  13. interpkit-0.5.0/interpkit/core/arch/residual.py +653 -0
  14. interpkit-0.5.0/interpkit/core/arch/resolve.py +679 -0
  15. interpkit-0.5.0/interpkit/core/arch/tree.py +190 -0
  16. interpkit-0.5.0/interpkit/core/arch/types.py +486 -0
  17. interpkit-0.5.0/interpkit/core/enums.py +105 -0
  18. interpkit-0.5.0/interpkit/core/exceptions.py +83 -0
  19. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/html.py +5 -2
  20. interpkit-0.5.0/interpkit/core/inputs.py +447 -0
  21. interpkit-0.5.0/interpkit/core/loader.py +704 -0
  22. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/model.py +537 -38
  23. interpkit-0.5.0/interpkit/core/paths.py +71 -0
  24. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/render.py +74 -18
  25. interpkit-0.5.0/interpkit/core/support_matrix.py +690 -0
  26. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/theme.py +11 -8
  27. interpkit-0.5.0/interpkit/core/tl_compat.py +297 -0
  28. interpkit-0.5.0/interpkit/ops/_atp.py +182 -0
  29. interpkit-0.5.0/interpkit/ops/_hooks.py +233 -0
  30. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/ablate.py +14 -0
  31. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/activations.py +9 -1
  32. interpkit-0.5.0/interpkit/ops/attention.py +334 -0
  33. interpkit-0.5.0/interpkit/ops/attribute.py +844 -0
  34. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/batch.py +4 -4
  35. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/circuits.py +221 -110
  36. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/diff.py +22 -2
  37. interpkit-0.5.0/interpkit/ops/dla.py +628 -0
  38. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/find_circuit.py +15 -17
  39. interpkit-0.5.0/interpkit/ops/heads.py +282 -0
  40. interpkit-0.5.0/interpkit/ops/lens.py +397 -0
  41. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/patch.py +113 -22
  42. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/probe.py +14 -0
  43. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/report.py +55 -10
  44. interpkit-0.5.0/interpkit/ops/sae.py +739 -0
  45. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/scan.py +28 -6
  46. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/steer.py +59 -2
  47. interpkit-0.5.0/interpkit/ops/trace.py +502 -0
  48. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/PKG-INFO +100 -9
  49. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/SOURCES.txt +31 -2
  50. interpkit-0.5.0/interpkit.egg-info/entry_points.txt +2 -0
  51. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/requires.txt +3 -1
  52. {interpkit-0.3.0 → interpkit-0.5.0}/pyproject.toml +27 -4
  53. interpkit-0.5.0/tests/test_archinfo_serialization.py +61 -0
  54. interpkit-0.5.0/tests/test_attention.py +112 -0
  55. interpkit-0.5.0/tests/test_audit_regressions.py +1891 -0
  56. interpkit-0.5.0/tests/test_cache_invalidation.py +66 -0
  57. interpkit-0.5.0/tests/test_capabilities.py +227 -0
  58. interpkit-0.5.0/tests/test_chat.py +217 -0
  59. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_cli.py +77 -1
  60. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_discovery.py +1 -1
  61. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_discovery_units.py +21 -21
  62. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_error_handling.py +11 -0
  63. interpkit-0.5.0/tests/test_inputs.py +251 -0
  64. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_invariants.py +22 -8
  65. interpkit-0.5.0/tests/test_lens.py +53 -0
  66. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_load_params.py +12 -2
  67. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_multi_arch.py +12 -5
  68. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_ops.py +6 -1
  69. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_param_variants.py +4 -2
  70. interpkit-0.5.0/tests/test_phase3_regressions.py +121 -0
  71. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_regressions.py +5 -2
  72. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_render_internals.py +34 -6
  73. interpkit-0.5.0/tests/test_resolver.py +268 -0
  74. interpkit-0.5.0/tests/test_resolver_golden.py +131 -0
  75. interpkit-0.5.0/tests/test_robustness_audit.py +790 -0
  76. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_sae.py +161 -1
  77. interpkit-0.5.0/tests/test_seq2seq_contract.py +119 -0
  78. interpkit-0.5.0/tests/test_steer.py +91 -0
  79. interpkit-0.5.0/tests/test_trace.py +76 -0
  80. interpkit-0.5.0/tests/test_validation.py +130 -0
  81. interpkit-0.3.0/interpkit/__init__.py +0 -27
  82. interpkit-0.3.0/interpkit/core/discovery.py +0 -810
  83. interpkit-0.3.0/interpkit/core/inputs.py +0 -130
  84. interpkit-0.3.0/interpkit/core/loader.py +0 -292
  85. interpkit-0.3.0/interpkit/core/tl_compat.py +0 -174
  86. interpkit-0.3.0/interpkit/ops/attention.py +0 -365
  87. interpkit-0.3.0/interpkit/ops/attribute.py +0 -308
  88. interpkit-0.3.0/interpkit/ops/dla.py +0 -488
  89. interpkit-0.3.0/interpkit/ops/heads.py +0 -175
  90. interpkit-0.3.0/interpkit/ops/lens.py +0 -243
  91. interpkit-0.3.0/interpkit/ops/sae.py +0 -439
  92. interpkit-0.3.0/interpkit/ops/trace.py +0 -349
  93. interpkit-0.3.0/interpkit.egg-info/entry_points.txt +0 -2
  94. interpkit-0.3.0/tests/test_attention.py +0 -44
  95. interpkit-0.3.0/tests/test_lens.py +0 -25
  96. interpkit-0.3.0/tests/test_steer.py +0 -30
  97. interpkit-0.3.0/tests/test_trace.py +0 -35
  98. {interpkit-0.3.0 → interpkit-0.5.0}/LICENSE +0 -0
  99. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/cli/__init__.py +0 -0
  100. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/__init__.py +0 -0
  101. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/cache.py +0 -0
  102. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/plot.py +0 -0
  103. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/core/registry.py +0 -0
  104. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/__init__.py +0 -0
  105. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit/ops/inspect.py +0 -0
  106. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/dependency_links.txt +0 -0
  107. {interpkit-0.3.0 → interpkit-0.5.0}/interpkit.egg-info/top_level.txt +0 -0
  108. {interpkit-0.3.0 → interpkit-0.5.0}/setup.cfg +0 -0
  109. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_ablate.py +0 -0
  110. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_activations.py +0 -0
  111. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_architectures.py +0 -0
  112. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_attribute.py +0 -0
  113. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_cache.py +0 -0
  114. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_diff.py +0 -0
  115. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_html.py +0 -0
  116. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_inspect.py +0 -0
  117. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_patch.py +0 -0
  118. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_plot_internals.py +0 -0
  119. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_plots.py +0 -0
  120. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_probe.py +0 -0
  121. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_registry.py +0 -0
  122. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_tl_compat.py +0 -0
  123. {interpkit-0.3.0 → interpkit-0.5.0}/tests/test_tl_ops.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: interpkit
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Mech interp for any HuggingFace model.
5
5
  Author: Davide Zani
6
6
  License-Expression: MIT
@@ -20,7 +20,8 @@ Requires-Python: >=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: torch>=2.1
23
- Requires-Dist: transformers>=4.36
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: transformers<6,>=4.36
24
25
  Requires-Dist: safetensors>=0.4
25
26
  Requires-Dist: rich>=13.0
26
27
  Requires-Dist: rich-gradient>=0.3
@@ -28,6 +29,7 @@ Requires-Dist: typer>=0.9
28
29
  Requires-Dist: Pillow>=10.0
29
30
  Requires-Dist: matplotlib>=3.8
30
31
  Requires-Dist: huggingface-hub>=0.20
32
+ Requires-Dist: sentencepiece>=0.1.99
31
33
  Provides-Extra: vision
32
34
  Requires-Dist: torchvision>=0.16; extra == "vision"
33
35
  Provides-Extra: probe
@@ -60,27 +62,55 @@ Dynamic: license-file
60
62
 
61
63
  Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
62
64
 
63
- InterpKit provides a single, consistent interface for mech interp operations across any HuggingFace model — transformers, SSMs, vision models, and more — with zero annotation required.
65
+ InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
64
66
 
65
67
  ---
66
68
 
67
69
  ## Install
68
70
 
71
+ We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
72
+
73
+ Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
74
+
75
+ ```bash
76
+ uv venv --python 3.11
77
+ source .venv/bin/activate
78
+ uv pip install interpkit
79
+
80
+ # For linear probe support:
81
+ uv pip install "interpkit[probe]"
82
+ ```
83
+
84
+ Or with plain `venv` + `pip`:
85
+
69
86
  ```bash
87
+ python3.11 -m venv .venv
88
+ source .venv/bin/activate
70
89
  pip install interpkit
71
90
 
72
91
  # For linear probe support:
73
- pip install interpkit[probe]
92
+ pip install "interpkit[probe]"
74
93
  ```
75
94
 
76
- Or install from source for development:
95
+ Or with `conda`:
96
+
97
+ ```bash
98
+ conda create -n interpkit python=3.11 -y
99
+ conda activate interpkit
100
+ pip install interpkit
101
+ ```
102
+
103
+ Installing from source for development:
77
104
 
78
105
  ```bash
79
106
  git clone https://github.com/z4nix/interpkit.git
80
107
  cd interpkit
81
- pip install -e ".[dev]"
108
+ uv venv --python 3.11 && source .venv/bin/activate
109
+ uv pip install -e ".[dev]"
82
110
  ```
83
111
 
112
+ > Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
113
+
84
114
  ---
85
115
 
86
116
  ## Quickstart
@@ -111,6 +141,25 @@ model = interpkit.load("google/vit-base-patch16-224")
111
141
  model = interpkit.load("bert-base-uncased")
112
142
  ```
113
143
 
144
+ ### Chat models
145
+
146
+ Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
147
+
148
+ ```python
149
+ chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
150
+
151
+ result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
152
+ print(result["response"])
153
+
154
+ # Run any other op on the templated prompt
155
+ chat.dla(result["prompt"])
156
+
157
+ # Or pass a message list directly to any op
158
+ chat.dla([{"role": "user", "content": "Capital of France?"}])
159
+ ```
160
+
161
+ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
162
+
114
163
  ---
115
164
 
116
165
  ## Operations
@@ -118,6 +167,7 @@ model = interpkit.load("bert-base-uncased")
118
167
  | Operation | What it does | Works on |
119
168
  |-----------|-------------|----------|
120
169
  | **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
170
+ | **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
121
171
  | **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
122
172
  | `inspect` | Module tree with types, param counts, shapes | Any model |
123
173
  | `patch` | Activation patching at a module, head, or position | Any model |
@@ -328,10 +378,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
328
378
  ## Steering
329
379
 
330
380
  ```python
331
- vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
381
+ vector = model.steer_vector(" love", " hate", at="transformer.h.8")
332
382
  model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
333
383
  ```
334
384
 
385
+ > Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
386
+
335
387
  ## Linear Probe
336
388
 
337
389
  ```python
@@ -422,7 +474,7 @@ interpkit lens gpt2 "The capital of France is"
422
474
  interpkit lens gpt2 "The capital of France is" --position -1
423
475
  interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
424
476
  interpkit attribute gpt2 "The capital of France is"
425
- interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
477
+ interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
426
478
  interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
427
479
  interpkit decompose gpt2 "The capital of France is"
428
480
  interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
@@ -430,6 +482,10 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
430
482
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
431
483
  interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
432
484
 
485
+ # Chat / instruct models — applies the tokenizer's chat template automatically
486
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
487
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
488
+
433
489
  # Interactive HTML output
434
490
  interpkit attention gpt2 "hello world" --html attention.html
435
491
  interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
@@ -439,7 +495,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
439
495
  interpkit attribute microsoft/resnet-50 cat.jpg --target 281
440
496
  ```
441
497
 
442
- Run `interpkit` with no arguments for a full command reference.
498
+ Run `interpkit` with no arguments for a full command reference, or
499
+ `interpkit --extensive` for a beginner-friendly walkthrough of every command.
500
+
501
+ If the `interpkit` console script isn't on your `PATH` (e.g. fresh
502
+ environments, sandboxed installs, or running from a checkout without
503
+ re-installing), every command also works as `python -m interpkit ...`:
504
+
505
+ ```bash
506
+ python -m interpkit scan gpt2 "The capital of France is"
507
+ python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
508
+ ```
443
509
 
444
510
  ---
445
511
 
@@ -486,6 +552,30 @@ model.trace(input_a, input_b, top_k=10)
486
552
 
487
553
  ---
488
554
 
555
+ ## Known limitations
556
+
557
+ - **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
558
+ HuggingFace transformers' relative-position-bias path triggers on
559
+ forward hooks for any DeBERTa-v3 model (e.g.
560
+ `microsoft/deberta-v3-small`). interpkit detects this at load time
561
+ and gates `trace`, `decompose`, `attribute`, `head_activations`,
562
+ `steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
563
+ `OperationNotSupportedForArchitecture` rather than the cryptic
564
+ upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
565
+ `attention` still work. Use `bert`, `roberta`, `electra`, or
566
+ `albert` for the gated ops on encoder-only inputs.
567
+
568
+ - **Integrated-gradients completeness on some modern decoders.** On
569
+ Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
570
+ sum does not converge to model-output completeness even at large
571
+ `n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
572
+ reliable as a token-importance **ranking** but cannot be interpreted as
573
+ additive contribution **magnitudes** on these models. `attribute()`
574
+ reports this programmatically: `result["interpretation"]` is
575
+ `"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
576
+ which are saliency methods), versus `"quantitative"` when IG completeness
577
+ holds. Branch on that field rather than parsing the warning text.
578
+
489
579
  ## Examples
490
580
 
491
581
  See the [`examples/`](examples/) directory for Jupyter notebooks:
@@ -501,6 +591,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
501
591
  | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
502
592
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
503
593
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
594
+ | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
504
595
 
505
596
  ---
506
597
 
@@ -12,27 +12,55 @@
12
12
 
13
13
  Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
14
14
 
15
- InterpKit provides a single, consistent interface for mech interp operations across any HuggingFace model — transformers, SSMs, vision models, and more — with zero annotation required.
15
+ InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
16
16
 
17
17
  ---
18
18
 
19
19
  ## Install
20
20
 
21
+ We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
22
+
23
+ Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
24
+
25
+ ```bash
26
+ uv venv --python 3.11
27
+ source .venv/bin/activate
28
+ uv pip install interpkit
29
+
30
+ # For linear probe support:
31
+ uv pip install "interpkit[probe]"
32
+ ```
33
+
34
+ Or with plain `venv` + `pip`:
35
+
21
36
  ```bash
37
+ python3.11 -m venv .venv
38
+ source .venv/bin/activate
22
39
  pip install interpkit
23
40
 
24
41
  # For linear probe support:
25
- pip install interpkit[probe]
42
+ pip install "interpkit[probe]"
26
43
  ```
27
44
 
28
- Or install from source for development:
45
+ Or with `conda`:
46
+
47
+ ```bash
48
+ conda create -n interpkit python=3.11 -y
49
+ conda activate interpkit
50
+ pip install interpkit
51
+ ```
52
+
53
+ Installing from source for development:
29
54
 
30
55
  ```bash
31
56
  git clone https://github.com/z4nix/interpkit.git
32
57
  cd interpkit
33
- pip install -e ".[dev]"
58
+ uv venv --python 3.11 && source .venv/bin/activate
59
+ uv pip install -e ".[dev]"
34
60
  ```
35
61
 
62
+ > Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
63
+
36
64
  ---
37
65
 
38
66
  ## Quickstart
@@ -63,6 +91,25 @@ model = interpkit.load("google/vit-base-patch16-224")
63
91
  model = interpkit.load("bert-base-uncased")
64
92
  ```
65
93
 
94
+ ### Chat models
95
+
96
+ Instruction-tuned models work too — interpkit applies the tokenizer's chat template automatically.
97
+
98
+ ```python
99
+ chat = interpkit.load("HuggingFaceTB/SmolLM2-360M-Instruct")
100
+
101
+ result = chat.chat("Write a haiku about cats.", max_new_tokens=64)
102
+ print(result["response"])
103
+
104
+ # Run any other op on the templated prompt
105
+ chat.dla(result["prompt"])
106
+
107
+ # Or pass a message list directly to any op
108
+ chat.dla([{"role": "user", "content": "Capital of France?"}])
109
+ ```
110
+
111
+ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full walkthrough including chat-style steering.
112
+
66
113
  ---
67
114
 
68
115
  ## Operations
@@ -70,6 +117,7 @@ model = interpkit.load("bert-base-uncased")
70
117
  | Operation | What it does | Works on |
71
118
  |-----------|-------------|----------|
72
119
  | **`scan`** | One-command model overview: runs DLA, lens, attention, attribution and surfaces key findings | LMs |
120
+ | **`chat`** | Send a message through the tokenizer's chat template and generate a reply | Chat / instruct LMs |
73
121
  | **`dla`** | Direct Logit Attribution — decompose output logits by head and MLP contribution; optionally decompose through an SAE into per-feature attributions | LMs |
74
122
  | `inspect` | Module tree with types, param counts, shapes | Any model |
75
123
  | `patch` | Activation patching at a module, head, or position | Any model |
@@ -280,10 +328,12 @@ results = model.dla_batch(["The capital of France is", "The CEO of Apple is"])
280
328
  ## Steering
281
329
 
282
330
  ```python
283
- vector = model.steer_vector("Love", "Hate", at="transformer.h.8")
331
+ vector = model.steer_vector(" love", " hate", at="transformer.h.8")
284
332
  model.steer("The weather today is", vector=vector, at="transformer.h.8", scale=2.0)
285
333
  ```
286
334
 
335
+ > Note the leading spaces. BPE tokenizers (GPT-2, Llama, ...) treat `" love"` and `"love"` as different tokens, and the leading-space variant is the one the model actually sees in normal text. interpkit prints a warning if you forget.
336
+
287
337
  ## Linear Probe
288
338
 
289
339
  ```python
@@ -374,7 +424,7 @@ interpkit lens gpt2 "The capital of France is"
374
424
  interpkit lens gpt2 "The capital of France is" --position -1
375
425
  interpkit attention gpt2 "The capital of France is" --layer 8 --save attention.png
376
426
  interpkit attribute gpt2 "The capital of France is"
377
- interpkit steer gpt2 "The weather is" --positive Love --negative Hate --at transformer.h.8
427
+ interpkit steer gpt2 "The weather is" --positive " love" --negative " hate" --at transformer.h.8
378
428
  interpkit ablate gpt2 "The capital of France is" --at transformer.h.8.mlp
379
429
  interpkit decompose gpt2 "The capital of France is"
380
430
  interpkit diff gpt2 my-finetuned-gpt2 "The capital of France is" --save diff.png
@@ -382,6 +432,10 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
382
432
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
383
433
  interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
384
434
 
435
+ # Chat / instruct models — applies the tokenizer's chat template automatically
436
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
437
+ interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
438
+
385
439
  # Interactive HTML output
386
440
  interpkit attention gpt2 "hello world" --html attention.html
387
441
  interpkit trace gpt2 --clean "...Paris..." --corrupted "...Rome..." --html trace.html
@@ -391,7 +445,17 @@ interpkit attribute gpt2 "The capital of France is" --html attribution.html
391
445
  interpkit attribute microsoft/resnet-50 cat.jpg --target 281
392
446
  ```
393
447
 
394
- Run `interpkit` with no arguments for a full command reference.
448
+ Run `interpkit` with no arguments for a full command reference, or
449
+ `interpkit --extensive` for a beginner-friendly walkthrough of every command.
450
+
451
+ If the `interpkit` console script isn't on your `PATH` (e.g. fresh
452
+ environments, sandboxed installs, or running from a checkout without
453
+ re-installing), every command also works as `python -m interpkit ...`:
454
+
455
+ ```bash
456
+ python -m interpkit scan gpt2 "The capital of France is"
457
+ python -m interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Hello!"
458
+ ```
395
459
 
396
460
  ---
397
461
 
@@ -438,6 +502,30 @@ model.trace(input_a, input_b, top_k=10)
438
502
 
439
503
  ---
440
504
 
505
+ ## Known limitations
506
+
507
+ - **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
508
+ HuggingFace transformers' relative-position-bias path triggers on
509
+ forward hooks for any DeBERTa-v3 model (e.g.
510
+ `microsoft/deberta-v3-small`). interpkit detects this at load time
511
+ and gates `trace`, `decompose`, `attribute`, `head_activations`,
512
+ `steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
513
+ `OperationNotSupportedForArchitecture` rather than the cryptic
514
+ upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
515
+ `attention` still work. Use `bert`, `roberta`, `electra`, or
516
+ `albert` for the gated ops on encoder-only inputs.
517
+
518
+ - **Integrated-gradients completeness on some modern decoders.** On
519
+ Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
520
+ sum does not converge to model-output completeness even at large
521
+ `n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
522
+ reliable as a token-importance **ranking** but cannot be interpreted as
523
+ additive contribution **magnitudes** on these models. `attribute()`
524
+ reports this programmatically: `result["interpretation"]` is
525
+ `"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
526
+ which are saliency methods), versus `"quantitative"` when IG completeness
527
+ holds. Branch on that field rather than parsing the warning text.
528
+
441
529
  ## Examples
442
530
 
443
531
  See the [`examples/`](examples/) directory for Jupyter notebooks:
@@ -453,6 +541,7 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
453
541
  | `07_vision_models` | ResNet/ViT attribution, ablation, activations |
454
542
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
455
543
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
544
+ | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
456
545
 
457
546
  ---
458
547
 
@@ -0,0 +1,65 @@
1
+ """interpkit — mech interp for any HuggingFace model."""
2
+
3
+ from interpkit.core.arch import (
4
+ ArchFamily,
5
+ ArchInfo,
6
+ BlockSpec,
7
+ LayerInfo,
8
+ ModuleInfo,
9
+ resolve_arch,
10
+ )
11
+ from interpkit.core.exceptions import (
12
+ ArchitectureNotSupported,
13
+ AttentionBackendUnavailable,
14
+ InterpkitError,
15
+ LensPipelineMismatch,
16
+ OperationNotSupportedForArchitecture,
17
+ WrongInputType,
18
+ )
19
+ from interpkit.core.loader import load, load_module
20
+ from interpkit.core.model import Model
21
+ from interpkit.core.registry import register
22
+ from interpkit.core.tl_compat import (
23
+ list_roundtrippable_hooks,
24
+ list_tl_hooks,
25
+ to_native_name,
26
+ to_tl_name,
27
+ )
28
+
29
+
30
+ def diff(model_a, model_b, input_data, *, save=None):
31
+ """Compare activations between two models on the same input."""
32
+ from interpkit.ops.diff import run_diff
33
+
34
+ return run_diff(model_a, model_b, input_data, save=save)
35
+
36
+
37
+ __all__ = [
38
+ # Loaders
39
+ "load",
40
+ "load_module",
41
+ "Model",
42
+ # Architecture types
43
+ "ArchInfo",
44
+ "ArchFamily",
45
+ "BlockSpec",
46
+ "resolve_arch",
47
+ # Per-layer structural types
48
+ "LayerInfo",
49
+ "ModuleInfo",
50
+ # Exception types
51
+ "InterpkitError",
52
+ "ArchitectureNotSupported",
53
+ "AttentionBackendUnavailable",
54
+ "LensPipelineMismatch",
55
+ "OperationNotSupportedForArchitecture",
56
+ "WrongInputType",
57
+ # Operations
58
+ "register",
59
+ "diff",
60
+ # TL compat
61
+ "to_tl_name",
62
+ "to_native_name",
63
+ "list_tl_hooks",
64
+ "list_roundtrippable_hooks",
65
+ ]
@@ -0,0 +1,23 @@
1
+ """Entry point so ``python -m interpkit`` invokes the Typer CLI.
2
+
3
+ Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:run"``
4
+ console script declared in :file:`pyproject.toml`, so users without the
5
+ console script on their ``$PATH`` (e.g. just-installed in a fresh
6
+ environment, vendored copies, ad-hoc subprocess invocations) can still
7
+ reach every CLI command via ``python -m interpkit ...``.
8
+ """
9
+
10
+ from interpkit.cli.main import run
11
+
12
+
13
+ def main() -> None:
14
+ """Invoke the CLI — separate function makes patching easier in tests.
15
+
16
+ Uses ``run`` (not ``app`` directly) so interpkit's fail-loud errors are
17
+ rendered as clean one-line messages instead of tracebacks.
18
+ """
19
+ run()
20
+
21
+
22
+ if __name__ == "__main__":
23
+ main()