interpkit 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {interpkit-0.4.0 → interpkit-0.6.0}/PKG-INFO +85 -7
  2. {interpkit-0.4.0 → interpkit-0.6.0}/README.md +79 -5
  3. interpkit-0.6.0/interpkit/__init__.py +84 -0
  4. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/__main__.py +8 -4
  5. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/cli/main.py +506 -11
  6. interpkit-0.6.0/interpkit/core/arch/__init__.py +102 -0
  7. interpkit-0.6.0/interpkit/core/arch/blocks.py +257 -0
  8. interpkit-0.6.0/interpkit/core/arch/family.py +421 -0
  9. interpkit-0.6.0/interpkit/core/arch/heads.py +583 -0
  10. interpkit-0.6.0/interpkit/core/arch/layers.py +462 -0
  11. interpkit-0.6.0/interpkit/core/arch/names.py +60 -0
  12. interpkit-0.6.0/interpkit/core/arch/probe.py +241 -0
  13. interpkit-0.6.0/interpkit/core/arch/residual.py +653 -0
  14. interpkit-0.6.0/interpkit/core/arch/resolve.py +679 -0
  15. interpkit-0.6.0/interpkit/core/arch/tree.py +190 -0
  16. interpkit-0.6.0/interpkit/core/arch/types.py +486 -0
  17. interpkit-0.6.0/interpkit/core/enums.py +121 -0
  18. interpkit-0.6.0/interpkit/core/exceptions.py +83 -0
  19. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/html.py +5 -2
  20. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/inputs.py +70 -8
  21. interpkit-0.6.0/interpkit/core/interventions.py +492 -0
  22. interpkit-0.6.0/interpkit/core/loader.py +704 -0
  23. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/model.py +610 -36
  24. interpkit-0.6.0/interpkit/core/paths.py +88 -0
  25. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/render.py +239 -7
  26. interpkit-0.6.0/interpkit/core/support_matrix.py +698 -0
  27. interpkit-0.6.0/interpkit/core/tl_compat.py +297 -0
  28. interpkit-0.6.0/interpkit/core/topk.py +63 -0
  29. interpkit-0.6.0/interpkit/ops/_atp.py +13 -0
  30. interpkit-0.6.0/interpkit/ops/_hooks.py +272 -0
  31. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/ablate.py +23 -39
  32. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/activations.py +9 -1
  33. interpkit-0.6.0/interpkit/ops/atp.py +230 -0
  34. interpkit-0.6.0/interpkit/ops/attention.py +334 -0
  35. interpkit-0.6.0/interpkit/ops/attribute.py +844 -0
  36. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/circuits.py +219 -108
  37. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/diff.py +22 -2
  38. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/dla.py +309 -190
  39. interpkit-0.6.0/interpkit/ops/eap.py +355 -0
  40. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/find_circuit.py +135 -76
  41. interpkit-0.6.0/interpkit/ops/generate.py +292 -0
  42. interpkit-0.6.0/interpkit/ops/heads.py +282 -0
  43. interpkit-0.6.0/interpkit/ops/lens.py +442 -0
  44. interpkit-0.6.0/interpkit/ops/maxact.py +347 -0
  45. interpkit-0.6.0/interpkit/ops/patch.py +328 -0
  46. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/probe.py +14 -0
  47. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/sae.py +142 -22
  48. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/steer.py +16 -24
  49. interpkit-0.6.0/interpkit/ops/trace.py +456 -0
  50. interpkit-0.6.0/interpkit/ops/tuned_lens.py +437 -0
  51. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/PKG-INFO +85 -7
  52. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/SOURCES.txt +41 -2
  53. interpkit-0.6.0/interpkit.egg-info/entry_points.txt +2 -0
  54. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/requires.txt +6 -1
  55. {interpkit-0.4.0 → interpkit-0.6.0}/pyproject.toml +30 -4
  56. interpkit-0.6.0/tests/test_archinfo_serialization.py +61 -0
  57. interpkit-0.6.0/tests/test_atp.py +68 -0
  58. interpkit-0.6.0/tests/test_attention.py +112 -0
  59. interpkit-0.6.0/tests/test_audit_regressions.py +1891 -0
  60. interpkit-0.6.0/tests/test_cache_invalidation.py +66 -0
  61. interpkit-0.6.0/tests/test_capabilities.py +227 -0
  62. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_cli.py +210 -1
  63. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_discovery.py +1 -1
  64. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_discovery_units.py +21 -21
  65. interpkit-0.6.0/tests/test_eap.py +138 -0
  66. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_error_handling.py +11 -0
  67. interpkit-0.6.0/tests/test_generate.py +186 -0
  68. interpkit-0.6.0/tests/test_interventions.py +241 -0
  69. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_invariants.py +22 -8
  70. interpkit-0.6.0/tests/test_lens.py +53 -0
  71. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_load_params.py +12 -2
  72. interpkit-0.6.0/tests/test_maxact.py +149 -0
  73. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_multi_arch.py +12 -5
  74. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_ops.py +6 -1
  75. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_param_variants.py +4 -2
  76. interpkit-0.6.0/tests/test_phase3_regressions.py +121 -0
  77. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_regressions.py +5 -2
  78. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_render_internals.py +34 -6
  79. interpkit-0.6.0/tests/test_resolver.py +268 -0
  80. interpkit-0.6.0/tests/test_resolver_golden.py +131 -0
  81. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_robustness_audit.py +56 -29
  82. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_sae.py +6 -1
  83. interpkit-0.6.0/tests/test_seq2seq_contract.py +119 -0
  84. interpkit-0.6.0/tests/test_topk.py +58 -0
  85. interpkit-0.6.0/tests/test_trace.py +76 -0
  86. interpkit-0.6.0/tests/test_tuned_lens.py +140 -0
  87. interpkit-0.6.0/tests/test_validation.py +130 -0
  88. interpkit-0.4.0/interpkit/__init__.py +0 -27
  89. interpkit-0.4.0/interpkit/core/discovery.py +0 -810
  90. interpkit-0.4.0/interpkit/core/loader.py +0 -322
  91. interpkit-0.4.0/interpkit/core/tl_compat.py +0 -174
  92. interpkit-0.4.0/interpkit/ops/attention.py +0 -365
  93. interpkit-0.4.0/interpkit/ops/attribute.py +0 -377
  94. interpkit-0.4.0/interpkit/ops/heads.py +0 -175
  95. interpkit-0.4.0/interpkit/ops/lens.py +0 -243
  96. interpkit-0.4.0/interpkit/ops/patch.py +0 -261
  97. interpkit-0.4.0/interpkit/ops/trace.py +0 -349
  98. interpkit-0.4.0/interpkit.egg-info/entry_points.txt +0 -2
  99. interpkit-0.4.0/tests/test_attention.py +0 -44
  100. interpkit-0.4.0/tests/test_lens.py +0 -25
  101. interpkit-0.4.0/tests/test_trace.py +0 -35
  102. {interpkit-0.4.0 → interpkit-0.6.0}/LICENSE +0 -0
  103. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/cli/__init__.py +0 -0
  104. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/__init__.py +0 -0
  105. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/cache.py +0 -0
  106. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/plot.py +0 -0
  107. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/registry.py +0 -0
  108. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/core/theme.py +0 -0
  109. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/__init__.py +0 -0
  110. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/batch.py +0 -0
  111. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/inspect.py +0 -0
  112. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/report.py +0 -0
  113. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit/ops/scan.py +0 -0
  114. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/dependency_links.txt +0 -0
  115. {interpkit-0.4.0 → interpkit-0.6.0}/interpkit.egg-info/top_level.txt +0 -0
  116. {interpkit-0.4.0 → interpkit-0.6.0}/setup.cfg +0 -0
  117. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_ablate.py +0 -0
  118. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_activations.py +0 -0
  119. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_architectures.py +0 -0
  120. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_attribute.py +0 -0
  121. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_cache.py +0 -0
  122. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_chat.py +0 -0
  123. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_diff.py +0 -0
  124. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_html.py +0 -0
  125. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_inputs.py +0 -0
  126. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_inspect.py +0 -0
  127. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_patch.py +0 -0
  128. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_plot_internals.py +0 -0
  129. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_plots.py +0 -0
  130. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_probe.py +0 -0
  131. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_registry.py +0 -0
  132. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_steer.py +0 -0
  133. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_tl_compat.py +0 -0
  134. {interpkit-0.4.0 → interpkit-0.6.0}/tests/test_tl_ops.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: interpkit
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Mech interp for any HuggingFace model.
5
5
  Author: Davide Zani
6
6
  License-Expression: MIT
@@ -20,7 +20,8 @@ Requires-Python: >=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: torch>=2.1
23
- Requires-Dist: transformers>=4.36
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: transformers<6,>=4.36
24
25
  Requires-Dist: safetensors>=0.4
25
26
  Requires-Dist: rich>=13.0
26
27
  Requires-Dist: rich-gradient>=0.3
@@ -28,10 +29,13 @@ Requires-Dist: typer>=0.9
28
29
  Requires-Dist: Pillow>=10.0
29
30
  Requires-Dist: matplotlib>=3.8
30
31
  Requires-Dist: huggingface-hub>=0.20
32
+ Requires-Dist: sentencepiece>=0.1.99
31
33
  Provides-Extra: vision
32
34
  Requires-Dist: torchvision>=0.16; extra == "vision"
33
35
  Provides-Extra: probe
34
36
  Requires-Dist: scikit-learn>=1.3; extra == "probe"
37
+ Provides-Extra: data
38
+ Requires-Dist: datasets>=2.14; extra == "data"
35
39
  Provides-Extra: dev
36
40
  Requires-Dist: pytest>=7.0; extra == "dev"
37
41
  Requires-Dist: pytest-timeout>=2.2; extra == "dev"
@@ -60,27 +64,55 @@ Dynamic: license-file
60
64
 
61
65
  Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
62
66
 
63
- InterpKit provides a single, consistent interface for mech interp operations across any HuggingFace model — transformers, SSMs, vision models, and more — with zero annotation required.
67
+ InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
64
68
 
65
69
  ---
66
70
 
67
71
  ## Install
68
72
 
73
+ We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
74
+
75
+ Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
76
+
69
77
  ```bash
78
+ uv venv --python 3.11
79
+ source .venv/bin/activate
80
+ uv pip install interpkit
81
+
82
+ # For linear probe support:
83
+ uv pip install "interpkit[probe]"
84
+ ```
85
+
86
+ Or with plain `venv` + `pip`:
87
+
88
+ ```bash
89
+ python3.11 -m venv .venv
90
+ source .venv/bin/activate
70
91
  pip install interpkit
71
92
 
72
93
  # For linear probe support:
73
- pip install interpkit[probe]
94
+ pip install "interpkit[probe]"
95
+ ```
96
+
97
+ Or with `conda`:
98
+
99
+ ```bash
100
+ conda create -n interpkit python=3.11 -y
101
+ conda activate interpkit
102
+ pip install interpkit
74
103
  ```
75
104
 
76
- Or install from source for development:
105
+ Installing from source for development:
77
106
 
78
107
  ```bash
79
108
  git clone https://github.com/z4nix/interpkit.git
80
109
  cd interpkit
81
- pip install -e ".[dev]"
110
+ uv venv --python 3.11 && source .venv/bin/activate
111
+ uv pip install -e ".[dev]"
82
112
  ```
83
113
 
114
+ > Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
115
+
84
116
  ---
85
117
 
86
118
  ## Quickstart
@@ -156,7 +188,13 @@ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full wa
156
188
  | **`ov_scores`** | OV circuit analysis — W_OV matrix per head | Transformers |
157
189
  | **`qk_scores`** | QK circuit analysis — W_QK matrix per head | Transformers |
158
190
  | **`composition`** | Q/K/V composition scores between heads in two layers | Transformers |
159
- | **`find_circuit`** | Automated circuit discovery via iterative ablation | Transformers |
191
+ | **`find_circuit`** | Automated circuit discovery iterative ablation or EAP-based selection with causal verification | Transformers |
192
+ | **`generate`** | Generation with interventions active across every decode step + per-token lens capture | Generative LMs |
193
+ | **`intervene`** | Context manager applying steer/ablate/patch interventions to any op | Any model |
194
+ | **`atp`** | Attribution Patching — first-order patch-effect scores for all modules in 3 passes | Any model |
195
+ | **`eap`** | Edge Attribution Patching — gradient-based component → residual-stream edge scores (EAP-IG via `ig_steps`) | Causal LMs |
196
+ | **`train_tuned_lens`** | Train per-layer tuned-lens translators (Belrose et al. 2023); use via `lens(kind="tuned")` | LMs |
197
+ | **`max_activating`** | Scan a corpus for the examples that most activate a neuron / SAE feature / head | Any model |
160
198
  | **`batch`** | Run any operation over a dataset with result aggregation | Any model |
161
199
 
162
200
  ---
@@ -452,6 +490,20 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
452
490
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
453
491
  interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
454
492
 
493
+ # Generation-time interventions + per-token lens trajectories
494
+ interpkit generate gpt2 "I feel" --positive " joy" --negative " fear" --at transformer.h.6 --scale 8
495
+ interpkit generate gpt2 "The capital of France is" --capture lens
496
+
497
+ # Gradient-based circuit discovery
498
+ interpkit atp gpt2 --clean "The capital of France is" --corrupted "The capital of Germany is"
499
+ interpkit eap gpt2 --clean "..." --corrupted "..." --ig-steps 5
500
+ interpkit find-circuit gpt2 --clean "..." --corrupted "..." --method eap --threshold 0.3
501
+
502
+ # Tuned lens + max-activating examples
503
+ interpkit train-tuned-lens gpt2 --corpus-file texts.txt --save lens_dir/
504
+ interpkit lens gpt2 "The capital of France is" --tuned-lens lens_dir/
505
+ interpkit maxact gpt2 --at transformer.h.6.mlp --neuron 42 --texts-file corpus.txt
506
+
455
507
  # Chat / instruct models — applies the tokenizer's chat template automatically
456
508
  interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
457
509
  interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
@@ -522,6 +574,30 @@ model.trace(input_a, input_b, top_k=10)
522
574
 
523
575
  ---
524
576
 
577
+ ## Known limitations
578
+
579
+ - **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
580
+ HuggingFace transformers' relative-position-bias path triggers on
581
+ forward hooks for any DeBERTa-v3 model (e.g.
582
+ `microsoft/deberta-v3-small`). interpkit detects this at load time
583
+ and gates `trace`, `decompose`, `attribute`, `head_activations`,
584
+ `steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
585
+ `OperationNotSupportedForArchitecture` rather than the cryptic
586
+ upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
587
+ `attention` still work. Use `bert`, `roberta`, `electra`, or
588
+ `albert` for the gated ops on encoder-only inputs.
589
+
590
+ - **Integrated-gradients completeness on some modern decoders.** On
591
+ Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
592
+ sum does not converge to model-output completeness even at large
593
+ `n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
594
+ reliable as a token-importance **ranking** but cannot be interpreted as
595
+ additive contribution **magnitudes** on these models. `attribute()`
596
+ reports this programmatically: `result["interpretation"]` is
597
+ `"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
598
+ which are saliency methods), versus `"quantitative"` when IG completeness
599
+ holds. Branch on that field rather than parsing the warning text.
600
+
525
601
  ## Examples
526
602
 
527
603
  See the [`examples/`](examples/) directory for Jupyter notebooks:
@@ -538,6 +614,8 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
538
614
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
539
615
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
540
616
  | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
617
+ | `11_generation_interventions` | Steering/ablation active across every decode step, per-token lens trajectories, positional interventions, `model.intervene()` |
618
+ | `12_circuit_discovery_and_lenses` | Attribution Patching, Edge Attribution Patching, EAP-driven `find_circuit`, tuned lens, max-activating examples |
541
619
 
542
620
  ---
543
621
 
@@ -12,27 +12,55 @@
12
12
 
13
13
  Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
14
14
 
15
- InterpKit provides a single, consistent interface for mech interp operations across any HuggingFace model — transformers, SSMs, vision models, and more — with zero annotation required.
15
+ InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
16
16
 
17
17
  ---
18
18
 
19
19
  ## Install
20
20
 
21
+ We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
22
+
23
+ Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
24
+
21
25
  ```bash
26
+ uv venv --python 3.11
27
+ source .venv/bin/activate
28
+ uv pip install interpkit
29
+
30
+ # For linear probe support:
31
+ uv pip install "interpkit[probe]"
32
+ ```
33
+
34
+ Or with plain `venv` + `pip`:
35
+
36
+ ```bash
37
+ python3.11 -m venv .venv
38
+ source .venv/bin/activate
22
39
  pip install interpkit
23
40
 
24
41
  # For linear probe support:
25
- pip install interpkit[probe]
42
+ pip install "interpkit[probe]"
43
+ ```
44
+
45
+ Or with `conda`:
46
+
47
+ ```bash
48
+ conda create -n interpkit python=3.11 -y
49
+ conda activate interpkit
50
+ pip install interpkit
26
51
  ```
27
52
 
28
- Or install from source for development:
53
+ Installing from source for development:
29
54
 
30
55
  ```bash
31
56
  git clone https://github.com/z4nix/interpkit.git
32
57
  cd interpkit
33
- pip install -e ".[dev]"
58
+ uv venv --python 3.11 && source .venv/bin/activate
59
+ uv pip install -e ".[dev]"
34
60
  ```
35
61
 
62
+ > Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
63
+
36
64
  ---
37
65
 
38
66
  ## Quickstart
@@ -108,7 +136,13 @@ See [examples/10_chat_models.ipynb](examples/10_chat_models.ipynb) for a full wa
108
136
  | **`ov_scores`** | OV circuit analysis — W_OV matrix per head | Transformers |
109
137
  | **`qk_scores`** | QK circuit analysis — W_QK matrix per head | Transformers |
110
138
  | **`composition`** | Q/K/V composition scores between heads in two layers | Transformers |
111
- | **`find_circuit`** | Automated circuit discovery via iterative ablation | Transformers |
139
+ | **`find_circuit`** | Automated circuit discovery iterative ablation or EAP-based selection with causal verification | Transformers |
140
+ | **`generate`** | Generation with interventions active across every decode step + per-token lens capture | Generative LMs |
141
+ | **`intervene`** | Context manager applying steer/ablate/patch interventions to any op | Any model |
142
+ | **`atp`** | Attribution Patching — first-order patch-effect scores for all modules in 3 passes | Any model |
143
+ | **`eap`** | Edge Attribution Patching — gradient-based component → residual-stream edge scores (EAP-IG via `ig_steps`) | Causal LMs |
144
+ | **`train_tuned_lens`** | Train per-layer tuned-lens translators (Belrose et al. 2023); use via `lens(kind="tuned")` | LMs |
145
+ | **`max_activating`** | Scan a corpus for the examples that most activate a neuron / SAE feature / head | Any model |
112
146
  | **`batch`** | Run any operation over a dataset with result aggregation | Any model |
113
147
 
114
148
  ---
@@ -404,6 +438,20 @@ interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae jb
404
438
  interpkit features gpt2 "The capital of France is" --at transformer.h.8 --sae ./my_sae.safetensors
405
439
  interpkit dla gpt2 "The capital of France is" --sae jbloom/GPT2-Small-SAEs-Reformatted --sae-at transformer.h.11.attn
406
440
 
441
+ # Generation-time interventions + per-token lens trajectories
442
+ interpkit generate gpt2 "I feel" --positive " joy" --negative " fear" --at transformer.h.6 --scale 8
443
+ interpkit generate gpt2 "The capital of France is" --capture lens
444
+
445
+ # Gradient-based circuit discovery
446
+ interpkit atp gpt2 --clean "The capital of France is" --corrupted "The capital of Germany is"
447
+ interpkit eap gpt2 --clean "..." --corrupted "..." --ig-steps 5
448
+ interpkit find-circuit gpt2 --clean "..." --corrupted "..." --method eap --threshold 0.3
449
+
450
+ # Tuned lens + max-activating examples
451
+ interpkit train-tuned-lens gpt2 --corpus-file texts.txt --save lens_dir/
452
+ interpkit lens gpt2 "The capital of France is" --tuned-lens lens_dir/
453
+ interpkit maxact gpt2 --at transformer.h.6.mlp --neuron 42 --texts-file corpus.txt
454
+
407
455
  # Chat / instruct models — applies the tokenizer's chat template automatically
408
456
  interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "Write a haiku about cats." --max-new-tokens 64
409
457
  interpkit chat HuggingFaceTB/SmolLM2-360M-Instruct "What is 2+2?" --system "You are terse." --show-prompt
@@ -474,6 +522,30 @@ model.trace(input_a, input_b, top_k=10)
474
522
 
475
523
  ---
476
524
 
525
+ ## Known limitations
526
+
527
+ - **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
528
+ HuggingFace transformers' relative-position-bias path triggers on
529
+ forward hooks for any DeBERTa-v3 model (e.g.
530
+ `microsoft/deberta-v3-small`). interpkit detects this at load time
531
+ and gates `trace`, `decompose`, `attribute`, `head_activations`,
532
+ `steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
533
+ `OperationNotSupportedForArchitecture` rather than the cryptic
534
+ upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
535
+ `attention` still work. Use `bert`, `roberta`, `electra`, or
536
+ `albert` for the gated ops on encoder-only inputs.
537
+
538
+ - **Integrated-gradients completeness on some modern decoders.** On
539
+ Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
540
+ sum does not converge to model-output completeness even at large
541
+ `n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
542
+ reliable as a token-importance **ranking** but cannot be interpreted as
543
+ additive contribution **magnitudes** on these models. `attribute()`
544
+ reports this programmatically: `result["interpretation"]` is
545
+ `"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
546
+ which are saliency methods), versus `"quantitative"` when IG completeness
547
+ holds. Branch on that field rather than parsing the warning text.
548
+
477
549
  ## Examples
478
550
 
479
551
  See the [`examples/`](examples/) directory for Jupyter notebooks:
@@ -490,6 +562,8 @@ See the [`examples/`](examples/) directory for Jupyter notebooks:
490
562
  | `08_dla_and_circuits` | DLA, head activations, residual decomposition, OV/QK analysis, composition, circuit discovery |
491
563
  | `09_scan_and_batch` | Auto-scan, batch operations, dataset workflows |
492
564
  | `10_chat_models` | Chat-template handling, `model.chat()`, message-list inputs, chat-style steering |
565
+ | `11_generation_interventions` | Steering/ablation active across every decode step, per-token lens trajectories, positional interventions, `model.intervene()` |
566
+ | `12_circuit_discovery_and_lenses` | Attribution Patching, Edge Attribution Patching, EAP-driven `find_circuit`, tuned lens, max-activating examples |
493
567
 
494
568
  ---
495
569
 
@@ -0,0 +1,84 @@
1
+ """interpkit — mech interp for any HuggingFace model."""
2
+
3
+ from interpkit.core.arch import (
4
+ ArchFamily,
5
+ ArchInfo,
6
+ BlockSpec,
7
+ LayerInfo,
8
+ ModuleInfo,
9
+ resolve_arch,
10
+ )
11
+ from interpkit.core.exceptions import (
12
+ ArchitectureNotSupported,
13
+ AttentionBackendUnavailable,
14
+ InterpkitError,
15
+ LensPipelineMismatch,
16
+ OperationNotSupportedForArchitecture,
17
+ WrongInputType,
18
+ )
19
+ from interpkit.core.interventions import (
20
+ AblateIntervention,
21
+ CaptureProbe,
22
+ FnIntervention,
23
+ GenerationContext,
24
+ Intervention,
25
+ PatchIntervention,
26
+ SteerIntervention,
27
+ apply_interventions,
28
+ )
29
+ from interpkit.core.loader import load, load_module
30
+ from interpkit.core.model import Model
31
+ from interpkit.core.registry import register
32
+ from interpkit.core.tl_compat import (
33
+ list_roundtrippable_hooks,
34
+ list_tl_hooks,
35
+ to_native_name,
36
+ to_tl_name,
37
+ )
38
+
39
+
40
+ def diff(model_a, model_b, input_data, *, save=None):
41
+ """Compare activations between two models on the same input."""
42
+ from interpkit.ops.diff import run_diff
43
+
44
+ return run_diff(model_a, model_b, input_data, save=save)
45
+
46
+
47
+ __all__ = [
48
+ # Loaders
49
+ "load",
50
+ "load_module",
51
+ "Model",
52
+ # Architecture types
53
+ "ArchInfo",
54
+ "ArchFamily",
55
+ "BlockSpec",
56
+ "resolve_arch",
57
+ # Per-layer structural types
58
+ "LayerInfo",
59
+ "ModuleInfo",
60
+ # Exception types
61
+ "InterpkitError",
62
+ "ArchitectureNotSupported",
63
+ "AttentionBackendUnavailable",
64
+ "LensPipelineMismatch",
65
+ "OperationNotSupportedForArchitecture",
66
+ "WrongInputType",
67
+ # Interventions
68
+ "Intervention",
69
+ "SteerIntervention",
70
+ "AblateIntervention",
71
+ "PatchIntervention",
72
+ "FnIntervention",
73
+ "CaptureProbe",
74
+ "GenerationContext",
75
+ "apply_interventions",
76
+ # Operations
77
+ "register",
78
+ "diff",
79
+ # TL compat
80
+ "to_tl_name",
81
+ "to_native_name",
82
+ "list_tl_hooks",
83
+ "list_roundtrippable_hooks",
84
+ ]
@@ -1,18 +1,22 @@
1
1
  """Entry point so ``python -m interpkit`` invokes the Typer CLI.
2
2
 
3
- Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:app"``
3
+ Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:run"``
4
4
  console script declared in :file:`pyproject.toml`, so users without the
5
5
  console script on their ``$PATH`` (e.g. just-installed in a fresh
6
6
  environment, vendored copies, ad-hoc subprocess invocations) can still
7
7
  reach every CLI command via ``python -m interpkit ...``.
8
8
  """
9
9
 
10
- from interpkit.cli.main import app
10
+ from interpkit.cli.main import run
11
11
 
12
12
 
13
13
  def main() -> None:
14
- """Invoke the Typer app — separate function makes patching easier in tests."""
15
- app()
14
+ """Invoke the CLI — separate function makes patching easier in tests.
15
+
16
+ Uses ``run`` (not ``app`` directly) so interpkit's fail-loud errors are
17
+ rendered as clean one-line messages instead of tracebacks.
18
+ """
19
+ run()
16
20
 
17
21
 
18
22
  if __name__ == "__main__":