interpkit 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {interpkit-0.4.0 → interpkit-0.5.0}/PKG-INFO +60 -6
- {interpkit-0.4.0 → interpkit-0.5.0}/README.md +56 -4
- interpkit-0.5.0/interpkit/__init__.py +65 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/__main__.py +8 -4
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/cli/main.py +164 -4
- interpkit-0.5.0/interpkit/core/arch/__init__.py +102 -0
- interpkit-0.5.0/interpkit/core/arch/blocks.py +257 -0
- interpkit-0.5.0/interpkit/core/arch/family.py +421 -0
- interpkit-0.5.0/interpkit/core/arch/heads.py +583 -0
- interpkit-0.5.0/interpkit/core/arch/layers.py +462 -0
- interpkit-0.5.0/interpkit/core/arch/names.py +60 -0
- interpkit-0.5.0/interpkit/core/arch/probe.py +241 -0
- interpkit-0.5.0/interpkit/core/arch/residual.py +653 -0
- interpkit-0.5.0/interpkit/core/arch/resolve.py +679 -0
- interpkit-0.5.0/interpkit/core/arch/tree.py +190 -0
- interpkit-0.5.0/interpkit/core/arch/types.py +486 -0
- interpkit-0.5.0/interpkit/core/enums.py +105 -0
- interpkit-0.5.0/interpkit/core/exceptions.py +83 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/html.py +5 -2
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/inputs.py +44 -0
- interpkit-0.5.0/interpkit/core/loader.py +704 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/model.py +388 -34
- interpkit-0.5.0/interpkit/core/paths.py +71 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/render.py +63 -7
- interpkit-0.5.0/interpkit/core/support_matrix.py +690 -0
- interpkit-0.5.0/interpkit/core/tl_compat.py +297 -0
- interpkit-0.5.0/interpkit/ops/_atp.py +182 -0
- interpkit-0.5.0/interpkit/ops/_hooks.py +233 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/ablate.py +14 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/activations.py +9 -1
- interpkit-0.5.0/interpkit/ops/attention.py +334 -0
- interpkit-0.5.0/interpkit/ops/attribute.py +844 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/circuits.py +219 -108
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/diff.py +22 -2
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/dla.py +309 -190
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/find_circuit.py +6 -12
- interpkit-0.5.0/interpkit/ops/heads.py +282 -0
- interpkit-0.5.0/interpkit/ops/lens.py +397 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/patch.py +113 -22
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/probe.py +14 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/sae.py +142 -22
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/steer.py +11 -0
- interpkit-0.5.0/interpkit/ops/trace.py +502 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit.egg-info/PKG-INFO +60 -6
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit.egg-info/SOURCES.txt +27 -2
- interpkit-0.5.0/interpkit.egg-info/entry_points.txt +2 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit.egg-info/requires.txt +3 -1
- {interpkit-0.4.0 → interpkit-0.5.0}/pyproject.toml +27 -4
- interpkit-0.5.0/tests/test_archinfo_serialization.py +61 -0
- interpkit-0.5.0/tests/test_attention.py +112 -0
- interpkit-0.5.0/tests/test_audit_regressions.py +1891 -0
- interpkit-0.5.0/tests/test_cache_invalidation.py +66 -0
- interpkit-0.5.0/tests/test_capabilities.py +227 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_cli.py +77 -1
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_discovery.py +1 -1
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_discovery_units.py +21 -21
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_error_handling.py +11 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_invariants.py +22 -8
- interpkit-0.5.0/tests/test_lens.py +53 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_load_params.py +12 -2
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_multi_arch.py +12 -5
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_ops.py +6 -1
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_param_variants.py +4 -2
- interpkit-0.5.0/tests/test_phase3_regressions.py +121 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_regressions.py +5 -2
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_render_internals.py +34 -6
- interpkit-0.5.0/tests/test_resolver.py +268 -0
- interpkit-0.5.0/tests/test_resolver_golden.py +131 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_robustness_audit.py +56 -29
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_sae.py +6 -1
- interpkit-0.5.0/tests/test_seq2seq_contract.py +119 -0
- interpkit-0.5.0/tests/test_trace.py +76 -0
- interpkit-0.5.0/tests/test_validation.py +130 -0
- interpkit-0.4.0/interpkit/__init__.py +0 -27
- interpkit-0.4.0/interpkit/core/discovery.py +0 -810
- interpkit-0.4.0/interpkit/core/loader.py +0 -322
- interpkit-0.4.0/interpkit/core/tl_compat.py +0 -174
- interpkit-0.4.0/interpkit/ops/attention.py +0 -365
- interpkit-0.4.0/interpkit/ops/attribute.py +0 -377
- interpkit-0.4.0/interpkit/ops/heads.py +0 -175
- interpkit-0.4.0/interpkit/ops/lens.py +0 -243
- interpkit-0.4.0/interpkit/ops/trace.py +0 -349
- interpkit-0.4.0/interpkit.egg-info/entry_points.txt +0 -2
- interpkit-0.4.0/tests/test_attention.py +0 -44
- interpkit-0.4.0/tests/test_lens.py +0 -25
- interpkit-0.4.0/tests/test_trace.py +0 -35
- {interpkit-0.4.0 → interpkit-0.5.0}/LICENSE +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/cli/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/cache.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/plot.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/registry.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/core/theme.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/__init__.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/batch.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/inspect.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/report.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit/ops/scan.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit.egg-info/dependency_links.txt +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/interpkit.egg-info/top_level.txt +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/setup.cfg +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_ablate.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_activations.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_architectures.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_attribute.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_cache.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_chat.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_diff.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_html.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_inputs.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_inspect.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_patch.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_plot_internals.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_plots.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_probe.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_registry.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_steer.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_tl_compat.py +0 -0
- {interpkit-0.4.0 → interpkit-0.5.0}/tests/test_tl_ops.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: interpkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Mech interp for any HuggingFace model.
|
|
5
5
|
Author: Davide Zani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -20,7 +20,8 @@ Requires-Python: >=3.10
|
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: torch>=2.1
|
|
23
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: transformers<6,>=4.36
|
|
24
25
|
Requires-Dist: safetensors>=0.4
|
|
25
26
|
Requires-Dist: rich>=13.0
|
|
26
27
|
Requires-Dist: rich-gradient>=0.3
|
|
@@ -28,6 +29,7 @@ Requires-Dist: typer>=0.9
|
|
|
28
29
|
Requires-Dist: Pillow>=10.0
|
|
29
30
|
Requires-Dist: matplotlib>=3.8
|
|
30
31
|
Requires-Dist: huggingface-hub>=0.20
|
|
32
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
31
33
|
Provides-Extra: vision
|
|
32
34
|
Requires-Dist: torchvision>=0.16; extra == "vision"
|
|
33
35
|
Provides-Extra: probe
|
|
@@ -60,27 +62,55 @@ Dynamic: license-file
|
|
|
60
62
|
|
|
61
63
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
62
64
|
|
|
63
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
65
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
64
66
|
|
|
65
67
|
---
|
|
66
68
|
|
|
67
69
|
## Install
|
|
68
70
|
|
|
71
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
72
|
+
|
|
73
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv venv --python 3.11
|
|
77
|
+
source .venv/bin/activate
|
|
78
|
+
uv pip install interpkit
|
|
79
|
+
|
|
80
|
+
# For linear probe support:
|
|
81
|
+
uv pip install "interpkit[probe]"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Or with plain `venv` + `pip`:
|
|
85
|
+
|
|
69
86
|
```bash
|
|
87
|
+
python3.11 -m venv .venv
|
|
88
|
+
source .venv/bin/activate
|
|
70
89
|
pip install interpkit
|
|
71
90
|
|
|
72
91
|
# For linear probe support:
|
|
73
|
-
pip install interpkit[probe]
|
|
92
|
+
pip install "interpkit[probe]"
|
|
74
93
|
```
|
|
75
94
|
|
|
76
|
-
Or
|
|
95
|
+
Or with `conda`:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
conda create -n interpkit python=3.11 -y
|
|
99
|
+
conda activate interpkit
|
|
100
|
+
pip install interpkit
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Installing from source for development:
|
|
77
104
|
|
|
78
105
|
```bash
|
|
79
106
|
git clone https://github.com/z4nix/interpkit.git
|
|
80
107
|
cd interpkit
|
|
81
|
-
|
|
108
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
109
|
+
uv pip install -e ".[dev]"
|
|
82
110
|
```
|
|
83
111
|
|
|
112
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
113
|
+
|
|
84
114
|
---
|
|
85
115
|
|
|
86
116
|
## Quickstart
|
|
@@ -522,6 +552,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
522
552
|
|
|
523
553
|
---
|
|
524
554
|
|
|
555
|
+
## Known limitations
|
|
556
|
+
|
|
557
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
558
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
559
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
560
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
561
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
562
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
563
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
564
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
565
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
566
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
567
|
+
|
|
568
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
569
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
570
|
+
sum does not converge to model-output completeness even at large
|
|
571
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
572
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
573
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
574
|
+
reports this programmatically: `result["interpretation"]` is
|
|
575
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
576
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
577
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
578
|
+
|
|
525
579
|
## Examples
|
|
526
580
|
|
|
527
581
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -12,27 +12,55 @@
|
|
|
12
12
|
|
|
13
13
|
Mechanistic interpretability tooling today is fragmented. Each library supports a narrow set of architectures, and moving to a different model family usually means rewriting hook code from scratch.
|
|
14
14
|
|
|
15
|
-
InterpKit provides a single, consistent interface for mech interp operations across
|
|
15
|
+
InterpKit provides a single, consistent interface for mech interp operations across a wide range of HuggingFace models — transformers, SSMs, vision models, and more — with automatic architecture discovery and little to no manual setup.
|
|
16
16
|
|
|
17
17
|
---
|
|
18
18
|
|
|
19
19
|
## Install
|
|
20
20
|
|
|
21
|
+
We strongly recommend installing into an isolated environment so InterpKit's pinned dependencies (e.g. `typer`, `rich`, `transformers`) don't clash with whatever you already have installed globally
|
|
22
|
+
|
|
23
|
+
Using [uv](https://docs.astral.sh/uv/) (recommended — fast, handles Python versions for you):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv venv --python 3.11
|
|
27
|
+
source .venv/bin/activate
|
|
28
|
+
uv pip install interpkit
|
|
29
|
+
|
|
30
|
+
# For linear probe support:
|
|
31
|
+
uv pip install "interpkit[probe]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or with plain `venv` + `pip`:
|
|
35
|
+
|
|
21
36
|
```bash
|
|
37
|
+
python3.11 -m venv .venv
|
|
38
|
+
source .venv/bin/activate
|
|
22
39
|
pip install interpkit
|
|
23
40
|
|
|
24
41
|
# For linear probe support:
|
|
25
|
-
pip install interpkit[probe]
|
|
42
|
+
pip install "interpkit[probe]"
|
|
26
43
|
```
|
|
27
44
|
|
|
28
|
-
Or
|
|
45
|
+
Or with `conda`:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
conda create -n interpkit python=3.11 -y
|
|
49
|
+
conda activate interpkit
|
|
50
|
+
pip install interpkit
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Installing from source for development:
|
|
29
54
|
|
|
30
55
|
```bash
|
|
31
56
|
git clone https://github.com/z4nix/interpkit.git
|
|
32
57
|
cd interpkit
|
|
33
|
-
|
|
58
|
+
uv venv --python 3.11 && source .venv/bin/activate
|
|
59
|
+
uv pip install -e ".[dev]"
|
|
34
60
|
```
|
|
35
61
|
|
|
62
|
+
> Python 3.10+ is required. If you must install into your system Python, use `pip install --user interpkit` and be aware that conflicting versions of `typer`, `rich`, or `transformers` already on your machine can break the CLI.
|
|
63
|
+
|
|
36
64
|
---
|
|
37
65
|
|
|
38
66
|
## Quickstart
|
|
@@ -474,6 +502,30 @@ model.trace(input_a, input_b, top_k=10)
|
|
|
474
502
|
|
|
475
503
|
---
|
|
476
504
|
|
|
505
|
+
## Known limitations
|
|
506
|
+
|
|
507
|
+
- **DeBERTa-v3 (DisentangledSelfAttention).** A known broadcast bug in
|
|
508
|
+
HuggingFace transformers' relative-position-bias path triggers on
|
|
509
|
+
forward hooks for any DeBERTa-v3 model (e.g.
|
|
510
|
+
`microsoft/deberta-v3-small`). interpkit detects this at load time
|
|
511
|
+
and gates `trace`, `decompose`, `attribute`, `head_activations`,
|
|
512
|
+
`steer`, `probe`, `diff`, `ov_scores`, `qk_scores` with a clean
|
|
513
|
+
`OperationNotSupportedForArchitecture` rather than the cryptic
|
|
514
|
+
upstream `RuntimeError: tensor (512) must match (7)`. `lens` and
|
|
515
|
+
`attention` still work. Use `bert`, `roberta`, `electra`, or
|
|
516
|
+
`albert` for the gated ops on encoder-only inputs.
|
|
517
|
+
|
|
518
|
+
- **Integrated-gradients completeness on some modern decoders.** On
|
|
519
|
+
Qwen2/Qwen2.5/Qwen3 and SmolLM-family models, the trapezoidal Riemann
|
|
520
|
+
sum does not converge to model-output completeness even at large
|
|
521
|
+
`n_steps` (the P0b/N-008 empirical finding). Per-token IG scores remain
|
|
522
|
+
reliable as a token-importance **ranking** but cannot be interpreted as
|
|
523
|
+
additive contribution **magnitudes** on these models. `attribute()`
|
|
524
|
+
reports this programmatically: `result["interpretation"]` is
|
|
525
|
+
`"ranking_only"` in that case (and for `gradient` / `gradient_x_input`,
|
|
526
|
+
which are saliency methods), versus `"quantitative"` when IG completeness
|
|
527
|
+
holds. Branch on that field rather than parsing the warning text.
|
|
528
|
+
|
|
477
529
|
## Examples
|
|
478
530
|
|
|
479
531
|
See the [`examples/`](examples/) directory for Jupyter notebooks:
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""interpkit — mech interp for any HuggingFace model."""
|
|
2
|
+
|
|
3
|
+
from interpkit.core.arch import (
|
|
4
|
+
ArchFamily,
|
|
5
|
+
ArchInfo,
|
|
6
|
+
BlockSpec,
|
|
7
|
+
LayerInfo,
|
|
8
|
+
ModuleInfo,
|
|
9
|
+
resolve_arch,
|
|
10
|
+
)
|
|
11
|
+
from interpkit.core.exceptions import (
|
|
12
|
+
ArchitectureNotSupported,
|
|
13
|
+
AttentionBackendUnavailable,
|
|
14
|
+
InterpkitError,
|
|
15
|
+
LensPipelineMismatch,
|
|
16
|
+
OperationNotSupportedForArchitecture,
|
|
17
|
+
WrongInputType,
|
|
18
|
+
)
|
|
19
|
+
from interpkit.core.loader import load, load_module
|
|
20
|
+
from interpkit.core.model import Model
|
|
21
|
+
from interpkit.core.registry import register
|
|
22
|
+
from interpkit.core.tl_compat import (
|
|
23
|
+
list_roundtrippable_hooks,
|
|
24
|
+
list_tl_hooks,
|
|
25
|
+
to_native_name,
|
|
26
|
+
to_tl_name,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def diff(model_a, model_b, input_data, *, save=None):
|
|
31
|
+
"""Compare activations between two models on the same input."""
|
|
32
|
+
from interpkit.ops.diff import run_diff
|
|
33
|
+
|
|
34
|
+
return run_diff(model_a, model_b, input_data, save=save)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
# Loaders
|
|
39
|
+
"load",
|
|
40
|
+
"load_module",
|
|
41
|
+
"Model",
|
|
42
|
+
# Architecture types
|
|
43
|
+
"ArchInfo",
|
|
44
|
+
"ArchFamily",
|
|
45
|
+
"BlockSpec",
|
|
46
|
+
"resolve_arch",
|
|
47
|
+
# Per-layer structural types
|
|
48
|
+
"LayerInfo",
|
|
49
|
+
"ModuleInfo",
|
|
50
|
+
# Exception types
|
|
51
|
+
"InterpkitError",
|
|
52
|
+
"ArchitectureNotSupported",
|
|
53
|
+
"AttentionBackendUnavailable",
|
|
54
|
+
"LensPipelineMismatch",
|
|
55
|
+
"OperationNotSupportedForArchitecture",
|
|
56
|
+
"WrongInputType",
|
|
57
|
+
# Operations
|
|
58
|
+
"register",
|
|
59
|
+
"diff",
|
|
60
|
+
# TL compat
|
|
61
|
+
"to_tl_name",
|
|
62
|
+
"to_native_name",
|
|
63
|
+
"list_tl_hooks",
|
|
64
|
+
"list_roundtrippable_hooks",
|
|
65
|
+
]
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
"""Entry point so ``python -m interpkit`` invokes the Typer CLI.
|
|
2
2
|
|
|
3
|
-
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:
|
|
3
|
+
Mirrors the ``[project.scripts] interpkit = "interpkit.cli.main:run"``
|
|
4
4
|
console script declared in :file:`pyproject.toml`, so users without the
|
|
5
5
|
console script on their ``$PATH`` (e.g. just-installed in a fresh
|
|
6
6
|
environment, vendored copies, ad-hoc subprocess invocations) can still
|
|
7
7
|
reach every CLI command via ``python -m interpkit ...``.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
from interpkit.cli.main import
|
|
10
|
+
from interpkit.cli.main import run
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def main() -> None:
|
|
14
|
-
"""Invoke the
|
|
15
|
-
|
|
14
|
+
"""Invoke the CLI — separate function makes patching easier in tests.
|
|
15
|
+
|
|
16
|
+
Uses ``run`` (not ``app`` directly) so interpkit's fail-loud errors are
|
|
17
|
+
rendered as clean one-line messages instead of tracebacks.
|
|
18
|
+
"""
|
|
19
|
+
run()
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
if __name__ == "__main__":
|
|
@@ -1,8 +1,17 @@
|
|
|
1
|
-
"""CLI entry point — Typer app with all interpkit commands.
|
|
1
|
+
"""CLI entry point — Typer app with all interpkit commands.
|
|
2
|
+
|
|
3
|
+
When ``--format json`` is set, all status / progress output (rich panels,
|
|
4
|
+
load progress bars, tqdm) is silenced or routed to stderr (F-023). The
|
|
5
|
+
stdout stream stays clean JSON for programmatic consumers — pre-1.0
|
|
6
|
+
``--format json`` interleaved rich panels and tqdm bars with the JSON
|
|
7
|
+
block, breaking ``json.loads(p.stdout)`` for every CLI invocation.
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
10
|
from __future__ import annotations
|
|
4
11
|
|
|
5
12
|
import json as _json
|
|
13
|
+
import os as _os
|
|
14
|
+
import sys as _sys
|
|
6
15
|
from importlib.metadata import version as _pkg_version
|
|
7
16
|
|
|
8
17
|
import typer
|
|
@@ -33,11 +42,85 @@ app = typer.Typer(
|
|
|
33
42
|
no_args_is_help=False,
|
|
34
43
|
add_completion=False,
|
|
35
44
|
rich_markup_mode="rich",
|
|
45
|
+
# interpkit's own errors (OperationNotSupportedForArchitecture,
|
|
46
|
+
# WrongInputType, LensPipelineMismatch, …) are deliberate, well-messaged,
|
|
47
|
+
# user-facing failures — not bugs. Disable Typer's rich-traceback so they
|
|
48
|
+
# don't reach the user as a scary stack trace; ``run()`` renders them as a
|
|
49
|
+
# clean one-line error instead.
|
|
50
|
+
pretty_exceptions_enable=False,
|
|
36
51
|
)
|
|
52
|
+
# F-023: console object — production code should call _make_console() so
|
|
53
|
+
# JSON-mode stderr routing happens uniformly. The module-level singleton
|
|
54
|
+
# is reassigned by main() once --format is parsed.
|
|
37
55
|
console = Console()
|
|
38
56
|
|
|
39
57
|
_output_format: str = "rich"
|
|
40
58
|
|
|
59
|
+
|
|
60
|
+
def _make_console() -> Console:
|
|
61
|
+
"""Construct a Console that respects the active output format.
|
|
62
|
+
|
|
63
|
+
In ``json`` mode, status / progress output goes to stderr so stdout
|
|
64
|
+
remains clean JSON. In ``rich`` mode, behaves identically to the
|
|
65
|
+
pre-1.0 module-level singleton.
|
|
66
|
+
"""
|
|
67
|
+
if _output_format == "json":
|
|
68
|
+
return Console(file=_sys.stderr)
|
|
69
|
+
return Console()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _silence_third_party_loaders() -> None:
|
|
73
|
+
"""Mute transformers / tqdm / huggingface chatter in JSON mode.
|
|
74
|
+
|
|
75
|
+
Pre-1.0 ``--format json`` had model-loading tqdm bars and the
|
|
76
|
+
"Loaded ... on cpu" rich line interleaved with the actual JSON
|
|
77
|
+
payload (F-023). Programmatic consumers couldn't json.loads(stdout).
|
|
78
|
+
|
|
79
|
+
Also re-binds every op-module console to write to stderr so rich
|
|
80
|
+
op-level rendering doesn't pollute the JSON stream.
|
|
81
|
+
"""
|
|
82
|
+
if _output_format != "json":
|
|
83
|
+
return
|
|
84
|
+
# Silence HF transformers progress / warnings to stderr-only.
|
|
85
|
+
try:
|
|
86
|
+
from transformers import logging as _hf_logging
|
|
87
|
+
_hf_logging.set_verbosity_error()
|
|
88
|
+
_hf_logging.disable_progress_bar()
|
|
89
|
+
except (ImportError, AttributeError):
|
|
90
|
+
pass
|
|
91
|
+
# Silence raw tqdm.
|
|
92
|
+
_os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
|
93
|
+
_os.environ["TQDM_DISABLE"] = "1"
|
|
94
|
+
_os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
|
95
|
+
|
|
96
|
+
# Re-bind op-module consoles to stderr so renders don't pollute stdout.
|
|
97
|
+
import importlib
|
|
98
|
+
|
|
99
|
+
_stderr_console = Console(file=_sys.stderr)
|
|
100
|
+
for mod_name in (
|
|
101
|
+
"interpkit.core.render",
|
|
102
|
+
"interpkit.core.plot",
|
|
103
|
+
"interpkit.ops.attention",
|
|
104
|
+
"interpkit.ops.attribute",
|
|
105
|
+
"interpkit.ops.batch",
|
|
106
|
+
"interpkit.ops.circuits",
|
|
107
|
+
"interpkit.ops.diff",
|
|
108
|
+
"interpkit.ops.find_circuit",
|
|
109
|
+
"interpkit.ops.lens",
|
|
110
|
+
"interpkit.ops.probe",
|
|
111
|
+
"interpkit.ops.report",
|
|
112
|
+
"interpkit.ops.sae",
|
|
113
|
+
"interpkit.ops.scan",
|
|
114
|
+
"interpkit.ops.steer",
|
|
115
|
+
"interpkit.ops.trace",
|
|
116
|
+
):
|
|
117
|
+
try:
|
|
118
|
+
mod = importlib.import_module(mod_name)
|
|
119
|
+
if hasattr(mod, "console"):
|
|
120
|
+
mod.console = _stderr_console # type: ignore[attr-defined]
|
|
121
|
+
except ImportError:
|
|
122
|
+
continue
|
|
123
|
+
|
|
41
124
|
_VERSION = _pkg_version("interpkit")
|
|
42
125
|
|
|
43
126
|
|
|
@@ -63,8 +146,17 @@ def _load_model(
|
|
|
63
146
|
):
|
|
64
147
|
from interpkit.core.model import load
|
|
65
148
|
|
|
149
|
+
# F-007 fix: don't forward dtype=None — load() now requires explicit
|
|
150
|
+
# dtype. Defer to its built-in default (fp32) when the CLI user didn't
|
|
151
|
+
# specify --dtype.
|
|
152
|
+
kwargs: dict = {"device": device}
|
|
153
|
+
if dtype is not None:
|
|
154
|
+
kwargs["dtype"] = dtype
|
|
155
|
+
if device_map is not None:
|
|
156
|
+
kwargs["device_map"] = device_map
|
|
157
|
+
|
|
66
158
|
with console.status(f" Loading [bold]{model_name}[/bold]..."):
|
|
67
|
-
m = load(model_name,
|
|
159
|
+
m = load(model_name, **kwargs)
|
|
68
160
|
console.print(f" [bold green]Loaded[/bold green] [{ACCENT}]{model_name}[/{ACCENT}] on [bold]{m._device}[/bold]")
|
|
69
161
|
return m
|
|
70
162
|
|
|
@@ -417,8 +509,11 @@ def main(
|
|
|
417
509
|
),
|
|
418
510
|
) -> None:
|
|
419
511
|
"""Mech interp for any HuggingFace model."""
|
|
420
|
-
global _output_format
|
|
512
|
+
global _output_format, console
|
|
421
513
|
_output_format = fmt
|
|
514
|
+
# F-023: re-bind module-level console so it routes to stderr in JSON mode.
|
|
515
|
+
console = _make_console()
|
|
516
|
+
_silence_third_party_loaders()
|
|
422
517
|
if ctx.invoked_subcommand is not None:
|
|
423
518
|
return
|
|
424
519
|
if extensive:
|
|
@@ -526,6 +621,38 @@ def inspect(
|
|
|
526
621
|
) -> None:
|
|
527
622
|
"""Print the model's module tree with types, param counts, and detected roles."""
|
|
528
623
|
m = _load_model(model_name, device=device, dtype=dtype, device_map=device_map)
|
|
624
|
+
if _output_format == "json":
|
|
625
|
+
# F-023: inspect previously ignored --format json. Now emits a
|
|
626
|
+
# structured JSON description of the architecture.
|
|
627
|
+
arch = m.arch_info
|
|
628
|
+
result = {
|
|
629
|
+
"model": model_name,
|
|
630
|
+
"family": arch.family.value if hasattr(arch.family, "value") else str(arch.family),
|
|
631
|
+
"arch_family": arch.arch_family,
|
|
632
|
+
"device": m.device,
|
|
633
|
+
"dtype": str(m.dtype),
|
|
634
|
+
"num_layers": arch.num_layers,
|
|
635
|
+
"hidden_size": arch.hidden_size,
|
|
636
|
+
"num_attention_heads": arch.num_attention_heads,
|
|
637
|
+
"vocab_size": arch.vocab_size,
|
|
638
|
+
"is_encoder_decoder": arch.is_encoder_decoder,
|
|
639
|
+
"spatial": arch.spatial,
|
|
640
|
+
"head_path": arch.head_path,
|
|
641
|
+
"embed_path": arch.embed_path,
|
|
642
|
+
"pre_head_path": arch.pre_head_path,
|
|
643
|
+
"project_out_path": arch.project_out_path,
|
|
644
|
+
"blocks": [
|
|
645
|
+
{"path": b.path, "stage": b.stage,
|
|
646
|
+
"has_attention": b.has_attention, "has_residual": b.has_residual}
|
|
647
|
+
for b in arch.blocks
|
|
648
|
+
],
|
|
649
|
+
"modules": [
|
|
650
|
+
{"name": m.name, "type": m.type_name, "param_count": m.param_count, "role": m.role}
|
|
651
|
+
for m in arch.modules
|
|
652
|
+
],
|
|
653
|
+
}
|
|
654
|
+
_json_dump(result)
|
|
655
|
+
return
|
|
529
656
|
with console.status(" Inspecting model..."):
|
|
530
657
|
m.inspect()
|
|
531
658
|
|
|
@@ -1063,5 +1190,38 @@ def chat(
|
|
|
1063
1190
|
_json_dump({k: v for k, v in result.items() if k not in {"input_ids", "output_ids"}})
|
|
1064
1191
|
|
|
1065
1192
|
|
|
1193
|
+
def run() -> None:
|
|
1194
|
+
"""CLI entry point that renders interpkit's intentional errors cleanly.
|
|
1195
|
+
|
|
1196
|
+
The ``InterpkitError`` family (e.g. ``OperationNotSupportedForArchitecture``,
|
|
1197
|
+
``WrongInputType``, ``LensPipelineMismatch``) is the project's fail-loud
|
|
1198
|
+
contract — these are clear, actionable, user-facing messages, not crashes.
|
|
1199
|
+
Presenting them as a Python traceback undermines that, so we catch them at
|
|
1200
|
+
the boundary and print a single clean line (JSON object in ``--format json``)
|
|
1201
|
+
+ exit non-zero. Unexpected exceptions still propagate as a normal traceback.
|
|
1202
|
+
"""
|
|
1203
|
+
from interpkit.core.exceptions import InterpkitError
|
|
1204
|
+
|
|
1205
|
+
try:
|
|
1206
|
+
app()
|
|
1207
|
+
except (InterpkitError, ValueError, KeyError, IndexError) as exc:
|
|
1208
|
+
# interpkit's user-facing validation failures: unsupported op / wrong
|
|
1209
|
+
# input type (InterpkitError family), empty input (ValueError), unknown
|
|
1210
|
+
# module path (KeyError with a "did you mean" hint), out-of-range
|
|
1211
|
+
# position (ValueError / IndexError). These are clear, actionable
|
|
1212
|
+
# messages — render one line, not a traceback. Genuine internal bugs
|
|
1213
|
+
# raise other types (RuntimeError, TypeError, …) and still surface a
|
|
1214
|
+
# full traceback. ``KeyError.__str__`` wraps the message in quotes, so
|
|
1215
|
+
# pull ``args[0]`` for it.
|
|
1216
|
+
msg = exc.args[0] if (isinstance(exc, KeyError) and exc.args) else str(exc)
|
|
1217
|
+
if _output_format == "json":
|
|
1218
|
+
import json as _json
|
|
1219
|
+
|
|
1220
|
+
print(_json.dumps({"error": type(exc).__name__, "message": str(msg)}))
|
|
1221
|
+
else:
|
|
1222
|
+
Console(file=_sys.stderr).print(f"[bold red]Error:[/bold red] {msg}")
|
|
1223
|
+
raise SystemExit(1) from None
|
|
1224
|
+
|
|
1225
|
+
|
|
1066
1226
|
if __name__ == "__main__":
|
|
1067
|
-
|
|
1227
|
+
run()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Architecture resolution: one cohesive package.
|
|
2
|
+
|
|
3
|
+
Consolidates what used to be three entangled modules — ``discovery``,
|
|
4
|
+
the ``resolve`` package, and ``residual`` — into a single
|
|
5
|
+
``interpkit.core.arch`` package with one :class:`ArchInfo` contract.
|
|
6
|
+
|
|
7
|
+
Submodule layout:
|
|
8
|
+
|
|
9
|
+
- ``names`` — module-name vocabulary + regexes.
|
|
10
|
+
- ``types`` — ``ArchInfo``, ``ArchFamily``, ``BlockSpec``, ``LayerInfo``, ``ModuleInfo``.
|
|
11
|
+
- ``tree`` — static module-tree primitives + weight extraction.
|
|
12
|
+
- ``probe`` — runtime forward-hook probes.
|
|
13
|
+
- ``family`` — family classification, topology, config parsing.
|
|
14
|
+
- ``blocks`` — block / decoder-block discovery.
|
|
15
|
+
- ``layers`` — per-layer attn/mlp/qkv resolution + role assignment.
|
|
16
|
+
- ``heads`` — head / unembedding / project-out / MLM / pre-head discovery.
|
|
17
|
+
- ``resolve`` — ``resolve_arch`` orchestrator + ``discover`` + overrides.
|
|
18
|
+
- ``residual`` — residual-stream decomposition schemas.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from interpkit.core.arch.names import (
|
|
24
|
+
ALL_QKV_NAMES,
|
|
25
|
+
ATTN_NAMES,
|
|
26
|
+
ATTN_RE,
|
|
27
|
+
FUSED_QKV_NAMES,
|
|
28
|
+
K_PROJ_NAMES,
|
|
29
|
+
MLP_NAMES,
|
|
30
|
+
MLP_RE,
|
|
31
|
+
O_PROJ_NAMES,
|
|
32
|
+
Q_PROJ_NAMES,
|
|
33
|
+
V_PROJ_NAMES,
|
|
34
|
+
names_to_regex,
|
|
35
|
+
)
|
|
36
|
+
from interpkit.core.arch.residual import (
|
|
37
|
+
Component,
|
|
38
|
+
PostLNResidual,
|
|
39
|
+
PreLNResidual,
|
|
40
|
+
ResidualSchema,
|
|
41
|
+
Seq2seqResidual,
|
|
42
|
+
SharedLayerResidual,
|
|
43
|
+
residual_schema_for,
|
|
44
|
+
)
|
|
45
|
+
from interpkit.core.arch.resolve import (
|
|
46
|
+
ARCH_OVERRIDES,
|
|
47
|
+
apply_overrides,
|
|
48
|
+
discover,
|
|
49
|
+
resolve_arch,
|
|
50
|
+
)
|
|
51
|
+
from interpkit.core.arch.tree import (
|
|
52
|
+
canonical_linear_weight,
|
|
53
|
+
extract_proj_weight,
|
|
54
|
+
get_weight,
|
|
55
|
+
module_at_path,
|
|
56
|
+
)
|
|
57
|
+
from interpkit.core.arch.types import (
|
|
58
|
+
ArchFamily,
|
|
59
|
+
ArchInfo,
|
|
60
|
+
BlockSpec,
|
|
61
|
+
LayerInfo,
|
|
62
|
+
ModuleInfo,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
__all__ = [
|
|
66
|
+
# Types
|
|
67
|
+
"ArchInfo",
|
|
68
|
+
"ArchFamily",
|
|
69
|
+
"BlockSpec",
|
|
70
|
+
"LayerInfo",
|
|
71
|
+
"ModuleInfo",
|
|
72
|
+
# Resolution
|
|
73
|
+
"resolve_arch",
|
|
74
|
+
"discover",
|
|
75
|
+
"apply_overrides",
|
|
76
|
+
"ARCH_OVERRIDES",
|
|
77
|
+
# Tree / weight helpers
|
|
78
|
+
"module_at_path",
|
|
79
|
+
"get_weight",
|
|
80
|
+
"extract_proj_weight",
|
|
81
|
+
"canonical_linear_weight",
|
|
82
|
+
# Module-name vocabulary
|
|
83
|
+
"ATTN_NAMES",
|
|
84
|
+
"MLP_NAMES",
|
|
85
|
+
"FUSED_QKV_NAMES",
|
|
86
|
+
"Q_PROJ_NAMES",
|
|
87
|
+
"K_PROJ_NAMES",
|
|
88
|
+
"V_PROJ_NAMES",
|
|
89
|
+
"ALL_QKV_NAMES",
|
|
90
|
+
"O_PROJ_NAMES",
|
|
91
|
+
"ATTN_RE",
|
|
92
|
+
"MLP_RE",
|
|
93
|
+
"names_to_regex",
|
|
94
|
+
# Residual schemas
|
|
95
|
+
"Component",
|
|
96
|
+
"ResidualSchema",
|
|
97
|
+
"PreLNResidual",
|
|
98
|
+
"PostLNResidual",
|
|
99
|
+
"SharedLayerResidual",
|
|
100
|
+
"Seq2seqResidual",
|
|
101
|
+
"residual_schema_for",
|
|
102
|
+
]
|