archscope 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- archscope/__init__.py +30 -0
- archscope/_utils.py +113 -0
- archscope/attribute.py +201 -0
- archscope/backends.py +236 -0
- archscope/bench.py +262 -0
- archscope/circuits.py +255 -0
- archscope/cli.py +120 -0
- archscope/diff.py +212 -0
- archscope/kazdov_backend.py +141 -0
- archscope/lens.py +304 -0
- archscope/neurons.py +118 -0
- archscope/probes.py +160 -0
- archscope/sae.py +127 -0
- archscope/transfer.py +188 -0
- archscope-0.2.2.dist-info/METADATA +324 -0
- archscope-0.2.2.dist-info/RECORD +20 -0
- archscope-0.2.2.dist-info/WHEEL +5 -0
- archscope-0.2.2.dist-info/entry_points.txt +2 -0
- archscope-0.2.2.dist-info/licenses/LICENSE +17 -0
- archscope-0.2.2.dist-info/top_level.txt +1 -0
archscope/bench.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""InterpBench — standardized mechanistic interpretability benchmark.
|
|
2
|
+
|
|
3
|
+
Run a fixed test suite on any model to get a comparable "interp profile".
|
|
4
|
+
Designed to support cross-architecture comparisons (transformer, hybrid, SSM).
|
|
5
|
+
|
|
6
|
+
Tests:
|
|
7
|
+
1. probe_sentiment_auroc — Can we linearly probe pos/neg sentiment?
|
|
8
|
+
2. probe_math_auroc — Can we linearly probe math vs non-math?
|
|
9
|
+
3. induction_head — Does the model copy A->B in-context?
|
|
10
|
+
4. copy_circuit — Can it copy a word list verbatim?
|
|
11
|
+
5. concentration — How peaked are next-token predictions?
|
|
12
|
+
6. sae_dictionary_quality — Dense SAE reconstruction at mid-layer
|
|
13
|
+
7. ssm_state_info — (Mamba only) variance ratio of SSM state across inputs
|
|
14
|
+
|
|
15
|
+
Output: dataclass `InterpProfile` with all scores, JSON-serializable.
|
|
16
|
+
A model's "InterpProfile" is its interp signature.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
import time
|
|
21
|
+
import torch
|
|
22
|
+
|
|
23
|
+
from . import probes, sae, circuits
|
|
24
|
+
from .backends import Backend
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class InterpProfile:
|
|
29
|
+
"""Standardized interp test results for one model."""
|
|
30
|
+
model_name: str
|
|
31
|
+
arch_family: str # "transformer" | "hybrid" | "ssm" | "custom"
|
|
32
|
+
n_params: int
|
|
33
|
+
n_layers: int
|
|
34
|
+
hidden_dim: int
|
|
35
|
+
|
|
36
|
+
# Probe tasks (in-arch AUROC at representative mid-layer)
|
|
37
|
+
probe_sentiment_auroc: float = float("nan")
|
|
38
|
+
probe_math_auroc: float = float("nan")
|
|
39
|
+
|
|
40
|
+
# Circuit detection (relative to baseline)
|
|
41
|
+
induction_head_relative: float = float("nan")
|
|
42
|
+
copy_accuracy: float = float("nan")
|
|
43
|
+
concentration_relative: float = float("nan")
|
|
44
|
+
|
|
45
|
+
# SAE quality (dense SAE recon at mid-layer)
|
|
46
|
+
sae_dense_recon: float = float("nan")
|
|
47
|
+
sae_rank1_recon: float = float("nan")
|
|
48
|
+
sae_better: str = "—" # "dense" or "rank1"
|
|
49
|
+
|
|
50
|
+
# SSM-specific (NaN for non-SSM)
|
|
51
|
+
ssm_state_variance_ratio: float = float("nan")
|
|
52
|
+
|
|
53
|
+
runtime_seconds: float = 0.0
|
|
54
|
+
notes: list = field(default_factory=list)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ---------- Probe tasks ----------
|
|
58
|
+
|
|
59
|
+
SENTIMENT_POS = [
|
|
60
|
+
"I love this movie, it's amazing!", "What a wonderful day today.",
|
|
61
|
+
"Best book I have ever read.", "She is so kind and thoughtful.",
|
|
62
|
+
"The food was absolutely delicious.", "I'm thrilled about the great news.",
|
|
63
|
+
"Such a beautiful sunset tonight.", "He always makes me laugh.",
|
|
64
|
+
"Amazing performance overall.", "Truly a delightful experience.",
|
|
65
|
+
"Incredible work, congratulations!", "I admire her dedication.",
|
|
66
|
+
"The concert was breathtaking.", "He is the kindest person I know.",
|
|
67
|
+
"This was pure joy from start.", "I had a wonderful evening.",
|
|
68
|
+
]
|
|
69
|
+
SENTIMENT_NEG = [
|
|
70
|
+
"I hate this place absolutely.", "What a terrible movie that was.",
|
|
71
|
+
"This is the worst day ever.", "She is mean and selfish.",
|
|
72
|
+
"The food was awful and cold.", "I'm so disappointed with everything.",
|
|
73
|
+
"Such a horrible meeting today.", "He always annoys me deeply.",
|
|
74
|
+
"What an awful experience overall.", "I despise everything about this.",
|
|
75
|
+
"This restaurant is dreadful.", "She is the rudest person here.",
|
|
76
|
+
"The concert was a disaster.", "He never listens to anyone properly.",
|
|
77
|
+
"Truly a miserable evening.", "I regret coming here entirely.",
|
|
78
|
+
]
|
|
79
|
+
MATH = [
|
|
80
|
+
"Solve for x: 2x + 3 = 11.", "Compute the derivative of x cubed.",
|
|
81
|
+
"The integral from 0 to 1 of x dx.", "Triangle with angles 30 60 90.",
|
|
82
|
+
"The eigenvalue of matrix M is.", "By chain rule d/dx of sin x squared.",
|
|
83
|
+
"Find roots of x squared minus 5x plus 6.", "The Taylor series of e to x.",
|
|
84
|
+
"Prove sum of two evens is even.", "The Cauchy-Schwarz inequality.",
|
|
85
|
+
"Let f be continuous on closed interval.", "Define the limit as n grows large.",
|
|
86
|
+
"By induction on the natural numbers.", "The dot product of u and v.",
|
|
87
|
+
"A group is abelian if commutative.", "Fundamental theorem of calculus.",
|
|
88
|
+
]
|
|
89
|
+
NONMATH = [
|
|
90
|
+
"The cat sat on the mat softly.", "Music has the power to move us.",
|
|
91
|
+
"Mountains stretch to the horizon.", "She whispered her secret carefully.",
|
|
92
|
+
"Birds sing at dawn every morning.", "The chef prepared dinner slowly.",
|
|
93
|
+
"Children laughed in the city park.", "Rain pattered against the window.",
|
|
94
|
+
"Coffee aroma filled the kitchen.", "Time passes faster when busy.",
|
|
95
|
+
"Stars appeared in the dark sky.", "The river flowed gently downstream.",
|
|
96
|
+
"Snow blanketed the entire valley.", "Dance is a universal language.",
|
|
97
|
+
"Books contain endless worlds.", "Travel broadens one's perspective.",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _run_probe(model, backend_hint, tokenize_fn, pos_texts, neg_texts, layer_name):
|
|
102
|
+
inputs_pos = tokenize_fn(pos_texts)
|
|
103
|
+
inputs_neg = tokenize_fn(neg_texts)
|
|
104
|
+
pf = probes.fit_probe(
|
|
105
|
+
model, inputs_pos=inputs_pos, inputs_neg=inputs_neg,
|
|
106
|
+
layer_name=layer_name, backend_hint=backend_hint,
|
|
107
|
+
)
|
|
108
|
+
# Use train AUROC (val with tiny n is too noisy)
|
|
109
|
+
return float(pf.metrics["train_auroc"])
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _run_sae(model, backend, layer_name, hidden_dim, tokenize_fn):
|
|
113
|
+
"""Train both Dense and Rank-1 SAE, return both recon errors."""
|
|
114
|
+
corpus = SENTIMENT_POS + SENTIMENT_NEG + MATH + NONMATH
|
|
115
|
+
inputs = tokenize_fn(corpus)
|
|
116
|
+
with torch.no_grad():
|
|
117
|
+
rec = backend.extract(inputs, layers=[layer_name])[0]
|
|
118
|
+
acts = rec.activations.reshape(-1, hidden_dim).detach()
|
|
119
|
+
cfg_d = sae.SAEConfig(input_dim=hidden_dim, n_features=512, sae_type="dense",
|
|
120
|
+
sparsity=1e-4, learning_rate=3e-3)
|
|
121
|
+
cfg_r = sae.SAEConfig(input_dim=hidden_dim, n_features=512, sae_type="rank1",
|
|
122
|
+
sparsity=1e-4, learning_rate=3e-3)
|
|
123
|
+
sae_d = sae.fit_sae(acts, cfg_d, epochs=60)
|
|
124
|
+
sae_r = sae.fit_sae(acts, cfg_r, epochs=60)
|
|
125
|
+
return sae_d.last_metrics["recon"], sae_r.last_metrics["recon"]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _ssm_state_variance_ratio(model, backend, layer_name, tokenize_fn) -> float:
|
|
129
|
+
"""For Mamba: how much variance does the SSM state have across diverse inputs?
|
|
130
|
+
|
|
131
|
+
Ratio = variance across inputs / total variance. High = state encodes input strongly.
|
|
132
|
+
"""
|
|
133
|
+
corpus = SENTIMENT_POS[:8] + MATH[:8] # diverse inputs
|
|
134
|
+
inputs = tokenize_fn(corpus)
|
|
135
|
+
rec = backend.extract(inputs, layers=[layer_name])[0]
|
|
136
|
+
# rec.activations shape: (B, intermediate, ssm_state)
|
|
137
|
+
flat = rec.activations.reshape(rec.activations.shape[0], -1)
|
|
138
|
+
var_across = flat.var(dim=0).mean().item()
|
|
139
|
+
var_total = flat.var().item()
|
|
140
|
+
return float(var_across / (var_total + 1e-9))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------- Main runner ----------
|
|
144
|
+
|
|
145
|
+
def benchmark(
|
|
146
|
+
model_name: str,
|
|
147
|
+
model,
|
|
148
|
+
tokenizer,
|
|
149
|
+
backend_hint: str,
|
|
150
|
+
arch_family: str = "transformer",
|
|
151
|
+
tokenize_fn=None,
|
|
152
|
+
sentiment_layer: int | None = None,
|
|
153
|
+
math_layer: int | None = None,
|
|
154
|
+
sae_layer: int | None = None,
|
|
155
|
+
ssm_layer: int | None = None,
|
|
156
|
+
) -> InterpProfile:
|
|
157
|
+
"""Run InterpBench on a model. Returns InterpProfile.
|
|
158
|
+
|
|
159
|
+
`tokenize_fn(texts: list[str]) -> dict` is the model-specific tokenizer wrapper.
|
|
160
|
+
"""
|
|
161
|
+
backend = Backend.for_model(model, hint=backend_hint)
|
|
162
|
+
t_start = time.time()
|
|
163
|
+
|
|
164
|
+
len(backend.layer_names())
|
|
165
|
+
# Filter to only residual layers if mixed (e.g., Mamba has both .residual and .ssm_state)
|
|
166
|
+
residual_layers = [n for n in backend.layer_names() if ".residual" in n]
|
|
167
|
+
n_blocks = len(residual_layers)
|
|
168
|
+
hidden_dim = backend.hidden_dim(residual_layers[0])
|
|
169
|
+
n_params = sum(p.numel() for p in model.parameters())
|
|
170
|
+
|
|
171
|
+
profile = InterpProfile(
|
|
172
|
+
model_name=model_name,
|
|
173
|
+
arch_family=arch_family,
|
|
174
|
+
n_params=n_params,
|
|
175
|
+
n_layers=n_blocks,
|
|
176
|
+
hidden_dim=hidden_dim,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Choose representative layers: sentiment probes at shallow depth, math
|
|
180
|
+
# and SAE at mid-depth. Each can be overridden by the caller.
|
|
181
|
+
if sentiment_layer is None:
|
|
182
|
+
sentiment_layer = n_blocks // 4
|
|
183
|
+
if math_layer is None:
|
|
184
|
+
math_layer = n_blocks // 2
|
|
185
|
+
if sae_layer is None:
|
|
186
|
+
sae_layer = n_blocks // 2
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
profile.probe_sentiment_auroc = _run_probe(
|
|
190
|
+
model, backend_hint, tokenize_fn,
|
|
191
|
+
SENTIMENT_POS, SENTIMENT_NEG,
|
|
192
|
+
f"layer_{sentiment_layer}.residual",
|
|
193
|
+
)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
profile.notes.append(f"probe_sentiment error: {str(e)[:60]}")
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
profile.probe_math_auroc = _run_probe(
|
|
199
|
+
model, backend_hint, tokenize_fn,
|
|
200
|
+
MATH, NONMATH,
|
|
201
|
+
f"layer_{math_layer}.residual",
|
|
202
|
+
)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
profile.notes.append(f"probe_math error: {str(e)[:60]}")
|
|
205
|
+
|
|
206
|
+
# Circuit detection
|
|
207
|
+
try:
|
|
208
|
+
circs = circuits.run_all_circuits(model, tokenizer=tokenizer, device="cpu")
|
|
209
|
+
profile.induction_head_relative = circs["induction_head"].relative
|
|
210
|
+
if "copy_circuit" in circs:
|
|
211
|
+
profile.copy_accuracy = circs["copy_circuit"].score
|
|
212
|
+
if "early_token_concentration" in circs:
|
|
213
|
+
profile.concentration_relative = circs["early_token_concentration"].relative
|
|
214
|
+
except Exception as e:
|
|
215
|
+
profile.notes.append(f"circuits error: {str(e)[:60]}")
|
|
216
|
+
|
|
217
|
+
# SAE quality
|
|
218
|
+
try:
|
|
219
|
+
d_recon, r_recon = _run_sae(
|
|
220
|
+
model, backend, f"layer_{sae_layer}.residual", hidden_dim, tokenize_fn,
|
|
221
|
+
)
|
|
222
|
+
profile.sae_dense_recon = d_recon
|
|
223
|
+
profile.sae_rank1_recon = r_recon
|
|
224
|
+
profile.sae_better = "rank1" if r_recon < d_recon else "dense"
|
|
225
|
+
except Exception as e:
|
|
226
|
+
profile.notes.append(f"sae error: {str(e)[:60]}")
|
|
227
|
+
|
|
228
|
+
# SSM state (mamba/ssm only)
|
|
229
|
+
if arch_family == "ssm" and ssm_layer is not None:
|
|
230
|
+
try:
|
|
231
|
+
profile.ssm_state_variance_ratio = _ssm_state_variance_ratio(
|
|
232
|
+
model, backend, f"layer_{ssm_layer}.ssm_state", tokenize_fn,
|
|
233
|
+
)
|
|
234
|
+
except Exception as e:
|
|
235
|
+
profile.notes.append(f"ssm_state error: {str(e)[:60]}")
|
|
236
|
+
|
|
237
|
+
profile.runtime_seconds = time.time() - t_start
|
|
238
|
+
return profile
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def profile_to_markdown(profile: InterpProfile) -> str:
|
|
242
|
+
"""Format profile as markdown row for the leaderboard."""
|
|
243
|
+
lines = [
|
|
244
|
+
f"### {profile.model_name}",
|
|
245
|
+
f" Arch: {profile.arch_family} | Params: {profile.n_params/1e6:.1f}M | "
|
|
246
|
+
f"Layers: {profile.n_layers} | Hidden: {profile.hidden_dim}",
|
|
247
|
+
" | Test | Score |",
|
|
248
|
+
" |------|-------|",
|
|
249
|
+
f" | Sentiment probe AUROC | {profile.probe_sentiment_auroc:.3f} |",
|
|
250
|
+
f" | Math probe AUROC | {profile.probe_math_auroc:.3f} |",
|
|
251
|
+
f" | Induction head (×chance) | {profile.induction_head_relative:>.1f} |",
|
|
252
|
+
f" | Copy accuracy | {profile.copy_accuracy:.2%} |",
|
|
253
|
+
f" | Concentration (rel) | {profile.concentration_relative:.3f} |",
|
|
254
|
+
f" | SAE Dense recon | {profile.sae_dense_recon:.4f} |",
|
|
255
|
+
f" | SAE Rank-1 recon | {profile.sae_rank1_recon:.4f} |",
|
|
256
|
+
f" | SAE better | {profile.sae_better} |",
|
|
257
|
+
f" | SSM state var ratio | {profile.ssm_state_variance_ratio:.3f} |",
|
|
258
|
+
f" | Runtime | {profile.runtime_seconds:.1f}s |",
|
|
259
|
+
]
|
|
260
|
+
if profile.notes:
|
|
261
|
+
lines.append(f" Notes: {'; '.join(profile.notes)}")
|
|
262
|
+
return "\n".join(lines)
|
archscope/circuits.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Cross-architecture circuit detection.
|
|
2
|
+
|
|
3
|
+
Classical mech-interp circuits were discovered in transformers (induction heads,
|
|
4
|
+
attention sinks, copy circuits). It's an OPEN QUESTION whether they exist in
|
|
5
|
+
SSMs (Mamba) or hybrid attention (Kazdov-α).
|
|
6
|
+
|
|
7
|
+
Each detector is BEHAVIORAL — measures the model's outputs given crafted
|
|
8
|
+
inputs — so it works on ANY architecture without per-arch internals.
|
|
9
|
+
|
|
10
|
+
Detectors implemented:
|
|
11
|
+
- induction_head_score: does the model copy A->B associations? (Olsson et al 2022)
|
|
12
|
+
- copy_score: can the model copy a sequence verbatim?
|
|
13
|
+
- early_token_attention: proxy for attention-sink behavior, measures prob mass
|
|
14
|
+
concentrated on early tokens (BOS-like).
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
import random
|
|
18
|
+
import torch
|
|
19
|
+
import torch.nn.functional as F
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CircuitScore:
|
|
25
|
+
"""Score from one circuit test."""
|
|
26
|
+
name: str
|
|
27
|
+
score: float # primary metric, higher = circuit more present
|
|
28
|
+
baseline: float # random-model baseline
|
|
29
|
+
relative: float # score / baseline (>1 means circuit present)
|
|
30
|
+
raw: dict # detailed numbers
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------- Helper: model-agnostic forward ----------
|
|
34
|
+
|
|
35
|
+
def _model_logits(model, input_ids: torch.Tensor) -> torch.Tensor:
|
|
36
|
+
"""Call model and extract final-step logits. Works for HF + kazdov."""
|
|
37
|
+
with torch.no_grad():
|
|
38
|
+
out = model(input_ids)
|
|
39
|
+
if isinstance(out, dict):
|
|
40
|
+
logits = out.get("logits", out.get("last_hidden_state"))
|
|
41
|
+
elif hasattr(out, "logits"):
|
|
42
|
+
logits = out.logits
|
|
43
|
+
else:
|
|
44
|
+
logits = out
|
|
45
|
+
return logits
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------- Induction head detection ----------
|
|
49
|
+
|
|
50
|
+
def induction_head_score(
|
|
51
|
+
model,
|
|
52
|
+
n_pairs: int = 20,
|
|
53
|
+
seq_len: int = 6,
|
|
54
|
+
n_trials: int = 50,
|
|
55
|
+
vocab_size: int | None = None,
|
|
56
|
+
device: str = "cpu",
|
|
57
|
+
seed: int = 0,
|
|
58
|
+
) -> CircuitScore:
|
|
59
|
+
"""Olsson-style induction head test.
|
|
60
|
+
|
|
61
|
+
Construct a sequence: [A1 B1] [A2 B2] ... [A_k B_k] [A_1]
|
|
62
|
+
Score = probability that next token is B_1 (the original pair completion).
|
|
63
|
+
|
|
64
|
+
If induction heads exist, the model should learn to copy A_1 → B_1 in-context.
|
|
65
|
+
Compared to random-token baseline (1/vocab_size).
|
|
66
|
+
"""
|
|
67
|
+
rng = random.Random(seed)
|
|
68
|
+
if vocab_size is None:
|
|
69
|
+
# Try to infer (HF transformers + custom models like kazdov)
|
|
70
|
+
if hasattr(model, "config") and hasattr(model.config, "vocab_size"):
|
|
71
|
+
vocab_size = model.config.vocab_size
|
|
72
|
+
elif hasattr(model, "vocab_size"):
|
|
73
|
+
vocab_size = model.vocab_size
|
|
74
|
+
else:
|
|
75
|
+
vocab_size = 50257 # GPT-2 default
|
|
76
|
+
|
|
77
|
+
successes = 0
|
|
78
|
+
rank_sum = 0.0
|
|
79
|
+
prob_target_sum = 0.0
|
|
80
|
+
for trial in range(n_trials):
|
|
81
|
+
# Pick n_pairs random token pairs
|
|
82
|
+
tokens = rng.sample(range(100, min(vocab_size, 40000)), 2 * n_pairs)
|
|
83
|
+
seq = []
|
|
84
|
+
pairs = []
|
|
85
|
+
for i in range(n_pairs):
|
|
86
|
+
a, b = tokens[2*i], tokens[2*i+1]
|
|
87
|
+
seq.extend([a, b])
|
|
88
|
+
pairs.append((a, b))
|
|
89
|
+
# Append cue: A1 — model should predict B1
|
|
90
|
+
a1, b1 = pairs[0]
|
|
91
|
+
seq.append(a1)
|
|
92
|
+
|
|
93
|
+
ids = torch.tensor([seq], dtype=torch.long, device=device)
|
|
94
|
+
logits = _model_logits(model, ids)
|
|
95
|
+
# last position prediction
|
|
96
|
+
last_logits = logits[0, -1, :]
|
|
97
|
+
probs = F.softmax(last_logits, dim=-1)
|
|
98
|
+
|
|
99
|
+
# Did the model predict b1 as top-1?
|
|
100
|
+
pred = int(torch.argmax(last_logits).item())
|
|
101
|
+
if pred == b1:
|
|
102
|
+
successes += 1
|
|
103
|
+
# Rank of b1 in the logit distribution (0 = predicted top-1).
|
|
104
|
+
rank = int((last_logits > last_logits[b1]).sum().item())
|
|
105
|
+
rank_sum += rank
|
|
106
|
+
prob_target_sum += float(probs[b1].item())
|
|
107
|
+
|
|
108
|
+
accuracy = successes / n_trials
|
|
109
|
+
avg_rank = rank_sum / n_trials
|
|
110
|
+
avg_prob = prob_target_sum / n_trials
|
|
111
|
+
chance = 1.0 / vocab_size
|
|
112
|
+
return CircuitScore(
|
|
113
|
+
name="induction_head",
|
|
114
|
+
score=avg_prob,
|
|
115
|
+
baseline=chance,
|
|
116
|
+
relative=avg_prob / chance,
|
|
117
|
+
raw={
|
|
118
|
+
"accuracy_top1": accuracy,
|
|
119
|
+
"avg_rank_target": avg_rank,
|
|
120
|
+
"avg_prob_target": avg_prob,
|
|
121
|
+
"n_trials": n_trials,
|
|
122
|
+
"seq_len_pairs": n_pairs,
|
|
123
|
+
},
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---------- Copy circuit detection ----------
|
|
128
|
+
|
|
129
|
+
def copy_score(
|
|
130
|
+
model,
|
|
131
|
+
tokenizer,
|
|
132
|
+
n_trials: int = 30,
|
|
133
|
+
n_words: int = 5,
|
|
134
|
+
device: str = "cpu",
|
|
135
|
+
seed: int = 0,
|
|
136
|
+
) -> CircuitScore:
|
|
137
|
+
"""Test whether model can copy a short word-list verbatim after a separator.
|
|
138
|
+
|
|
139
|
+
Input pattern: "list: A B C D E. list: " → predict "A"
|
|
140
|
+
then after predicting A, predict B, etc.
|
|
141
|
+
|
|
142
|
+
Score = fraction of correctly-copied tokens.
|
|
143
|
+
"""
|
|
144
|
+
rng = random.Random(seed)
|
|
145
|
+
word_pool = [
|
|
146
|
+
"cat", "dog", "tree", "moon", "star", "fish", "bird", "rock",
|
|
147
|
+
"leaf", "wave", "sky", "rain", "snow", "wind", "fire", "ice",
|
|
148
|
+
"book", "lamp", "key", "door", "wall", "road", "hill", "sand",
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
correct = 0
|
|
152
|
+
total = 0
|
|
153
|
+
for trial in range(n_trials):
|
|
154
|
+
words = rng.sample(word_pool, n_words)
|
|
155
|
+
prompt = f"list: {' '.join(words)}. list: "
|
|
156
|
+
ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
|
157
|
+
ids.shape[1]
|
|
158
|
+
|
|
159
|
+
# Token IDs for target words (first token of each)
|
|
160
|
+
target_tokens = []
|
|
161
|
+
for w in words:
|
|
162
|
+
target_tokens.append(tokenizer(" " + w, add_special_tokens=False).input_ids[0])
|
|
163
|
+
|
|
164
|
+
# Autoregressively predict n_words tokens, chaining the model's own
|
|
165
|
+
# predictions (not teacher-forcing) — measures cumulative copy ability.
|
|
166
|
+
cur = ids.clone()
|
|
167
|
+
for tgt in target_tokens:
|
|
168
|
+
logits = _model_logits(model, cur)
|
|
169
|
+
next_tok = int(torch.argmax(logits[0, -1, :]).item())
|
|
170
|
+
if next_tok == tgt:
|
|
171
|
+
correct += 1
|
|
172
|
+
total += 1
|
|
173
|
+
cur = torch.cat([cur, torch.tensor([[next_tok]], device=device)], dim=1)
|
|
174
|
+
|
|
175
|
+
acc = correct / total if total > 0 else 0.0
|
|
176
|
+
# Random baseline for copying is ~1/vocab_size; effectively ~0
|
|
177
|
+
vocab_size = (getattr(model, "vocab_size", None)
|
|
178
|
+
or getattr(getattr(model, "config", None), "vocab_size", 50257))
|
|
179
|
+
chance = 1.0 / vocab_size
|
|
180
|
+
return CircuitScore(
|
|
181
|
+
name="copy_circuit",
|
|
182
|
+
score=acc,
|
|
183
|
+
baseline=chance,
|
|
184
|
+
relative=acc / chance if chance > 0 else float("inf"),
|
|
185
|
+
raw={"n_trials": n_trials, "n_words": n_words, "correct": correct, "total": total},
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------- Early-token attention proxy (attention sink) ----------
|
|
190
|
+
|
|
191
|
+
def early_token_attention(
|
|
192
|
+
model,
|
|
193
|
+
tokenizer,
|
|
194
|
+
texts: list[str] | None = None,
|
|
195
|
+
device: str = "cpu",
|
|
196
|
+
) -> CircuitScore:
|
|
197
|
+
"""Behavioral proxy for "attention sink" — measures how much logit mass
|
|
198
|
+
concentrates on the first token's vocab prediction when given a continuation cue.
|
|
199
|
+
|
|
200
|
+
Construct: "[BOS] X1 X2 ... X_n [predict ?]"
|
|
201
|
+
Measure: entropy of next-token distribution.
|
|
202
|
+
|
|
203
|
+
Low entropy → model is highly concentrated (possible sink-like behavior).
|
|
204
|
+
High entropy → no concentration.
|
|
205
|
+
|
|
206
|
+
NOTE: this is a coarse behavioral proxy. True attention-sink analysis
|
|
207
|
+
requires architecture-specific attention weights. But the proxy works
|
|
208
|
+
on ANY architecture (including SSMs that don't have attention).
|
|
209
|
+
"""
|
|
210
|
+
if texts is None:
|
|
211
|
+
texts = [
|
|
212
|
+
"The cat sat on the", "Music has the power to",
|
|
213
|
+
"Mountains stretch to the", "She wrote a letter to",
|
|
214
|
+
"The sun set behind the", "Children laughed in the",
|
|
215
|
+
"Solve for x in the equation", "Compute the derivative of the",
|
|
216
|
+
]
|
|
217
|
+
entropies = []
|
|
218
|
+
for txt in texts:
|
|
219
|
+
ids = tokenizer(txt, return_tensors="pt").input_ids.to(device)
|
|
220
|
+
logits = _model_logits(model, ids)
|
|
221
|
+
last_logits = logits[0, -1, :]
|
|
222
|
+
probs = F.softmax(last_logits, dim=-1)
|
|
223
|
+
# Shannon entropy (in nats)
|
|
224
|
+
ent = -(probs * (probs.clamp(min=1e-12).log())).sum().item()
|
|
225
|
+
entropies.append(ent)
|
|
226
|
+
|
|
227
|
+
avg_ent = sum(entropies) / len(entropies)
|
|
228
|
+
# Reference: log(vocab_size) is max entropy
|
|
229
|
+
vocab_size = (getattr(model, "vocab_size", None)
|
|
230
|
+
or getattr(getattr(model, "config", None), "vocab_size", 50257))
|
|
231
|
+
max_ent = torch.log(torch.tensor(float(vocab_size))).item()
|
|
232
|
+
return CircuitScore(
|
|
233
|
+
name="early_token_concentration",
|
|
234
|
+
score=avg_ent, # in nats
|
|
235
|
+
baseline=max_ent,
|
|
236
|
+
relative=avg_ent / max_ent, # 0 = full concentration, 1 = uniform
|
|
237
|
+
raw={"per_text_entropy": entropies, "max_entropy": max_ent},
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ---------- Runner ----------
|
|
242
|
+
|
|
243
|
+
def run_all_circuits(model, tokenizer=None, device: str = "cpu") -> dict[str, CircuitScore]:
|
|
244
|
+
"""Run all available circuit tests on a model.
|
|
245
|
+
|
|
246
|
+
Returns a dict keyed by circuit name. Some tests require tokenizer; if not
|
|
247
|
+
provided, those are skipped.
|
|
248
|
+
"""
|
|
249
|
+
results = {}
|
|
250
|
+
# Induction head only needs model + vocab_size
|
|
251
|
+
results["induction_head"] = induction_head_score(model, device=device)
|
|
252
|
+
if tokenizer is not None:
|
|
253
|
+
results["copy_circuit"] = copy_score(model, tokenizer, device=device)
|
|
254
|
+
results["early_token_concentration"] = early_token_attention(model, tokenizer, device=device)
|
|
255
|
+
return results
|
archscope/cli.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Command-line interface for archscope."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from . import __version__
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@click.group()
|
|
17
|
+
def cli() -> None:
|
|
18
|
+
"""archscope — cross-architecture mechanistic interpretability toolkit."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@cli.command()
|
|
22
|
+
def info() -> None:
|
|
23
|
+
"""Show available methods + supported backends."""
|
|
24
|
+
methods = Table(title=f"archscope v{__version__}")
|
|
25
|
+
methods.add_column("Method")
|
|
26
|
+
methods.add_column("Module")
|
|
27
|
+
methods.add_column("Source paper")
|
|
28
|
+
for row in (
|
|
29
|
+
("Probes", "probes.fit_probe", "Drop the Act (2605.11467)"),
|
|
30
|
+
("SAE", "sae.fit_sae", "WriteSAE (2605.12770)"),
|
|
31
|
+
("Neuron mod", "neurons.find_neurons", "Targeted Neuron Mod (2605.12290)"),
|
|
32
|
+
("Activation patch", "attribute.activation_patch", "Multi-Agent Sycophancy (2605.12991)"),
|
|
33
|
+
("Cross-arch transfer", "transfer.evaluate_transfer", "this library"),
|
|
34
|
+
("Circuit detection", "circuits.run_all_circuits", "this library"),
|
|
35
|
+
("Logit/tuned lens", "lens.logit_lens", "Belrose et al 2023"),
|
|
36
|
+
("Model diff", "diff.compare", "this library"),
|
|
37
|
+
("InterpBench", "bench.benchmark", "this library"),
|
|
38
|
+
):
|
|
39
|
+
methods.add_row(*row)
|
|
40
|
+
console.print(methods)
|
|
41
|
+
|
|
42
|
+
backends = Table(title="Backends")
|
|
43
|
+
backends.add_column("Name")
|
|
44
|
+
backends.add_column("Architecture family")
|
|
45
|
+
for row in (
|
|
46
|
+
("transformer", "HuggingFace decoder LMs (Llama, GPT, Qwen, Pythia, ...)"),
|
|
47
|
+
("mamba", "Mamba / Mamba-2 SSM — exposes .ssm_state (recurrent h_t)"),
|
|
48
|
+
("kazdov", "Kazdov-α hybrid MoBE-BCN+MHA"),
|
|
49
|
+
("recurrent", "Generic RNN (subclass per model)"),
|
|
50
|
+
):
|
|
51
|
+
backends.add_row(*row)
|
|
52
|
+
console.print(backends)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@cli.command()
|
|
56
|
+
@click.argument("model_name")
|
|
57
|
+
@click.option("--arch", default="transformer",
|
|
58
|
+
type=click.Choice(["transformer", "mamba", "kazdov"]),
|
|
59
|
+
help="Architecture family.")
|
|
60
|
+
@click.option("--out", default=None,
|
|
61
|
+
help="Output file. Format inferred from extension: .json or .md. "
|
|
62
|
+
"Without --out, prints markdown to stdout.")
|
|
63
|
+
def bench(model_name: str, arch: str, out: str | None) -> None:
|
|
64
|
+
"""Run archscope InterpBench on a HuggingFace model.
|
|
65
|
+
|
|
66
|
+
Examples:
|
|
67
|
+
archscope bench EleutherAI/pythia-160m --arch transformer
|
|
68
|
+
archscope bench EleutherAI/pythia-160m --arch transformer --out pythia.md
|
|
69
|
+
archscope bench state-spaces/mamba-130m-hf --arch mamba --out mamba.json
|
|
70
|
+
"""
|
|
71
|
+
# Lazy imports keep `archscope info` fast (no torch/transformers).
|
|
72
|
+
import torch
|
|
73
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
74
|
+
|
|
75
|
+
from . import bench as bench_mod
|
|
76
|
+
|
|
77
|
+
console.print(f"[cyan]→ Loading {model_name}…[/cyan]")
|
|
78
|
+
tok = AutoTokenizer.from_pretrained(model_name)
|
|
79
|
+
if tok.pad_token is None:
|
|
80
|
+
tok.pad_token = tok.eos_token
|
|
81
|
+
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
|
|
82
|
+
model.eval()
|
|
83
|
+
|
|
84
|
+
def tokenize_fn(texts):
|
|
85
|
+
return tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
|
|
86
|
+
|
|
87
|
+
arch_family = {"transformer": "transformer", "mamba": "ssm", "kazdov": "hybrid"}[arch]
|
|
88
|
+
profile = bench_mod.benchmark(
|
|
89
|
+
model_name=model_name,
|
|
90
|
+
model=model,
|
|
91
|
+
tokenizer=tok,
|
|
92
|
+
backend_hint=arch,
|
|
93
|
+
arch_family=arch_family,
|
|
94
|
+
tokenize_fn=tokenize_fn,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
markdown = bench_mod.profile_to_markdown(profile)
|
|
98
|
+
console.print()
|
|
99
|
+
console.print(markdown)
|
|
100
|
+
|
|
101
|
+
if out:
|
|
102
|
+
from dataclasses import asdict
|
|
103
|
+
os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
|
|
104
|
+
ext = os.path.splitext(out)[1].lower()
|
|
105
|
+
if ext == ".md":
|
|
106
|
+
with open(out, "w") as f:
|
|
107
|
+
f.write(markdown + "\n")
|
|
108
|
+
console.print(f"\n[green]Saved markdown report to {out}[/green]")
|
|
109
|
+
elif ext == ".json" or ext == "":
|
|
110
|
+
with open(out, "w") as f:
|
|
111
|
+
json.dump(asdict(profile), f, indent=2, default=str)
|
|
112
|
+
console.print(f"\n[green]Saved JSON profile to {out}[/green]")
|
|
113
|
+
else:
|
|
114
|
+
raise click.UsageError(
|
|
115
|
+
f"Unsupported --out extension '{ext}'. Use .md or .json."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
cli()
|