archscope 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
archscope/bench.py ADDED
@@ -0,0 +1,262 @@
1
+ """InterpBench — standardized mechanistic interpretability benchmark.
2
+
3
+ Run a fixed test suite on any model to get a comparable "interp profile".
4
+ Designed to support cross-architecture comparisons (transformer, hybrid, SSM).
5
+
6
+ Tests:
7
+ 1. probe_sentiment_auroc — Can we linearly probe pos/neg sentiment?
8
+ 2. probe_math_auroc — Can we linearly probe math vs non-math?
9
+ 3. induction_head — Does the model copy A->B in-context?
10
+ 4. copy_circuit — Can it copy a word list verbatim?
11
+ 5. concentration — How peaked are next-token predictions?
12
+ 6. sae_dictionary_quality — Dense SAE reconstruction at mid-layer
13
+ 7. ssm_state_info — (Mamba only) variance ratio of SSM state across inputs
14
+
15
+ Output: dataclass `InterpProfile` with all scores, JSON-serializable.
16
+ A model's "InterpProfile" is its interp signature.
17
+ """
18
+ from __future__ import annotations
19
+ from dataclasses import dataclass, field
20
+ import time
21
+ import torch
22
+
23
+ from . import probes, sae, circuits
24
+ from .backends import Backend
25
+
26
+
27
+ @dataclass
28
+ class InterpProfile:
29
+ """Standardized interp test results for one model."""
30
+ model_name: str
31
+ arch_family: str # "transformer" | "hybrid" | "ssm" | "custom"
32
+ n_params: int
33
+ n_layers: int
34
+ hidden_dim: int
35
+
36
+ # Probe tasks (in-arch AUROC at representative mid-layer)
37
+ probe_sentiment_auroc: float = float("nan")
38
+ probe_math_auroc: float = float("nan")
39
+
40
+ # Circuit detection (relative to baseline)
41
+ induction_head_relative: float = float("nan")
42
+ copy_accuracy: float = float("nan")
43
+ concentration_relative: float = float("nan")
44
+
45
+ # SAE quality (dense SAE recon at mid-layer)
46
+ sae_dense_recon: float = float("nan")
47
+ sae_rank1_recon: float = float("nan")
48
+ sae_better: str = "—" # "dense" or "rank1"
49
+
50
+ # SSM-specific (NaN for non-SSM)
51
+ ssm_state_variance_ratio: float = float("nan")
52
+
53
+ runtime_seconds: float = 0.0
54
+ notes: list = field(default_factory=list)
55
+
56
+
57
+ # ---------- Probe tasks ----------
58
+
59
+ SENTIMENT_POS = [
60
+ "I love this movie, it's amazing!", "What a wonderful day today.",
61
+ "Best book I have ever read.", "She is so kind and thoughtful.",
62
+ "The food was absolutely delicious.", "I'm thrilled about the great news.",
63
+ "Such a beautiful sunset tonight.", "He always makes me laugh.",
64
+ "Amazing performance overall.", "Truly a delightful experience.",
65
+ "Incredible work, congratulations!", "I admire her dedication.",
66
+ "The concert was breathtaking.", "He is the kindest person I know.",
67
+ "This was pure joy from start.", "I had a wonderful evening.",
68
+ ]
69
+ SENTIMENT_NEG = [
70
+ "I hate this place absolutely.", "What a terrible movie that was.",
71
+ "This is the worst day ever.", "She is mean and selfish.",
72
+ "The food was awful and cold.", "I'm so disappointed with everything.",
73
+ "Such a horrible meeting today.", "He always annoys me deeply.",
74
+ "What an awful experience overall.", "I despise everything about this.",
75
+ "This restaurant is dreadful.", "She is the rudest person here.",
76
+ "The concert was a disaster.", "He never listens to anyone properly.",
77
+ "Truly a miserable evening.", "I regret coming here entirely.",
78
+ ]
79
+ MATH = [
80
+ "Solve for x: 2x + 3 = 11.", "Compute the derivative of x cubed.",
81
+ "The integral from 0 to 1 of x dx.", "Triangle with angles 30 60 90.",
82
+ "The eigenvalue of matrix M is.", "By chain rule d/dx of sin x squared.",
83
+ "Find roots of x squared minus 5x plus 6.", "The Taylor series of e to x.",
84
+ "Prove sum of two evens is even.", "The Cauchy-Schwarz inequality.",
85
+ "Let f be continuous on closed interval.", "Define the limit as n grows large.",
86
+ "By induction on the natural numbers.", "The dot product of u and v.",
87
+ "A group is abelian if commutative.", "Fundamental theorem of calculus.",
88
+ ]
89
+ NONMATH = [
90
+ "The cat sat on the mat softly.", "Music has the power to move us.",
91
+ "Mountains stretch to the horizon.", "She whispered her secret carefully.",
92
+ "Birds sing at dawn every morning.", "The chef prepared dinner slowly.",
93
+ "Children laughed in the city park.", "Rain pattered against the window.",
94
+ "Coffee aroma filled the kitchen.", "Time passes faster when busy.",
95
+ "Stars appeared in the dark sky.", "The river flowed gently downstream.",
96
+ "Snow blanketed the entire valley.", "Dance is a universal language.",
97
+ "Books contain endless worlds.", "Travel broadens one's perspective.",
98
+ ]
99
+
100
+
101
+ def _run_probe(model, backend_hint, tokenize_fn, pos_texts, neg_texts, layer_name):
102
+ inputs_pos = tokenize_fn(pos_texts)
103
+ inputs_neg = tokenize_fn(neg_texts)
104
+ pf = probes.fit_probe(
105
+ model, inputs_pos=inputs_pos, inputs_neg=inputs_neg,
106
+ layer_name=layer_name, backend_hint=backend_hint,
107
+ )
108
+ # Use train AUROC (val with tiny n is too noisy)
109
+ return float(pf.metrics["train_auroc"])
110
+
111
+
112
+ def _run_sae(model, backend, layer_name, hidden_dim, tokenize_fn):
113
+ """Train both Dense and Rank-1 SAE, return both recon errors."""
114
+ corpus = SENTIMENT_POS + SENTIMENT_NEG + MATH + NONMATH
115
+ inputs = tokenize_fn(corpus)
116
+ with torch.no_grad():
117
+ rec = backend.extract(inputs, layers=[layer_name])[0]
118
+ acts = rec.activations.reshape(-1, hidden_dim).detach()
119
+ cfg_d = sae.SAEConfig(input_dim=hidden_dim, n_features=512, sae_type="dense",
120
+ sparsity=1e-4, learning_rate=3e-3)
121
+ cfg_r = sae.SAEConfig(input_dim=hidden_dim, n_features=512, sae_type="rank1",
122
+ sparsity=1e-4, learning_rate=3e-3)
123
+ sae_d = sae.fit_sae(acts, cfg_d, epochs=60)
124
+ sae_r = sae.fit_sae(acts, cfg_r, epochs=60)
125
+ return sae_d.last_metrics["recon"], sae_r.last_metrics["recon"]
126
+
127
+
128
+ def _ssm_state_variance_ratio(model, backend, layer_name, tokenize_fn) -> float:
129
+ """For Mamba: how much variance does the SSM state have across diverse inputs?
130
+
131
+ Ratio = variance across inputs / total variance. High = state encodes input strongly.
132
+ """
133
+ corpus = SENTIMENT_POS[:8] + MATH[:8] # diverse inputs
134
+ inputs = tokenize_fn(corpus)
135
+ rec = backend.extract(inputs, layers=[layer_name])[0]
136
+ # rec.activations shape: (B, intermediate, ssm_state)
137
+ flat = rec.activations.reshape(rec.activations.shape[0], -1)
138
+ var_across = flat.var(dim=0).mean().item()
139
+ var_total = flat.var().item()
140
+ return float(var_across / (var_total + 1e-9))
141
+
142
+
143
+ # ---------- Main runner ----------
144
+
145
+ def benchmark(
146
+ model_name: str,
147
+ model,
148
+ tokenizer,
149
+ backend_hint: str,
150
+ arch_family: str = "transformer",
151
+ tokenize_fn=None,
152
+ sentiment_layer: int | None = None,
153
+ math_layer: int | None = None,
154
+ sae_layer: int | None = None,
155
+ ssm_layer: int | None = None,
156
+ ) -> InterpProfile:
157
+ """Run InterpBench on a model. Returns InterpProfile.
158
+
159
+ `tokenize_fn(texts: list[str]) -> dict` is the model-specific tokenizer wrapper.
160
+ """
161
+ backend = Backend.for_model(model, hint=backend_hint)
162
+ t_start = time.time()
163
+
164
+ len(backend.layer_names())
165
+ # Filter to only residual layers if mixed (e.g., Mamba has both .residual and .ssm_state)
166
+ residual_layers = [n for n in backend.layer_names() if ".residual" in n]
167
+ n_blocks = len(residual_layers)
168
+ hidden_dim = backend.hidden_dim(residual_layers[0])
169
+ n_params = sum(p.numel() for p in model.parameters())
170
+
171
+ profile = InterpProfile(
172
+ model_name=model_name,
173
+ arch_family=arch_family,
174
+ n_params=n_params,
175
+ n_layers=n_blocks,
176
+ hidden_dim=hidden_dim,
177
+ )
178
+
179
+ # Choose representative layers: sentiment probes at shallow depth, math
180
+ # and SAE at mid-depth. Each can be overridden by the caller.
181
+ if sentiment_layer is None:
182
+ sentiment_layer = n_blocks // 4
183
+ if math_layer is None:
184
+ math_layer = n_blocks // 2
185
+ if sae_layer is None:
186
+ sae_layer = n_blocks // 2
187
+
188
+ try:
189
+ profile.probe_sentiment_auroc = _run_probe(
190
+ model, backend_hint, tokenize_fn,
191
+ SENTIMENT_POS, SENTIMENT_NEG,
192
+ f"layer_{sentiment_layer}.residual",
193
+ )
194
+ except Exception as e:
195
+ profile.notes.append(f"probe_sentiment error: {str(e)[:60]}")
196
+
197
+ try:
198
+ profile.probe_math_auroc = _run_probe(
199
+ model, backend_hint, tokenize_fn,
200
+ MATH, NONMATH,
201
+ f"layer_{math_layer}.residual",
202
+ )
203
+ except Exception as e:
204
+ profile.notes.append(f"probe_math error: {str(e)[:60]}")
205
+
206
+ # Circuit detection
207
+ try:
208
+ circs = circuits.run_all_circuits(model, tokenizer=tokenizer, device="cpu")
209
+ profile.induction_head_relative = circs["induction_head"].relative
210
+ if "copy_circuit" in circs:
211
+ profile.copy_accuracy = circs["copy_circuit"].score
212
+ if "early_token_concentration" in circs:
213
+ profile.concentration_relative = circs["early_token_concentration"].relative
214
+ except Exception as e:
215
+ profile.notes.append(f"circuits error: {str(e)[:60]}")
216
+
217
+ # SAE quality
218
+ try:
219
+ d_recon, r_recon = _run_sae(
220
+ model, backend, f"layer_{sae_layer}.residual", hidden_dim, tokenize_fn,
221
+ )
222
+ profile.sae_dense_recon = d_recon
223
+ profile.sae_rank1_recon = r_recon
224
+ profile.sae_better = "rank1" if r_recon < d_recon else "dense"
225
+ except Exception as e:
226
+ profile.notes.append(f"sae error: {str(e)[:60]}")
227
+
228
+ # SSM state (mamba/ssm only)
229
+ if arch_family == "ssm" and ssm_layer is not None:
230
+ try:
231
+ profile.ssm_state_variance_ratio = _ssm_state_variance_ratio(
232
+ model, backend, f"layer_{ssm_layer}.ssm_state", tokenize_fn,
233
+ )
234
+ except Exception as e:
235
+ profile.notes.append(f"ssm_state error: {str(e)[:60]}")
236
+
237
+ profile.runtime_seconds = time.time() - t_start
238
+ return profile
239
+
240
+
241
+ def profile_to_markdown(profile: InterpProfile) -> str:
242
+ """Format profile as markdown row for the leaderboard."""
243
+ lines = [
244
+ f"### {profile.model_name}",
245
+ f" Arch: {profile.arch_family} | Params: {profile.n_params/1e6:.1f}M | "
246
+ f"Layers: {profile.n_layers} | Hidden: {profile.hidden_dim}",
247
+ " | Test | Score |",
248
+ " |------|-------|",
249
+ f" | Sentiment probe AUROC | {profile.probe_sentiment_auroc:.3f} |",
250
+ f" | Math probe AUROC | {profile.probe_math_auroc:.3f} |",
251
+ f" | Induction head (×chance) | {profile.induction_head_relative:>.1f} |",
252
+ f" | Copy accuracy | {profile.copy_accuracy:.2%} |",
253
+ f" | Concentration (rel) | {profile.concentration_relative:.3f} |",
254
+ f" | SAE Dense recon | {profile.sae_dense_recon:.4f} |",
255
+ f" | SAE Rank-1 recon | {profile.sae_rank1_recon:.4f} |",
256
+ f" | SAE better | {profile.sae_better} |",
257
+ f" | SSM state var ratio | {profile.ssm_state_variance_ratio:.3f} |",
258
+ f" | Runtime | {profile.runtime_seconds:.1f}s |",
259
+ ]
260
+ if profile.notes:
261
+ lines.append(f" Notes: {'; '.join(profile.notes)}")
262
+ return "\n".join(lines)
archscope/circuits.py ADDED
@@ -0,0 +1,255 @@
1
+ """Cross-architecture circuit detection.
2
+
3
+ Classical mech-interp circuits were discovered in transformers (induction heads,
4
+ attention sinks, copy circuits). It's an OPEN QUESTION whether they exist in
5
+ SSMs (Mamba) or hybrid attention (Kazdov-α).
6
+
7
+ Each detector is BEHAVIORAL — measures the model's outputs given crafted
8
+ inputs — so it works on ANY architecture without per-arch internals.
9
+
10
+ Detectors implemented:
11
+ - induction_head_score: does the model copy A->B associations? (Olsson et al 2022)
12
+ - copy_score: can the model copy a sequence verbatim?
13
+ - early_token_attention: proxy for attention-sink behavior, measures prob mass
14
+ concentrated on early tokens (BOS-like).
15
+ """
16
+ from __future__ import annotations
17
+ import random
18
+ import torch
19
+ import torch.nn.functional as F
20
+ from dataclasses import dataclass
21
+
22
+
23
+ @dataclass
24
+ class CircuitScore:
25
+ """Score from one circuit test."""
26
+ name: str
27
+ score: float # primary metric, higher = circuit more present
28
+ baseline: float # random-model baseline
29
+ relative: float # score / baseline (>1 means circuit present)
30
+ raw: dict # detailed numbers
31
+
32
+
33
+ # ---------- Helper: model-agnostic forward ----------
34
+
35
+ def _model_logits(model, input_ids: torch.Tensor) -> torch.Tensor:
36
+ """Call model and extract final-step logits. Works for HF + kazdov."""
37
+ with torch.no_grad():
38
+ out = model(input_ids)
39
+ if isinstance(out, dict):
40
+ logits = out.get("logits", out.get("last_hidden_state"))
41
+ elif hasattr(out, "logits"):
42
+ logits = out.logits
43
+ else:
44
+ logits = out
45
+ return logits
46
+
47
+
48
+ # ---------- Induction head detection ----------
49
+
50
+ def induction_head_score(
51
+ model,
52
+ n_pairs: int = 20,
53
+ seq_len: int = 6,
54
+ n_trials: int = 50,
55
+ vocab_size: int | None = None,
56
+ device: str = "cpu",
57
+ seed: int = 0,
58
+ ) -> CircuitScore:
59
+ """Olsson-style induction head test.
60
+
61
+ Construct a sequence: [A1 B1] [A2 B2] ... [A_k B_k] [A_1]
62
+ Score = probability that next token is B_1 (the original pair completion).
63
+
64
+ If induction heads exist, the model should learn to copy A_1 → B_1 in-context.
65
+ Compared to random-token baseline (1/vocab_size).
66
+ """
67
+ rng = random.Random(seed)
68
+ if vocab_size is None:
69
+ # Try to infer (HF transformers + custom models like kazdov)
70
+ if hasattr(model, "config") and hasattr(model.config, "vocab_size"):
71
+ vocab_size = model.config.vocab_size
72
+ elif hasattr(model, "vocab_size"):
73
+ vocab_size = model.vocab_size
74
+ else:
75
+ vocab_size = 50257 # GPT-2 default
76
+
77
+ successes = 0
78
+ rank_sum = 0.0
79
+ prob_target_sum = 0.0
80
+ for trial in range(n_trials):
81
+ # Pick n_pairs random token pairs
82
+ tokens = rng.sample(range(100, min(vocab_size, 40000)), 2 * n_pairs)
83
+ seq = []
84
+ pairs = []
85
+ for i in range(n_pairs):
86
+ a, b = tokens[2*i], tokens[2*i+1]
87
+ seq.extend([a, b])
88
+ pairs.append((a, b))
89
+ # Append cue: A1 — model should predict B1
90
+ a1, b1 = pairs[0]
91
+ seq.append(a1)
92
+
93
+ ids = torch.tensor([seq], dtype=torch.long, device=device)
94
+ logits = _model_logits(model, ids)
95
+ # last position prediction
96
+ last_logits = logits[0, -1, :]
97
+ probs = F.softmax(last_logits, dim=-1)
98
+
99
+ # Did the model predict b1 as top-1?
100
+ pred = int(torch.argmax(last_logits).item())
101
+ if pred == b1:
102
+ successes += 1
103
+ # Rank of b1 in the logit distribution (0 = predicted top-1).
104
+ rank = int((last_logits > last_logits[b1]).sum().item())
105
+ rank_sum += rank
106
+ prob_target_sum += float(probs[b1].item())
107
+
108
+ accuracy = successes / n_trials
109
+ avg_rank = rank_sum / n_trials
110
+ avg_prob = prob_target_sum / n_trials
111
+ chance = 1.0 / vocab_size
112
+ return CircuitScore(
113
+ name="induction_head",
114
+ score=avg_prob,
115
+ baseline=chance,
116
+ relative=avg_prob / chance,
117
+ raw={
118
+ "accuracy_top1": accuracy,
119
+ "avg_rank_target": avg_rank,
120
+ "avg_prob_target": avg_prob,
121
+ "n_trials": n_trials,
122
+ "seq_len_pairs": n_pairs,
123
+ },
124
+ )
125
+
126
+
127
+ # ---------- Copy circuit detection ----------
128
+
129
+ def copy_score(
130
+ model,
131
+ tokenizer,
132
+ n_trials: int = 30,
133
+ n_words: int = 5,
134
+ device: str = "cpu",
135
+ seed: int = 0,
136
+ ) -> CircuitScore:
137
+ """Test whether model can copy a short word-list verbatim after a separator.
138
+
139
+ Input pattern: "list: A B C D E. list: " → predict "A"
140
+ then after predicting A, predict B, etc.
141
+
142
+ Score = fraction of correctly-copied tokens.
143
+ """
144
+ rng = random.Random(seed)
145
+ word_pool = [
146
+ "cat", "dog", "tree", "moon", "star", "fish", "bird", "rock",
147
+ "leaf", "wave", "sky", "rain", "snow", "wind", "fire", "ice",
148
+ "book", "lamp", "key", "door", "wall", "road", "hill", "sand",
149
+ ]
150
+
151
+ correct = 0
152
+ total = 0
153
+ for trial in range(n_trials):
154
+ words = rng.sample(word_pool, n_words)
155
+ prompt = f"list: {' '.join(words)}. list: "
156
+ ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
157
+ ids.shape[1]
158
+
159
+ # Token IDs for target words (first token of each)
160
+ target_tokens = []
161
+ for w in words:
162
+ target_tokens.append(tokenizer(" " + w, add_special_tokens=False).input_ids[0])
163
+
164
+ # Autoregressively predict n_words tokens, chaining the model's own
165
+ # predictions (not teacher-forcing) — measures cumulative copy ability.
166
+ cur = ids.clone()
167
+ for tgt in target_tokens:
168
+ logits = _model_logits(model, cur)
169
+ next_tok = int(torch.argmax(logits[0, -1, :]).item())
170
+ if next_tok == tgt:
171
+ correct += 1
172
+ total += 1
173
+ cur = torch.cat([cur, torch.tensor([[next_tok]], device=device)], dim=1)
174
+
175
+ acc = correct / total if total > 0 else 0.0
176
+ # Random baseline for copying is ~1/vocab_size; effectively ~0
177
+ vocab_size = (getattr(model, "vocab_size", None)
178
+ or getattr(getattr(model, "config", None), "vocab_size", 50257))
179
+ chance = 1.0 / vocab_size
180
+ return CircuitScore(
181
+ name="copy_circuit",
182
+ score=acc,
183
+ baseline=chance,
184
+ relative=acc / chance if chance > 0 else float("inf"),
185
+ raw={"n_trials": n_trials, "n_words": n_words, "correct": correct, "total": total},
186
+ )
187
+
188
+
189
+ # ---------- Early-token attention proxy (attention sink) ----------
190
+
191
+ def early_token_attention(
192
+ model,
193
+ tokenizer,
194
+ texts: list[str] | None = None,
195
+ device: str = "cpu",
196
+ ) -> CircuitScore:
197
+ """Behavioral proxy for "attention sink" — measures how much logit mass
198
+ concentrates on the first token's vocab prediction when given a continuation cue.
199
+
200
+ Construct: "[BOS] X1 X2 ... X_n [predict ?]"
201
+ Measure: entropy of next-token distribution.
202
+
203
+ Low entropy → model is highly concentrated (possible sink-like behavior).
204
+ High entropy → no concentration.
205
+
206
+ NOTE: this is a coarse behavioral proxy. True attention-sink analysis
207
+ requires architecture-specific attention weights. But the proxy works
208
+ on ANY architecture (including SSMs that don't have attention).
209
+ """
210
+ if texts is None:
211
+ texts = [
212
+ "The cat sat on the", "Music has the power to",
213
+ "Mountains stretch to the", "She wrote a letter to",
214
+ "The sun set behind the", "Children laughed in the",
215
+ "Solve for x in the equation", "Compute the derivative of the",
216
+ ]
217
+ entropies = []
218
+ for txt in texts:
219
+ ids = tokenizer(txt, return_tensors="pt").input_ids.to(device)
220
+ logits = _model_logits(model, ids)
221
+ last_logits = logits[0, -1, :]
222
+ probs = F.softmax(last_logits, dim=-1)
223
+ # Shannon entropy (in nats)
224
+ ent = -(probs * (probs.clamp(min=1e-12).log())).sum().item()
225
+ entropies.append(ent)
226
+
227
+ avg_ent = sum(entropies) / len(entropies)
228
+ # Reference: log(vocab_size) is max entropy
229
+ vocab_size = (getattr(model, "vocab_size", None)
230
+ or getattr(getattr(model, "config", None), "vocab_size", 50257))
231
+ max_ent = torch.log(torch.tensor(float(vocab_size))).item()
232
+ return CircuitScore(
233
+ name="early_token_concentration",
234
+ score=avg_ent, # in nats
235
+ baseline=max_ent,
236
+ relative=avg_ent / max_ent, # 0 = full concentration, 1 = uniform
237
+ raw={"per_text_entropy": entropies, "max_entropy": max_ent},
238
+ )
239
+
240
+
241
+ # ---------- Runner ----------
242
+
243
+ def run_all_circuits(model, tokenizer=None, device: str = "cpu") -> dict[str, CircuitScore]:
244
+ """Run all available circuit tests on a model.
245
+
246
+ Returns a dict keyed by circuit name. Some tests require tokenizer; if not
247
+ provided, those are skipped.
248
+ """
249
+ results = {}
250
+ # Induction head only needs model + vocab_size
251
+ results["induction_head"] = induction_head_score(model, device=device)
252
+ if tokenizer is not None:
253
+ results["copy_circuit"] = copy_score(model, tokenizer, device=device)
254
+ results["early_token_concentration"] = early_token_attention(model, tokenizer, device=device)
255
+ return results
archscope/cli.py ADDED
@@ -0,0 +1,120 @@
1
+ """Command-line interface for archscope."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+
7
+ import click
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from . import __version__
12
+
13
+ console = Console()
14
+
15
+
16
+ @click.group()
17
+ def cli() -> None:
18
+ """archscope — cross-architecture mechanistic interpretability toolkit."""
19
+
20
+
21
+ @cli.command()
22
+ def info() -> None:
23
+ """Show available methods + supported backends."""
24
+ methods = Table(title=f"archscope v{__version__}")
25
+ methods.add_column("Method")
26
+ methods.add_column("Module")
27
+ methods.add_column("Source paper")
28
+ for row in (
29
+ ("Probes", "probes.fit_probe", "Drop the Act (2605.11467)"),
30
+ ("SAE", "sae.fit_sae", "WriteSAE (2605.12770)"),
31
+ ("Neuron mod", "neurons.find_neurons", "Targeted Neuron Mod (2605.12290)"),
32
+ ("Activation patch", "attribute.activation_patch", "Multi-Agent Sycophancy (2605.12991)"),
33
+ ("Cross-arch transfer", "transfer.evaluate_transfer", "this library"),
34
+ ("Circuit detection", "circuits.run_all_circuits", "this library"),
35
+ ("Logit/tuned lens", "lens.logit_lens", "Belrose et al 2023"),
36
+ ("Model diff", "diff.compare", "this library"),
37
+ ("InterpBench", "bench.benchmark", "this library"),
38
+ ):
39
+ methods.add_row(*row)
40
+ console.print(methods)
41
+
42
+ backends = Table(title="Backends")
43
+ backends.add_column("Name")
44
+ backends.add_column("Architecture family")
45
+ for row in (
46
+ ("transformer", "HuggingFace decoder LMs (Llama, GPT, Qwen, Pythia, ...)"),
47
+ ("mamba", "Mamba / Mamba-2 SSM — exposes .ssm_state (recurrent h_t)"),
48
+ ("kazdov", "Kazdov-α hybrid MoBE-BCN+MHA"),
49
+ ("recurrent", "Generic RNN (subclass per model)"),
50
+ ):
51
+ backends.add_row(*row)
52
+ console.print(backends)
53
+
54
+
55
+ @cli.command()
56
+ @click.argument("model_name")
57
+ @click.option("--arch", default="transformer",
58
+ type=click.Choice(["transformer", "mamba", "kazdov"]),
59
+ help="Architecture family.")
60
+ @click.option("--out", default=None,
61
+ help="Output file. Format inferred from extension: .json or .md. "
62
+ "Without --out, prints markdown to stdout.")
63
+ def bench(model_name: str, arch: str, out: str | None) -> None:
64
+ """Run archscope InterpBench on a HuggingFace model.
65
+
66
+ Examples:
67
+ archscope bench EleutherAI/pythia-160m --arch transformer
68
+ archscope bench EleutherAI/pythia-160m --arch transformer --out pythia.md
69
+ archscope bench state-spaces/mamba-130m-hf --arch mamba --out mamba.json
70
+ """
71
+ # Lazy imports keep `archscope info` fast (no torch/transformers).
72
+ import torch
73
+ from transformers import AutoModelForCausalLM, AutoTokenizer
74
+
75
+ from . import bench as bench_mod
76
+
77
+ console.print(f"[cyan]→ Loading {model_name}…[/cyan]")
78
+ tok = AutoTokenizer.from_pretrained(model_name)
79
+ if tok.pad_token is None:
80
+ tok.pad_token = tok.eos_token
81
+ model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
82
+ model.eval()
83
+
84
+ def tokenize_fn(texts):
85
+ return tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
86
+
87
+ arch_family = {"transformer": "transformer", "mamba": "ssm", "kazdov": "hybrid"}[arch]
88
+ profile = bench_mod.benchmark(
89
+ model_name=model_name,
90
+ model=model,
91
+ tokenizer=tok,
92
+ backend_hint=arch,
93
+ arch_family=arch_family,
94
+ tokenize_fn=tokenize_fn,
95
+ )
96
+
97
+ markdown = bench_mod.profile_to_markdown(profile)
98
+ console.print()
99
+ console.print(markdown)
100
+
101
+ if out:
102
+ from dataclasses import asdict
103
+ os.makedirs(os.path.dirname(out) or ".", exist_ok=True)
104
+ ext = os.path.splitext(out)[1].lower()
105
+ if ext == ".md":
106
+ with open(out, "w") as f:
107
+ f.write(markdown + "\n")
108
+ console.print(f"\n[green]Saved markdown report to {out}[/green]")
109
+ elif ext == ".json" or ext == "":
110
+ with open(out, "w") as f:
111
+ json.dump(asdict(profile), f, indent=2, default=str)
112
+ console.print(f"\n[green]Saved JSON profile to {out}[/green]")
113
+ else:
114
+ raise click.UsageError(
115
+ f"Unsupported --out extension '{ext}'. Use .md or .json."
116
+ )
117
+
118
+
119
+ if __name__ == "__main__":
120
+ cli()