archscope 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {archscope-0.2.5/src/archscope.egg-info → archscope-0.2.7}/PKG-INFO +25 -6
- {archscope-0.2.5 → archscope-0.2.7}/README.md +24 -5
- {archscope-0.2.5 → archscope-0.2.7}/pyproject.toml +1 -1
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/__init__.py +1 -1
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/attribute.py +20 -1
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/backends.py +4 -1
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/circuits.py +12 -2
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/lens.py +26 -4
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/probes.py +34 -0
- {archscope-0.2.5 → archscope-0.2.7/src/archscope.egg-info}/PKG-INFO +25 -6
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_unit.py +66 -1
- {archscope-0.2.5 → archscope-0.2.7}/LICENSE +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/setup.cfg +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/_utils.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/bench.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/cli.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/diff.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/kazdov_backend.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/loader.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/neurons.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/py.typed +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/sae.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope/transfer.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope.egg-info/SOURCES.txt +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope.egg-info/dependency_links.txt +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope.egg-info/entry_points.txt +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope.egg-info/requires.txt +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/src/archscope.egg-info/top_level.txt +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_circuits_3arch.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_diff.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_kazdov_integration.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_lens.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_mamba_integration.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_mamba_ssm_state.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_probe_transfer.py +0 -0
- {archscope-0.2.5 → archscope-0.2.7}/tests/test_pythia_end_to_end.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: archscope
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
|
|
5
5
|
Author: Juan Cruz Dovzak
|
|
6
6
|
License: Apache-2.0
|
|
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
|
|
|
58
58
|
|
|
59
59
|
```python
|
|
60
60
|
import archscope as mi
|
|
61
|
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
62
|
-
|
|
63
|
-
tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
|
|
64
|
-
model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
|
|
65
61
|
|
|
66
|
-
|
|
62
|
+
# One call → HuggingFace model + tokenizer + the right backend
|
|
63
|
+
model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
|
|
67
64
|
|
|
68
65
|
# Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
|
|
69
66
|
ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
|
|
70
67
|
# Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
|
|
71
68
|
```
|
|
72
69
|
|
|
70
|
+
`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
|
|
71
|
+
|
|
73
72
|
---
|
|
74
73
|
|
|
75
74
|
## What's inside
|
|
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
105
104
|
|
|
106
105
|
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
107
106
|
|
|
107
|
+
### Method × backend support
|
|
108
|
+
|
|
109
|
+
Not every method works on every architecture. The cross-product:
|
|
110
|
+
|
|
111
|
+
| Method | transformer | mamba | kazdov | recurrent |
|
|
112
|
+
|---|:---:|:---:|:---:|:---:|
|
|
113
|
+
| `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
|
|
114
|
+
| `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
|
|
115
|
+
| `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
|
|
116
|
+
| `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
|
|
117
|
+
| `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
|
|
118
|
+
| `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
|
|
119
|
+
| `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
|
|
120
|
+
| `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
|
|
121
|
+
| `diff.compare` | ✅ | ✅ | ✅ | ✅ |
|
|
122
|
+
| `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
|
|
123
|
+
| `bench.benchmark` | ✅ | ✅ | ✅ | partial |
|
|
124
|
+
|
|
125
|
+
❌ entries raise a clear `ValueError` rather than silently degrading.
|
|
126
|
+
|
|
108
127
|
---
|
|
109
128
|
|
|
110
129
|
## Install
|
|
@@ -21,18 +21,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
|
|
|
21
21
|
|
|
22
22
|
```python
|
|
23
23
|
import archscope as mi
|
|
24
|
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
25
|
-
|
|
26
|
-
tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
|
|
27
|
-
model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
|
|
28
24
|
|
|
29
|
-
|
|
25
|
+
# One call → HuggingFace model + tokenizer + the right backend
|
|
26
|
+
model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
|
|
30
27
|
|
|
31
28
|
# Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
|
|
32
29
|
ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
|
|
33
30
|
# Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
|
|
34
31
|
```
|
|
35
32
|
|
|
33
|
+
`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
|
|
34
|
+
|
|
36
35
|
---
|
|
37
36
|
|
|
38
37
|
## What's inside
|
|
@@ -68,6 +67,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
68
67
|
|
|
69
68
|
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
70
69
|
|
|
70
|
+
### Method × backend support
|
|
71
|
+
|
|
72
|
+
Not every method works on every architecture. The cross-product:
|
|
73
|
+
|
|
74
|
+
| Method | transformer | mamba | kazdov | recurrent |
|
|
75
|
+
|---|:---:|:---:|:---:|:---:|
|
|
76
|
+
| `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
|
|
77
|
+
| `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
|
|
78
|
+
| `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
|
|
79
|
+
| `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
|
|
80
|
+
| `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
|
|
81
|
+
| `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
|
|
82
|
+
| `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
|
|
83
|
+
| `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
|
|
84
|
+
| `diff.compare` | ✅ | ✅ | ✅ | ✅ |
|
|
85
|
+
| `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
|
|
86
|
+
| `bench.benchmark` | ✅ | ✅ | ✅ | partial |
|
|
87
|
+
|
|
88
|
+
❌ entries raise a clear `ValueError` rather than silently degrading.
|
|
89
|
+
|
|
71
90
|
---
|
|
72
91
|
|
|
73
92
|
## Install
|
|
@@ -95,7 +95,9 @@ def activation_patch(
|
|
|
95
95
|
module = resolve_layer_module(model, f"layer_{idx}.residual")
|
|
96
96
|
if module is None:
|
|
97
97
|
continue
|
|
98
|
-
|
|
98
|
+
# detach+clone for the same reason dim_decompose does: avoid aliasing
|
|
99
|
+
# a tensor that could be overwritten when the patched forward runs.
|
|
100
|
+
src_h = src_rec.activations.detach().clone()
|
|
99
101
|
|
|
100
102
|
def hook(mod, inp, out, replacement=src_h):
|
|
101
103
|
if isinstance(out, tuple):
|
|
@@ -155,6 +157,23 @@ def dim_decompose(
|
|
|
155
157
|
metric_b = metric_fn(out_b)
|
|
156
158
|
total_gap = metric_a - metric_b
|
|
157
159
|
|
|
160
|
+
# Sanity check: at least one component must be resolvable for at least one
|
|
161
|
+
# requested layer. Architectures without attention/MLP submodules (Mamba,
|
|
162
|
+
# pure SSMs, custom recurrent blocks) would otherwise silently return an
|
|
163
|
+
# empty DIMResult.
|
|
164
|
+
resolvable = any(
|
|
165
|
+
resolve_subcomponent_module(model, idx, comp) is not None
|
|
166
|
+
for idx in layer_indices for comp in components
|
|
167
|
+
)
|
|
168
|
+
if not resolvable:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"dim_decompose: none of components={components} were found on this "
|
|
171
|
+
f"model (type {type(model).__name__}). This method expects "
|
|
172
|
+
"attention/MLP submodules — it's transformer-style only. For "
|
|
173
|
+
"SSM/recurrent architectures, use activation_patch on the residual "
|
|
174
|
+
"stream instead."
|
|
175
|
+
)
|
|
176
|
+
|
|
158
177
|
contributions: dict[str, float] = {}
|
|
159
178
|
for comp in components:
|
|
160
179
|
# 1) Capture component outputs during prompt_a.
|
|
@@ -135,7 +135,10 @@ class TransformerBackend(Backend):
|
|
|
135
135
|
"""HuggingFace transformers backend — extracts residual stream per layer."""
|
|
136
136
|
|
|
137
137
|
def layer_names(self) -> list[str]:
|
|
138
|
-
#
|
|
138
|
+
# Layer names are virtual handles consumed by .extract(), which uses
|
|
139
|
+
# HF's `output_hidden_states=True` to retrieve the residual stream
|
|
140
|
+
# (no direct attribute walk into model.model.layers[i] needed —
|
|
141
|
+
# so this works across HF decoder LM families).
|
|
139
142
|
n_layers = getattr(self.model.config, "num_hidden_layers", 0)
|
|
140
143
|
return [f"layer_{i}.residual" for i in range(n_layers)]
|
|
141
144
|
|
|
@@ -74,12 +74,22 @@ def induction_head_score(
|
|
|
74
74
|
else:
|
|
75
75
|
vocab_size = 50257 # GPT-2 default
|
|
76
76
|
|
|
77
|
+
# Adaptive vocab window — defaults to [100, 40000) for full-size LMs but
|
|
78
|
+
# tightens for small-vocab toy models so we don't sample outside the range.
|
|
79
|
+
lo = min(100, max(1, vocab_size // 4))
|
|
80
|
+
hi = min(vocab_size, 40000)
|
|
81
|
+
if hi - lo < 2 * n_pairs:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"induction_head_score: vocab window [{lo}, {hi}) has only "
|
|
84
|
+
f"{hi - lo} tokens but n_pairs={n_pairs} requires {2 * n_pairs} distinct ids. "
|
|
85
|
+
f"Lower n_pairs or pass a model with vocab_size >= {2 * n_pairs + 100}."
|
|
86
|
+
)
|
|
87
|
+
|
|
77
88
|
successes = 0
|
|
78
89
|
rank_sum = 0.0
|
|
79
90
|
prob_target_sum = 0.0
|
|
80
91
|
for trial in range(n_trials):
|
|
81
|
-
|
|
82
|
-
tokens = rng.sample(range(100, min(vocab_size, 40000)), 2 * n_pairs)
|
|
92
|
+
tokens = rng.sample(range(lo, hi), 2 * n_pairs)
|
|
83
93
|
seq = []
|
|
84
94
|
pairs = []
|
|
85
95
|
for i in range(n_pairs):
|
|
@@ -218,14 +218,36 @@ class TunedLens(nn.Module):
|
|
|
218
218
|
|
|
219
219
|
opt = torch.optim.AdamW(tl.translators.parameters(), lr=lr)
|
|
220
220
|
|
|
221
|
-
# Pre-extract all activations + target logits once
|
|
221
|
+
# Pre-extract all activations + target logits once.
|
|
222
|
+
# Ensure tokenizer has a pad token (GPT-2 family ships without one).
|
|
223
|
+
if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
|
|
224
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
225
|
+
|
|
222
226
|
enc = tokenizer(calibration_texts, return_tensors="pt", padding=True,
|
|
223
227
|
truncation=True, max_length=max_len)
|
|
224
228
|
inputs = {"input_ids": enc["input_ids"].to(device)}
|
|
229
|
+
if "attention_mask" in enc:
|
|
230
|
+
inputs["attention_mask"] = enc["attention_mask"].to(device)
|
|
231
|
+
|
|
232
|
+
# Per-row index of the last REAL (non-pad) token. If no attention_mask
|
|
233
|
+
# (single, unpadded sequence), the conventional last-position is fine.
|
|
234
|
+
if "attention_mask" in enc:
|
|
235
|
+
real_lengths = enc["attention_mask"].sum(dim=1).to(device) # (B,)
|
|
236
|
+
last_idx = (real_lengths - 1).clamp(min=0)
|
|
237
|
+
else:
|
|
238
|
+
B = inputs["input_ids"].shape[0]
|
|
239
|
+
last_idx = torch.full((B,), inputs["input_ids"].shape[1] - 1,
|
|
240
|
+
dtype=torch.long, device=device)
|
|
241
|
+
|
|
242
|
+
def gather_last(acts: torch.Tensor) -> torch.Tensor:
|
|
243
|
+
# acts: (B, T, H) → (B, H) at each row's real last position.
|
|
244
|
+
B = acts.shape[0]
|
|
245
|
+
return acts[torch.arange(B, device=acts.device), last_idx]
|
|
246
|
+
|
|
225
247
|
with torch.no_grad():
|
|
226
248
|
records = backend.extract(inputs, layers=layer_names)
|
|
227
|
-
# Target: model's actual final logits at last position
|
|
228
|
-
final_residual = records[-1].activations
|
|
249
|
+
# Target: model's actual final logits at last REAL position per row.
|
|
250
|
+
final_residual = gather_last(records[-1].activations)
|
|
229
251
|
if norm is not None:
|
|
230
252
|
final_residual = norm(final_residual)
|
|
231
253
|
target_logits = unembed(final_residual).detach() # (B, vocab)
|
|
@@ -235,7 +257,7 @@ class TunedLens(nn.Module):
|
|
|
235
257
|
opt.zero_grad()
|
|
236
258
|
total_loss = 0.0
|
|
237
259
|
for i, rec in enumerate(records):
|
|
238
|
-
last = rec.activations
|
|
260
|
+
last = gather_last(rec.activations).detach()
|
|
239
261
|
translated = tl.translators[i](last)
|
|
240
262
|
if norm is not None:
|
|
241
263
|
translated = norm(translated)
|
|
@@ -108,6 +108,40 @@ class ProbeFit:
|
|
|
108
108
|
with torch.no_grad():
|
|
109
109
|
return torch.sigmoid(self.probe(activations.to(self.device)))
|
|
110
110
|
|
|
111
|
+
@property
|
|
112
|
+
def direction(self) -> torch.Tensor:
|
|
113
|
+
"""1D direction vector in activation space (linear probes only).
|
|
114
|
+
|
|
115
|
+
Shape: ``(hidden_dim,)``. This is the projection axis the probe found —
|
|
116
|
+
useful for: applying a probe to externally-transformed activations
|
|
117
|
+
(e.g., after ``archscope.transfer.learn_alignment``), inspecting feature
|
|
118
|
+
geometry, or projecting interventions along the learned direction.
|
|
119
|
+
|
|
120
|
+
Raises ``ValueError`` for MLP probes (no single linear direction).
|
|
121
|
+
"""
|
|
122
|
+
if self.config.probe_type != "linear":
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f".direction is only defined for linear probes (got "
|
|
125
|
+
f"probe_type={self.config.probe_type!r}). MLP probes don't have a "
|
|
126
|
+
"single direction in activation space."
|
|
127
|
+
)
|
|
128
|
+
return self.probe.net.weight.detach().squeeze(0).clone()
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def bias(self) -> torch.Tensor:
|
|
132
|
+
"""Scalar bias term (linear probes only). Shape: ``()``.
|
|
133
|
+
|
|
134
|
+
Together with ``.direction``, lets you score arbitrary activations as
|
|
135
|
+
``logits = acts @ direction + bias`` without going through the probe
|
|
136
|
+
module — handy for cross-arch transfer experiments.
|
|
137
|
+
"""
|
|
138
|
+
if self.config.probe_type != "linear":
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f".bias is only defined for linear probes (got "
|
|
141
|
+
f"probe_type={self.config.probe_type!r})."
|
|
142
|
+
)
|
|
143
|
+
return self.probe.net.bias.detach().squeeze().clone()
|
|
144
|
+
|
|
111
145
|
|
|
112
146
|
def _auroc(logits: torch.Tensor, labels: torch.Tensor) -> float:
|
|
113
147
|
"""AUROC from logits + binary labels.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: archscope
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
|
|
5
5
|
Author: Juan Cruz Dovzak
|
|
6
6
|
License: Apache-2.0
|
|
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
|
|
|
58
58
|
|
|
59
59
|
```python
|
|
60
60
|
import archscope as mi
|
|
61
|
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
62
|
-
|
|
63
|
-
tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
|
|
64
|
-
model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
|
|
65
61
|
|
|
66
|
-
|
|
62
|
+
# One call → HuggingFace model + tokenizer + the right backend
|
|
63
|
+
model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
|
|
67
64
|
|
|
68
65
|
# Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
|
|
69
66
|
ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
|
|
70
67
|
# Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
|
|
71
68
|
```
|
|
72
69
|
|
|
70
|
+
`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
|
|
71
|
+
|
|
73
72
|
---
|
|
74
73
|
|
|
75
74
|
## What's inside
|
|
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
105
104
|
|
|
106
105
|
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
107
106
|
|
|
107
|
+
### Method × backend support
|
|
108
|
+
|
|
109
|
+
Not every method works on every architecture. The cross-product:
|
|
110
|
+
|
|
111
|
+
| Method | transformer | mamba | kazdov | recurrent |
|
|
112
|
+
|---|:---:|:---:|:---:|:---:|
|
|
113
|
+
| `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
|
|
114
|
+
| `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
|
|
115
|
+
| `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
|
|
116
|
+
| `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
|
|
117
|
+
| `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
|
|
118
|
+
| `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
|
|
119
|
+
| `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
|
|
120
|
+
| `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
|
|
121
|
+
| `diff.compare` | ✅ | ✅ | ✅ | ✅ |
|
|
122
|
+
| `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
|
|
123
|
+
| `bench.benchmark` | ✅ | ✅ | ✅ | partial |
|
|
124
|
+
|
|
125
|
+
❌ entries raise a clear `ValueError` rather than silently degrading.
|
|
126
|
+
|
|
108
127
|
---
|
|
109
128
|
|
|
110
129
|
## Install
|
|
@@ -22,7 +22,7 @@ def test_imports():
|
|
|
22
22
|
import archscope
|
|
23
23
|
from archscope import (probes, sae, neurons, attribute, backends, # noqa: F401
|
|
24
24
|
circuits, transfer, bench, lens, diff) # noqa: F401
|
|
25
|
-
assert archscope.__version__ == "0.2.
|
|
25
|
+
assert archscope.__version__ == "0.2.7"
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def test_loader_exports():
|
|
@@ -244,6 +244,71 @@ def test_neurons_layer_filter_rejects_nonmatching():
|
|
|
244
244
|
assert cfg.layer_filter == "not_a_substring"
|
|
245
245
|
|
|
246
246
|
|
|
247
|
+
def test_induction_head_score_small_vocab_clear_error():
|
|
248
|
+
"""induction_head_score raises a clear error when vocab is too small."""
|
|
249
|
+
from archscope.circuits import induction_head_score
|
|
250
|
+
|
|
251
|
+
class _TinyModel:
|
|
252
|
+
class config:
|
|
253
|
+
vocab_size = 40 # << 2*n_pairs + 100
|
|
254
|
+
def __call__(self, ids):
|
|
255
|
+
return torch.zeros(1, ids.shape[1], 40)
|
|
256
|
+
|
|
257
|
+
with pytest.raises(ValueError) as ei:
|
|
258
|
+
induction_head_score(_TinyModel(), n_pairs=20, n_trials=1)
|
|
259
|
+
assert "vocab window" in str(ei.value).lower() or "n_pairs" in str(ei.value)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_probefit_direction_and_bias_accessors():
|
|
263
|
+
"""ProbeFit exposes .direction and .bias for linear probes."""
|
|
264
|
+
from archscope.probes import ProbeFit, ProbeConfig
|
|
265
|
+
torch.manual_seed(0)
|
|
266
|
+
pos = torch.randn(40, 8) + 1.5
|
|
267
|
+
neg = torch.randn(40, 8) - 1.5
|
|
268
|
+
cfg = ProbeConfig(layer_name="x", probe_type="linear")
|
|
269
|
+
pf = ProbeFit(cfg, input_dim=8)
|
|
270
|
+
pf.train(torch.cat([pos, neg]), torch.cat([torch.ones(40), torch.zeros(40)]),
|
|
271
|
+
epochs=30, batch_size=16)
|
|
272
|
+
d, b = pf.direction, pf.bias
|
|
273
|
+
assert d.shape == (8,), f"direction shape: {d.shape}"
|
|
274
|
+
assert b.dim() == 0, f"bias should be scalar: {b.shape}"
|
|
275
|
+
# Manual application matches what probe.score does (up to sigmoid).
|
|
276
|
+
test_act = torch.randn(3, 8)
|
|
277
|
+
manual_logits = test_act @ d + b
|
|
278
|
+
via_probe = pf.probe(test_act)
|
|
279
|
+
assert torch.allclose(manual_logits, via_probe, atol=1e-5), \
|
|
280
|
+
"direction @ acts + bias should equal probe(acts)"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def test_probefit_direction_rejects_mlp():
|
|
284
|
+
""".direction raises on MLP probes."""
|
|
285
|
+
from archscope.probes import ProbeFit, ProbeConfig
|
|
286
|
+
cfg = ProbeConfig(layer_name="x", probe_type="mlp")
|
|
287
|
+
pf = ProbeFit(cfg, input_dim=8)
|
|
288
|
+
with pytest.raises(ValueError) as ei:
|
|
289
|
+
_ = pf.direction
|
|
290
|
+
assert "linear" in str(ei.value).lower()
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def test_dim_decompose_rejects_mamba_style_model():
|
|
294
|
+
"""dim_decompose raises on models with no attention/MLP submodules."""
|
|
295
|
+
from archscope.attribute import dim_decompose
|
|
296
|
+
|
|
297
|
+
class _NoSubmods(torch.nn.Module):
|
|
298
|
+
def forward(self, **kwargs):
|
|
299
|
+
class Out:
|
|
300
|
+
logits = torch.zeros(1, 3, 8)
|
|
301
|
+
return Out()
|
|
302
|
+
|
|
303
|
+
with pytest.raises(ValueError) as ei:
|
|
304
|
+
dim_decompose(_NoSubmods(),
|
|
305
|
+
prompt_a={"input_ids": torch.tensor([[1, 2, 3]])},
|
|
306
|
+
prompt_b={"input_ids": torch.tensor([[4, 5, 6]])},
|
|
307
|
+
layer_indices=[0, 1],
|
|
308
|
+
metric_fn=lambda o: 0.0)
|
|
309
|
+
assert "attention" in str(ei.value).lower() or "submod" in str(ei.value).lower()
|
|
310
|
+
|
|
311
|
+
|
|
247
312
|
if __name__ == "__main__":
|
|
248
313
|
# Allow `python tests/test_unit.py` for quick local check
|
|
249
314
|
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|