archscope 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. archscope-0.2.2/LICENSE +17 -0
  2. archscope-0.2.2/PKG-INFO +324 -0
  3. archscope-0.2.2/README.md +287 -0
  4. archscope-0.2.2/pyproject.toml +45 -0
  5. archscope-0.2.2/setup.cfg +4 -0
  6. archscope-0.2.2/src/archscope/__init__.py +30 -0
  7. archscope-0.2.2/src/archscope/_utils.py +113 -0
  8. archscope-0.2.2/src/archscope/attribute.py +201 -0
  9. archscope-0.2.2/src/archscope/backends.py +236 -0
  10. archscope-0.2.2/src/archscope/bench.py +262 -0
  11. archscope-0.2.2/src/archscope/circuits.py +255 -0
  12. archscope-0.2.2/src/archscope/cli.py +120 -0
  13. archscope-0.2.2/src/archscope/diff.py +212 -0
  14. archscope-0.2.2/src/archscope/kazdov_backend.py +141 -0
  15. archscope-0.2.2/src/archscope/lens.py +304 -0
  16. archscope-0.2.2/src/archscope/neurons.py +118 -0
  17. archscope-0.2.2/src/archscope/probes.py +160 -0
  18. archscope-0.2.2/src/archscope/sae.py +127 -0
  19. archscope-0.2.2/src/archscope/transfer.py +188 -0
  20. archscope-0.2.2/src/archscope.egg-info/PKG-INFO +324 -0
  21. archscope-0.2.2/src/archscope.egg-info/SOURCES.txt +32 -0
  22. archscope-0.2.2/src/archscope.egg-info/dependency_links.txt +1 -0
  23. archscope-0.2.2/src/archscope.egg-info/entry_points.txt +2 -0
  24. archscope-0.2.2/src/archscope.egg-info/requires.txt +19 -0
  25. archscope-0.2.2/src/archscope.egg-info/top_level.txt +1 -0
  26. archscope-0.2.2/tests/test_circuits_3arch.py +109 -0
  27. archscope-0.2.2/tests/test_diff.py +87 -0
  28. archscope-0.2.2/tests/test_kazdov_integration.py +159 -0
  29. archscope-0.2.2/tests/test_lens.py +118 -0
  30. archscope-0.2.2/tests/test_mamba_integration.py +136 -0
  31. archscope-0.2.2/tests/test_mamba_ssm_state.py +104 -0
  32. archscope-0.2.2/tests/test_probe_transfer.py +200 -0
  33. archscope-0.2.2/tests/test_pythia_end_to_end.py +227 -0
  34. archscope-0.2.2/tests/test_unit.py +150 -0
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Juan Cruz Dovzak
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
@@ -0,0 +1,324 @@
1
+ Metadata-Version: 2.4
2
+ Name: archscope
3
+ Version: 0.2.2
4
+ Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
+ Author: Juan Cruz Dovzak
6
+ License: Apache-2.0
7
+ Keywords: mechanistic-interpretability,sparse-autoencoders,probes,RNN,Mamba,transformer
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: torch>=2.1.0
21
+ Requires-Dist: numpy>=1.26.0
22
+ Requires-Dist: einops>=0.7.0
23
+ Requires-Dist: click>=8.1.0
24
+ Requires-Dist: rich>=13.0.0
25
+ Requires-Dist: transformers>=4.40.0
26
+ Requires-Dist: datasets>=2.19.0
27
+ Requires-Dist: scikit-learn>=1.4.0
28
+ Provides-Extra: jax
29
+ Requires-Dist: jax>=0.4.30; extra == "jax"
30
+ Requires-Dist: flax>=0.8.4; extra == "jax"
31
+ Provides-Extra: mamba
32
+ Requires-Dist: mamba-ssm>=1.2; extra == "mamba"
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=8.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # archscope
39
+
40
+ **Mechanistic interpretability experiments across architectures — Transformers, SSMs/Mamba, recurrent models, and hybrids.**
41
+
42
+ [![CI](https://github.com/OriginalKazdov/archscope/actions/workflows/ci.yml/badge.svg)](https://github.com/OriginalKazdov/archscope/actions/workflows/ci.yml)
43
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org)
44
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
45
+
46
+ ## What archscope is
47
+
48
+ `archscope` is a **small-model interpretability workbench**. It's designed for quick, reproducible experiments across model families — not for large-scale SAE training, production model auditing, or replacing mature Transformer-specific tools.
49
+
50
+ Use it when you want to ask:
51
+ - *Can I extract comparable activations from different architectures?*
52
+ - *Do linear probes transfer across model families?*
53
+ - *Do induction-like behaviors appear outside attention?*
54
+ - *Did a fine-tuned model drift in specific layers?*
55
+ - *Do dense or rank-1 SAEs reconstruct this model family better at this layer?*
56
+
57
+ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader and more mature), a production audit tool, or a SaaS. It's a small, hackable workbench.
58
+
59
+ ```python
60
+ import archscope as mi
61
+ from transformers import AutoModelForCausalLM, AutoTokenizer
62
+
63
+ tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
64
+ model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
65
+
66
+ backend = mi.backends.Backend.for_model(model, hint="mamba")
67
+
68
+ # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
69
+ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
70
+ # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
71
+ ```
72
+
73
+ ---
74
+
75
+ ## What's inside
76
+
77
+ ### Core mech-interp methods
78
+
79
+ | Module | What it does | Source |
80
+ |---|---|---|
81
+ | `probes` | Linear/MLP probes on hidden states | Drop the Act (arXiv:2605.11467) |
82
+ | `sae` | Dense + Rank-1 factored sparse autoencoders | WriteSAE (arXiv:2605.12770) |
83
+ | `neurons` | Top-K contrastive neuron modulation | Targeted Neuron Mod (arXiv:2605.12290) |
84
+ | `attribute` | Activation patching + DIM decomposition | Multi-Agent Sycophancy (arXiv:2605.12991) |
85
+ | `circuits` | Induction, copy, attention-concentration detectors | Olsson et al 2022 |
86
+ | `lens` | Logit lens + Tuned lens | Belrose et al 2023 |
87
+ | `diff` | Model-diff: base vs fine-tuned, find what changed | this library |
88
+
89
+ ### Experiment infrastructure
90
+
91
+ | Module | What it does |
92
+ |---|---|
93
+ | `backends` | Unified extraction API across architectures |
94
+ | `transfer` | Cross-arch probe transfer via paired-activation linear alignment |
95
+ | `bench` | InterpProfile — standardized comparable profile (`mi.bench.benchmark()`) |
96
+
97
+ ### Backends
98
+
99
+ | Backend | Models | Specific |
100
+ |---|---|---|
101
+ | `transformer` | Pythia, GPT-2, Llama, Mistral, Qwen, MPT, Falcon, GPT-Neo | residual stream |
102
+ | `mamba` | Mamba, Mamba-2 | residual + explicit `.ssm_state` (recurrent h_t) |
103
+ | `kazdov` | Kazdov-α hybrid MoBE-BCN+MHA | residual per custom block |
104
+ | `recurrent` | Generic RNN (user subclass) | hidden state per layer |
105
+
106
+ ---
107
+
108
+ ## Install
109
+
110
+ ```bash
111
+ pip install archscope # once on PyPI
112
+ # or:
113
+ git clone https://github.com/OriginalKazdov/archscope.git
114
+ cd archscope && pip install -e .
115
+ ```
116
+
117
+ For Mamba on CPU you don't need `mamba-ssm` — HF's slow path works. On CUDA install `mamba-ssm` for the fast path.
118
+
119
+ ---
120
+
121
+ ## Quick examples
122
+
123
+ ### Train a probe on any architecture
124
+
125
+ ```python
126
+ import archscope as mi
127
+ from transformers import AutoModelForCausalLM, AutoTokenizer
128
+
129
+ model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")
130
+ tok = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
131
+ tk = lambda txts: tok(txts, return_tensors="pt", padding=True, truncation=True)
132
+
133
+ probe = mi.probes.fit_probe(
134
+ model,
135
+ inputs_pos=tk(["I love this", "Wonderful!", "Amazing"]),
136
+ inputs_neg=tk(["I hate this", "Awful", "Terrible"]),
137
+ layer_name="layer_5.residual",
138
+ backend_hint="transformer",
139
+ )
140
+ print(probe.metrics) # {'train_auroc': 1.0, ...}
141
+ ```
142
+
143
+ ### Extract Mamba's SSM recurrent state
144
+
145
+ ```python
146
+ backend = mi.backends.Backend.for_model(mamba_model, hint="mamba")
147
+ rec = backend.extract(tk("Hello world"), layers=["layer_12.ssm_state"])[0]
148
+ # rec.activations.shape == (B, intermediate_size, ssm_state_size)
149
+ # This is the actual recurrent memory h_t of Mamba — exposed via the same
150
+ # extraction API used for Transformer residual streams.
151
+ ```
152
+
153
+ ### Logit lens / tuned lens — see what each layer "thinks"
154
+
155
+ ```python
156
+ result = mi.lens.logit_lens(
157
+ model, tok,
158
+ prompt="The capital of France is",
159
+ target_token=" Paris",
160
+ backend_hint="transformer",
161
+ )
162
+ print(result.to_markdown())
163
+
164
+ # Tuned lens — learned per-layer projections (Belrose et al 2023):
165
+ tl = mi.lens.TunedLens.fit(model, tok, calibration_texts, backend_hint="transformer")
166
+ tl.predict(model, tok, "...", backend_hint="transformer")
167
+ ```
168
+
169
+ ### Model Diff — what did fine-tuning change?
170
+
171
+ ```python
172
+ from archscope.diff import compare
173
+
174
+ result = compare(
175
+ base_model, fine_tuned_model, tokenizer,
176
+ calibration_texts=texts,
177
+ backend_hint="transformer",
178
+ )
179
+ print(result.to_markdown())
180
+ # Per-layer residual drift, top shifted neurons, circuit deltas.
181
+ ```
182
+
183
+ ### Detect circuits cross-arch
184
+
185
+ ```python
186
+ scores = mi.circuits.run_all_circuits(model, tokenizer=tok)
187
+ print(scores["induction_head"].relative) # × chance
188
+ print(scores["copy_circuit"].score) # accuracy
189
+ ```
190
+
191
+ ### InterpBench — standardized model profile
192
+
193
+ ```python
194
+ profile = mi.bench.benchmark(
195
+ "EleutherAI/pythia-160m", model, tok,
196
+ backend_hint="transformer", arch_family="transformer",
197
+ tokenize_fn=tk,
198
+ )
199
+ print(mi.bench.profile_to_markdown(profile))
200
+ ```
201
+
202
+ CLI:
203
+ ```bash
204
+ archscope info
205
+ archscope bench EleutherAI/pythia-160m --arch transformer --out pythia.json
206
+ archscope bench state-spaces/mamba-130m-hf --arch mamba
207
+ ```
208
+
209
+ ---
210
+
211
+ ## Findings — running archscope on a mini-zoo of 7 small models
212
+
213
+ Each model profiled with `bench.benchmark()` (probes + circuits + dense vs rank-1 SAE). ~10 min total compute on CPU.
214
+
215
+ ### Reproduce
216
+
217
+ ```bash
218
+ python scripts/reproduce_mini_zoo.py
219
+ # → _research/mini_zoo_leaderboard.json
220
+ # → _research/mini_zoo_leaderboard.md
221
+ ```
222
+
223
+ Skip specific models with `--skip Mamba-370m` if memory-tight. Kazdov-α is included only if the local checkpoint is available.
224
+
225
+ | Model | Arch | Params | Induction (× chance) | SAE-dense | SAE-rank1 | SSM var |
226
+ |---|---|---|---|---|---|---|
227
+ | Pythia-160m | transformer | 162M | 490× | 0.019 | 0.025 | — |
228
+ | Pythia-410m | transformer | 405M | 3,261× | 0.075 | 0.135 | — |
229
+ | GPT-2 | transformer | 124M | 6,393× | 5.731 | **0.608** | — |
230
+ | Mamba-130m | SSM | 129M | 6,378× | 0.048 | **0.032** | 0.54 |
231
+ | Mamba-370m | SSM | 372M | **7,730×** | 0.022 | 0.027 | 0.73 |
232
+ | Qwen2.5-0.5B | transformer | 494M | **17,637×** | 0.092 | 0.068 | — |
233
+ | kazdov-α | hybrid | 98M | 2,700× | 0.043 | **0.004** | — |
234
+
235
+ **Open questions raised by this run** (single-seed observations, not formal claims):
236
+
237
+ - **Does induction-like behavior require attention heads?** Mamba — which has no attention mechanism — scores 6378-7730× chance on our behavioral induction test, comparable to or above similarly-sized Transformers. The test is behavioral (output-based), so it doesn't presume any specific mechanism. What in SSMs implements this behavior?
238
+ - **Why does naive logit lens degrade with depth on Mamba?** Applying each model's own `lm_head` to its intermediate residuals surfaces the target with depth on Pythia (target rank 5117 → 77 across 12 layers on "capital of France is _Paris_"). The same procedure on Mamba moves the target *away* from top-1 (rank 197 → 1049 across 24 layers). Does this hold across more SSM checkpoints? Is tuned-lens enough to fix it?
239
+ - **Is rank-1 SAE preference architecture-driven or layer-driven?** In this run, GPT-2, both Mambas, and kazdov-α reconstructed better with rank-1 factored SAEs at the tested mid-layer; both Pythias preferred dense; Qwen was marginal. Suggestive but needs layer sweeps + multiple seeds before claiming a pattern.
240
+ - **How much do training recipe, tokenizer, and data affect induction-like behavior?** Qwen2.5-0.5B shows 17,637× induction — 5.4× higher than Pythia-410m at similar size. Plausibly attributable to data curation + training stability since 2023, but we haven't isolated the cause.
241
+ - **Does Mamba's SSM-state utilization scale with model size?** In this run, the input-dependent variance ratio rose 0.54 (Mamba-130m) → 0.73 (Mamba-370m). Does this trend hold across more checkpoints?
242
+
243
+ These aren't published findings — they're observations from a single mini-zoo run. Methodological corrections welcome.
244
+
245
+ ### Metrics caveats
246
+
247
+ - **Induction score** is behavioral (output-based), not proof of a specific circuit. It tells you the model copies `A→B` associations in-context; it doesn't tell you *how*.
248
+ - **SAE reconstruction error** is measured on a small sample of mid-layer activations. Lower is better. Numbers are not comparable across layers with different residual magnitudes (e.g., Pythia L11 has very large residuals which dominate dense SAE recon).
249
+ - **SSM-state variance ratio** is descriptive — it tells you whether the state changes meaningfully across inputs, not whether the state is *causally used* downstream.
250
+ - **Logit lens** results are diagnostic, not a guarantee of representational alignment. Naive logit lens applies the *final* `lm_head` to intermediate residuals — when that fails, it just means the residuals aren't in the final-layer vocab space (e.g., Mamba). `TunedLens` is the fix.
251
+ - All probes/SAEs/circuit tests in InterpBench are **single-seed**. Treat differences <2× as noise.
252
+
253
+ ---
254
+
255
+ ## Honest limits
256
+
257
+ `archscope` is a v0.2 release. What it does well: cross-architecture mech-interp primitives, unified API, real observable findings, validated on multiple architectures. What it doesn't do yet:
258
+
259
+ - No causal scrubbing (gold-standard circuit verification)
260
+ - No interactive notebook viz (matplotlib helpers are TBD)
261
+ - Circuit detection is limited to induction / copy / attention-concentration — no IOI, name-mover, or successor heads yet
262
+ - Mamba-2 backend support is partial (Mamba-1 fully supported)
263
+ - No pretrained SAE collection (you train your own per layer)
264
+ - Probe transfer assumes same-tokenizer paired data
265
+
266
+ See [`CONTRIBUTING.md`](CONTRIBUTING.md) for what we welcome (new backends, new circuit detectors, viz helpers).
267
+
268
+ For mature Transformer-centric workflows, prefer [`transformer_lens`](https://github.com/TransformerLensOrg/TransformerLens) or [`nnsight`](https://nnsight.net/). They are broader and more mature; `archscope` focuses on lightweight cross-architecture experiments and small / non-standard model workflows.
269
+
270
+ ---
271
+
272
+ ## Citation
273
+
274
+ ```bibtex
275
+ @misc{dovzak2026archscope,
276
+ title = {archscope: Cross-architecture mechanistic interpretability experiments},
277
+ author = {Juan Cruz Dovzak},
278
+ year = {2026},
279
+ url = {https://github.com/OriginalKazdov/archscope}
280
+ }
281
+ ```
282
+
283
+ Source papers reimplemented or wrapped:
284
+ - WriteSAE — arXiv:2605.12770
285
+ - Drop the Act / ProFIL — arXiv:2605.11467
286
+ - Targeted Neuron Modulation — arXiv:2605.12290
287
+ - Multi-Agent Sycophancy — arXiv:2605.12991
288
+ - Tuned Lens (Belrose et al, 2023)
289
+ - Induction heads (Olsson et al, 2022)
290
+
291
+ ---
292
+
293
+ ## Troubleshooting
294
+
295
+ ### "The fast path is not available because ..." (Mamba on CPU)
296
+
297
+ Normal. Mamba falls back to a slow pure-PyTorch path that works correctly (~30s per benchmark vs ~1s on CUDA). Install `pip install mamba-ssm causal-conv1d` only on CUDA machines.
298
+
299
+ ### Custom backend not auto-detected
300
+
301
+ Pass `Backend.for_model(model, hint="my_backend")` explicitly. Auto-detection uses `config.model_type`.
302
+
303
+ ### `RuntimeError: Trying to backward through the graph a second time`
304
+
305
+ Activations from `Backend.extract()` carry the autograd graph by default. Call `.detach()` before reusing, or extract inside `torch.no_grad()`. The high-level `probes.fit_probe()` does this for you.
306
+
307
+ ---
308
+
309
+ ## Roadmap (post-0.2.0)
310
+
311
+ - Multi-token circuit detection: IOI, name-mover, successor heads
312
+ - Mamba-2 backend with same `.ssm_state` API
313
+ - Cross-arch SAE feature alignment (extend `transfer.py` from probes to features)
314
+ - Pretrained SAE collection for common small models
315
+ - Plotly/matplotlib viz helpers
316
+ - HuggingFace Space demo
317
+
318
+ PRs welcome — see [`CONTRIBUTING.md`](CONTRIBUTING.md).
319
+
320
+ ---
321
+
322
+ ## License
323
+
324
+ Apache-2.0
@@ -0,0 +1,287 @@
1
+ # archscope
2
+
3
+ **Mechanistic interpretability experiments across architectures — Transformers, SSMs/Mamba, recurrent models, and hybrids.**
4
+
5
+ [![CI](https://github.com/OriginalKazdov/archscope/actions/workflows/ci.yml/badge.svg)](https://github.com/OriginalKazdov/archscope/actions/workflows/ci.yml)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org)
7
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
8
+
9
+ ## What archscope is
10
+
11
+ `archscope` is a **small-model interpretability workbench**. It's designed for quick, reproducible experiments across model families — not for large-scale SAE training, production model auditing, or replacing mature Transformer-specific tools.
12
+
13
+ Use it when you want to ask:
14
+ - *Can I extract comparable activations from different architectures?*
15
+ - *Do linear probes transfer across model families?*
16
+ - *Do induction-like behaviors appear outside attention?*
17
+ - *Did a fine-tuned model drift in specific layers?*
18
+ - *Do dense or rank-1 SAEs reconstruct this model family better at this layer?*
19
+
20
+ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader and more mature), a production audit tool, or a SaaS. It's a small, hackable workbench.
21
+
22
+ ```python
23
+ import archscope as mi
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer
25
+
26
+ tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
27
+ model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
28
+
29
+ backend = mi.backends.Backend.for_model(model, hint="mamba")
30
+
31
+ # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
32
+ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
33
+ # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
34
+ ```
35
+
36
+ ---
37
+
38
+ ## What's inside
39
+
40
+ ### Core mech-interp methods
41
+
42
+ | Module | What it does | Source |
43
+ |---|---|---|
44
+ | `probes` | Linear/MLP probes on hidden states | Drop the Act (arXiv:2605.11467) |
45
+ | `sae` | Dense + Rank-1 factored sparse autoencoders | WriteSAE (arXiv:2605.12770) |
46
+ | `neurons` | Top-K contrastive neuron modulation | Targeted Neuron Mod (arXiv:2605.12290) |
47
+ | `attribute` | Activation patching + DIM decomposition | Multi-Agent Sycophancy (arXiv:2605.12991) |
48
+ | `circuits` | Induction, copy, attention-concentration detectors | Olsson et al 2022 |
49
+ | `lens` | Logit lens + Tuned lens | Belrose et al 2023 |
50
+ | `diff` | Model-diff: base vs fine-tuned, find what changed | this library |
51
+
52
+ ### Experiment infrastructure
53
+
54
+ | Module | What it does |
55
+ |---|---|
56
+ | `backends` | Unified extraction API across architectures |
57
+ | `transfer` | Cross-arch probe transfer via paired-activation linear alignment |
58
+ | `bench` | InterpProfile — standardized comparable profile (`mi.bench.benchmark()`) |
59
+
60
+ ### Backends
61
+
62
+ | Backend | Models | Specific |
63
+ |---|---|---|
64
+ | `transformer` | Pythia, GPT-2, Llama, Mistral, Qwen, MPT, Falcon, GPT-Neo | residual stream |
65
+ | `mamba` | Mamba, Mamba-2 | residual + explicit `.ssm_state` (recurrent h_t) |
66
+ | `kazdov` | Kazdov-α hybrid MoBE-BCN+MHA | residual per custom block |
67
+ | `recurrent` | Generic RNN (user subclass) | hidden state per layer |
68
+
69
+ ---
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install archscope # once on PyPI
75
+ # or:
76
+ git clone https://github.com/OriginalKazdov/archscope.git
77
+ cd archscope && pip install -e .
78
+ ```
79
+
80
+ For Mamba on CPU you don't need `mamba-ssm` — HF's slow path works. On CUDA install `mamba-ssm` for the fast path.
81
+
82
+ ---
83
+
84
+ ## Quick examples
85
+
86
+ ### Train a probe on any architecture
87
+
88
+ ```python
89
+ import archscope as mi
90
+ from transformers import AutoModelForCausalLM, AutoTokenizer
91
+
92
+ model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")
93
+ tok = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
94
+ tk = lambda txts: tok(txts, return_tensors="pt", padding=True, truncation=True)
95
+
96
+ probe = mi.probes.fit_probe(
97
+ model,
98
+ inputs_pos=tk(["I love this", "Wonderful!", "Amazing"]),
99
+ inputs_neg=tk(["I hate this", "Awful", "Terrible"]),
100
+ layer_name="layer_5.residual",
101
+ backend_hint="transformer",
102
+ )
103
+ print(probe.metrics) # {'train_auroc': 1.0, ...}
104
+ ```
105
+
106
+ ### Extract Mamba's SSM recurrent state
107
+
108
+ ```python
109
+ backend = mi.backends.Backend.for_model(mamba_model, hint="mamba")
110
+ rec = backend.extract(tk("Hello world"), layers=["layer_12.ssm_state"])[0]
111
+ # rec.activations.shape == (B, intermediate_size, ssm_state_size)
112
+ # This is the actual recurrent memory h_t of Mamba — exposed via the same
113
+ # extraction API used for Transformer residual streams.
114
+ ```
115
+
116
+ ### Logit lens / tuned lens — see what each layer "thinks"
117
+
118
+ ```python
119
+ result = mi.lens.logit_lens(
120
+ model, tok,
121
+ prompt="The capital of France is",
122
+ target_token=" Paris",
123
+ backend_hint="transformer",
124
+ )
125
+ print(result.to_markdown())
126
+
127
+ # Tuned lens — learned per-layer projections (Belrose et al 2023):
128
+ tl = mi.lens.TunedLens.fit(model, tok, calibration_texts, backend_hint="transformer")
129
+ tl.predict(model, tok, "...", backend_hint="transformer")
130
+ ```
131
+
132
+ ### Model Diff — what did fine-tuning change?
133
+
134
+ ```python
135
+ from archscope.diff import compare
136
+
137
+ result = compare(
138
+ base_model, fine_tuned_model, tokenizer,
139
+ calibration_texts=texts,
140
+ backend_hint="transformer",
141
+ )
142
+ print(result.to_markdown())
143
+ # Per-layer residual drift, top shifted neurons, circuit deltas.
144
+ ```
145
+
146
+ ### Detect circuits cross-arch
147
+
148
+ ```python
149
+ scores = mi.circuits.run_all_circuits(model, tokenizer=tok)
150
+ print(scores["induction_head"].relative) # × chance
151
+ print(scores["copy_circuit"].score) # accuracy
152
+ ```
153
+
154
+ ### InterpBench — standardized model profile
155
+
156
+ ```python
157
+ profile = mi.bench.benchmark(
158
+ "EleutherAI/pythia-160m", model, tok,
159
+ backend_hint="transformer", arch_family="transformer",
160
+ tokenize_fn=tk,
161
+ )
162
+ print(mi.bench.profile_to_markdown(profile))
163
+ ```
164
+
165
+ CLI:
166
+ ```bash
167
+ archscope info
168
+ archscope bench EleutherAI/pythia-160m --arch transformer --out pythia.json
169
+ archscope bench state-spaces/mamba-130m-hf --arch mamba
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Findings — running archscope on a mini-zoo of 7 small models
175
+
176
+ Each model profiled with `bench.benchmark()` (probes + circuits + dense vs rank-1 SAE). ~10 min total compute on CPU.
177
+
178
+ ### Reproduce
179
+
180
+ ```bash
181
+ python scripts/reproduce_mini_zoo.py
182
+ # → _research/mini_zoo_leaderboard.json
183
+ # → _research/mini_zoo_leaderboard.md
184
+ ```
185
+
186
+ Skip specific models with `--skip Mamba-370m` if memory-tight. Kazdov-α is included only if the local checkpoint is available.
187
+
188
+ | Model | Arch | Params | Induction (× chance) | SAE-dense | SAE-rank1 | SSM var |
189
+ |---|---|---|---|---|---|---|
190
+ | Pythia-160m | transformer | 162M | 490× | 0.019 | 0.025 | — |
191
+ | Pythia-410m | transformer | 405M | 3,261× | 0.075 | 0.135 | — |
192
+ | GPT-2 | transformer | 124M | 6,393× | 5.731 | **0.608** | — |
193
+ | Mamba-130m | SSM | 129M | 6,378× | 0.048 | **0.032** | 0.54 |
194
+ | Mamba-370m | SSM | 372M | **7,730×** | 0.022 | 0.027 | 0.73 |
195
+ | Qwen2.5-0.5B | transformer | 494M | **17,637×** | 0.092 | 0.068 | — |
196
+ | kazdov-α | hybrid | 98M | 2,700× | 0.043 | **0.004** | — |
197
+
198
+ **Open questions raised by this run** (single-seed observations, not formal claims):
199
+
200
+ - **Does induction-like behavior require attention heads?** Mamba — which has no attention mechanism — scores 6378-7730× chance on our behavioral induction test, comparable to or above similarly-sized Transformers. The test is behavioral (output-based), so it doesn't presume any specific mechanism. What in SSMs implements this behavior?
201
+ - **Why does naive logit lens degrade with depth on Mamba?** Applying each model's own `lm_head` to its intermediate residuals surfaces the target with depth on Pythia (target rank 5117 → 77 across 12 layers on "capital of France is _Paris_"). The same procedure on Mamba moves the target *away* from top-1 (rank 197 → 1049 across 24 layers). Does this hold across more SSM checkpoints? Is tuned-lens enough to fix it?
202
+ - **Is rank-1 SAE preference architecture-driven or layer-driven?** In this run, GPT-2, both Mambas, and kazdov-α reconstructed better with rank-1 factored SAEs at the tested mid-layer; both Pythias preferred dense; Qwen was marginal. Suggestive but needs layer sweeps + multiple seeds before claiming a pattern.
203
+ - **How much do training recipe, tokenizer, and data affect induction-like behavior?** Qwen2.5-0.5B shows 17,637× induction — 5.4× higher than Pythia-410m at similar size. Plausibly attributable to data curation + training stability since 2023, but we haven't isolated the cause.
204
+ - **Does Mamba's SSM-state utilization scale with model size?** In this run, the input-dependent variance ratio rose 0.54 (Mamba-130m) → 0.73 (Mamba-370m). Does this trend hold across more checkpoints?
205
+
206
+ These aren't published findings — they're observations from a single mini-zoo run. Methodological corrections welcome.
207
+
208
+ ### Metrics caveats
209
+
210
+ - **Induction score** is behavioral (output-based), not proof of a specific circuit. It tells you the model copies `A→B` associations in-context; it doesn't tell you *how*.
211
+ - **SAE reconstruction error** is measured on a small sample of mid-layer activations. Lower is better. Numbers are not comparable across layers with different residual magnitudes (e.g., Pythia L11 has very large residuals which dominate dense SAE recon).
212
+ - **SSM-state variance ratio** is descriptive — it tells you whether the state changes meaningfully across inputs, not whether the state is *causally used* downstream.
213
+ - **Logit lens** results are diagnostic, not a guarantee of representational alignment. Naive logit lens applies the *final* `lm_head` to intermediate residuals — when that fails, it just means the residuals aren't in the final-layer vocab space (e.g., Mamba). `TunedLens` is the fix.
214
+ - All probes/SAEs/circuit tests in InterpBench are **single-seed**. Treat differences <2× as noise.
215
+
216
+ ---
217
+
218
+ ## Honest limits
219
+
220
+ `archscope` is a v0.2 release. What it does well: cross-architecture mech-interp primitives, unified API, real observable findings, validated on multiple architectures. What it doesn't do yet:
221
+
222
+ - No causal scrubbing (gold-standard circuit verification)
223
+ - No interactive notebook viz (matplotlib helpers are TBD)
224
+ - Circuit detection is limited to induction / copy / attention-concentration — no IOI, name-mover, or successor heads yet
225
+ - Mamba-2 backend support is partial (Mamba-1 fully supported)
226
+ - No pretrained SAE collection (you train your own per layer)
227
+ - Probe transfer assumes same-tokenizer paired data
228
+
229
+ See [`CONTRIBUTING.md`](CONTRIBUTING.md) for what we welcome (new backends, new circuit detectors, viz helpers).
230
+
231
+ For mature Transformer-centric workflows, prefer [`transformer_lens`](https://github.com/TransformerLensOrg/TransformerLens) or [`nnsight`](https://nnsight.net/). They are broader and more mature; `archscope` focuses on lightweight cross-architecture experiments and small / non-standard model workflows.
232
+
233
+ ---
234
+
235
+ ## Citation
236
+
237
+ ```bibtex
238
+ @misc{dovzak2026archscope,
239
+ title = {archscope: Cross-architecture mechanistic interpretability experiments},
240
+ author = {Juan Cruz Dovzak},
241
+ year = {2026},
242
+ url = {https://github.com/OriginalKazdov/archscope}
243
+ }
244
+ ```
245
+
246
+ Source papers reimplemented or wrapped:
247
+ - WriteSAE — arXiv:2605.12770
248
+ - Drop the Act / ProFIL — arXiv:2605.11467
249
+ - Targeted Neuron Modulation — arXiv:2605.12290
250
+ - Multi-Agent Sycophancy — arXiv:2605.12991
251
+ - Tuned Lens (Belrose et al, 2023)
252
+ - Induction heads (Olsson et al, 2022)
253
+
254
+ ---
255
+
256
+ ## Troubleshooting
257
+
258
+ ### "The fast path is not available because ..." (Mamba on CPU)
259
+
260
+ Normal. Mamba falls back to a slow pure-PyTorch path that works correctly (~30s per benchmark vs ~1s on CUDA). Install `pip install mamba-ssm causal-conv1d` only on CUDA machines.
261
+
262
+ ### Custom backend not auto-detected
263
+
264
+ Pass `Backend.for_model(model, hint="my_backend")` explicitly. Auto-detection uses `config.model_type`.
265
+
266
+ ### `RuntimeError: Trying to backward through the graph a second time`
267
+
268
+ Activations from `Backend.extract()` carry the autograd graph by default. Call `.detach()` before reusing, or extract inside `torch.no_grad()`. The high-level `probes.fit_probe()` does this for you.
269
+
270
+ ---
271
+
272
+ ## Roadmap (post-0.2.0)
273
+
274
+ - Multi-token circuit detection: IOI, name-mover, successor heads
275
+ - Mamba-2 backend with same `.ssm_state` API
276
+ - Cross-arch SAE feature alignment (extend `transfer.py` from probes to features)
277
+ - Pretrained SAE collection for common small models
278
+ - Plotly/matplotlib viz helpers
279
+ - HuggingFace Space demo
280
+
281
+ PRs welcome — see [`CONTRIBUTING.md`](CONTRIBUTING.md).
282
+
283
+ ---
284
+
285
+ ## License
286
+
287
+ Apache-2.0
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "archscope"
3
+ version = "0.2.2"
4
+ description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
5
+ readme = "README.md"
6
+ authors = [{name = "Juan Cruz Dovzak"}]
7
+ requires-python = ">=3.10"
8
+ license = {text = "Apache-2.0"}
9
+ keywords = ["mechanistic-interpretability", "sparse-autoencoders", "probes", "RNN", "Mamba", "transformer"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Intended Audience :: Science/Research",
13
+ "License :: OSI Approved :: Apache Software License",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.10",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
20
+ ]
21
+ dependencies = [
22
+ "torch>=2.1.0",
23
+ "numpy>=1.26.0",
24
+ "einops>=0.7.0",
25
+ "click>=8.1.0",
26
+ "rich>=13.0.0",
27
+ "transformers>=4.40.0",
28
+ "datasets>=2.19.0",
29
+ "scikit-learn>=1.4.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ jax = ["jax>=0.4.30", "flax>=0.8.4"]
34
+ mamba = ["mamba-ssm>=1.2"]
35
+ dev = ["pytest>=8.0", "ruff>=0.4.0"]
36
+
37
+ [project.scripts]
38
+ archscope = "archscope.cli:cli"
39
+
40
+ [build-system]
41
+ requires = ["setuptools>=68.0"]
42
+ build-backend = "setuptools.build_meta"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+