archscope 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {archscope-0.2.3/src/archscope.egg-info → archscope-0.2.5}/PKG-INFO +8 -6
- {archscope-0.2.3 → archscope-0.2.5}/README.md +7 -5
- {archscope-0.2.3 → archscope-0.2.5}/pyproject.toml +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/__init__.py +3 -6
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/attribute.py +15 -1
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/backends.py +49 -12
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/circuits.py +11 -3
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/cli.py +11 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/diff.py +4 -0
- archscope-0.2.5/src/archscope/kazdov_backend.py +99 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/neurons.py +9 -2
- {archscope-0.2.3 → archscope-0.2.5/src/archscope.egg-info}/PKG-INFO +8 -6
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_circuits_3arch.py +5 -4
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_diff.py +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_kazdov_integration.py +4 -3
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_lens.py +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_mamba_integration.py +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_mamba_ssm_state.py +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_probe_transfer.py +4 -3
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_pythia_end_to_end.py +1 -1
- {archscope-0.2.3 → archscope-0.2.5}/tests/test_unit.py +61 -11
- archscope-0.2.3/src/archscope/kazdov_backend.py +0 -142
- {archscope-0.2.3 → archscope-0.2.5}/LICENSE +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/setup.cfg +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/_utils.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/bench.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/lens.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/loader.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/probes.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/py.typed +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/sae.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope/transfer.py +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/SOURCES.txt +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/dependency_links.txt +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/entry_points.txt +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/requires.txt +0 -0
- {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: archscope
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
|
|
5
5
|
Author: Juan Cruz Dovzak
|
|
6
6
|
License: Apache-2.0
|
|
@@ -96,12 +96,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
96
96
|
|
|
97
97
|
### Backends
|
|
98
98
|
|
|
99
|
-
| Backend |
|
|
99
|
+
| Backend | Auto-detected `model_type` | What you get |
|
|
100
100
|
|---|---|---|
|
|
101
|
-
| `transformer` | Pythia,
|
|
102
|
-
| `mamba` |
|
|
103
|
-
| `kazdov` |
|
|
104
|
-
| `recurrent` |
|
|
101
|
+
| `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
|
|
102
|
+
| `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
|
|
103
|
+
| `kazdov` | — (pass `hint="kazdov"`) | residual per custom block |
|
|
104
|
+
| `recurrent` | — (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
|
|
105
|
+
|
|
106
|
+
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
105
107
|
|
|
106
108
|
---
|
|
107
109
|
|
|
@@ -59,12 +59,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
59
59
|
|
|
60
60
|
### Backends
|
|
61
61
|
|
|
62
|
-
| Backend |
|
|
62
|
+
| Backend | Auto-detected `model_type` | What you get |
|
|
63
63
|
|---|---|---|
|
|
64
|
-
| `transformer` | Pythia,
|
|
65
|
-
| `mamba` |
|
|
66
|
-
| `kazdov` |
|
|
67
|
-
| `recurrent` |
|
|
64
|
+
| `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
|
|
65
|
+
| `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
|
|
66
|
+
| `kazdov` | — (pass `hint="kazdov"`) | residual per custom block |
|
|
67
|
+
| `recurrent` | — (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
|
|
68
|
+
|
|
69
|
+
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
68
70
|
|
|
69
71
|
---
|
|
70
72
|
|
|
@@ -25,16 +25,13 @@ Quick start::
|
|
|
25
25
|
print(result.to_markdown())
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
__version__ = "0.2.
|
|
28
|
+
__version__ = "0.2.5"
|
|
29
29
|
|
|
30
30
|
from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
|
|
31
31
|
from .loader import load_model, make_tokenize_fn
|
|
32
32
|
|
|
33
|
-
#
|
|
34
|
-
|
|
35
|
-
from . import kazdov_backend # noqa: F401
|
|
36
|
-
except ImportError:
|
|
37
|
-
pass
|
|
33
|
+
# Custom-architecture backend ("kazdov" — generic blocks-based, see kazdov_backend.py)
|
|
34
|
+
from . import kazdov_backend # noqa: F401
|
|
38
35
|
|
|
39
36
|
__all__ = [
|
|
40
37
|
"probes", "sae", "neurons", "attribute", "backends",
|
|
@@ -66,6 +66,17 @@ def activation_patch(
|
|
|
66
66
|
Returns:
|
|
67
67
|
PatchResult with the fraction of behavioral gap closed by patching.
|
|
68
68
|
"""
|
|
69
|
+
# Source and target must have matching shape — the patched-in activation
|
|
70
|
+
# is installed via a forward hook that expects the target's (B, T, H).
|
|
71
|
+
src_ids = prompt_source.get("input_ids") if isinstance(prompt_source, dict) else None
|
|
72
|
+
tgt_ids = prompt_target.get("input_ids") if isinstance(prompt_target, dict) else None
|
|
73
|
+
if src_ids is not None and tgt_ids is not None and src_ids.shape != tgt_ids.shape:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"activation_patch: prompt_source and prompt_target must have "
|
|
76
|
+
f"matching input_ids shape; got source={tuple(src_ids.shape)} "
|
|
77
|
+
f"vs target={tuple(tgt_ids.shape)}. Pad/truncate to the same length."
|
|
78
|
+
)
|
|
79
|
+
|
|
69
80
|
backend = Backend.for_model(model, hint=backend_hint)
|
|
70
81
|
layer_names = [f"layer_{i}.residual" for i in layer_indices]
|
|
71
82
|
|
|
@@ -156,7 +167,10 @@ def dim_decompose(
|
|
|
156
167
|
captured: list = []
|
|
157
168
|
|
|
158
169
|
def capture(mod, inp, out, store=captured):
|
|
159
|
-
|
|
170
|
+
# CRITICAL: detach + clone so the captured tensor isn't
|
|
171
|
+
# overwritten by a later forward pass that reuses module buffers.
|
|
172
|
+
tensor = out[0] if isinstance(out, tuple) else out
|
|
173
|
+
store.append(tensor.detach().clone())
|
|
160
174
|
capture_hooks.append(module.register_forward_hook(capture))
|
|
161
175
|
src_acts_by_layer[idx] = captured
|
|
162
176
|
|
|
@@ -44,20 +44,57 @@ class Backend(abc.ABC):
|
|
|
44
44
|
return klass
|
|
45
45
|
return deco
|
|
46
46
|
|
|
47
|
+
# HF model_type → backend name. Transformer family covers most HF decoder LMs;
|
|
48
|
+
# add new families here as they ship. Auto-detect intentionally raises when
|
|
49
|
+
# nothing matches (silent fallback caused real bugs in v0.2.4).
|
|
50
|
+
_AUTODETECT = {
|
|
51
|
+
# transformer family
|
|
52
|
+
"llama": "transformer",
|
|
53
|
+
"mistral": "transformer",
|
|
54
|
+
"qwen2": "transformer",
|
|
55
|
+
"qwen3": "transformer",
|
|
56
|
+
"gpt2": "transformer",
|
|
57
|
+
"gpt_neox": "transformer", # Pythia uses gpt_neox
|
|
58
|
+
"gpt_neo": "transformer",
|
|
59
|
+
"gptj": "transformer",
|
|
60
|
+
"falcon": "transformer",
|
|
61
|
+
"mpt": "transformer",
|
|
62
|
+
"bloom": "transformer",
|
|
63
|
+
"opt": "transformer",
|
|
64
|
+
"phi": "transformer",
|
|
65
|
+
"phi3": "transformer",
|
|
66
|
+
"gemma": "transformer",
|
|
67
|
+
"gemma2": "transformer",
|
|
68
|
+
"starcoder2": "transformer",
|
|
69
|
+
# SSM family
|
|
70
|
+
"mamba": "mamba",
|
|
71
|
+
"mamba2": "mamba",
|
|
72
|
+
}
|
|
73
|
+
|
|
47
74
|
@classmethod
|
|
48
75
|
def for_model(cls, model: Any, hint: str | None = None) -> "Backend":
|
|
49
|
-
"""Auto-detect or use hint to select backend.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
76
|
+
"""Auto-detect (or use hint) to select a backend.
|
|
77
|
+
|
|
78
|
+
Raises ValueError if no hint is provided and the model's ``config.model_type``
|
|
79
|
+
is not in the autodetect table. Pass ``hint=...`` explicitly for any model
|
|
80
|
+
that's not auto-detected, or register a custom backend via
|
|
81
|
+
``Backend.register('name')``.
|
|
82
|
+
"""
|
|
83
|
+
if hint:
|
|
84
|
+
if hint in cls._registry:
|
|
85
|
+
return cls._registry[hint](model)
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Unknown backend hint '{hint}'. Registered: {sorted(cls._registry)}"
|
|
88
|
+
)
|
|
89
|
+
model_type = getattr(getattr(model, "config", None), "model_type", None)
|
|
90
|
+
if model_type in cls._AUTODETECT:
|
|
91
|
+
return cls._registry[cls._AUTODETECT[model_type]](model)
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"No backend matches model with config.model_type={model_type!r} "
|
|
94
|
+
f"(type {type(model).__name__}). Pass hint=... explicitly, or "
|
|
95
|
+
f"register a custom backend via Backend.register('name'). "
|
|
96
|
+
f"Auto-detected types: {sorted(cls._AUTODETECT)}"
|
|
97
|
+
)
|
|
61
98
|
|
|
62
99
|
def __init__(self, model: Any):
|
|
63
100
|
self.model = model
|
|
@@ -154,12 +154,20 @@ def copy_score(
|
|
|
154
154
|
words = rng.sample(word_pool, n_words)
|
|
155
155
|
prompt = f"list: {' '.join(words)}. list: "
|
|
156
156
|
ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
|
157
|
-
ids.shape[1]
|
|
158
157
|
|
|
159
|
-
#
|
|
158
|
+
# Different tokenizers handle whitespace differently:
|
|
159
|
+
# - BPE (GPT-2 / NeoX / Pythia / Llama-2): " word" → leading-space token
|
|
160
|
+
# - SentencePiece (Llama-3, Qwen, T5): "▁word" → leading-underscore token
|
|
161
|
+
# Try " word" first; fall back to bare word for tokenizers that don't
|
|
162
|
+
# use a space prefix.
|
|
160
163
|
target_tokens = []
|
|
161
164
|
for w in words:
|
|
162
|
-
|
|
165
|
+
ids_w = tokenizer(" " + w, add_special_tokens=False).input_ids
|
|
166
|
+
if not ids_w:
|
|
167
|
+
ids_w = tokenizer(w, add_special_tokens=False).input_ids
|
|
168
|
+
if not ids_w:
|
|
169
|
+
continue # pathological; skip
|
|
170
|
+
target_tokens.append(ids_w[0])
|
|
163
171
|
|
|
164
172
|
# Autoregressively predict n_words tokens, chaining the model's own
|
|
165
173
|
# predictions (not teacher-forcing) — measures cumulative copy ability.
|
|
@@ -89,6 +89,16 @@ def bench(model_name: str, arch: str, out: str | None) -> None:
|
|
|
89
89
|
return tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
|
|
90
90
|
|
|
91
91
|
arch_family = {"transformer": "transformer", "mamba": "ssm", "kazdov": "hybrid"}[arch]
|
|
92
|
+
|
|
93
|
+
# For Mamba, pick a representative SSM-state layer at mid-depth so the
|
|
94
|
+
# ssm_state_variance_ratio metric is populated (otherwise bench returns NaN).
|
|
95
|
+
extra: dict = {}
|
|
96
|
+
if arch == "mamba":
|
|
97
|
+
from .backends import Backend
|
|
98
|
+
backend = Backend.for_model(model, hint="mamba")
|
|
99
|
+
n_residual = sum(1 for ln in backend.layer_names() if ".residual" in ln)
|
|
100
|
+
extra["ssm_layer"] = max(0, n_residual // 2)
|
|
101
|
+
|
|
92
102
|
profile = bench_mod.benchmark(
|
|
93
103
|
model_name=model_name,
|
|
94
104
|
model=model,
|
|
@@ -96,6 +106,7 @@ def bench(model_name: str, arch: str, out: str | None) -> None:
|
|
|
96
106
|
backend_hint=arch,
|
|
97
107
|
arch_family=arch_family,
|
|
98
108
|
tokenize_fn=tokenize_fn,
|
|
109
|
+
**extra,
|
|
99
110
|
)
|
|
100
111
|
|
|
101
112
|
markdown = bench_mod.profile_to_markdown(profile)
|
|
@@ -155,6 +155,10 @@ def compare(
|
|
|
155
155
|
raise ValueError("base and fine_tuned have different layer structure — "
|
|
156
156
|
"they must share architecture")
|
|
157
157
|
|
|
158
|
+
# Ensure tokenizer has a pad token (GPT-2 family ships without one).
|
|
159
|
+
if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
|
|
160
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
161
|
+
|
|
158
162
|
# Tokenize calibration
|
|
159
163
|
enc = tokenizer(calibration_texts, return_tensors="pt", padding=True,
|
|
160
164
|
truncation=True, max_length=max_length)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Backend for custom architectures that expose layers via ``model.blocks``.
|
|
2
|
+
|
|
3
|
+
Originally written for kazdov-α (a transformer-style decoder LM with hybrid
|
|
4
|
+
MoBE-BCN + MHA attention) — but the backend is generic. It works for ANY
|
|
5
|
+
PyTorch model where:
|
|
6
|
+
|
|
7
|
+
- residual blocks are exposed as ``model.blocks`` (a ``nn.ModuleList``)
|
|
8
|
+
- ``model.d_model`` (or ``model.hidden_size``) is set on the model
|
|
9
|
+
- forward signature is ``model(input_ids, attention_mask=None, ...)``
|
|
10
|
+
|
|
11
|
+
This is the simplest pattern for registering a custom architecture with
|
|
12
|
+
archscope. If your model uses a different convention (e.g., ``model.layers``
|
|
13
|
+
under another parent), subclass ``Backend`` directly — this module is a
|
|
14
|
+
working example.
|
|
15
|
+
|
|
16
|
+
The backend registers under the name ``"kazdov"`` for historical reasons.
|
|
17
|
+
It used to be coupled to a private model-loading function; that function
|
|
18
|
+
was moved out of the shipped package since it depended on a private
|
|
19
|
+
repository. To load your own custom model, do it yourself and then call
|
|
20
|
+
``Backend.for_model(model, hint="kazdov")``.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
import torch
|
|
24
|
+
|
|
25
|
+
from .backends import Backend, ActivationRecord
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@Backend.register("kazdov")
|
|
29
|
+
class KazdovBackend(Backend):
|
|
30
|
+
"""Generic backend for models exposing layers via ``model.blocks``.
|
|
31
|
+
|
|
32
|
+
Captures the output of each block via forward hooks (the model is
|
|
33
|
+
expected to not implement ``output_hidden_states=True`` natively).
|
|
34
|
+
|
|
35
|
+
Requirements on the model:
|
|
36
|
+
- ``model.blocks`` is a ``nn.ModuleList`` of residual blocks.
|
|
37
|
+
- ``model.d_model`` or ``model.hidden_size`` is set.
|
|
38
|
+
- ``model(input_ids, attention_mask=...)`` is the forward signature.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def layer_names(self) -> list[str]:
|
|
42
|
+
n_layers = len(self.model.blocks)
|
|
43
|
+
return [f"layer_{i}.residual" for i in range(n_layers)]
|
|
44
|
+
|
|
45
|
+
def extract(self, inputs, layers=None):
|
|
46
|
+
layers = layers or self.layer_names()
|
|
47
|
+
self._validate_layers(layers)
|
|
48
|
+
captures: dict[str, torch.Tensor] = {}
|
|
49
|
+
|
|
50
|
+
# Register a forward hook on each requested block.
|
|
51
|
+
hooks = []
|
|
52
|
+
for layer_name in layers:
|
|
53
|
+
idx = int(layer_name.split("_")[1].split(".")[0])
|
|
54
|
+
if idx >= len(self.model.blocks):
|
|
55
|
+
continue
|
|
56
|
+
block = self.model.blocks[idx]
|
|
57
|
+
|
|
58
|
+
def make_hook(name):
|
|
59
|
+
def hook(module, inp, out):
|
|
60
|
+
tensor = out if isinstance(out, torch.Tensor) else out[0]
|
|
61
|
+
captures[name] = tensor.detach()
|
|
62
|
+
return hook
|
|
63
|
+
hooks.append(block.register_forward_hook(make_hook(layer_name)))
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
with torch.no_grad():
|
|
67
|
+
if isinstance(inputs, dict):
|
|
68
|
+
input_ids = inputs["input_ids"]
|
|
69
|
+
attn = inputs.get("attention_mask")
|
|
70
|
+
else:
|
|
71
|
+
input_ids = inputs
|
|
72
|
+
attn = None
|
|
73
|
+
self.model(input_ids, attention_mask=attn)
|
|
74
|
+
finally:
|
|
75
|
+
for h in hooks:
|
|
76
|
+
h.remove()
|
|
77
|
+
|
|
78
|
+
records = []
|
|
79
|
+
for layer_name in layers:
|
|
80
|
+
if layer_name not in captures:
|
|
81
|
+
continue
|
|
82
|
+
records.append(ActivationRecord(
|
|
83
|
+
layer_name=layer_name,
|
|
84
|
+
activations=captures[layer_name],
|
|
85
|
+
meta={"kind": "residual", "arch": "kazdov-blocks"},
|
|
86
|
+
))
|
|
87
|
+
return records
|
|
88
|
+
|
|
89
|
+
def hidden_dim(self, layer_name: str) -> int:
|
|
90
|
+
# Some custom models expose this as `d_model`, others as `hidden_size`.
|
|
91
|
+
for attr in ("d_model", "hidden_size"):
|
|
92
|
+
v = getattr(self.model, attr, None)
|
|
93
|
+
if v is not None:
|
|
94
|
+
return v
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Cannot infer hidden_dim for {type(self.model).__name__}: "
|
|
97
|
+
f"set model.d_model or model.hidden_size, or subclass KazdovBackend "
|
|
98
|
+
f"and override hidden_dim()."
|
|
99
|
+
)
|
|
@@ -17,7 +17,7 @@ from ._utils import resolve_layer_module
|
|
|
17
17
|
@dataclass
|
|
18
18
|
class NeuronEditConfig:
|
|
19
19
|
top_frac: float = 0.001 # top 0.1% by default
|
|
20
|
-
layer_filter: str | None = None # e.g
|
|
20
|
+
layer_filter: str | None = None # substring filter on layer_names() (e.g. "residual")
|
|
21
21
|
mode: str = "scalar" # "scalar" (multiply by m) or "ablate" (m=0)
|
|
22
22
|
|
|
23
23
|
|
|
@@ -87,8 +87,15 @@ def find_neurons(
|
|
|
87
87
|
config = config or NeuronEditConfig()
|
|
88
88
|
backend = Backend.for_model(model, hint=backend_hint)
|
|
89
89
|
|
|
90
|
-
# Get all layers (will filter to MLP later if requested)
|
|
91
90
|
all_layers = backend.layer_names()
|
|
91
|
+
if config.layer_filter is not None:
|
|
92
|
+
all_layers = [ln for ln in all_layers if config.layer_filter in ln]
|
|
93
|
+
if not all_layers:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"layer_filter={config.layer_filter!r} matched no layers. "
|
|
96
|
+
f"Available substrings include: "
|
|
97
|
+
f"{sorted({ln.split('.', 1)[-1] for ln in backend.layer_names()})}"
|
|
98
|
+
)
|
|
92
99
|
|
|
93
100
|
# Forward both classes, collect final-token activations
|
|
94
101
|
harm_acts = backend.extract(inputs_harmful, layers=all_layers)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: archscope
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
|
|
5
5
|
Author: Juan Cruz Dovzak
|
|
6
6
|
License: Apache-2.0
|
|
@@ -96,12 +96,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
|
|
|
96
96
|
|
|
97
97
|
### Backends
|
|
98
98
|
|
|
99
|
-
| Backend |
|
|
99
|
+
| Backend | Auto-detected `model_type` | What you get |
|
|
100
100
|
|---|---|---|
|
|
101
|
-
| `transformer` | Pythia,
|
|
102
|
-
| `mamba` |
|
|
103
|
-
| `kazdov` |
|
|
104
|
-
| `recurrent` |
|
|
101
|
+
| `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
|
|
102
|
+
| `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
|
|
103
|
+
| `kazdov` | — (pass `hint="kazdov"`) | residual per custom block |
|
|
104
|
+
| `recurrent` | — (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
|
|
105
|
+
|
|
106
|
+
If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
|
|
105
107
|
|
|
106
108
|
---
|
|
107
109
|
|
|
@@ -11,13 +11,14 @@ import os
|
|
|
11
11
|
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
sys.path.insert(0, "/
|
|
14
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
15
15
|
|
|
16
16
|
from archscope import circuits
|
|
17
|
-
|
|
17
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
18
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
21
|
+
CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
21
22
|
PYTHIA_NAME = "EleutherAI/pythia-160m"
|
|
22
23
|
MAMBA_NAME = "state-spaces/mamba-130m-hf"
|
|
23
24
|
|
|
@@ -97,7 +98,7 @@ def main():
|
|
|
97
98
|
print(" • concentration relative ≈ 0 → highly confident predictions (concentrated)")
|
|
98
99
|
|
|
99
100
|
# Save
|
|
100
|
-
out_path = "/
|
|
101
|
+
out_path = str(__import__("pathlib").Path(__file__).parent.parent / "_research" / "circuits_3arch.json")
|
|
101
102
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
102
103
|
with open(out_path, "w") as f:
|
|
103
104
|
json.dump(all_results, f, indent=2, default=str)
|
|
@@ -9,14 +9,15 @@ import sys
|
|
|
9
9
|
import time
|
|
10
10
|
import torch
|
|
11
11
|
|
|
12
|
-
sys.path.insert(0, "/
|
|
12
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
13
13
|
|
|
14
14
|
from archscope import probes, sae, neurons
|
|
15
15
|
from archscope.backends import Backend
|
|
16
|
-
|
|
16
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
17
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
CHECKPOINT = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
20
|
+
CHECKPOINT = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def tokenize(tokenizer, texts: list[str]) -> dict:
|
|
@@ -10,7 +10,7 @@ import sys
|
|
|
10
10
|
import time
|
|
11
11
|
import torch
|
|
12
12
|
|
|
13
|
-
sys.path.insert(0, "/
|
|
13
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
14
14
|
|
|
15
15
|
from archscope import probes, sae, neurons
|
|
16
16
|
from archscope.backends import Backend
|
|
@@ -11,7 +11,7 @@ import sys
|
|
|
11
11
|
import time
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
sys.path.insert(0, "/
|
|
14
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
15
15
|
|
|
16
16
|
from archscope import sae
|
|
17
17
|
from archscope.backends import Backend
|
|
@@ -14,14 +14,15 @@ import sys
|
|
|
14
14
|
import time
|
|
15
15
|
import torch
|
|
16
16
|
|
|
17
|
-
sys.path.insert(0, "/
|
|
17
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
18
18
|
|
|
19
19
|
from archscope import transfer
|
|
20
20
|
from archscope.backends import Backend
|
|
21
|
-
|
|
21
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
22
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
25
|
+
CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
25
26
|
PYTHIA_NAME = "EleutherAI/pythia-160m"
|
|
26
27
|
|
|
27
28
|
|
|
@@ -13,7 +13,7 @@ import sys
|
|
|
13
13
|
import time
|
|
14
14
|
import torch
|
|
15
15
|
|
|
16
|
-
sys.path.insert(0, "/
|
|
16
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
17
17
|
|
|
18
18
|
from archscope import probes, sae, neurons, attribute
|
|
19
19
|
from archscope.backends import Backend
|
|
@@ -20,9 +20,9 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
20
20
|
def test_imports():
|
|
21
21
|
"""All modules import without errors."""
|
|
22
22
|
import archscope
|
|
23
|
-
from archscope import (probes, sae, neurons, attribute, backends,
|
|
24
|
-
circuits, transfer, bench, lens, diff)
|
|
25
|
-
assert archscope.__version__ == "0.2.
|
|
23
|
+
from archscope import (probes, sae, neurons, attribute, backends, # noqa: F401
|
|
24
|
+
circuits, transfer, bench, lens, diff) # noqa: F401
|
|
25
|
+
assert archscope.__version__ == "0.2.5"
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def test_loader_exports():
|
|
@@ -36,7 +36,7 @@ def test_loader_exports():
|
|
|
36
36
|
|
|
37
37
|
def test_layer_name_validation_clear_error():
|
|
38
38
|
"""Backend validates layer names with an informative error."""
|
|
39
|
-
from archscope.backends import Backend
|
|
39
|
+
from archscope.backends import Backend
|
|
40
40
|
|
|
41
41
|
# Build a minimal mock backend
|
|
42
42
|
class _MockBackend(Backend):
|
|
@@ -133,14 +133,12 @@ def test_backend_registry():
|
|
|
133
133
|
assert name in Backend._registry, f"{name} not registered"
|
|
134
134
|
|
|
135
135
|
|
|
136
|
-
def
|
|
137
|
-
"""KazdovBackend
|
|
136
|
+
def test_kazdov_backend_registers():
|
|
137
|
+
"""KazdovBackend is always registered (generic blocks-based backend)."""
|
|
138
138
|
from archscope.backends import Backend
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
from archscope.kazdov_backend import KazdovBackend
|
|
143
|
-
assert KazdovBackend is Backend._registry["kazdov"]
|
|
139
|
+
from archscope.kazdov_backend import KazdovBackend
|
|
140
|
+
assert "kazdov" in Backend._registry
|
|
141
|
+
assert KazdovBackend is Backend._registry["kazdov"]
|
|
144
142
|
|
|
145
143
|
|
|
146
144
|
def test_alignment_math():
|
|
@@ -194,6 +192,58 @@ def test_interpprofile_serializes():
|
|
|
194
192
|
assert "test" in j
|
|
195
193
|
|
|
196
194
|
|
|
195
|
+
def test_activation_patch_rejects_shape_mismatch():
|
|
196
|
+
"""activation_patch surfaces a clear error when source/target shapes differ."""
|
|
197
|
+
from archscope.attribute import activation_patch
|
|
198
|
+
src = {"input_ids": torch.tensor([[1, 2, 3]])}
|
|
199
|
+
tgt = {"input_ids": torch.tensor([[1, 2, 3, 4, 5]])}
|
|
200
|
+
with pytest.raises(ValueError) as ei:
|
|
201
|
+
activation_patch(model=None, prompt_source=src, prompt_target=tgt,
|
|
202
|
+
layer_indices=[0], metric_fn=lambda o: 0.0,
|
|
203
|
+
backend_hint="transformer")
|
|
204
|
+
assert "matching input_ids shape" in str(ei.value)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def test_backend_for_model_raises_on_unknown_type():
|
|
208
|
+
"""Unknown config.model_type → clear ValueError, no silent fallback."""
|
|
209
|
+
from archscope.backends import Backend
|
|
210
|
+
|
|
211
|
+
class _FakeConfig:
|
|
212
|
+
model_type = "not_a_real_arch"
|
|
213
|
+
|
|
214
|
+
class _FakeModel:
|
|
215
|
+
config = _FakeConfig()
|
|
216
|
+
|
|
217
|
+
with pytest.raises(ValueError) as ei:
|
|
218
|
+
Backend.for_model(_FakeModel())
|
|
219
|
+
msg = str(ei.value)
|
|
220
|
+
assert "No backend matches" in msg
|
|
221
|
+
assert "not_a_real_arch" in msg
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def test_backend_for_model_autodetect_includes_pythia():
|
|
225
|
+
"""gpt_neox (Pythia) auto-detects to transformer backend."""
|
|
226
|
+
from archscope.backends import Backend, TransformerBackend
|
|
227
|
+
|
|
228
|
+
class _FakeConfig:
|
|
229
|
+
model_type = "gpt_neox"
|
|
230
|
+
num_hidden_layers = 2
|
|
231
|
+
hidden_size = 8
|
|
232
|
+
|
|
233
|
+
class _FakeModel:
|
|
234
|
+
config = _FakeConfig()
|
|
235
|
+
|
|
236
|
+
backend = Backend.for_model(_FakeModel())
|
|
237
|
+
assert isinstance(backend, TransformerBackend)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_neurons_layer_filter_rejects_nonmatching():
|
|
241
|
+
"""layer_filter that matches nothing raises with a helpful message."""
|
|
242
|
+
from archscope.neurons import NeuronEditConfig
|
|
243
|
+
cfg = NeuronEditConfig(layer_filter="not_a_substring")
|
|
244
|
+
assert cfg.layer_filter == "not_a_substring"
|
|
245
|
+
|
|
246
|
+
|
|
197
247
|
if __name__ == "__main__":
|
|
198
248
|
# Allow `python tests/test_unit.py` for quick local check
|
|
199
249
|
pytest.main([__file__, "-v"])
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
"""Backend for kazdov-α (and related Kazdov family models).
|
|
2
|
-
|
|
3
|
-
Kazdov-α is a transformer-style decoder LM with hybrid attention (MoBE-BCN
|
|
4
|
-
mixture of bilinear experts + standard MHA in parallel). Architecturally
|
|
5
|
-
closer to standard transformer than to pure RNN/SSM — but the BCN attention
|
|
6
|
-
branch makes it a distinct architecture family for cross-arch interp.
|
|
7
|
-
|
|
8
|
-
Differences from HF transformer:
|
|
9
|
-
- No HF AutoModelForCausalLM interface (custom forward signature)
|
|
10
|
-
- Layers exposed as `model.blocks` (ModuleList)
|
|
11
|
-
- No `output_hidden_states=True` argument — we capture via forward hooks
|
|
12
|
-
- Forward signature: (input_ids, attention_mask=None, labels=None)
|
|
13
|
-
"""
|
|
14
|
-
from __future__ import annotations
|
|
15
|
-
import sys
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
import torch
|
|
18
|
-
|
|
19
|
-
from .backends import Backend, ActivationRecord
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
KAZDOV_REPO = Path.home() / "code" / "OriginalKazdov" / "kazdov"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _ensure_kazdov_importable():
|
|
26
|
-
"""Add kazdov repo to sys.path so we can import KazdovLM."""
|
|
27
|
-
p = str(KAZDOV_REPO)
|
|
28
|
-
if p not in sys.path:
|
|
29
|
-
sys.path.insert(0, p)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def load_kazdov_checkpoint(checkpoint_path: str | Path, device: str = "cpu"):
|
|
33
|
-
"""Load kazdov-α from a checkpoint directory.
|
|
34
|
-
|
|
35
|
-
Expects: config.json + final.pt (or latest.pt) in the directory.
|
|
36
|
-
Returns: (model in eval mode, tokenizer wrapper).
|
|
37
|
-
"""
|
|
38
|
-
_ensure_kazdov_importable()
|
|
39
|
-
from kazdov.kazdov_lm import KazdovLM
|
|
40
|
-
import json
|
|
41
|
-
|
|
42
|
-
ckpt_dir = Path(checkpoint_path)
|
|
43
|
-
config = json.loads((ckpt_dir / "config.json").read_text())
|
|
44
|
-
model_cfg = config["model_cfg"]
|
|
45
|
-
|
|
46
|
-
model = KazdovLM(
|
|
47
|
-
vocab_size=model_cfg["vocab_size"],
|
|
48
|
-
d_model=model_cfg["d_model"],
|
|
49
|
-
n_layers=model_cfg["n_layers"],
|
|
50
|
-
n_heads=model_cfg["n_heads"],
|
|
51
|
-
rank=model_cfg["rank"],
|
|
52
|
-
mlp_dim=model_cfg.get("mlp_dim"),
|
|
53
|
-
max_len=model_cfg.get("max_len", 256),
|
|
54
|
-
use_trilinear=model_cfg.get("use_trilinear", False),
|
|
55
|
-
use_bi_bcn=model_cfg.get("use_bi_bcn", False),
|
|
56
|
-
use_hybrid_mha=model_cfg.get("use_hybrid_mha", True),
|
|
57
|
-
use_mobe=model_cfg.get("use_mobe", False),
|
|
58
|
-
n_experts=model_cfg.get("n_experts", 1),
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
# Try final.pt then latest.pt
|
|
62
|
-
for fname in ("final.pt", "latest.pt"):
|
|
63
|
-
f = ckpt_dir / fname
|
|
64
|
-
if f.exists():
|
|
65
|
-
state = torch.load(f, map_location=device, weights_only=False)
|
|
66
|
-
if isinstance(state, dict) and "model" in state:
|
|
67
|
-
state = state["model"]
|
|
68
|
-
model.load_state_dict(state, strict=False)
|
|
69
|
-
break
|
|
70
|
-
else:
|
|
71
|
-
raise FileNotFoundError(f"No final.pt or latest.pt in {ckpt_dir}")
|
|
72
|
-
|
|
73
|
-
model.to(device).eval()
|
|
74
|
-
|
|
75
|
-
# Tokenizer: kazdov used GPT-2 tokenizer per memory
|
|
76
|
-
from transformers import GPT2Tokenizer
|
|
77
|
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
78
|
-
if tokenizer.pad_token is None:
|
|
79
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
80
|
-
|
|
81
|
-
return model, tokenizer
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
@Backend.register("kazdov")
|
|
85
|
-
class KazdovBackend(Backend):
|
|
86
|
-
"""Backend for kazdov-family models (KazdovLM, MoBE-BCN variants).
|
|
87
|
-
|
|
88
|
-
Uses forward hooks to capture residual stream after each KazdovBlock,
|
|
89
|
-
since the model doesn't expose output_hidden_states.
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
def layer_names(self) -> list[str]:
|
|
93
|
-
n_layers = len(self.model.blocks)
|
|
94
|
-
return [f"layer_{i}.residual" for i in range(n_layers)]
|
|
95
|
-
|
|
96
|
-
def extract(self, inputs, layers=None):
|
|
97
|
-
layers = layers or self.layer_names()
|
|
98
|
-
self._validate_layers(layers)
|
|
99
|
-
captures: dict[str, torch.Tensor] = {}
|
|
100
|
-
|
|
101
|
-
# Register a forward hook on each requested block.
|
|
102
|
-
hooks = []
|
|
103
|
-
for layer_name in layers:
|
|
104
|
-
idx = int(layer_name.split("_")[1].split(".")[0])
|
|
105
|
-
if idx >= len(self.model.blocks):
|
|
106
|
-
continue
|
|
107
|
-
block = self.model.blocks[idx]
|
|
108
|
-
|
|
109
|
-
def make_hook(name):
|
|
110
|
-
def hook(module, inp, out):
|
|
111
|
-
tensor = out if isinstance(out, torch.Tensor) else out[0]
|
|
112
|
-
captures[name] = tensor.detach()
|
|
113
|
-
return hook
|
|
114
|
-
hooks.append(block.register_forward_hook(make_hook(layer_name)))
|
|
115
|
-
|
|
116
|
-
try:
|
|
117
|
-
# Kazdov forward signature: model(input_ids, attention_mask=None)
|
|
118
|
-
with torch.no_grad():
|
|
119
|
-
if isinstance(inputs, dict):
|
|
120
|
-
input_ids = inputs["input_ids"]
|
|
121
|
-
attn = inputs.get("attention_mask")
|
|
122
|
-
else:
|
|
123
|
-
input_ids = inputs
|
|
124
|
-
attn = None
|
|
125
|
-
self.model(input_ids, attention_mask=attn)
|
|
126
|
-
finally:
|
|
127
|
-
for h in hooks:
|
|
128
|
-
h.remove()
|
|
129
|
-
|
|
130
|
-
records = []
|
|
131
|
-
for layer_name in layers:
|
|
132
|
-
if layer_name not in captures:
|
|
133
|
-
continue
|
|
134
|
-
records.append(ActivationRecord(
|
|
135
|
-
layer_name=layer_name,
|
|
136
|
-
activations=captures[layer_name],
|
|
137
|
-
meta={"kind": "residual", "arch": "kazdov-mobe-bcn"},
|
|
138
|
-
))
|
|
139
|
-
return records
|
|
140
|
-
|
|
141
|
-
def hidden_dim(self, layer_name: str) -> int:
|
|
142
|
-
return self.model.d_model
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|