archscope 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {archscope-0.2.2/src/archscope.egg-info → archscope-0.2.4}/PKG-INFO +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/pyproject.toml +4 -1
- archscope-0.2.4/src/archscope/__init__.py +44 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/backends.py +17 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/cli.py +7 -3
- archscope-0.2.4/src/archscope/kazdov_backend.py +99 -0
- archscope-0.2.4/src/archscope/loader.py +76 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/probes.py +53 -9
- archscope-0.2.4/src/archscope/py.typed +0 -0
- {archscope-0.2.2 → archscope-0.2.4/src/archscope.egg-info}/PKG-INFO +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/SOURCES.txt +2 -0
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_circuits_3arch.py +5 -4
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_diff.py +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_kazdov_integration.py +4 -3
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_lens.py +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_mamba_integration.py +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_mamba_ssm_state.py +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_probe_transfer.py +4 -3
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_pythia_end_to_end.py +1 -1
- {archscope-0.2.2 → archscope-0.2.4}/tests/test_unit.py +50 -1
- archscope-0.2.2/src/archscope/__init__.py +0 -30
- archscope-0.2.2/src/archscope/kazdov_backend.py +0 -141
- {archscope-0.2.2 → archscope-0.2.4}/LICENSE +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/README.md +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/setup.cfg +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/_utils.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/attribute.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/bench.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/circuits.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/diff.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/lens.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/neurons.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/sae.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope/transfer.py +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/dependency_links.txt +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/entry_points.txt +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/requires.txt +0 -0
- {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "archscope"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.4"
|
|
4
4
|
description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{name = "Juan Cruz Dovzak"}]
|
|
@@ -43,3 +43,6 @@ build-backend = "setuptools.build_meta"
|
|
|
43
43
|
|
|
44
44
|
[tool.setuptools.packages.find]
|
|
45
45
|
where = ["src"]
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.package-data]
|
|
48
|
+
"archscope" = ["py.typed"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""archscope — cross-architecture mechanistic interpretability workbench.
|
|
2
|
+
|
|
3
|
+
Core methods (architecture-agnostic):
|
|
4
|
+
- probes: linear/MLP probes on hidden states (Drop the Act-style)
|
|
5
|
+
- sae: Dense + Rank-1 sparse autoencoders (WriteSAE-style)
|
|
6
|
+
- neurons: contrastive neuron modulation
|
|
7
|
+
- attribute: activation patching + DIM decomposition
|
|
8
|
+
- circuits: induction head, copy, attention-concentration detectors
|
|
9
|
+
- lens: logit lens + tuned lens (Belrose et al 2023)
|
|
10
|
+
- diff: base vs fine-tuned model comparison
|
|
11
|
+
|
|
12
|
+
Experiment infrastructure:
|
|
13
|
+
- backends: unified extraction API across architectures
|
|
14
|
+
- transfer: cross-arch probe transfer via paired-activation alignment
|
|
15
|
+
- bench: InterpProfile standardized benchmark
|
|
16
|
+
- loader: one-call HuggingFace model + tokenizer + backend loader
|
|
17
|
+
|
|
18
|
+
Backends: ``transformer``, ``mamba`` (incl. ssm_state), ``kazdov``, ``recurrent``.
|
|
19
|
+
|
|
20
|
+
Quick start::
|
|
21
|
+
|
|
22
|
+
import archscope as ai
|
|
23
|
+
model, tok, backend = ai.load_model("EleutherAI/pythia-160m", arch="transformer")
|
|
24
|
+
result = ai.lens.logit_lens(model, tok, "The capital of France is", target_token=" Paris")
|
|
25
|
+
print(result.to_markdown())
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
__version__ = "0.2.4"
|
|
29
|
+
|
|
30
|
+
from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
|
|
31
|
+
from .loader import load_model, make_tokenize_fn
|
|
32
|
+
|
|
33
|
+
# Kazdov backend registers itself on import — optional, only if kazdov repo present
|
|
34
|
+
try:
|
|
35
|
+
from . import kazdov_backend # noqa: F401
|
|
36
|
+
except ImportError:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"probes", "sae", "neurons", "attribute", "backends",
|
|
41
|
+
"circuits", "transfer", "bench", "lens", "diff",
|
|
42
|
+
"load_model", "make_tokenize_fn",
|
|
43
|
+
"__version__",
|
|
44
|
+
]
|
|
@@ -77,6 +77,21 @@ class Backend(abc.ABC):
|
|
|
77
77
|
"""Dimensionality of activations at a given layer."""
|
|
78
78
|
...
|
|
79
79
|
|
|
80
|
+
def _validate_layers(self, layers: list[str]) -> None:
|
|
81
|
+
"""Raise a clear error if any requested layer name isn't valid."""
|
|
82
|
+
valid = set(self.layer_names())
|
|
83
|
+
bad = [ln for ln in layers if ln not in valid]
|
|
84
|
+
if bad:
|
|
85
|
+
# Show first few valid examples so users see the format.
|
|
86
|
+
sample = ", ".join(self.layer_names()[:3])
|
|
87
|
+
n_total = len(valid)
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"Unknown layer name(s) for {type(self).__name__}: {bad}. "
|
|
90
|
+
f"Valid layer names look like: {sample}{', ...' if n_total > 3 else ''} "
|
|
91
|
+
f"(total: {n_total} layer names). Call `backend.layer_names()` "
|
|
92
|
+
f"to see all valid options."
|
|
93
|
+
)
|
|
94
|
+
|
|
80
95
|
|
|
81
96
|
@Backend.register("transformer")
|
|
82
97
|
class TransformerBackend(Backend):
|
|
@@ -89,6 +104,7 @@ class TransformerBackend(Backend):
|
|
|
89
104
|
|
|
90
105
|
def extract(self, inputs, layers=None):
|
|
91
106
|
layers = layers or self.layer_names()
|
|
107
|
+
self._validate_layers(layers)
|
|
92
108
|
# Use HF's output_hidden_states=True for clean extraction.
|
|
93
109
|
# Wrap in no_grad: extraction shouldn't build a backward graph.
|
|
94
110
|
with torch.no_grad():
|
|
@@ -134,6 +150,7 @@ class MambaBackend(Backend):
|
|
|
134
150
|
|
|
135
151
|
def extract(self, inputs, layers=None):
|
|
136
152
|
layers = layers or self.layer_names()
|
|
153
|
+
self._validate_layers(layers)
|
|
137
154
|
|
|
138
155
|
need_residual = any(".residual" in ln for ln in layers)
|
|
139
156
|
need_ssm = any(".ssm_state" in ln for ln in layers)
|
|
@@ -13,9 +13,13 @@ from . import __version__
|
|
|
13
13
|
console = Console()
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
@click.group()
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
@click.group(invoke_without_command=True)
|
|
17
|
+
@click.version_option(__version__, "-V", "--version", prog_name="archscope")
|
|
18
|
+
@click.pass_context
|
|
19
|
+
def cli(ctx: click.Context) -> None:
|
|
20
|
+
"""archscope — cross-architecture mechanistic interpretability workbench."""
|
|
21
|
+
if ctx.invoked_subcommand is None:
|
|
22
|
+
click.echo(ctx.get_help())
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
@cli.command()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Backend for custom architectures that expose layers via ``model.blocks``.
|
|
2
|
+
|
|
3
|
+
Originally written for kazdov-α (a transformer-style decoder LM with hybrid
|
|
4
|
+
MoBE-BCN + MHA attention) — but the backend is generic. It works for ANY
|
|
5
|
+
PyTorch model where:
|
|
6
|
+
|
|
7
|
+
- residual blocks are exposed as ``model.blocks`` (a ``nn.ModuleList``)
|
|
8
|
+
- ``model.d_model`` (or ``model.hidden_size``) is set on the model
|
|
9
|
+
- forward signature is ``model(input_ids, attention_mask=None, ...)``
|
|
10
|
+
|
|
11
|
+
This is the simplest pattern for registering a custom architecture with
|
|
12
|
+
archscope. If your model uses a different convention (e.g., ``model.layers``
|
|
13
|
+
under another parent), subclass ``Backend`` directly — this module is a
|
|
14
|
+
working example.
|
|
15
|
+
|
|
16
|
+
The backend registers under the name ``"kazdov"`` for historical reasons.
|
|
17
|
+
It used to be coupled to a private model-loading function; that function
|
|
18
|
+
was moved out of the shipped package since it depended on a private
|
|
19
|
+
repository. To load your own custom model, do it yourself and then call
|
|
20
|
+
``Backend.for_model(model, hint="kazdov")``.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
import torch
|
|
24
|
+
|
|
25
|
+
from .backends import Backend, ActivationRecord
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@Backend.register("kazdov")
|
|
29
|
+
class KazdovBackend(Backend):
|
|
30
|
+
"""Generic backend for models exposing layers via ``model.blocks``.
|
|
31
|
+
|
|
32
|
+
Captures the output of each block via forward hooks (the model is
|
|
33
|
+
expected to not implement ``output_hidden_states=True`` natively).
|
|
34
|
+
|
|
35
|
+
Requirements on the model:
|
|
36
|
+
- ``model.blocks`` is a ``nn.ModuleList`` of residual blocks.
|
|
37
|
+
- ``model.d_model`` or ``model.hidden_size`` is set.
|
|
38
|
+
- ``model(input_ids, attention_mask=...)`` is the forward signature.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def layer_names(self) -> list[str]:
|
|
42
|
+
n_layers = len(self.model.blocks)
|
|
43
|
+
return [f"layer_{i}.residual" for i in range(n_layers)]
|
|
44
|
+
|
|
45
|
+
def extract(self, inputs, layers=None):
|
|
46
|
+
layers = layers or self.layer_names()
|
|
47
|
+
self._validate_layers(layers)
|
|
48
|
+
captures: dict[str, torch.Tensor] = {}
|
|
49
|
+
|
|
50
|
+
# Register a forward hook on each requested block.
|
|
51
|
+
hooks = []
|
|
52
|
+
for layer_name in layers:
|
|
53
|
+
idx = int(layer_name.split("_")[1].split(".")[0])
|
|
54
|
+
if idx >= len(self.model.blocks):
|
|
55
|
+
continue
|
|
56
|
+
block = self.model.blocks[idx]
|
|
57
|
+
|
|
58
|
+
def make_hook(name):
|
|
59
|
+
def hook(module, inp, out):
|
|
60
|
+
tensor = out if isinstance(out, torch.Tensor) else out[0]
|
|
61
|
+
captures[name] = tensor.detach()
|
|
62
|
+
return hook
|
|
63
|
+
hooks.append(block.register_forward_hook(make_hook(layer_name)))
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
with torch.no_grad():
|
|
67
|
+
if isinstance(inputs, dict):
|
|
68
|
+
input_ids = inputs["input_ids"]
|
|
69
|
+
attn = inputs.get("attention_mask")
|
|
70
|
+
else:
|
|
71
|
+
input_ids = inputs
|
|
72
|
+
attn = None
|
|
73
|
+
self.model(input_ids, attention_mask=attn)
|
|
74
|
+
finally:
|
|
75
|
+
for h in hooks:
|
|
76
|
+
h.remove()
|
|
77
|
+
|
|
78
|
+
records = []
|
|
79
|
+
for layer_name in layers:
|
|
80
|
+
if layer_name not in captures:
|
|
81
|
+
continue
|
|
82
|
+
records.append(ActivationRecord(
|
|
83
|
+
layer_name=layer_name,
|
|
84
|
+
activations=captures[layer_name],
|
|
85
|
+
meta={"kind": "residual", "arch": "kazdov-blocks"},
|
|
86
|
+
))
|
|
87
|
+
return records
|
|
88
|
+
|
|
89
|
+
def hidden_dim(self, layer_name: str) -> int:
|
|
90
|
+
# Some custom models expose this as `d_model`, others as `hidden_size`.
|
|
91
|
+
for attr in ("d_model", "hidden_size"):
|
|
92
|
+
v = getattr(self.model, attr, None)
|
|
93
|
+
if v is not None:
|
|
94
|
+
return v
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Cannot infer hidden_dim for {type(self.model).__name__}: "
|
|
97
|
+
f"set model.d_model or model.hidden_size, or subclass KazdovBackend "
|
|
98
|
+
f"and override hidden_dim()."
|
|
99
|
+
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""High-level model loading helper.
|
|
2
|
+
|
|
3
|
+
Eliminates ~5 lines of HuggingFace boilerplate per example:
|
|
4
|
+
|
|
5
|
+
# Before
|
|
6
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
7
|
+
tok = AutoTokenizer.from_pretrained(name)
|
|
8
|
+
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
|
9
|
+
model = AutoModelForCausalLM.from_pretrained(name, dtype=torch.float32)
|
|
10
|
+
model.eval()
|
|
11
|
+
backend = Backend.for_model(model, hint=arch)
|
|
12
|
+
|
|
13
|
+
# After
|
|
14
|
+
model, tok, backend = archscope.load_model(name, arch="transformer")
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from .backends import Backend
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_model(
|
|
23
|
+
name: str,
|
|
24
|
+
arch: str | None = None,
|
|
25
|
+
dtype: Any = None,
|
|
26
|
+
device: str = "cpu",
|
|
27
|
+
) -> tuple[Any, Any, Backend]:
|
|
28
|
+
"""Load a HuggingFace model + tokenizer + matching backend in one call.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
name: HuggingFace model id (e.g., "EleutherAI/pythia-160m") OR a path
|
|
32
|
+
to a local archscope-registered model (e.g., kazdov checkpoint).
|
|
33
|
+
arch: Backend hint — one of "transformer", "mamba", "kazdov",
|
|
34
|
+
"recurrent". If None, auto-detect from HF config.model_type.
|
|
35
|
+
dtype: torch dtype. Defaults to torch.float32.
|
|
36
|
+
device: device string. Default "cpu".
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
(model, tokenizer, backend) ready for use in probes/sae/lens/etc.
|
|
40
|
+
"""
|
|
41
|
+
import torch
|
|
42
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
43
|
+
|
|
44
|
+
if dtype is None:
|
|
45
|
+
dtype = torch.float32
|
|
46
|
+
|
|
47
|
+
tokenizer = AutoTokenizer.from_pretrained(name)
|
|
48
|
+
if tokenizer.pad_token is None:
|
|
49
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
50
|
+
model = AutoModelForCausalLM.from_pretrained(name, dtype=dtype)
|
|
51
|
+
model = model.to(device).eval()
|
|
52
|
+
|
|
53
|
+
backend = Backend.for_model(model, hint=arch)
|
|
54
|
+
return model, tokenizer, backend
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def make_tokenize_fn(tokenizer, max_length: int = 32, attention_mask_bool: bool = False):
|
|
58
|
+
"""Return a tokenize function suitable for ``Backend.extract`` and ``probes.fit_probe``.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
tokenizer: a HuggingFace tokenizer.
|
|
62
|
+
max_length: max sequence length to truncate to.
|
|
63
|
+
attention_mask_bool: if True, returns attention_mask as a bool tensor
|
|
64
|
+
(required for ``kazdov`` backend); HF default is int64.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A callable ``texts -> dict`` that matches the ``inputs`` format used
|
|
68
|
+
across archscope.
|
|
69
|
+
"""
|
|
70
|
+
def fn(texts):
|
|
71
|
+
out = tokenizer(texts, return_tensors="pt", padding=True,
|
|
72
|
+
truncation=True, max_length=max_length)
|
|
73
|
+
if attention_mask_bool and "attention_mask" in out:
|
|
74
|
+
out["attention_mask"] = out["attention_mask"].bool()
|
|
75
|
+
return out
|
|
76
|
+
return fn
|
|
@@ -110,28 +110,72 @@ class ProbeFit:
|
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def _auroc(logits: torch.Tensor, labels: torch.Tensor) -> float:
|
|
113
|
-
"""
|
|
113
|
+
"""AUROC from logits + binary labels.
|
|
114
|
+
|
|
115
|
+
Returns 0.5 (chance) when only one class is present in `labels`
|
|
116
|
+
(the typical small-split case) — this is more informative than NaN
|
|
117
|
+
and avoids sklearn's UndefinedMetricWarning leaking to user code.
|
|
118
|
+
"""
|
|
119
|
+
import warnings
|
|
114
120
|
from sklearn.metrics import roc_auc_score
|
|
115
121
|
scores = torch.sigmoid(logits).cpu().numpy()
|
|
116
122
|
y = labels.cpu().numpy()
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
if len(set(y.tolist())) < 2:
|
|
124
|
+
warnings.warn(
|
|
125
|
+
"Only one class present in this split — AUROC undefined; "
|
|
126
|
+
"returning 0.5 (chance). Increase val_split or dataset size.",
|
|
127
|
+
stacklevel=2,
|
|
128
|
+
)
|
|
129
|
+
return 0.5
|
|
130
|
+
return float(roc_auc_score(y, scores))
|
|
121
131
|
|
|
122
132
|
|
|
123
133
|
# High-level API matching paper
|
|
124
134
|
|
|
125
135
|
def fit_probe(
|
|
126
136
|
model,
|
|
127
|
-
inputs_pos
|
|
128
|
-
inputs_neg
|
|
129
|
-
layer_name: str,
|
|
137
|
+
inputs_pos=None,
|
|
138
|
+
inputs_neg=None,
|
|
139
|
+
layer_name: str = "",
|
|
130
140
|
backend_hint: str | None = None,
|
|
131
141
|
config: ProbeConfig | None = None,
|
|
132
142
|
device: str = "cpu",
|
|
143
|
+
*,
|
|
144
|
+
tokenizer=None,
|
|
145
|
+
pos_texts: list[str] | None = None,
|
|
146
|
+
neg_texts: list[str] | None = None,
|
|
147
|
+
max_length: int = 32,
|
|
133
148
|
) -> ProbeFit:
|
|
134
|
-
"""End-to-end: extract activations from model
|
|
149
|
+
"""End-to-end: extract activations from a model and fit a probe.
|
|
150
|
+
|
|
151
|
+
Two calling conventions:
|
|
152
|
+
|
|
153
|
+
1. **Pre-tokenized**: pass ``inputs_pos`` and ``inputs_neg`` as already-tokenized
|
|
154
|
+
dicts (with ``input_ids``, optional ``attention_mask``).
|
|
155
|
+
|
|
156
|
+
2. **Texts + tokenizer**: pass ``tokenizer=…``, ``pos_texts=[…]``, ``neg_texts=[…]``
|
|
157
|
+
and archscope tokenizes for you. The kazdov backend requires a bool
|
|
158
|
+
attention_mask; we auto-handle that.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
A ``ProbeFit`` with trained probe and ``.metrics`` (train/val AUROC, loss).
|
|
162
|
+
"""
|
|
163
|
+
if pos_texts is not None or neg_texts is not None:
|
|
164
|
+
if tokenizer is None:
|
|
165
|
+
raise ValueError("pos_texts/neg_texts require a tokenizer= argument")
|
|
166
|
+
if pos_texts is None or neg_texts is None:
|
|
167
|
+
raise ValueError("provide both pos_texts and neg_texts")
|
|
168
|
+
from .loader import make_tokenize_fn
|
|
169
|
+
tk = make_tokenize_fn(
|
|
170
|
+
tokenizer, max_length=max_length,
|
|
171
|
+
attention_mask_bool=(backend_hint == "kazdov"),
|
|
172
|
+
)
|
|
173
|
+
inputs_pos = tk(pos_texts)
|
|
174
|
+
inputs_neg = tk(neg_texts)
|
|
175
|
+
elif inputs_pos is None or inputs_neg is None:
|
|
176
|
+
raise ValueError("provide either (inputs_pos, inputs_neg) or "
|
|
177
|
+
"(tokenizer, pos_texts, neg_texts)")
|
|
178
|
+
|
|
135
179
|
backend = Backend.for_model(model, hint=backend_hint)
|
|
136
180
|
config = config or ProbeConfig(layer_name=layer_name)
|
|
137
181
|
|
|
File without changes
|
|
@@ -11,8 +11,10 @@ src/archscope/cli.py
|
|
|
11
11
|
src/archscope/diff.py
|
|
12
12
|
src/archscope/kazdov_backend.py
|
|
13
13
|
src/archscope/lens.py
|
|
14
|
+
src/archscope/loader.py
|
|
14
15
|
src/archscope/neurons.py
|
|
15
16
|
src/archscope/probes.py
|
|
17
|
+
src/archscope/py.typed
|
|
16
18
|
src/archscope/sae.py
|
|
17
19
|
src/archscope/transfer.py
|
|
18
20
|
src/archscope.egg-info/PKG-INFO
|
|
@@ -11,13 +11,14 @@ import os
|
|
|
11
11
|
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
sys.path.insert(0, "/
|
|
14
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
15
15
|
|
|
16
16
|
from archscope import circuits
|
|
17
|
-
|
|
17
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
18
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
21
|
+
CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
21
22
|
PYTHIA_NAME = "EleutherAI/pythia-160m"
|
|
22
23
|
MAMBA_NAME = "state-spaces/mamba-130m-hf"
|
|
23
24
|
|
|
@@ -97,7 +98,7 @@ def main():
|
|
|
97
98
|
print(" • concentration relative ≈ 0 → highly confident predictions (concentrated)")
|
|
98
99
|
|
|
99
100
|
# Save
|
|
100
|
-
out_path = "/
|
|
101
|
+
out_path = "str(__import__("pathlib").Path(__file__).parent.parent / "_research")/circuits_3arch.json"
|
|
101
102
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
102
103
|
with open(out_path, "w") as f:
|
|
103
104
|
json.dump(all_results, f, indent=2, default=str)
|
|
@@ -9,14 +9,15 @@ import sys
|
|
|
9
9
|
import time
|
|
10
10
|
import torch
|
|
11
11
|
|
|
12
|
-
sys.path.insert(0, "/
|
|
12
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
13
13
|
|
|
14
14
|
from archscope import probes, sae, neurons
|
|
15
15
|
from archscope.backends import Backend
|
|
16
|
-
|
|
16
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
17
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
CHECKPOINT = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
20
|
+
CHECKPOINT = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def tokenize(tokenizer, texts: list[str]) -> dict:
|
|
@@ -10,7 +10,7 @@ import sys
|
|
|
10
10
|
import time
|
|
11
11
|
import torch
|
|
12
12
|
|
|
13
|
-
sys.path.insert(0, "/
|
|
13
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
14
14
|
|
|
15
15
|
from archscope import probes, sae, neurons
|
|
16
16
|
from archscope.backends import Backend
|
|
@@ -11,7 +11,7 @@ import sys
|
|
|
11
11
|
import time
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
sys.path.insert(0, "/
|
|
14
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
15
15
|
|
|
16
16
|
from archscope import sae
|
|
17
17
|
from archscope.backends import Backend
|
|
@@ -14,14 +14,15 @@ import sys
|
|
|
14
14
|
import time
|
|
15
15
|
import torch
|
|
16
16
|
|
|
17
|
-
sys.path.insert(0, "/
|
|
17
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
18
18
|
|
|
19
19
|
from archscope import transfer
|
|
20
20
|
from archscope.backends import Backend
|
|
21
|
-
|
|
21
|
+
import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
|
|
22
|
+
from _kazdov_loader import load_kazdov_checkpoint
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
|
|
25
|
+
CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
|
|
25
26
|
PYTHIA_NAME = "EleutherAI/pythia-160m"
|
|
26
27
|
|
|
27
28
|
|
|
@@ -13,7 +13,7 @@ import sys
|
|
|
13
13
|
import time
|
|
14
14
|
import torch
|
|
15
15
|
|
|
16
|
-
sys.path.insert(0, "/
|
|
16
|
+
sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
|
|
17
17
|
|
|
18
18
|
from archscope import probes, sae, neurons, attribute
|
|
19
19
|
from archscope.backends import Backend
|
|
@@ -22,7 +22,56 @@ def test_imports():
|
|
|
22
22
|
import archscope
|
|
23
23
|
from archscope import (probes, sae, neurons, attribute, backends,
|
|
24
24
|
circuits, transfer, bench, lens, diff)
|
|
25
|
-
assert archscope.__version__ == "0.2.
|
|
25
|
+
assert archscope.__version__ == "0.2.4"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_loader_exports():
|
|
29
|
+
"""load_model and make_tokenize_fn are exported at top level."""
|
|
30
|
+
import archscope
|
|
31
|
+
assert hasattr(archscope, "load_model")
|
|
32
|
+
assert hasattr(archscope, "make_tokenize_fn")
|
|
33
|
+
assert callable(archscope.load_model)
|
|
34
|
+
assert callable(archscope.make_tokenize_fn)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_layer_name_validation_clear_error():
|
|
38
|
+
"""Backend validates layer names with an informative error."""
|
|
39
|
+
from archscope.backends import Backend, ActivationRecord
|
|
40
|
+
|
|
41
|
+
# Build a minimal mock backend
|
|
42
|
+
class _MockBackend(Backend):
|
|
43
|
+
def layer_names(self): return ["layer_0.residual", "layer_1.residual"]
|
|
44
|
+
def extract(self, inputs, layers=None):
|
|
45
|
+
layers = layers or self.layer_names()
|
|
46
|
+
self._validate_layers(layers)
|
|
47
|
+
return []
|
|
48
|
+
def hidden_dim(self, layer_name): return 8
|
|
49
|
+
|
|
50
|
+
b = _MockBackend(model=None)
|
|
51
|
+
# Valid layer → no error
|
|
52
|
+
b.extract({}, layers=["layer_0.residual"])
|
|
53
|
+
# Invalid layer → clear error message
|
|
54
|
+
try:
|
|
55
|
+
b.extract({}, layers=["layer_99.residual"])
|
|
56
|
+
except ValueError as e:
|
|
57
|
+
assert "Unknown layer name" in str(e)
|
|
58
|
+
assert "layer_0.residual" in str(e) # shows valid example
|
|
59
|
+
return
|
|
60
|
+
raise AssertionError("Expected ValueError for invalid layer name")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_auroc_returns_chance_on_single_class():
|
|
64
|
+
"""_auroc returns 0.5 instead of NaN when only one class is present."""
|
|
65
|
+
import warnings
|
|
66
|
+
import torch
|
|
67
|
+
from archscope.probes import _auroc
|
|
68
|
+
|
|
69
|
+
logits = torch.tensor([0.5, 0.2, -0.1])
|
|
70
|
+
labels = torch.tensor([1.0, 1.0, 1.0]) # only one class
|
|
71
|
+
with warnings.catch_warnings():
|
|
72
|
+
warnings.simplefilter("ignore")
|
|
73
|
+
result = _auroc(logits, labels)
|
|
74
|
+
assert result == 0.5
|
|
26
75
|
|
|
27
76
|
|
|
28
77
|
def test_diff_dataclasses():
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
"""archscope: unified mech interp toolkit across small + RNN + transformer.
|
|
2
|
-
|
|
3
|
-
Four core methods unified under a single API:
|
|
4
|
-
- probes: linear/MLP probes over hidden states (Drop the Act inspired)
|
|
5
|
-
- sae: sparse autoencoders for residual + recurrent state (WriteSAE)
|
|
6
|
-
- neurons: targeted neuron modulation via contrastive search (Nous Research)
|
|
7
|
-
- attribute: activation patching + DIM decomposition (Multi-Agent Sycophancy)
|
|
8
|
-
|
|
9
|
-
Each method exposes the same architecture-agnostic API:
|
|
10
|
-
- .extract(model, inputs) -> hidden states / activations
|
|
11
|
-
- .fit(activations, labels) -> learned tool
|
|
12
|
-
- .apply(model, inputs) -> modified outputs / scores / explanations
|
|
13
|
-
|
|
14
|
-
Designed for cross-architecture comparison: transformer, Mamba/SSM, custom RNN.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
__version__ = "0.2.2"
|
|
18
|
-
|
|
19
|
-
from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
|
|
20
|
-
|
|
21
|
-
# Kazdov backend registers itself on import — optional, only if kazdov repo present
|
|
22
|
-
try:
|
|
23
|
-
from . import kazdov_backend # noqa: F401
|
|
24
|
-
except ImportError:
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
__all__ = [
|
|
28
|
-
"probes", "sae", "neurons", "attribute", "backends",
|
|
29
|
-
"circuits", "transfer", "bench", "lens", "diff", "__version__",
|
|
30
|
-
]
|
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
"""Backend for kazdov-α (and related Kazdov family models).
|
|
2
|
-
|
|
3
|
-
Kazdov-α is a transformer-style decoder LM with hybrid attention (MoBE-BCN
|
|
4
|
-
mixture of bilinear experts + standard MHA in parallel). Architecturally
|
|
5
|
-
closer to standard transformer than to pure RNN/SSM — but the BCN attention
|
|
6
|
-
branch makes it a distinct architecture family for cross-arch interp.
|
|
7
|
-
|
|
8
|
-
Differences from HF transformer:
|
|
9
|
-
- No HF AutoModelForCausalLM interface (custom forward signature)
|
|
10
|
-
- Layers exposed as `model.blocks` (ModuleList)
|
|
11
|
-
- No `output_hidden_states=True` argument — we capture via forward hooks
|
|
12
|
-
- Forward signature: (input_ids, attention_mask=None, labels=None)
|
|
13
|
-
"""
|
|
14
|
-
from __future__ import annotations
|
|
15
|
-
import sys
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
import torch
|
|
18
|
-
|
|
19
|
-
from .backends import Backend, ActivationRecord
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
KAZDOV_REPO = Path.home() / "code" / "OriginalKazdov" / "kazdov"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _ensure_kazdov_importable():
|
|
26
|
-
"""Add kazdov repo to sys.path so we can import KazdovLM."""
|
|
27
|
-
p = str(KAZDOV_REPO)
|
|
28
|
-
if p not in sys.path:
|
|
29
|
-
sys.path.insert(0, p)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def load_kazdov_checkpoint(checkpoint_path: str | Path, device: str = "cpu"):
|
|
33
|
-
"""Load kazdov-α from a checkpoint directory.
|
|
34
|
-
|
|
35
|
-
Expects: config.json + final.pt (or latest.pt) in the directory.
|
|
36
|
-
Returns: (model in eval mode, tokenizer wrapper).
|
|
37
|
-
"""
|
|
38
|
-
_ensure_kazdov_importable()
|
|
39
|
-
from kazdov.kazdov_lm import KazdovLM
|
|
40
|
-
import json
|
|
41
|
-
|
|
42
|
-
ckpt_dir = Path(checkpoint_path)
|
|
43
|
-
config = json.loads((ckpt_dir / "config.json").read_text())
|
|
44
|
-
model_cfg = config["model_cfg"]
|
|
45
|
-
|
|
46
|
-
model = KazdovLM(
|
|
47
|
-
vocab_size=model_cfg["vocab_size"],
|
|
48
|
-
d_model=model_cfg["d_model"],
|
|
49
|
-
n_layers=model_cfg["n_layers"],
|
|
50
|
-
n_heads=model_cfg["n_heads"],
|
|
51
|
-
rank=model_cfg["rank"],
|
|
52
|
-
mlp_dim=model_cfg.get("mlp_dim"),
|
|
53
|
-
max_len=model_cfg.get("max_len", 256),
|
|
54
|
-
use_trilinear=model_cfg.get("use_trilinear", False),
|
|
55
|
-
use_bi_bcn=model_cfg.get("use_bi_bcn", False),
|
|
56
|
-
use_hybrid_mha=model_cfg.get("use_hybrid_mha", True),
|
|
57
|
-
use_mobe=model_cfg.get("use_mobe", False),
|
|
58
|
-
n_experts=model_cfg.get("n_experts", 1),
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
# Try final.pt then latest.pt
|
|
62
|
-
for fname in ("final.pt", "latest.pt"):
|
|
63
|
-
f = ckpt_dir / fname
|
|
64
|
-
if f.exists():
|
|
65
|
-
state = torch.load(f, map_location=device, weights_only=False)
|
|
66
|
-
if isinstance(state, dict) and "model" in state:
|
|
67
|
-
state = state["model"]
|
|
68
|
-
model.load_state_dict(state, strict=False)
|
|
69
|
-
break
|
|
70
|
-
else:
|
|
71
|
-
raise FileNotFoundError(f"No final.pt or latest.pt in {ckpt_dir}")
|
|
72
|
-
|
|
73
|
-
model.to(device).eval()
|
|
74
|
-
|
|
75
|
-
# Tokenizer: kazdov used GPT-2 tokenizer per memory
|
|
76
|
-
from transformers import GPT2Tokenizer
|
|
77
|
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
78
|
-
if tokenizer.pad_token is None:
|
|
79
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
80
|
-
|
|
81
|
-
return model, tokenizer
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
@Backend.register("kazdov")
|
|
85
|
-
class KazdovBackend(Backend):
|
|
86
|
-
"""Backend for kazdov-family models (KazdovLM, MoBE-BCN variants).
|
|
87
|
-
|
|
88
|
-
Uses forward hooks to capture residual stream after each KazdovBlock,
|
|
89
|
-
since the model doesn't expose output_hidden_states.
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
def layer_names(self) -> list[str]:
|
|
93
|
-
n_layers = len(self.model.blocks)
|
|
94
|
-
return [f"layer_{i}.residual" for i in range(n_layers)]
|
|
95
|
-
|
|
96
|
-
def extract(self, inputs, layers=None):
|
|
97
|
-
layers = layers or self.layer_names()
|
|
98
|
-
captures: dict[str, torch.Tensor] = {}
|
|
99
|
-
|
|
100
|
-
# Register a forward hook on each requested block.
|
|
101
|
-
hooks = []
|
|
102
|
-
for layer_name in layers:
|
|
103
|
-
idx = int(layer_name.split("_")[1].split(".")[0])
|
|
104
|
-
if idx >= len(self.model.blocks):
|
|
105
|
-
continue
|
|
106
|
-
block = self.model.blocks[idx]
|
|
107
|
-
|
|
108
|
-
def make_hook(name):
|
|
109
|
-
def hook(module, inp, out):
|
|
110
|
-
tensor = out if isinstance(out, torch.Tensor) else out[0]
|
|
111
|
-
captures[name] = tensor.detach()
|
|
112
|
-
return hook
|
|
113
|
-
hooks.append(block.register_forward_hook(make_hook(layer_name)))
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
# Kazdov forward signature: model(input_ids, attention_mask=None)
|
|
117
|
-
with torch.no_grad():
|
|
118
|
-
if isinstance(inputs, dict):
|
|
119
|
-
input_ids = inputs["input_ids"]
|
|
120
|
-
attn = inputs.get("attention_mask")
|
|
121
|
-
else:
|
|
122
|
-
input_ids = inputs
|
|
123
|
-
attn = None
|
|
124
|
-
self.model(input_ids, attention_mask=attn)
|
|
125
|
-
finally:
|
|
126
|
-
for h in hooks:
|
|
127
|
-
h.remove()
|
|
128
|
-
|
|
129
|
-
records = []
|
|
130
|
-
for layer_name in layers:
|
|
131
|
-
if layer_name not in captures:
|
|
132
|
-
continue
|
|
133
|
-
records.append(ActivationRecord(
|
|
134
|
-
layer_name=layer_name,
|
|
135
|
-
activations=captures[layer_name],
|
|
136
|
-
meta={"kind": "residual", "arch": "kazdov-mobe-bcn"},
|
|
137
|
-
))
|
|
138
|
-
return records
|
|
139
|
-
|
|
140
|
-
def hidden_dim(self, layer_name: str) -> int:
|
|
141
|
-
return self.model.d_model
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|