archscope 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {archscope-0.2.2/src/archscope.egg-info → archscope-0.2.4}/PKG-INFO +1 -1
  2. {archscope-0.2.2 → archscope-0.2.4}/pyproject.toml +4 -1
  3. archscope-0.2.4/src/archscope/__init__.py +44 -0
  4. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/backends.py +17 -0
  5. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/cli.py +7 -3
  6. archscope-0.2.4/src/archscope/kazdov_backend.py +99 -0
  7. archscope-0.2.4/src/archscope/loader.py +76 -0
  8. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/probes.py +53 -9
  9. archscope-0.2.4/src/archscope/py.typed +0 -0
  10. {archscope-0.2.2 → archscope-0.2.4/src/archscope.egg-info}/PKG-INFO +1 -1
  11. {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/SOURCES.txt +2 -0
  12. {archscope-0.2.2 → archscope-0.2.4}/tests/test_circuits_3arch.py +5 -4
  13. {archscope-0.2.2 → archscope-0.2.4}/tests/test_diff.py +1 -1
  14. {archscope-0.2.2 → archscope-0.2.4}/tests/test_kazdov_integration.py +4 -3
  15. {archscope-0.2.2 → archscope-0.2.4}/tests/test_lens.py +1 -1
  16. {archscope-0.2.2 → archscope-0.2.4}/tests/test_mamba_integration.py +1 -1
  17. {archscope-0.2.2 → archscope-0.2.4}/tests/test_mamba_ssm_state.py +1 -1
  18. {archscope-0.2.2 → archscope-0.2.4}/tests/test_probe_transfer.py +4 -3
  19. {archscope-0.2.2 → archscope-0.2.4}/tests/test_pythia_end_to_end.py +1 -1
  20. {archscope-0.2.2 → archscope-0.2.4}/tests/test_unit.py +50 -1
  21. archscope-0.2.2/src/archscope/__init__.py +0 -30
  22. archscope-0.2.2/src/archscope/kazdov_backend.py +0 -141
  23. {archscope-0.2.2 → archscope-0.2.4}/LICENSE +0 -0
  24. {archscope-0.2.2 → archscope-0.2.4}/README.md +0 -0
  25. {archscope-0.2.2 → archscope-0.2.4}/setup.cfg +0 -0
  26. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/_utils.py +0 -0
  27. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/attribute.py +0 -0
  28. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/bench.py +0 -0
  29. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/circuits.py +0 -0
  30. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/diff.py +0 -0
  31. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/lens.py +0 -0
  32. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/neurons.py +0 -0
  33. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/sae.py +0 -0
  34. {archscope-0.2.2 → archscope-0.2.4}/src/archscope/transfer.py +0 -0
  35. {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/dependency_links.txt +0 -0
  36. {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/entry_points.txt +0 -0
  37. {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/requires.txt +0 -0
  38. {archscope-0.2.2 → archscope-0.2.4}/src/archscope.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "archscope"
3
- version = "0.2.2"
3
+ version = "0.2.4"
4
4
  description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
5
5
  readme = "README.md"
6
6
  authors = [{name = "Juan Cruz Dovzak"}]
@@ -43,3 +43,6 @@ build-backend = "setuptools.build_meta"
43
43
 
44
44
  [tool.setuptools.packages.find]
45
45
  where = ["src"]
46
+
47
+ [tool.setuptools.package-data]
48
+ "archscope" = ["py.typed"]
@@ -0,0 +1,44 @@
1
+ """archscope — cross-architecture mechanistic interpretability workbench.
2
+
3
+ Core methods (architecture-agnostic):
4
+ - probes: linear/MLP probes on hidden states (Drop the Act-style)
5
+ - sae: Dense + Rank-1 sparse autoencoders (WriteSAE-style)
6
+ - neurons: contrastive neuron modulation
7
+ - attribute: activation patching + DIM decomposition
8
+ - circuits: induction head, copy, attention-concentration detectors
9
+ - lens: logit lens + tuned lens (Belrose et al 2023)
10
+ - diff: base vs fine-tuned model comparison
11
+
12
+ Experiment infrastructure:
13
+ - backends: unified extraction API across architectures
14
+ - transfer: cross-arch probe transfer via paired-activation alignment
15
+ - bench: InterpProfile standardized benchmark
16
+ - loader: one-call HuggingFace model + tokenizer + backend loader
17
+
18
+ Backends: ``transformer``, ``mamba`` (incl. ssm_state), ``kazdov``, ``recurrent``.
19
+
20
+ Quick start::
21
+
22
+ import archscope as ai
23
+ model, tok, backend = ai.load_model("EleutherAI/pythia-160m", arch="transformer")
24
+ result = ai.lens.logit_lens(model, tok, "The capital of France is", target_token=" Paris")
25
+ print(result.to_markdown())
26
+ """
27
+
28
+ __version__ = "0.2.4"
29
+
30
+ from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
31
+ from .loader import load_model, make_tokenize_fn
32
+
33
+ # Kazdov backend registers itself on import — optional, only if kazdov repo present
34
+ try:
35
+ from . import kazdov_backend # noqa: F401
36
+ except ImportError:
37
+ pass
38
+
39
+ __all__ = [
40
+ "probes", "sae", "neurons", "attribute", "backends",
41
+ "circuits", "transfer", "bench", "lens", "diff",
42
+ "load_model", "make_tokenize_fn",
43
+ "__version__",
44
+ ]
@@ -77,6 +77,21 @@ class Backend(abc.ABC):
77
77
  """Dimensionality of activations at a given layer."""
78
78
  ...
79
79
 
80
+ def _validate_layers(self, layers: list[str]) -> None:
81
+ """Raise a clear error if any requested layer name isn't valid."""
82
+ valid = set(self.layer_names())
83
+ bad = [ln for ln in layers if ln not in valid]
84
+ if bad:
85
+ # Show first few valid examples so users see the format.
86
+ sample = ", ".join(self.layer_names()[:3])
87
+ n_total = len(valid)
88
+ raise ValueError(
89
+ f"Unknown layer name(s) for {type(self).__name__}: {bad}. "
90
+ f"Valid layer names look like: {sample}{', ...' if n_total > 3 else ''} "
91
+ f"(total: {n_total} layer names). Call `backend.layer_names()` "
92
+ f"to see all valid options."
93
+ )
94
+
80
95
 
81
96
  @Backend.register("transformer")
82
97
  class TransformerBackend(Backend):
@@ -89,6 +104,7 @@ class TransformerBackend(Backend):
89
104
 
90
105
  def extract(self, inputs, layers=None):
91
106
  layers = layers or self.layer_names()
107
+ self._validate_layers(layers)
92
108
  # Use HF's output_hidden_states=True for clean extraction.
93
109
  # Wrap in no_grad: extraction shouldn't build a backward graph.
94
110
  with torch.no_grad():
@@ -134,6 +150,7 @@ class MambaBackend(Backend):
134
150
 
135
151
  def extract(self, inputs, layers=None):
136
152
  layers = layers or self.layer_names()
153
+ self._validate_layers(layers)
137
154
 
138
155
  need_residual = any(".residual" in ln for ln in layers)
139
156
  need_ssm = any(".ssm_state" in ln for ln in layers)
@@ -13,9 +13,13 @@ from . import __version__
13
13
  console = Console()
14
14
 
15
15
 
16
- @click.group()
17
- def cli() -> None:
18
- """archscope — cross-architecture mechanistic interpretability toolkit."""
16
+ @click.group(invoke_without_command=True)
17
+ @click.version_option(__version__, "-V", "--version", prog_name="archscope")
18
+ @click.pass_context
19
+ def cli(ctx: click.Context) -> None:
20
+ """archscope — cross-architecture mechanistic interpretability workbench."""
21
+ if ctx.invoked_subcommand is None:
22
+ click.echo(ctx.get_help())
19
23
 
20
24
 
21
25
  @cli.command()
@@ -0,0 +1,99 @@
1
+ """Backend for custom architectures that expose layers via ``model.blocks``.
2
+
3
+ Originally written for kazdov-α (a transformer-style decoder LM with hybrid
4
+ MoBE-BCN + MHA attention) — but the backend is generic. It works for ANY
5
+ PyTorch model where:
6
+
7
+ - residual blocks are exposed as ``model.blocks`` (a ``nn.ModuleList``)
8
+ - ``model.d_model`` (or ``model.hidden_size``) is set on the model
9
+ - forward signature is ``model(input_ids, attention_mask=None, ...)``
10
+
11
+ This is the simplest pattern for registering a custom architecture with
12
+ archscope. If your model uses a different convention (e.g., ``model.layers``
13
+ under another parent), subclass ``Backend`` directly — this module is a
14
+ working example.
15
+
16
+ The backend registers under the name ``"kazdov"`` for historical reasons.
17
+ It used to be coupled to a private model-loading function; that function
18
+ was moved out of the shipped package since it depended on a private
19
+ repository. To load your own custom model, do it yourself and then call
20
+ ``Backend.for_model(model, hint="kazdov")``.
21
+ """
22
+ from __future__ import annotations
23
+ import torch
24
+
25
+ from .backends import Backend, ActivationRecord
26
+
27
+
28
+ @Backend.register("kazdov")
29
+ class KazdovBackend(Backend):
30
+ """Generic backend for models exposing layers via ``model.blocks``.
31
+
32
+ Captures the output of each block via forward hooks (the model is
33
+ expected to not implement ``output_hidden_states=True`` natively).
34
+
35
+ Requirements on the model:
36
+ - ``model.blocks`` is a ``nn.ModuleList`` of residual blocks.
37
+ - ``model.d_model`` or ``model.hidden_size`` is set.
38
+ - ``model(input_ids, attention_mask=...)`` is the forward signature.
39
+ """
40
+
41
+ def layer_names(self) -> list[str]:
42
+ n_layers = len(self.model.blocks)
43
+ return [f"layer_{i}.residual" for i in range(n_layers)]
44
+
45
+ def extract(self, inputs, layers=None):
46
+ layers = layers or self.layer_names()
47
+ self._validate_layers(layers)
48
+ captures: dict[str, torch.Tensor] = {}
49
+
50
+ # Register a forward hook on each requested block.
51
+ hooks = []
52
+ for layer_name in layers:
53
+ idx = int(layer_name.split("_")[1].split(".")[0])
54
+ if idx >= len(self.model.blocks):
55
+ continue
56
+ block = self.model.blocks[idx]
57
+
58
+ def make_hook(name):
59
+ def hook(module, inp, out):
60
+ tensor = out if isinstance(out, torch.Tensor) else out[0]
61
+ captures[name] = tensor.detach()
62
+ return hook
63
+ hooks.append(block.register_forward_hook(make_hook(layer_name)))
64
+
65
+ try:
66
+ with torch.no_grad():
67
+ if isinstance(inputs, dict):
68
+ input_ids = inputs["input_ids"]
69
+ attn = inputs.get("attention_mask")
70
+ else:
71
+ input_ids = inputs
72
+ attn = None
73
+ self.model(input_ids, attention_mask=attn)
74
+ finally:
75
+ for h in hooks:
76
+ h.remove()
77
+
78
+ records = []
79
+ for layer_name in layers:
80
+ if layer_name not in captures:
81
+ continue
82
+ records.append(ActivationRecord(
83
+ layer_name=layer_name,
84
+ activations=captures[layer_name],
85
+ meta={"kind": "residual", "arch": "kazdov-blocks"},
86
+ ))
87
+ return records
88
+
89
+ def hidden_dim(self, layer_name: str) -> int:
90
+ # Some custom models expose this as `d_model`, others as `hidden_size`.
91
+ for attr in ("d_model", "hidden_size"):
92
+ v = getattr(self.model, attr, None)
93
+ if v is not None:
94
+ return v
95
+ raise ValueError(
96
+ f"Cannot infer hidden_dim for {type(self.model).__name__}: "
97
+ f"set model.d_model or model.hidden_size, or subclass KazdovBackend "
98
+ f"and override hidden_dim()."
99
+ )
@@ -0,0 +1,76 @@
1
+ """High-level model loading helper.
2
+
3
+ Eliminates ~5 lines of HuggingFace boilerplate per example:
4
+
5
+ # Before
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ tok = AutoTokenizer.from_pretrained(name)
8
+ if tok.pad_token is None: tok.pad_token = tok.eos_token
9
+ model = AutoModelForCausalLM.from_pretrained(name, dtype=torch.float32)
10
+ model.eval()
11
+ backend = Backend.for_model(model, hint=arch)
12
+
13
+ # After
14
+ model, tok, backend = archscope.load_model(name, arch="transformer")
15
+ """
16
+ from __future__ import annotations
17
+ from typing import Any
18
+
19
+ from .backends import Backend
20
+
21
+
22
+ def load_model(
23
+ name: str,
24
+ arch: str | None = None,
25
+ dtype: Any = None,
26
+ device: str = "cpu",
27
+ ) -> tuple[Any, Any, Backend]:
28
+ """Load a HuggingFace model + tokenizer + matching backend in one call.
29
+
30
+ Args:
31
+ name: HuggingFace model id (e.g., "EleutherAI/pythia-160m") OR a path
32
+ to a local archscope-registered model (e.g., kazdov checkpoint).
33
+ arch: Backend hint — one of "transformer", "mamba", "kazdov",
34
+ "recurrent". If None, auto-detect from HF config.model_type.
35
+ dtype: torch dtype. Defaults to torch.float32.
36
+ device: device string. Default "cpu".
37
+
38
+ Returns:
39
+ (model, tokenizer, backend) ready for use in probes/sae/lens/etc.
40
+ """
41
+ import torch
42
+ from transformers import AutoModelForCausalLM, AutoTokenizer
43
+
44
+ if dtype is None:
45
+ dtype = torch.float32
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained(name)
48
+ if tokenizer.pad_token is None:
49
+ tokenizer.pad_token = tokenizer.eos_token
50
+ model = AutoModelForCausalLM.from_pretrained(name, dtype=dtype)
51
+ model = model.to(device).eval()
52
+
53
+ backend = Backend.for_model(model, hint=arch)
54
+ return model, tokenizer, backend
55
+
56
+
57
+ def make_tokenize_fn(tokenizer, max_length: int = 32, attention_mask_bool: bool = False):
58
+ """Return a tokenize function suitable for ``Backend.extract`` and ``probes.fit_probe``.
59
+
60
+ Args:
61
+ tokenizer: a HuggingFace tokenizer.
62
+ max_length: max sequence length to truncate to.
63
+ attention_mask_bool: if True, returns attention_mask as a bool tensor
64
+ (required for ``kazdov`` backend); HF default is int64.
65
+
66
+ Returns:
67
+ A callable ``texts -> dict`` that matches the ``inputs`` format used
68
+ across archscope.
69
+ """
70
+ def fn(texts):
71
+ out = tokenizer(texts, return_tensors="pt", padding=True,
72
+ truncation=True, max_length=max_length)
73
+ if attention_mask_bool and "attention_mask" in out:
74
+ out["attention_mask"] = out["attention_mask"].bool()
75
+ return out
76
+ return fn
@@ -110,28 +110,72 @@ class ProbeFit:
110
110
 
111
111
 
112
112
  def _auroc(logits: torch.Tensor, labels: torch.Tensor) -> float:
113
- """Simple AUROC from logits + binary labels."""
113
+ """AUROC from logits + binary labels.
114
+
115
+ Returns 0.5 (chance) when only one class is present in `labels`
116
+ (the typical small-split case) — this is more informative than NaN
117
+ and avoids sklearn's UndefinedMetricWarning leaking to user code.
118
+ """
119
+ import warnings
114
120
  from sklearn.metrics import roc_auc_score
115
121
  scores = torch.sigmoid(logits).cpu().numpy()
116
122
  y = labels.cpu().numpy()
117
- try:
118
- return float(roc_auc_score(y, scores))
119
- except ValueError:
120
- return float("nan") # happens when only one class present
123
+ if len(set(y.tolist())) < 2:
124
+ warnings.warn(
125
+ "Only one class present in this split — AUROC undefined; "
126
+ "returning 0.5 (chance). Increase val_split or dataset size.",
127
+ stacklevel=2,
128
+ )
129
+ return 0.5
130
+ return float(roc_auc_score(y, scores))
121
131
 
122
132
 
123
133
  # High-level API matching paper
124
134
 
125
135
  def fit_probe(
126
136
  model,
127
- inputs_pos: list, # examples where target=1 (e.g., faithful)
128
- inputs_neg: list, # examples where target=0 (e.g., reasoning theater)
129
- layer_name: str,
137
+ inputs_pos=None,
138
+ inputs_neg=None,
139
+ layer_name: str = "",
130
140
  backend_hint: str | None = None,
131
141
  config: ProbeConfig | None = None,
132
142
  device: str = "cpu",
143
+ *,
144
+ tokenizer=None,
145
+ pos_texts: list[str] | None = None,
146
+ neg_texts: list[str] | None = None,
147
+ max_length: int = 32,
133
148
  ) -> ProbeFit:
134
- """End-to-end: extract activations from model, fit probe."""
149
+ """End-to-end: extract activations from a model and fit a probe.
150
+
151
+ Two calling conventions:
152
+
153
+ 1. **Pre-tokenized**: pass ``inputs_pos`` and ``inputs_neg`` as already-tokenized
154
+ dicts (with ``input_ids``, optional ``attention_mask``).
155
+
156
+ 2. **Texts + tokenizer**: pass ``tokenizer=…``, ``pos_texts=[…]``, ``neg_texts=[…]``
157
+ and archscope tokenizes for you. The kazdov backend requires a bool
158
+ attention_mask; we auto-handle that.
159
+
160
+ Returns:
161
+ A ``ProbeFit`` with trained probe and ``.metrics`` (train/val AUROC, loss).
162
+ """
163
+ if pos_texts is not None or neg_texts is not None:
164
+ if tokenizer is None:
165
+ raise ValueError("pos_texts/neg_texts require a tokenizer= argument")
166
+ if pos_texts is None or neg_texts is None:
167
+ raise ValueError("provide both pos_texts and neg_texts")
168
+ from .loader import make_tokenize_fn
169
+ tk = make_tokenize_fn(
170
+ tokenizer, max_length=max_length,
171
+ attention_mask_bool=(backend_hint == "kazdov"),
172
+ )
173
+ inputs_pos = tk(pos_texts)
174
+ inputs_neg = tk(neg_texts)
175
+ elif inputs_pos is None or inputs_neg is None:
176
+ raise ValueError("provide either (inputs_pos, inputs_neg) or "
177
+ "(tokenizer, pos_texts, neg_texts)")
178
+
135
179
  backend = Backend.for_model(model, hint=backend_hint)
136
180
  config = config or ProbeConfig(layer_name=layer_name)
137
181
 
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -11,8 +11,10 @@ src/archscope/cli.py
11
11
  src/archscope/diff.py
12
12
  src/archscope/kazdov_backend.py
13
13
  src/archscope/lens.py
14
+ src/archscope/loader.py
14
15
  src/archscope/neurons.py
15
16
  src/archscope/probes.py
17
+ src/archscope/py.typed
16
18
  src/archscope/sae.py
17
19
  src/archscope/transfer.py
18
20
  src/archscope.egg-info/PKG-INFO
@@ -11,13 +11,14 @@ import os
11
11
 
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import circuits
17
- from archscope.kazdov_backend import load_kazdov_checkpoint
17
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
18
+ from _kazdov_loader import load_kazdov_checkpoint
18
19
 
19
20
 
20
- CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
21
+ CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
21
22
  PYTHIA_NAME = "EleutherAI/pythia-160m"
22
23
  MAMBA_NAME = "state-spaces/mamba-130m-hf"
23
24
 
@@ -97,7 +98,7 @@ def main():
97
98
  print(" • concentration relative ≈ 0 → highly confident predictions (concentrated)")
98
99
 
99
100
  # Save
100
- out_path = "/Users/kazdov/code/OriginalKazdov/archscope/_research/circuits_3arch.json"
101
+ out_path = "str(__import__("pathlib").Path(__file__).parent.parent / "_research")/circuits_3arch.json"
101
102
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
102
103
  with open(out_path, "w") as f:
103
104
  json.dump(all_results, f, indent=2, default=str)
@@ -11,7 +11,7 @@ import time
11
11
  import copy
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import diff
17
17
 
@@ -9,14 +9,15 @@ import sys
9
9
  import time
10
10
  import torch
11
11
 
12
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
12
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
13
13
 
14
14
  from archscope import probes, sae, neurons
15
15
  from archscope.backends import Backend
16
- from archscope.kazdov_backend import load_kazdov_checkpoint
16
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
17
+ from _kazdov_loader import load_kazdov_checkpoint
17
18
 
18
19
 
19
- CHECKPOINT = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
20
+ CHECKPOINT = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
20
21
 
21
22
 
22
23
  def tokenize(tokenizer, texts: list[str]) -> dict:
@@ -9,7 +9,7 @@ import sys
9
9
  import time
10
10
  import torch
11
11
 
12
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
12
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
13
13
 
14
14
  from archscope import lens
15
15
 
@@ -10,7 +10,7 @@ import sys
10
10
  import time
11
11
  import torch
12
12
 
13
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
13
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
14
14
 
15
15
  from archscope import probes, sae, neurons
16
16
  from archscope.backends import Backend
@@ -11,7 +11,7 @@ import sys
11
11
  import time
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import sae
17
17
  from archscope.backends import Backend
@@ -14,14 +14,15 @@ import sys
14
14
  import time
15
15
  import torch
16
16
 
17
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
17
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
18
18
 
19
19
  from archscope import transfer
20
20
  from archscope.backends import Backend
21
- from archscope.kazdov_backend import load_kazdov_checkpoint
21
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
22
+ from _kazdov_loader import load_kazdov_checkpoint
22
23
 
23
24
 
24
- CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
25
+ CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
25
26
  PYTHIA_NAME = "EleutherAI/pythia-160m"
26
27
 
27
28
 
@@ -13,7 +13,7 @@ import sys
13
13
  import time
14
14
  import torch
15
15
 
16
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
16
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
17
17
 
18
18
  from archscope import probes, sae, neurons, attribute
19
19
  from archscope.backends import Backend
@@ -22,7 +22,56 @@ def test_imports():
22
22
  import archscope
23
23
  from archscope import (probes, sae, neurons, attribute, backends,
24
24
  circuits, transfer, bench, lens, diff)
25
- assert archscope.__version__ == "0.2.2"
25
+ assert archscope.__version__ == "0.2.4"
26
+
27
+
28
+ def test_loader_exports():
29
+ """load_model and make_tokenize_fn are exported at top level."""
30
+ import archscope
31
+ assert hasattr(archscope, "load_model")
32
+ assert hasattr(archscope, "make_tokenize_fn")
33
+ assert callable(archscope.load_model)
34
+ assert callable(archscope.make_tokenize_fn)
35
+
36
+
37
+ def test_layer_name_validation_clear_error():
38
+ """Backend validates layer names with an informative error."""
39
+ from archscope.backends import Backend, ActivationRecord
40
+
41
+ # Build a minimal mock backend
42
+ class _MockBackend(Backend):
43
+ def layer_names(self): return ["layer_0.residual", "layer_1.residual"]
44
+ def extract(self, inputs, layers=None):
45
+ layers = layers or self.layer_names()
46
+ self._validate_layers(layers)
47
+ return []
48
+ def hidden_dim(self, layer_name): return 8
49
+
50
+ b = _MockBackend(model=None)
51
+ # Valid layer → no error
52
+ b.extract({}, layers=["layer_0.residual"])
53
+ # Invalid layer → clear error message
54
+ try:
55
+ b.extract({}, layers=["layer_99.residual"])
56
+ except ValueError as e:
57
+ assert "Unknown layer name" in str(e)
58
+ assert "layer_0.residual" in str(e) # shows valid example
59
+ return
60
+ raise AssertionError("Expected ValueError for invalid layer name")
61
+
62
+
63
+ def test_auroc_returns_chance_on_single_class():
64
+ """_auroc returns 0.5 instead of NaN when only one class is present."""
65
+ import warnings
66
+ import torch
67
+ from archscope.probes import _auroc
68
+
69
+ logits = torch.tensor([0.5, 0.2, -0.1])
70
+ labels = torch.tensor([1.0, 1.0, 1.0]) # only one class
71
+ with warnings.catch_warnings():
72
+ warnings.simplefilter("ignore")
73
+ result = _auroc(logits, labels)
74
+ assert result == 0.5
26
75
 
27
76
 
28
77
  def test_diff_dataclasses():
@@ -1,30 +0,0 @@
1
- """archscope: unified mech interp toolkit across small + RNN + transformer.
2
-
3
- Four core methods unified under a single API:
4
- - probes: linear/MLP probes over hidden states (Drop the Act inspired)
5
- - sae: sparse autoencoders for residual + recurrent state (WriteSAE)
6
- - neurons: targeted neuron modulation via contrastive search (Nous Research)
7
- - attribute: activation patching + DIM decomposition (Multi-Agent Sycophancy)
8
-
9
- Each method exposes the same architecture-agnostic API:
10
- - .extract(model, inputs) -> hidden states / activations
11
- - .fit(activations, labels) -> learned tool
12
- - .apply(model, inputs) -> modified outputs / scores / explanations
13
-
14
- Designed for cross-architecture comparison: transformer, Mamba/SSM, custom RNN.
15
- """
16
-
17
- __version__ = "0.2.2"
18
-
19
- from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
20
-
21
- # Kazdov backend registers itself on import — optional, only if kazdov repo present
22
- try:
23
- from . import kazdov_backend # noqa: F401
24
- except ImportError:
25
- pass
26
-
27
- __all__ = [
28
- "probes", "sae", "neurons", "attribute", "backends",
29
- "circuits", "transfer", "bench", "lens", "diff", "__version__",
30
- ]
@@ -1,141 +0,0 @@
1
- """Backend for kazdov-α (and related Kazdov family models).
2
-
3
- Kazdov-α is a transformer-style decoder LM with hybrid attention (MoBE-BCN
4
- mixture of bilinear experts + standard MHA in parallel). Architecturally
5
- closer to standard transformer than to pure RNN/SSM — but the BCN attention
6
- branch makes it a distinct architecture family for cross-arch interp.
7
-
8
- Differences from HF transformer:
9
- - No HF AutoModelForCausalLM interface (custom forward signature)
10
- - Layers exposed as `model.blocks` (ModuleList)
11
- - No `output_hidden_states=True` argument — we capture via forward hooks
12
- - Forward signature: (input_ids, attention_mask=None, labels=None)
13
- """
14
- from __future__ import annotations
15
- import sys
16
- from pathlib import Path
17
- import torch
18
-
19
- from .backends import Backend, ActivationRecord
20
-
21
-
22
- KAZDOV_REPO = Path.home() / "code" / "OriginalKazdov" / "kazdov"
23
-
24
-
25
- def _ensure_kazdov_importable():
26
- """Add kazdov repo to sys.path so we can import KazdovLM."""
27
- p = str(KAZDOV_REPO)
28
- if p not in sys.path:
29
- sys.path.insert(0, p)
30
-
31
-
32
- def load_kazdov_checkpoint(checkpoint_path: str | Path, device: str = "cpu"):
33
- """Load kazdov-α from a checkpoint directory.
34
-
35
- Expects: config.json + final.pt (or latest.pt) in the directory.
36
- Returns: (model in eval mode, tokenizer wrapper).
37
- """
38
- _ensure_kazdov_importable()
39
- from kazdov.kazdov_lm import KazdovLM
40
- import json
41
-
42
- ckpt_dir = Path(checkpoint_path)
43
- config = json.loads((ckpt_dir / "config.json").read_text())
44
- model_cfg = config["model_cfg"]
45
-
46
- model = KazdovLM(
47
- vocab_size=model_cfg["vocab_size"],
48
- d_model=model_cfg["d_model"],
49
- n_layers=model_cfg["n_layers"],
50
- n_heads=model_cfg["n_heads"],
51
- rank=model_cfg["rank"],
52
- mlp_dim=model_cfg.get("mlp_dim"),
53
- max_len=model_cfg.get("max_len", 256),
54
- use_trilinear=model_cfg.get("use_trilinear", False),
55
- use_bi_bcn=model_cfg.get("use_bi_bcn", False),
56
- use_hybrid_mha=model_cfg.get("use_hybrid_mha", True),
57
- use_mobe=model_cfg.get("use_mobe", False),
58
- n_experts=model_cfg.get("n_experts", 1),
59
- )
60
-
61
- # Try final.pt then latest.pt
62
- for fname in ("final.pt", "latest.pt"):
63
- f = ckpt_dir / fname
64
- if f.exists():
65
- state = torch.load(f, map_location=device, weights_only=False)
66
- if isinstance(state, dict) and "model" in state:
67
- state = state["model"]
68
- model.load_state_dict(state, strict=False)
69
- break
70
- else:
71
- raise FileNotFoundError(f"No final.pt or latest.pt in {ckpt_dir}")
72
-
73
- model.to(device).eval()
74
-
75
- # Tokenizer: kazdov used GPT-2 tokenizer per memory
76
- from transformers import GPT2Tokenizer
77
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
78
- if tokenizer.pad_token is None:
79
- tokenizer.pad_token = tokenizer.eos_token
80
-
81
- return model, tokenizer
82
-
83
-
84
- @Backend.register("kazdov")
85
- class KazdovBackend(Backend):
86
- """Backend for kazdov-family models (KazdovLM, MoBE-BCN variants).
87
-
88
- Uses forward hooks to capture residual stream after each KazdovBlock,
89
- since the model doesn't expose output_hidden_states.
90
- """
91
-
92
- def layer_names(self) -> list[str]:
93
- n_layers = len(self.model.blocks)
94
- return [f"layer_{i}.residual" for i in range(n_layers)]
95
-
96
- def extract(self, inputs, layers=None):
97
- layers = layers or self.layer_names()
98
- captures: dict[str, torch.Tensor] = {}
99
-
100
- # Register a forward hook on each requested block.
101
- hooks = []
102
- for layer_name in layers:
103
- idx = int(layer_name.split("_")[1].split(".")[0])
104
- if idx >= len(self.model.blocks):
105
- continue
106
- block = self.model.blocks[idx]
107
-
108
- def make_hook(name):
109
- def hook(module, inp, out):
110
- tensor = out if isinstance(out, torch.Tensor) else out[0]
111
- captures[name] = tensor.detach()
112
- return hook
113
- hooks.append(block.register_forward_hook(make_hook(layer_name)))
114
-
115
- try:
116
- # Kazdov forward signature: model(input_ids, attention_mask=None)
117
- with torch.no_grad():
118
- if isinstance(inputs, dict):
119
- input_ids = inputs["input_ids"]
120
- attn = inputs.get("attention_mask")
121
- else:
122
- input_ids = inputs
123
- attn = None
124
- self.model(input_ids, attention_mask=attn)
125
- finally:
126
- for h in hooks:
127
- h.remove()
128
-
129
- records = []
130
- for layer_name in layers:
131
- if layer_name not in captures:
132
- continue
133
- records.append(ActivationRecord(
134
- layer_name=layer_name,
135
- activations=captures[layer_name],
136
- meta={"kind": "residual", "arch": "kazdov-mobe-bcn"},
137
- ))
138
- return records
139
-
140
- def hidden_dim(self, layer_name: str) -> int:
141
- return self.model.d_model
File without changes
File without changes
File without changes