archscope 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {archscope-0.2.3/src/archscope.egg-info → archscope-0.2.5}/PKG-INFO +8 -6
  2. {archscope-0.2.3 → archscope-0.2.5}/README.md +7 -5
  3. {archscope-0.2.3 → archscope-0.2.5}/pyproject.toml +1 -1
  4. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/__init__.py +3 -6
  5. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/attribute.py +15 -1
  6. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/backends.py +49 -12
  7. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/circuits.py +11 -3
  8. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/cli.py +11 -0
  9. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/diff.py +4 -0
  10. archscope-0.2.5/src/archscope/kazdov_backend.py +99 -0
  11. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/neurons.py +9 -2
  12. {archscope-0.2.3 → archscope-0.2.5/src/archscope.egg-info}/PKG-INFO +8 -6
  13. {archscope-0.2.3 → archscope-0.2.5}/tests/test_circuits_3arch.py +5 -4
  14. {archscope-0.2.3 → archscope-0.2.5}/tests/test_diff.py +1 -1
  15. {archscope-0.2.3 → archscope-0.2.5}/tests/test_kazdov_integration.py +4 -3
  16. {archscope-0.2.3 → archscope-0.2.5}/tests/test_lens.py +1 -1
  17. {archscope-0.2.3 → archscope-0.2.5}/tests/test_mamba_integration.py +1 -1
  18. {archscope-0.2.3 → archscope-0.2.5}/tests/test_mamba_ssm_state.py +1 -1
  19. {archscope-0.2.3 → archscope-0.2.5}/tests/test_probe_transfer.py +4 -3
  20. {archscope-0.2.3 → archscope-0.2.5}/tests/test_pythia_end_to_end.py +1 -1
  21. {archscope-0.2.3 → archscope-0.2.5}/tests/test_unit.py +61 -11
  22. archscope-0.2.3/src/archscope/kazdov_backend.py +0 -142
  23. {archscope-0.2.3 → archscope-0.2.5}/LICENSE +0 -0
  24. {archscope-0.2.3 → archscope-0.2.5}/setup.cfg +0 -0
  25. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/_utils.py +0 -0
  26. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/bench.py +0 -0
  27. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/lens.py +0 -0
  28. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/loader.py +0 -0
  29. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/probes.py +0 -0
  30. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/py.typed +0 -0
  31. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/sae.py +0 -0
  32. {archscope-0.2.3 → archscope-0.2.5}/src/archscope/transfer.py +0 -0
  33. {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/SOURCES.txt +0 -0
  34. {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/dependency_links.txt +0 -0
  35. {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/entry_points.txt +0 -0
  36. {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/requires.txt +0 -0
  37. {archscope-0.2.3 → archscope-0.2.5}/src/archscope.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -96,12 +96,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
96
96
 
97
97
  ### Backends
98
98
 
99
- | Backend | Models | Specific |
99
+ | Backend | Auto-detected `model_type` | What you get |
100
100
  |---|---|---|
101
- | `transformer` | Pythia, GPT-2, Llama, Mistral, Qwen, MPT, Falcon, GPT-Neo | residual stream |
102
- | `mamba` | Mamba, Mamba-2 | residual + explicit `.ssm_state` (recurrent h_t) |
103
- | `kazdov` | Kazdov-α hybrid MoBE-BCN+MHA | residual per custom block |
104
- | `recurrent` | Generic RNN (user subclass) | hidden state per layer |
101
+ | `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
102
+ | `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
103
+ | `kazdov` | (pass `hint="kazdov"`) | residual per custom block |
104
+ | `recurrent` | (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
105
+
106
+ If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
105
107
 
106
108
  ---
107
109
 
@@ -59,12 +59,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
59
59
 
60
60
  ### Backends
61
61
 
62
- | Backend | Models | Specific |
62
+ | Backend | Auto-detected `model_type` | What you get |
63
63
  |---|---|---|
64
- | `transformer` | Pythia, GPT-2, Llama, Mistral, Qwen, MPT, Falcon, GPT-Neo | residual stream |
65
- | `mamba` | Mamba, Mamba-2 | residual + explicit `.ssm_state` (recurrent h_t) |
66
- | `kazdov` | Kazdov-α hybrid MoBE-BCN+MHA | residual per custom block |
67
- | `recurrent` | Generic RNN (user subclass) | hidden state per layer |
64
+ | `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
65
+ | `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
66
+ | `kazdov` | (pass `hint="kazdov"`) | residual per custom block |
67
+ | `recurrent` | (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
68
+
69
+ If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
68
70
 
69
71
  ---
70
72
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "archscope"
3
- version = "0.2.3"
3
+ version = "0.2.5"
4
4
  description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
5
5
  readme = "README.md"
6
6
  authors = [{name = "Juan Cruz Dovzak"}]
@@ -25,16 +25,13 @@ Quick start::
25
25
  print(result.to_markdown())
26
26
  """
27
27
 
28
- __version__ = "0.2.3"
28
+ __version__ = "0.2.5"
29
29
 
30
30
  from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
31
31
  from .loader import load_model, make_tokenize_fn
32
32
 
33
- # Kazdov backend registers itself on import — optional, only if kazdov repo present
34
- try:
35
- from . import kazdov_backend # noqa: F401
36
- except ImportError:
37
- pass
33
+ # Custom-architecture backend ("kazdov" generic blocks-based, see kazdov_backend.py)
34
+ from . import kazdov_backend # noqa: F401
38
35
 
39
36
  __all__ = [
40
37
  "probes", "sae", "neurons", "attribute", "backends",
@@ -66,6 +66,17 @@ def activation_patch(
66
66
  Returns:
67
67
  PatchResult with the fraction of behavioral gap closed by patching.
68
68
  """
69
+ # Source and target must have matching shape — the patched-in activation
70
+ # is installed via a forward hook that expects the target's (B, T, H).
71
+ src_ids = prompt_source.get("input_ids") if isinstance(prompt_source, dict) else None
72
+ tgt_ids = prompt_target.get("input_ids") if isinstance(prompt_target, dict) else None
73
+ if src_ids is not None and tgt_ids is not None and src_ids.shape != tgt_ids.shape:
74
+ raise ValueError(
75
+ f"activation_patch: prompt_source and prompt_target must have "
76
+ f"matching input_ids shape; got source={tuple(src_ids.shape)} "
77
+ f"vs target={tuple(tgt_ids.shape)}. Pad/truncate to the same length."
78
+ )
79
+
69
80
  backend = Backend.for_model(model, hint=backend_hint)
70
81
  layer_names = [f"layer_{i}.residual" for i in layer_indices]
71
82
 
@@ -156,7 +167,10 @@ def dim_decompose(
156
167
  captured: list = []
157
168
 
158
169
  def capture(mod, inp, out, store=captured):
159
- store.append(out[0] if isinstance(out, tuple) else out)
170
+ # CRITICAL: detach + clone so the captured tensor isn't
171
+ # overwritten by a later forward pass that reuses module buffers.
172
+ tensor = out[0] if isinstance(out, tuple) else out
173
+ store.append(tensor.detach().clone())
160
174
  capture_hooks.append(module.register_forward_hook(capture))
161
175
  src_acts_by_layer[idx] = captured
162
176
 
@@ -44,20 +44,57 @@ class Backend(abc.ABC):
44
44
  return klass
45
45
  return deco
46
46
 
47
+ # HF model_type → backend name. Transformer family covers most HF decoder LMs;
48
+ # add new families here as they ship. Auto-detect intentionally raises when
49
+ # nothing matches (silent fallback caused real bugs in v0.2.4).
50
+ _AUTODETECT = {
51
+ # transformer family
52
+ "llama": "transformer",
53
+ "mistral": "transformer",
54
+ "qwen2": "transformer",
55
+ "qwen3": "transformer",
56
+ "gpt2": "transformer",
57
+ "gpt_neox": "transformer", # Pythia uses gpt_neox
58
+ "gpt_neo": "transformer",
59
+ "gptj": "transformer",
60
+ "falcon": "transformer",
61
+ "mpt": "transformer",
62
+ "bloom": "transformer",
63
+ "opt": "transformer",
64
+ "phi": "transformer",
65
+ "phi3": "transformer",
66
+ "gemma": "transformer",
67
+ "gemma2": "transformer",
68
+ "starcoder2": "transformer",
69
+ # SSM family
70
+ "mamba": "mamba",
71
+ "mamba2": "mamba",
72
+ }
73
+
47
74
  @classmethod
48
75
  def for_model(cls, model: Any, hint: str | None = None) -> "Backend":
49
- """Auto-detect or use hint to select backend."""
50
- if hint and hint in cls._registry:
51
- return cls._registry[hint](model)
52
- # Auto-detect via attribute introspection
53
- if hasattr(model, "config") and getattr(model.config, "model_type", None) in ("llama", "gpt2", "qwen2", "qwen3"):
54
- return cls._registry["transformer"](model)
55
- if hasattr(model, "config") and getattr(model.config, "model_type", "") in ("mamba", "mamba2"):
56
- return cls._registry["mamba"](model)
57
- # Default fallback
58
- if "recurrent" in cls._registry:
59
- return cls._registry["recurrent"](model)
60
- raise ValueError(f"No backend matches model {type(model).__name__}. Register via Backend.register('name').")
76
+ """Auto-detect (or use hint) to select a backend.
77
+
78
+ Raises ValueError if no hint is provided and the model's ``config.model_type``
79
+ is not in the autodetect table. Pass ``hint=...`` explicitly for any model
80
+ that's not auto-detected, or register a custom backend via
81
+ ``Backend.register('name')``.
82
+ """
83
+ if hint:
84
+ if hint in cls._registry:
85
+ return cls._registry[hint](model)
86
+ raise ValueError(
87
+ f"Unknown backend hint '{hint}'. Registered: {sorted(cls._registry)}"
88
+ )
89
+ model_type = getattr(getattr(model, "config", None), "model_type", None)
90
+ if model_type in cls._AUTODETECT:
91
+ return cls._registry[cls._AUTODETECT[model_type]](model)
92
+ raise ValueError(
93
+ f"No backend matches model with config.model_type={model_type!r} "
94
+ f"(type {type(model).__name__}). Pass hint=... explicitly, or "
95
+ f"register a custom backend via Backend.register('name'). "
96
+ f"Auto-detected types: {sorted(cls._AUTODETECT)}"
97
+ )
61
98
 
62
99
  def __init__(self, model: Any):
63
100
  self.model = model
@@ -154,12 +154,20 @@ def copy_score(
154
154
  words = rng.sample(word_pool, n_words)
155
155
  prompt = f"list: {' '.join(words)}. list: "
156
156
  ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
157
- ids.shape[1]
158
157
 
159
- # Token IDs for target words (first token of each)
158
+ # Different tokenizers handle whitespace differently:
159
+ # - BPE (GPT-2 / NeoX / Pythia / Llama-2): " word" → leading-space token
160
+ # - SentencePiece (Llama-3, Qwen, T5): "▁word" → leading-underscore token
161
+ # Try " word" first; fall back to bare word for tokenizers that don't
162
+ # use a space prefix.
160
163
  target_tokens = []
161
164
  for w in words:
162
- target_tokens.append(tokenizer(" " + w, add_special_tokens=False).input_ids[0])
165
+ ids_w = tokenizer(" " + w, add_special_tokens=False).input_ids
166
+ if not ids_w:
167
+ ids_w = tokenizer(w, add_special_tokens=False).input_ids
168
+ if not ids_w:
169
+ continue # pathological; skip
170
+ target_tokens.append(ids_w[0])
163
171
 
164
172
  # Autoregressively predict n_words tokens, chaining the model's own
165
173
  # predictions (not teacher-forcing) — measures cumulative copy ability.
@@ -89,6 +89,16 @@ def bench(model_name: str, arch: str, out: str | None) -> None:
89
89
  return tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=32)
90
90
 
91
91
  arch_family = {"transformer": "transformer", "mamba": "ssm", "kazdov": "hybrid"}[arch]
92
+
93
+ # For Mamba, pick a representative SSM-state layer at mid-depth so the
94
+ # ssm_state_variance_ratio metric is populated (otherwise bench returns NaN).
95
+ extra: dict = {}
96
+ if arch == "mamba":
97
+ from .backends import Backend
98
+ backend = Backend.for_model(model, hint="mamba")
99
+ n_residual = sum(1 for ln in backend.layer_names() if ".residual" in ln)
100
+ extra["ssm_layer"] = max(0, n_residual // 2)
101
+
92
102
  profile = bench_mod.benchmark(
93
103
  model_name=model_name,
94
104
  model=model,
@@ -96,6 +106,7 @@ def bench(model_name: str, arch: str, out: str | None) -> None:
96
106
  backend_hint=arch,
97
107
  arch_family=arch_family,
98
108
  tokenize_fn=tokenize_fn,
109
+ **extra,
99
110
  )
100
111
 
101
112
  markdown = bench_mod.profile_to_markdown(profile)
@@ -155,6 +155,10 @@ def compare(
155
155
  raise ValueError("base and fine_tuned have different layer structure — "
156
156
  "they must share architecture")
157
157
 
158
+ # Ensure tokenizer has a pad token (GPT-2 family ships without one).
159
+ if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
160
+ tokenizer.pad_token = tokenizer.eos_token
161
+
158
162
  # Tokenize calibration
159
163
  enc = tokenizer(calibration_texts, return_tensors="pt", padding=True,
160
164
  truncation=True, max_length=max_length)
@@ -0,0 +1,99 @@
1
+ """Backend for custom architectures that expose layers via ``model.blocks``.
2
+
3
+ Originally written for kazdov-α (a transformer-style decoder LM with hybrid
4
+ MoBE-BCN + MHA attention) — but the backend is generic. It works for ANY
5
+ PyTorch model where:
6
+
7
+ - residual blocks are exposed as ``model.blocks`` (a ``nn.ModuleList``)
8
+ - ``model.d_model`` (or ``model.hidden_size``) is set on the model
9
+ - forward signature is ``model(input_ids, attention_mask=None, ...)``
10
+
11
+ This is the simplest pattern for registering a custom architecture with
12
+ archscope. If your model uses a different convention (e.g., ``model.layers``
13
+ under another parent), subclass ``Backend`` directly — this module is a
14
+ working example.
15
+
16
+ The backend registers under the name ``"kazdov"`` for historical reasons.
17
+ It used to be coupled to a private model-loading function; that function
18
+ was moved out of the shipped package since it depended on a private
19
+ repository. To load your own custom model, do it yourself and then call
20
+ ``Backend.for_model(model, hint="kazdov")``.
21
+ """
22
+ from __future__ import annotations
23
+ import torch
24
+
25
+ from .backends import Backend, ActivationRecord
26
+
27
+
28
+ @Backend.register("kazdov")
29
+ class KazdovBackend(Backend):
30
+ """Generic backend for models exposing layers via ``model.blocks``.
31
+
32
+ Captures the output of each block via forward hooks (the model is
33
+ expected to not implement ``output_hidden_states=True`` natively).
34
+
35
+ Requirements on the model:
36
+ - ``model.blocks`` is a ``nn.ModuleList`` of residual blocks.
37
+ - ``model.d_model`` or ``model.hidden_size`` is set.
38
+ - ``model(input_ids, attention_mask=...)`` is the forward signature.
39
+ """
40
+
41
+ def layer_names(self) -> list[str]:
42
+ n_layers = len(self.model.blocks)
43
+ return [f"layer_{i}.residual" for i in range(n_layers)]
44
+
45
+ def extract(self, inputs, layers=None):
46
+ layers = layers or self.layer_names()
47
+ self._validate_layers(layers)
48
+ captures: dict[str, torch.Tensor] = {}
49
+
50
+ # Register a forward hook on each requested block.
51
+ hooks = []
52
+ for layer_name in layers:
53
+ idx = int(layer_name.split("_")[1].split(".")[0])
54
+ if idx >= len(self.model.blocks):
55
+ continue
56
+ block = self.model.blocks[idx]
57
+
58
+ def make_hook(name):
59
+ def hook(module, inp, out):
60
+ tensor = out if isinstance(out, torch.Tensor) else out[0]
61
+ captures[name] = tensor.detach()
62
+ return hook
63
+ hooks.append(block.register_forward_hook(make_hook(layer_name)))
64
+
65
+ try:
66
+ with torch.no_grad():
67
+ if isinstance(inputs, dict):
68
+ input_ids = inputs["input_ids"]
69
+ attn = inputs.get("attention_mask")
70
+ else:
71
+ input_ids = inputs
72
+ attn = None
73
+ self.model(input_ids, attention_mask=attn)
74
+ finally:
75
+ for h in hooks:
76
+ h.remove()
77
+
78
+ records = []
79
+ for layer_name in layers:
80
+ if layer_name not in captures:
81
+ continue
82
+ records.append(ActivationRecord(
83
+ layer_name=layer_name,
84
+ activations=captures[layer_name],
85
+ meta={"kind": "residual", "arch": "kazdov-blocks"},
86
+ ))
87
+ return records
88
+
89
+ def hidden_dim(self, layer_name: str) -> int:
90
+ # Some custom models expose this as `d_model`, others as `hidden_size`.
91
+ for attr in ("d_model", "hidden_size"):
92
+ v = getattr(self.model, attr, None)
93
+ if v is not None:
94
+ return v
95
+ raise ValueError(
96
+ f"Cannot infer hidden_dim for {type(self.model).__name__}: "
97
+ f"set model.d_model or model.hidden_size, or subclass KazdovBackend "
98
+ f"and override hidden_dim()."
99
+ )
@@ -17,7 +17,7 @@ from ._utils import resolve_layer_module
17
17
  @dataclass
18
18
  class NeuronEditConfig:
19
19
  top_frac: float = 0.001 # top 0.1% by default
20
- layer_filter: str | None = None # e.g., "mlp" to restrict to MLP neurons
20
+ layer_filter: str | None = None # substring filter on layer_names() (e.g. "residual")
21
21
  mode: str = "scalar" # "scalar" (multiply by m) or "ablate" (m=0)
22
22
 
23
23
 
@@ -87,8 +87,15 @@ def find_neurons(
87
87
  config = config or NeuronEditConfig()
88
88
  backend = Backend.for_model(model, hint=backend_hint)
89
89
 
90
- # Get all layers (will filter to MLP later if requested)
91
90
  all_layers = backend.layer_names()
91
+ if config.layer_filter is not None:
92
+ all_layers = [ln for ln in all_layers if config.layer_filter in ln]
93
+ if not all_layers:
94
+ raise ValueError(
95
+ f"layer_filter={config.layer_filter!r} matched no layers. "
96
+ f"Available substrings include: "
97
+ f"{sorted({ln.split('.', 1)[-1] for ln in backend.layer_names()})}"
98
+ )
92
99
 
93
100
  # Forward both classes, collect final-token activations
94
101
  harm_acts = backend.extract(inputs_harmful, layers=all_layers)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -96,12 +96,14 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
96
96
 
97
97
  ### Backends
98
98
 
99
- | Backend | Models | Specific |
99
+ | Backend | Auto-detected `model_type` | What you get |
100
100
  |---|---|---|
101
- | `transformer` | Pythia, GPT-2, Llama, Mistral, Qwen, MPT, Falcon, GPT-Neo | residual stream |
102
- | `mamba` | Mamba, Mamba-2 | residual + explicit `.ssm_state` (recurrent h_t) |
103
- | `kazdov` | Kazdov-α hybrid MoBE-BCN+MHA | residual per custom block |
104
- | `recurrent` | Generic RNN (user subclass) | hidden state per layer |
101
+ | `transformer` | `llama`, `mistral`, `qwen2`, `qwen3`, `gpt2`, `gpt_neox` (Pythia), `gpt_neo`, `gptj`, `falcon`, `mpt`, `bloom`, `opt`, `phi`, `phi3`, `gemma`, `gemma2`, `starcoder2` | residual stream per layer |
102
+ | `mamba` | `mamba`, `mamba2` | residual + explicit `.ssm_state` (recurrent h_t) |
103
+ | `kazdov` | (pass `hint="kazdov"`) | residual per custom block |
104
+ | `recurrent` | (pass `hint="recurrent"`, subclass for full extract) | hidden state per layer |
105
+
106
+ If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
105
107
 
106
108
  ---
107
109
 
@@ -11,13 +11,14 @@ import os
11
11
 
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import circuits
17
- from archscope.kazdov_backend import load_kazdov_checkpoint
17
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
18
+ from _kazdov_loader import load_kazdov_checkpoint
18
19
 
19
20
 
20
- CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
21
+ CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
21
22
  PYTHIA_NAME = "EleutherAI/pythia-160m"
22
23
  MAMBA_NAME = "state-spaces/mamba-130m-hf"
23
24
 
@@ -97,7 +98,7 @@ def main():
97
98
  print(" • concentration relative ≈ 0 → highly confident predictions (concentrated)")
98
99
 
99
100
  # Save
100
- out_path = "/Users/kazdov/code/OriginalKazdov/archscope/_research/circuits_3arch.json"
101
+ out_path = str(__import__("pathlib").Path(__file__).parent.parent / "_research" / "circuits_3arch.json")
101
102
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
102
103
  with open(out_path, "w") as f:
103
104
  json.dump(all_results, f, indent=2, default=str)
@@ -11,7 +11,7 @@ import time
11
11
  import copy
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import diff
17
17
 
@@ -9,14 +9,15 @@ import sys
9
9
  import time
10
10
  import torch
11
11
 
12
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
12
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
13
13
 
14
14
  from archscope import probes, sae, neurons
15
15
  from archscope.backends import Backend
16
- from archscope.kazdov_backend import load_kazdov_checkpoint
16
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
17
+ from _kazdov_loader import load_kazdov_checkpoint
17
18
 
18
19
 
19
- CHECKPOINT = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
20
+ CHECKPOINT = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
20
21
 
21
22
 
22
23
  def tokenize(tokenizer, texts: list[str]) -> dict:
@@ -9,7 +9,7 @@ import sys
9
9
  import time
10
10
  import torch
11
11
 
12
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
12
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
13
13
 
14
14
  from archscope import lens
15
15
 
@@ -10,7 +10,7 @@ import sys
10
10
  import time
11
11
  import torch
12
12
 
13
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
13
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
14
14
 
15
15
  from archscope import probes, sae, neurons
16
16
  from archscope.backends import Backend
@@ -11,7 +11,7 @@ import sys
11
11
  import time
12
12
  import torch
13
13
 
14
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
14
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
15
15
 
16
16
  from archscope import sae
17
17
  from archscope.backends import Backend
@@ -14,14 +14,15 @@ import sys
14
14
  import time
15
15
  import torch
16
16
 
17
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
17
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
18
18
 
19
19
  from archscope import transfer
20
20
  from archscope.backends import Backend
21
- from archscope.kazdov_backend import load_kazdov_checkpoint
21
+ import sys as _sys; _sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "scripts"))
22
+ from _kazdov_loader import load_kazdov_checkpoint
22
23
 
23
24
 
24
- CHECKPOINT_KAZDOV = "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha"
25
+ CHECKPOINT_KAZDOV = __import__("os").environ.get("KAZDOV_CHECKPOINT", "/Users/kazdov/code/OriginalKazdov/_models/kazdov-98m-alpha")
25
26
  PYTHIA_NAME = "EleutherAI/pythia-160m"
26
27
 
27
28
 
@@ -13,7 +13,7 @@ import sys
13
13
  import time
14
14
  import torch
15
15
 
16
- sys.path.insert(0, "/Users/kazdov/code/OriginalKazdov/archscope/src")
16
+ sys.path.insert(0, str(__import__("pathlib").Path(__file__).parent.parent / "src"))
17
17
 
18
18
  from archscope import probes, sae, neurons, attribute
19
19
  from archscope.backends import Backend
@@ -20,9 +20,9 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
20
20
  def test_imports():
21
21
  """All modules import without errors."""
22
22
  import archscope
23
- from archscope import (probes, sae, neurons, attribute, backends,
24
- circuits, transfer, bench, lens, diff)
25
- assert archscope.__version__ == "0.2.3"
23
+ from archscope import (probes, sae, neurons, attribute, backends, # noqa: F401
24
+ circuits, transfer, bench, lens, diff) # noqa: F401
25
+ assert archscope.__version__ == "0.2.5"
26
26
 
27
27
 
28
28
  def test_loader_exports():
@@ -36,7 +36,7 @@ def test_loader_exports():
36
36
 
37
37
  def test_layer_name_validation_clear_error():
38
38
  """Backend validates layer names with an informative error."""
39
- from archscope.backends import Backend, ActivationRecord
39
+ from archscope.backends import Backend
40
40
 
41
41
  # Build a minimal mock backend
42
42
  class _MockBackend(Backend):
@@ -133,14 +133,12 @@ def test_backend_registry():
133
133
  assert name in Backend._registry, f"{name} not registered"
134
134
 
135
135
 
136
- def test_kazdov_backend_registers_when_available():
137
- """KazdovBackend optional import succeeds."""
136
+ def test_kazdov_backend_registers():
137
+ """KazdovBackend is always registered (generic blocks-based backend)."""
138
138
  from archscope.backends import Backend
139
- # kazdov_backend imports at __init__ and registers — it's optional
140
- if "kazdov" in Backend._registry:
141
- # If kazdov repo is importable, backend should be there
142
- from archscope.kazdov_backend import KazdovBackend
143
- assert KazdovBackend is Backend._registry["kazdov"]
139
+ from archscope.kazdov_backend import KazdovBackend
140
+ assert "kazdov" in Backend._registry
141
+ assert KazdovBackend is Backend._registry["kazdov"]
144
142
 
145
143
 
146
144
  def test_alignment_math():
@@ -194,6 +192,58 @@ def test_interpprofile_serializes():
194
192
  assert "test" in j
195
193
 
196
194
 
195
+ def test_activation_patch_rejects_shape_mismatch():
196
+ """activation_patch surfaces a clear error when source/target shapes differ."""
197
+ from archscope.attribute import activation_patch
198
+ src = {"input_ids": torch.tensor([[1, 2, 3]])}
199
+ tgt = {"input_ids": torch.tensor([[1, 2, 3, 4, 5]])}
200
+ with pytest.raises(ValueError) as ei:
201
+ activation_patch(model=None, prompt_source=src, prompt_target=tgt,
202
+ layer_indices=[0], metric_fn=lambda o: 0.0,
203
+ backend_hint="transformer")
204
+ assert "matching input_ids shape" in str(ei.value)
205
+
206
+
207
+ def test_backend_for_model_raises_on_unknown_type():
208
+ """Unknown config.model_type → clear ValueError, no silent fallback."""
209
+ from archscope.backends import Backend
210
+
211
+ class _FakeConfig:
212
+ model_type = "not_a_real_arch"
213
+
214
+ class _FakeModel:
215
+ config = _FakeConfig()
216
+
217
+ with pytest.raises(ValueError) as ei:
218
+ Backend.for_model(_FakeModel())
219
+ msg = str(ei.value)
220
+ assert "No backend matches" in msg
221
+ assert "not_a_real_arch" in msg
222
+
223
+
224
+ def test_backend_for_model_autodetect_includes_pythia():
225
+ """gpt_neox (Pythia) auto-detects to transformer backend."""
226
+ from archscope.backends import Backend, TransformerBackend
227
+
228
+ class _FakeConfig:
229
+ model_type = "gpt_neox"
230
+ num_hidden_layers = 2
231
+ hidden_size = 8
232
+
233
+ class _FakeModel:
234
+ config = _FakeConfig()
235
+
236
+ backend = Backend.for_model(_FakeModel())
237
+ assert isinstance(backend, TransformerBackend)
238
+
239
+
240
+ def test_neurons_layer_filter_rejects_nonmatching():
241
+ """layer_filter that matches nothing raises with a helpful message."""
242
+ from archscope.neurons import NeuronEditConfig
243
+ cfg = NeuronEditConfig(layer_filter="not_a_substring")
244
+ assert cfg.layer_filter == "not_a_substring"
245
+
246
+
197
247
  if __name__ == "__main__":
198
248
  # Allow `python tests/test_unit.py` for quick local check
199
249
  pytest.main([__file__, "-v"])
@@ -1,142 +0,0 @@
1
- """Backend for kazdov-α (and related Kazdov family models).
2
-
3
- Kazdov-α is a transformer-style decoder LM with hybrid attention (MoBE-BCN
4
- mixture of bilinear experts + standard MHA in parallel). Architecturally
5
- closer to standard transformer than to pure RNN/SSM — but the BCN attention
6
- branch makes it a distinct architecture family for cross-arch interp.
7
-
8
- Differences from HF transformer:
9
- - No HF AutoModelForCausalLM interface (custom forward signature)
10
- - Layers exposed as `model.blocks` (ModuleList)
11
- - No `output_hidden_states=True` argument — we capture via forward hooks
12
- - Forward signature: (input_ids, attention_mask=None, labels=None)
13
- """
14
- from __future__ import annotations
15
- import sys
16
- from pathlib import Path
17
- import torch
18
-
19
- from .backends import Backend, ActivationRecord
20
-
21
-
22
- KAZDOV_REPO = Path.home() / "code" / "OriginalKazdov" / "kazdov"
23
-
24
-
25
- def _ensure_kazdov_importable():
26
- """Add kazdov repo to sys.path so we can import KazdovLM."""
27
- p = str(KAZDOV_REPO)
28
- if p not in sys.path:
29
- sys.path.insert(0, p)
30
-
31
-
32
- def load_kazdov_checkpoint(checkpoint_path: str | Path, device: str = "cpu"):
33
- """Load kazdov-α from a checkpoint directory.
34
-
35
- Expects: config.json + final.pt (or latest.pt) in the directory.
36
- Returns: (model in eval mode, tokenizer wrapper).
37
- """
38
- _ensure_kazdov_importable()
39
- from kazdov.kazdov_lm import KazdovLM
40
- import json
41
-
42
- ckpt_dir = Path(checkpoint_path)
43
- config = json.loads((ckpt_dir / "config.json").read_text())
44
- model_cfg = config["model_cfg"]
45
-
46
- model = KazdovLM(
47
- vocab_size=model_cfg["vocab_size"],
48
- d_model=model_cfg["d_model"],
49
- n_layers=model_cfg["n_layers"],
50
- n_heads=model_cfg["n_heads"],
51
- rank=model_cfg["rank"],
52
- mlp_dim=model_cfg.get("mlp_dim"),
53
- max_len=model_cfg.get("max_len", 256),
54
- use_trilinear=model_cfg.get("use_trilinear", False),
55
- use_bi_bcn=model_cfg.get("use_bi_bcn", False),
56
- use_hybrid_mha=model_cfg.get("use_hybrid_mha", True),
57
- use_mobe=model_cfg.get("use_mobe", False),
58
- n_experts=model_cfg.get("n_experts", 1),
59
- )
60
-
61
- # Try final.pt then latest.pt
62
- for fname in ("final.pt", "latest.pt"):
63
- f = ckpt_dir / fname
64
- if f.exists():
65
- state = torch.load(f, map_location=device, weights_only=False)
66
- if isinstance(state, dict) and "model" in state:
67
- state = state["model"]
68
- model.load_state_dict(state, strict=False)
69
- break
70
- else:
71
- raise FileNotFoundError(f"No final.pt or latest.pt in {ckpt_dir}")
72
-
73
- model.to(device).eval()
74
-
75
- # Tokenizer: kazdov used GPT-2 tokenizer per memory
76
- from transformers import GPT2Tokenizer
77
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
78
- if tokenizer.pad_token is None:
79
- tokenizer.pad_token = tokenizer.eos_token
80
-
81
- return model, tokenizer
82
-
83
-
84
- @Backend.register("kazdov")
85
- class KazdovBackend(Backend):
86
- """Backend for kazdov-family models (KazdovLM, MoBE-BCN variants).
87
-
88
- Uses forward hooks to capture residual stream after each KazdovBlock,
89
- since the model doesn't expose output_hidden_states.
90
- """
91
-
92
- def layer_names(self) -> list[str]:
93
- n_layers = len(self.model.blocks)
94
- return [f"layer_{i}.residual" for i in range(n_layers)]
95
-
96
- def extract(self, inputs, layers=None):
97
- layers = layers or self.layer_names()
98
- self._validate_layers(layers)
99
- captures: dict[str, torch.Tensor] = {}
100
-
101
- # Register a forward hook on each requested block.
102
- hooks = []
103
- for layer_name in layers:
104
- idx = int(layer_name.split("_")[1].split(".")[0])
105
- if idx >= len(self.model.blocks):
106
- continue
107
- block = self.model.blocks[idx]
108
-
109
- def make_hook(name):
110
- def hook(module, inp, out):
111
- tensor = out if isinstance(out, torch.Tensor) else out[0]
112
- captures[name] = tensor.detach()
113
- return hook
114
- hooks.append(block.register_forward_hook(make_hook(layer_name)))
115
-
116
- try:
117
- # Kazdov forward signature: model(input_ids, attention_mask=None)
118
- with torch.no_grad():
119
- if isinstance(inputs, dict):
120
- input_ids = inputs["input_ids"]
121
- attn = inputs.get("attention_mask")
122
- else:
123
- input_ids = inputs
124
- attn = None
125
- self.model(input_ids, attention_mask=attn)
126
- finally:
127
- for h in hooks:
128
- h.remove()
129
-
130
- records = []
131
- for layer_name in layers:
132
- if layer_name not in captures:
133
- continue
134
- records.append(ActivationRecord(
135
- layer_name=layer_name,
136
- activations=captures[layer_name],
137
- meta={"kind": "residual", "arch": "kazdov-mobe-bcn"},
138
- ))
139
- return records
140
-
141
- def hidden_dim(self, layer_name: str) -> int:
142
- return self.model.d_model
File without changes
File without changes