archscope 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {archscope-0.2.5/src/archscope.egg-info → archscope-0.2.6}/PKG-INFO +25 -6
  2. {archscope-0.2.5 → archscope-0.2.6}/README.md +24 -5
  3. {archscope-0.2.5 → archscope-0.2.6}/pyproject.toml +1 -1
  4. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/__init__.py +1 -1
  5. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/attribute.py +20 -1
  6. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/backends.py +4 -1
  7. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/circuits.py +12 -2
  8. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/lens.py +26 -4
  9. {archscope-0.2.5 → archscope-0.2.6/src/archscope.egg-info}/PKG-INFO +25 -6
  10. {archscope-0.2.5 → archscope-0.2.6}/tests/test_unit.py +35 -1
  11. {archscope-0.2.5 → archscope-0.2.6}/LICENSE +0 -0
  12. {archscope-0.2.5 → archscope-0.2.6}/setup.cfg +0 -0
  13. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/_utils.py +0 -0
  14. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/bench.py +0 -0
  15. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/cli.py +0 -0
  16. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/diff.py +0 -0
  17. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/kazdov_backend.py +0 -0
  18. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/loader.py +0 -0
  19. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/neurons.py +0 -0
  20. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/probes.py +0 -0
  21. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/py.typed +0 -0
  22. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/sae.py +0 -0
  23. {archscope-0.2.5 → archscope-0.2.6}/src/archscope/transfer.py +0 -0
  24. {archscope-0.2.5 → archscope-0.2.6}/src/archscope.egg-info/SOURCES.txt +0 -0
  25. {archscope-0.2.5 → archscope-0.2.6}/src/archscope.egg-info/dependency_links.txt +0 -0
  26. {archscope-0.2.5 → archscope-0.2.6}/src/archscope.egg-info/entry_points.txt +0 -0
  27. {archscope-0.2.5 → archscope-0.2.6}/src/archscope.egg-info/requires.txt +0 -0
  28. {archscope-0.2.5 → archscope-0.2.6}/src/archscope.egg-info/top_level.txt +0 -0
  29. {archscope-0.2.5 → archscope-0.2.6}/tests/test_circuits_3arch.py +0 -0
  30. {archscope-0.2.5 → archscope-0.2.6}/tests/test_diff.py +0 -0
  31. {archscope-0.2.5 → archscope-0.2.6}/tests/test_kazdov_integration.py +0 -0
  32. {archscope-0.2.5 → archscope-0.2.6}/tests/test_lens.py +0 -0
  33. {archscope-0.2.5 → archscope-0.2.6}/tests/test_mamba_integration.py +0 -0
  34. {archscope-0.2.5 → archscope-0.2.6}/tests/test_mamba_ssm_state.py +0 -0
  35. {archscope-0.2.5 → archscope-0.2.6}/tests/test_probe_transfer.py +0 -0
  36. {archscope-0.2.5 → archscope-0.2.6}/tests/test_pythia_end_to_end.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
58
58
 
59
59
  ```python
60
60
  import archscope as mi
61
- from transformers import AutoModelForCausalLM, AutoTokenizer
62
-
63
- tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
64
- model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
65
61
 
66
- backend = mi.backends.Backend.for_model(model, hint="mamba")
62
+ # One call → HuggingFace model + tokenizer + the right backend
63
+ model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
67
64
 
68
65
  # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
69
66
  ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
70
67
  # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
71
68
  ```
72
69
 
70
+ `load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
71
+
73
72
  ---
74
73
 
75
74
  ## What's inside
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
105
104
 
106
105
  If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
107
106
 
107
+ ### Method × backend support
108
+
109
+ Not every method works on every architecture. The cross-product:
110
+
111
+ | Method | transformer | mamba | kazdov | recurrent |
112
+ |---|:---:|:---:|:---:|:---:|
113
+ | `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
114
+ | `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
115
+ | `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
116
+ | `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
117
+ | `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
118
+ | `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
119
+ | `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
120
+ | `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
121
+ | `diff.compare` | ✅ | ✅ | ✅ | ✅ |
122
+ | `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
123
+ | `bench.benchmark` | ✅ | ✅ | ✅ | partial |
124
+
125
+ ❌ entries raise a clear `ValueError` rather than silently degrading.
126
+
108
127
  ---
109
128
 
110
129
  ## Install
@@ -21,18 +21,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
21
21
 
22
22
  ```python
23
23
  import archscope as mi
24
- from transformers import AutoModelForCausalLM, AutoTokenizer
25
-
26
- tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
27
- model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
28
24
 
29
- backend = mi.backends.Backend.for_model(model, hint="mamba")
25
+ # One call → HuggingFace model + tokenizer + the right backend
26
+ model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
30
27
 
31
28
  # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
32
29
  ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
33
30
  # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
34
31
  ```
35
32
 
33
+ `load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
34
+
36
35
  ---
37
36
 
38
37
  ## What's inside
@@ -68,6 +67,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
68
67
 
69
68
  If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
70
69
 
70
+ ### Method × backend support
71
+
72
+ Not every method works on every architecture. The cross-product:
73
+
74
+ | Method | transformer | mamba | kazdov | recurrent |
75
+ |---|:---:|:---:|:---:|:---:|
76
+ | `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
77
+ | `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
78
+ | `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
79
+ | `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
80
+ | `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
81
+ | `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
82
+ | `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
83
+ | `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
84
+ | `diff.compare` | ✅ | ✅ | ✅ | ✅ |
85
+ | `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
86
+ | `bench.benchmark` | ✅ | ✅ | ✅ | partial |
87
+
88
+ ❌ entries raise a clear `ValueError` rather than silently degrading.
89
+
71
90
  ---
72
91
 
73
92
  ## Install
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "archscope"
3
- version = "0.2.5"
3
+ version = "0.2.6"
4
4
  description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
5
5
  readme = "README.md"
6
6
  authors = [{name = "Juan Cruz Dovzak"}]
@@ -25,7 +25,7 @@ Quick start::
25
25
  print(result.to_markdown())
26
26
  """
27
27
 
28
- __version__ = "0.2.5"
28
+ __version__ = "0.2.6"
29
29
 
30
30
  from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
31
31
  from .loader import load_model, make_tokenize_fn
@@ -95,7 +95,9 @@ def activation_patch(
95
95
  module = resolve_layer_module(model, f"layer_{idx}.residual")
96
96
  if module is None:
97
97
  continue
98
- src_h = src_rec.activations
98
+ # detach+clone for the same reason dim_decompose does: avoid aliasing
99
+ # a tensor that could be overwritten when the patched forward runs.
100
+ src_h = src_rec.activations.detach().clone()
99
101
 
100
102
  def hook(mod, inp, out, replacement=src_h):
101
103
  if isinstance(out, tuple):
@@ -155,6 +157,23 @@ def dim_decompose(
155
157
  metric_b = metric_fn(out_b)
156
158
  total_gap = metric_a - metric_b
157
159
 
160
+ # Sanity check: at least one component must be resolvable for at least one
161
+ # requested layer. Architectures without attention/MLP submodules (Mamba,
162
+ # pure SSMs, custom recurrent blocks) would otherwise silently return an
163
+ # empty DIMResult.
164
+ resolvable = any(
165
+ resolve_subcomponent_module(model, idx, comp) is not None
166
+ for idx in layer_indices for comp in components
167
+ )
168
+ if not resolvable:
169
+ raise ValueError(
170
+ f"dim_decompose: none of components={components} were found on this "
171
+ f"model (type {type(model).__name__}). This method expects "
172
+ "attention/MLP submodules — it's transformer-style only. For "
173
+ "SSM/recurrent architectures, use activation_patch on the residual "
174
+ "stream instead."
175
+ )
176
+
158
177
  contributions: dict[str, float] = {}
159
178
  for comp in components:
160
179
  # 1) Capture component outputs during prompt_a.
@@ -135,7 +135,10 @@ class TransformerBackend(Backend):
135
135
  """HuggingFace transformers backend — extracts residual stream per layer."""
136
136
 
137
137
  def layer_names(self) -> list[str]:
138
- # Standard HF: model.model.layers[i] for decoder transformers
138
+ # Layer names are virtual handles consumed by .extract(), which uses
139
+ # HF's `output_hidden_states=True` to retrieve the residual stream
140
+ # (no direct attribute walk into model.model.layers[i] needed —
141
+ # so this works across HF decoder LM families).
139
142
  n_layers = getattr(self.model.config, "num_hidden_layers", 0)
140
143
  return [f"layer_{i}.residual" for i in range(n_layers)]
141
144
 
@@ -74,12 +74,22 @@ def induction_head_score(
74
74
  else:
75
75
  vocab_size = 50257 # GPT-2 default
76
76
 
77
+ # Adaptive vocab window — defaults to [100, 40000) for full-size LMs but
78
+ # tightens for small-vocab toy models so we don't sample outside the range.
79
+ lo = min(100, max(1, vocab_size // 4))
80
+ hi = min(vocab_size, 40000)
81
+ if hi - lo < 2 * n_pairs:
82
+ raise ValueError(
83
+ f"induction_head_score: vocab window [{lo}, {hi}) has only "
84
+ f"{hi - lo} tokens but n_pairs={n_pairs} requires {2 * n_pairs} distinct ids. "
85
+ f"Lower n_pairs or pass a model with vocab_size >= {2 * n_pairs + 100}."
86
+ )
87
+
77
88
  successes = 0
78
89
  rank_sum = 0.0
79
90
  prob_target_sum = 0.0
80
91
  for trial in range(n_trials):
81
- # Pick n_pairs random token pairs
82
- tokens = rng.sample(range(100, min(vocab_size, 40000)), 2 * n_pairs)
92
+ tokens = rng.sample(range(lo, hi), 2 * n_pairs)
83
93
  seq = []
84
94
  pairs = []
85
95
  for i in range(n_pairs):
@@ -218,14 +218,36 @@ class TunedLens(nn.Module):
218
218
 
219
219
  opt = torch.optim.AdamW(tl.translators.parameters(), lr=lr)
220
220
 
221
- # Pre-extract all activations + target logits once
221
+ # Pre-extract all activations + target logits once.
222
+ # Ensure tokenizer has a pad token (GPT-2 family ships without one).
223
+ if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
224
+ tokenizer.pad_token = tokenizer.eos_token
225
+
222
226
  enc = tokenizer(calibration_texts, return_tensors="pt", padding=True,
223
227
  truncation=True, max_length=max_len)
224
228
  inputs = {"input_ids": enc["input_ids"].to(device)}
229
+ if "attention_mask" in enc:
230
+ inputs["attention_mask"] = enc["attention_mask"].to(device)
231
+
232
+ # Per-row index of the last REAL (non-pad) token. If no attention_mask
233
+ # (single, unpadded sequence), the conventional last-position is fine.
234
+ if "attention_mask" in enc:
235
+ real_lengths = enc["attention_mask"].sum(dim=1).to(device) # (B,)
236
+ last_idx = (real_lengths - 1).clamp(min=0)
237
+ else:
238
+ B = inputs["input_ids"].shape[0]
239
+ last_idx = torch.full((B,), inputs["input_ids"].shape[1] - 1,
240
+ dtype=torch.long, device=device)
241
+
242
+ def gather_last(acts: torch.Tensor) -> torch.Tensor:
243
+ # acts: (B, T, H) → (B, H) at each row's real last position.
244
+ B = acts.shape[0]
245
+ return acts[torch.arange(B, device=acts.device), last_idx]
246
+
225
247
  with torch.no_grad():
226
248
  records = backend.extract(inputs, layers=layer_names)
227
- # Target: model's actual final logits at last position
228
- final_residual = records[-1].activations[:, -1, :]
249
+ # Target: model's actual final logits at last REAL position per row.
250
+ final_residual = gather_last(records[-1].activations)
229
251
  if norm is not None:
230
252
  final_residual = norm(final_residual)
231
253
  target_logits = unembed(final_residual).detach() # (B, vocab)
@@ -235,7 +257,7 @@ class TunedLens(nn.Module):
235
257
  opt.zero_grad()
236
258
  total_loss = 0.0
237
259
  for i, rec in enumerate(records):
238
- last = rec.activations[:, -1, :].detach()
260
+ last = gather_last(rec.activations).detach()
239
261
  translated = tl.translators[i](last)
240
262
  if norm is not None:
241
263
  translated = norm(translated)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archscope
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
5
5
  Author: Juan Cruz Dovzak
6
6
  License: Apache-2.0
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
58
58
 
59
59
  ```python
60
60
  import archscope as mi
61
- from transformers import AutoModelForCausalLM, AutoTokenizer
62
-
63
- tok = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
64
- model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
65
61
 
66
- backend = mi.backends.Backend.for_model(model, hint="mamba")
62
+ # One call → HuggingFace model + tokenizer + the right backend
63
+ model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
67
64
 
68
65
  # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
69
66
  ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
70
67
  # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
71
68
  ```
72
69
 
70
+ `load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
71
+
73
72
  ---
74
73
 
75
74
  ## What's inside
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
105
104
 
106
105
  If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
107
106
 
107
+ ### Method × backend support
108
+
109
+ Not every method works on every architecture. The cross-product:
110
+
111
+ | Method | transformer | mamba | kazdov | recurrent |
112
+ |---|:---:|:---:|:---:|:---:|
113
+ | `probes.fit_probe` | ✅ | ✅ | ✅ | ✅ |
114
+ | `sae.fit_sae` (Dense / Rank-1) | ✅ | ✅ | ✅ | ✅ |
115
+ | `neurons.find_neurons` | ✅ | ✅ | ✅ | ✅ |
116
+ | `attribute.activation_patch` | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
117
+ | `attribute.dim_decompose` | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
118
+ | `circuits.*` (behavioural) | ✅ | ✅ | ✅ | ✅ |
119
+ | `lens.logit_lens` | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
120
+ | `lens.TunedLens.fit` | ✅ | ✅ | ✅ | ⚠️ |
121
+ | `diff.compare` | ✅ | ✅ | ✅ | ✅ |
122
+ | `transfer.evaluate_transfer` | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
123
+ | `bench.benchmark` | ✅ | ✅ | ✅ | partial |
124
+
125
+ ❌ entries raise a clear `ValueError` rather than silently degrading.
126
+
108
127
  ---
109
128
 
110
129
  ## Install
@@ -22,7 +22,7 @@ def test_imports():
22
22
  import archscope
23
23
  from archscope import (probes, sae, neurons, attribute, backends, # noqa: F401
24
24
  circuits, transfer, bench, lens, diff) # noqa: F401
25
- assert archscope.__version__ == "0.2.5"
25
+ assert archscope.__version__ == "0.2.6"
26
26
 
27
27
 
28
28
  def test_loader_exports():
@@ -244,6 +244,40 @@ def test_neurons_layer_filter_rejects_nonmatching():
244
244
  assert cfg.layer_filter == "not_a_substring"
245
245
 
246
246
 
247
+ def test_induction_head_score_small_vocab_clear_error():
248
+ """induction_head_score raises a clear error when vocab is too small."""
249
+ from archscope.circuits import induction_head_score
250
+
251
+ class _TinyModel:
252
+ class config:
253
+ vocab_size = 40 # << 2*n_pairs + 100
254
+ def __call__(self, ids):
255
+ return torch.zeros(1, ids.shape[1], 40)
256
+
257
+ with pytest.raises(ValueError) as ei:
258
+ induction_head_score(_TinyModel(), n_pairs=20, n_trials=1)
259
+ assert "vocab window" in str(ei.value).lower() or "n_pairs" in str(ei.value)
260
+
261
+
262
+ def test_dim_decompose_rejects_mamba_style_model():
263
+ """dim_decompose raises on models with no attention/MLP submodules."""
264
+ from archscope.attribute import dim_decompose
265
+
266
+ class _NoSubmods(torch.nn.Module):
267
+ def forward(self, **kwargs):
268
+ class Out:
269
+ logits = torch.zeros(1, 3, 8)
270
+ return Out()
271
+
272
+ with pytest.raises(ValueError) as ei:
273
+ dim_decompose(_NoSubmods(),
274
+ prompt_a={"input_ids": torch.tensor([[1, 2, 3]])},
275
+ prompt_b={"input_ids": torch.tensor([[4, 5, 6]])},
276
+ layer_indices=[0, 1],
277
+ metric_fn=lambda o: 0.0)
278
+ assert "attention" in str(ei.value).lower() or "submod" in str(ei.value).lower()
279
+
280
+
247
281
  if __name__ == "__main__":
248
282
  # Allow `python tests/test_unit.py` for quick local check
249
283
  pytest.main([__file__, "-v"])
File without changes
File without changes
File without changes
File without changes