PyPI - archscope - Versions diffs - 0.2.5__tar.gz → 0.2.7__tar.gz - Mend

archscope 0.2.5tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{archscope-0.2.5/src/archscope.egg-info → archscope-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: archscope
-Version: 0.2.5
+Version: 0.2.7
 Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
 Author: Juan Cruz Dovzak
 License: Apache-2.0
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
 ```python
 import archscope as mi
-from transformers import AutoModelForCausalLM, AutoTokenizer
-tok   = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
-model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
-backend = mi.backends.Backend.for_model(model, hint="mamba")
+# One call → HuggingFace model + tokenizer + the right backend
+model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
 # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
 ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
 # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
 ```
+`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
 ---
 ## What's inside
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
 If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
+### Method × backend support
+Not every method works on every architecture. The cross-product:
+| Method | transformer | mamba | kazdov | recurrent |
+|---|:---:|:---:|:---:|:---:|
+| `probes.fit_probe`               | ✅ | ✅ | ✅ | ✅ |
+| `sae.fit_sae` (Dense / Rank-1)   | ✅ | ✅ | ✅ | ✅ |
+| `neurons.find_neurons`           | ✅ | ✅ | ✅ | ✅ |
+| `attribute.activation_patch`     | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
+| `attribute.dim_decompose`        | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
+| `circuits.*` (behavioural)       | ✅ | ✅ | ✅ | ✅ |
+| `lens.logit_lens`                | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
+| `lens.TunedLens.fit`             | ✅ | ✅ | ✅ | ⚠️ |
+| `diff.compare`                   | ✅ | ✅ | ✅ | ✅ |
+| `transfer.evaluate_transfer`     | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
+| `bench.benchmark`                | ✅ | ✅ | ✅ | partial |
+❌ entries raise a clear `ValueError` rather than silently degrading.
 ---
 ## Install

{archscope-0.2.5 → archscope-0.2.7}/README.md RENAMED Viewed

@@ -21,18 +21,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
 ```python
 import archscope as mi
-from transformers import AutoModelForCausalLM, AutoTokenizer
-tok   = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
-model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
-backend = mi.backends.Backend.for_model(model, hint="mamba")
+# One call → HuggingFace model + tokenizer + the right backend
+model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
 # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
 ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
 # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
 ```
+`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
 ---
 ## What's inside
@@ -68,6 +67,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
 If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
+### Method × backend support
+Not every method works on every architecture. The cross-product:
+| Method | transformer | mamba | kazdov | recurrent |
+|---|:---:|:---:|:---:|:---:|
+| `probes.fit_probe`               | ✅ | ✅ | ✅ | ✅ |
+| `sae.fit_sae` (Dense / Rank-1)   | ✅ | ✅ | ✅ | ✅ |
+| `neurons.find_neurons`           | ✅ | ✅ | ✅ | ✅ |
+| `attribute.activation_patch`     | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
+| `attribute.dim_decompose`        | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
+| `circuits.*` (behavioural)       | ✅ | ✅ | ✅ | ✅ |
+| `lens.logit_lens`                | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
+| `lens.TunedLens.fit`             | ✅ | ✅ | ✅ | ⚠️ |
+| `diff.compare`                   | ✅ | ✅ | ✅ | ✅ |
+| `transfer.evaluate_transfer`     | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
+| `bench.benchmark`                | ✅ | ✅ | ✅ | partial |
+❌ entries raise a clear `ValueError` rather than silently degrading.
 ---
 ## Install

{archscope-0.2.5 → archscope-0.2.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "archscope"
-version = "0.2.5"
+version = "0.2.7"
 description = "Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models"
 readme = "README.md"
 authors = [{name = "Juan Cruz Dovzak"}]

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ Quick start::
     print(result.to_markdown())
 """
-__version__ = "0.2.5"
+__version__ = "0.2.7"
 from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
 from .loader import load_model, make_tokenize_fn

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/attribute.py RENAMED Viewed

@@ -95,7 +95,9 @@ def activation_patch(
         module = resolve_layer_module(model, f"layer_{idx}.residual")
         if module is None:
             continue
-        src_h = src_rec.activations
+        # detach+clone for the same reason dim_decompose does: avoid aliasing
+        # a tensor that could be overwritten when the patched forward runs.
+        src_h = src_rec.activations.detach().clone()
         def hook(mod, inp, out, replacement=src_h):
             if isinstance(out, tuple):
@@ -155,6 +157,23 @@ def dim_decompose(
     metric_b = metric_fn(out_b)
     total_gap = metric_a - metric_b
+    # Sanity check: at least one component must be resolvable for at least one
+    # requested layer. Architectures without attention/MLP submodules (Mamba,
+    # pure SSMs, custom recurrent blocks) would otherwise silently return an
+    # empty DIMResult.
+    resolvable = any(
+        resolve_subcomponent_module(model, idx, comp) is not None
+        for idx in layer_indices for comp in components
+    )
+    if not resolvable:
+        raise ValueError(
+            f"dim_decompose: none of components={components} were found on this "
+            f"model (type {type(model).__name__}). This method expects "
+            "attention/MLP submodules — it's transformer-style only. For "
+            "SSM/recurrent architectures, use activation_patch on the residual "
+            "stream instead."
+        )
     contributions: dict[str, float] = {}
     for comp in components:
         # 1) Capture component outputs during prompt_a.

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/backends.py RENAMED Viewed

@@ -135,7 +135,10 @@ class TransformerBackend(Backend):
     """HuggingFace transformers backend — extracts residual stream per layer."""
     def layer_names(self) -> list[str]:
-        # Standard HF: model.model.layers[i] for decoder transformers
+        # Layer names are virtual handles consumed by .extract(), which uses
+        # HF's `output_hidden_states=True` to retrieve the residual stream
+        # (no direct attribute walk into model.model.layers[i] needed —
+        # so this works across HF decoder LM families).
         n_layers = getattr(self.model.config, "num_hidden_layers", 0)
         return [f"layer_{i}.residual" for i in range(n_layers)]

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/circuits.py RENAMED Viewed

@@ -74,12 +74,22 @@ def induction_head_score(
         else:
             vocab_size = 50257   # GPT-2 default
+    # Adaptive vocab window — defaults to [100, 40000) for full-size LMs but
+    # tightens for small-vocab toy models so we don't sample outside the range.
+    lo = min(100, max(1, vocab_size // 4))
+    hi = min(vocab_size, 40000)
+    if hi - lo < 2 * n_pairs:
+        raise ValueError(
+            f"induction_head_score: vocab window [{lo}, {hi}) has only "
+            f"{hi - lo} tokens but n_pairs={n_pairs} requires {2 * n_pairs} distinct ids. "
+            f"Lower n_pairs or pass a model with vocab_size >= {2 * n_pairs + 100}."
+        )
     successes = 0
     rank_sum = 0.0
     prob_target_sum = 0.0
     for trial in range(n_trials):
-        # Pick n_pairs random token pairs
-        tokens = rng.sample(range(100, min(vocab_size, 40000)), 2 * n_pairs)
+        tokens = rng.sample(range(lo, hi), 2 * n_pairs)
         seq = []
         pairs = []
         for i in range(n_pairs):

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/lens.py RENAMED Viewed

@@ -218,14 +218,36 @@ class TunedLens(nn.Module):
         opt = torch.optim.AdamW(tl.translators.parameters(), lr=lr)
-        # Pre-extract all activations + target logits once
+        # Pre-extract all activations + target logits once.
+        # Ensure tokenizer has a pad token (GPT-2 family ships without one).
+        if getattr(tokenizer, "pad_token", None) is None and getattr(tokenizer, "eos_token", None) is not None:
+            tokenizer.pad_token = tokenizer.eos_token
         enc = tokenizer(calibration_texts, return_tensors="pt", padding=True,
                          truncation=True, max_length=max_len)
         inputs = {"input_ids": enc["input_ids"].to(device)}
+        if "attention_mask" in enc:
+            inputs["attention_mask"] = enc["attention_mask"].to(device)
+        # Per-row index of the last REAL (non-pad) token. If no attention_mask
+        # (single, unpadded sequence), the conventional last-position is fine.
+        if "attention_mask" in enc:
+            real_lengths = enc["attention_mask"].sum(dim=1).to(device)  # (B,)
+            last_idx = (real_lengths - 1).clamp(min=0)
+        else:
+            B = inputs["input_ids"].shape[0]
+            last_idx = torch.full((B,), inputs["input_ids"].shape[1] - 1,
+                                   dtype=torch.long, device=device)
+        def gather_last(acts: torch.Tensor) -> torch.Tensor:
+            # acts: (B, T, H) → (B, H) at each row's real last position.
+            B = acts.shape[0]
+            return acts[torch.arange(B, device=acts.device), last_idx]
         with torch.no_grad():
             records = backend.extract(inputs, layers=layer_names)
-            # Target: model's actual final logits at last position
-            final_residual = records[-1].activations[:, -1, :]
+            # Target: model's actual final logits at last REAL position per row.
+            final_residual = gather_last(records[-1].activations)
             if norm is not None:
                 final_residual = norm(final_residual)
             target_logits = unembed(final_residual).detach()    # (B, vocab)
@@ -235,7 +257,7 @@ class TunedLens(nn.Module):
             opt.zero_grad()
             total_loss = 0.0
             for i, rec in enumerate(records):
-                last = rec.activations[:, -1, :].detach()
+                last = gather_last(rec.activations).detach()
                 translated = tl.translators[i](last)
                 if norm is not None:
                     translated = norm(translated)

{archscope-0.2.5 → archscope-0.2.7}/src/archscope/probes.py RENAMED Viewed

@@ -108,6 +108,40 @@ class ProbeFit:
         with torch.no_grad():
             return torch.sigmoid(self.probe(activations.to(self.device)))
+    @property
+    def direction(self) -> torch.Tensor:
+        """1D direction vector in activation space (linear probes only).
+        Shape: ``(hidden_dim,)``. This is the projection axis the probe found —
+        useful for: applying a probe to externally-transformed activations
+        (e.g., after ``archscope.transfer.learn_alignment``), inspecting feature
+        geometry, or projecting interventions along the learned direction.
+        Raises ``ValueError`` for MLP probes (no single linear direction).
+        """
+        if self.config.probe_type != "linear":
+            raise ValueError(
+                f".direction is only defined for linear probes (got "
+                f"probe_type={self.config.probe_type!r}). MLP probes don't have a "
+                "single direction in activation space."
+            )
+        return self.probe.net.weight.detach().squeeze(0).clone()
+    @property
+    def bias(self) -> torch.Tensor:
+        """Scalar bias term (linear probes only). Shape: ``()``.
+        Together with ``.direction``, lets you score arbitrary activations as
+        ``logits = acts @ direction + bias`` without going through the probe
+        module — handy for cross-arch transfer experiments.
+        """
+        if self.config.probe_type != "linear":
+            raise ValueError(
+                f".bias is only defined for linear probes (got "
+                f"probe_type={self.config.probe_type!r})."
+            )
+        return self.probe.net.bias.detach().squeeze().clone()
 def _auroc(logits: torch.Tensor, labels: torch.Tensor) -> float:
     """AUROC from logits + binary labels.

{archscope-0.2.5 → archscope-0.2.7/src/archscope.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: archscope
-Version: 0.2.5
+Version: 0.2.7
 Summary: Lightweight workbench for cross-architecture mechanistic interpretability experiments on small models
 Author: Juan Cruz Dovzak
 License: Apache-2.0
@@ -58,18 +58,17 @@ It is **not**: a competitor to `transformer_lens` or `nnsight` (both are broader
 ```python
 import archscope as mi
-from transformers import AutoModelForCausalLM, AutoTokenizer
-tok   = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
-model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
-backend = mi.backends.Backend.for_model(model, hint="mamba")
+# One call → HuggingFace model + tokenizer + the right backend
+model, tok, backend = mi.load_model("state-spaces/mamba-130m-hf", arch="mamba")
 # Extract Mamba's recurrent SSM state h_t (in addition to residual stream)
 ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_state"])[0]
 # Shape: (B, intermediate_size, ssm_state_size) = (B, 1536, 16) for mamba-130m
 ```
+`load_model` handles `pad_token` setup, `model.eval()`, and backend auto-detection. If you'd rather drive `transformers` yourself, every method also accepts `backend_hint=...`.
 ---
 ## What's inside
@@ -105,6 +104,26 @@ ssm = backend.extract(tok("text", return_tensors="pt"), layers=["layer_12.ssm_st
 If `Backend.for_model(model)` is called on a model whose `config.model_type` isn't in the autodetect list, it raises a clear `ValueError` rather than silently picking a backend. Pass `hint="..."` explicitly for anything outside the list, or register a new backend via `Backend.register("name")`.
+### Method × backend support
+Not every method works on every architecture. The cross-product:
+| Method | transformer | mamba | kazdov | recurrent |
+|---|:---:|:---:|:---:|:---:|
+| `probes.fit_probe`               | ✅ | ✅ | ✅ | ✅ |
+| `sae.fit_sae` (Dense / Rank-1)   | ✅ | ✅ | ✅ | ✅ |
+| `neurons.find_neurons`           | ✅ | ✅ | ✅ | ✅ |
+| `attribute.activation_patch`     | ✅ | ✅ residual only | ✅ | ⚠️ subclass needed |
+| `attribute.dim_decompose`        | ✅ | ❌ no attention/MLP submods | ✅ | ❌ |
+| `circuits.*` (behavioural)       | ✅ | ✅ | ✅ | ✅ |
+| `lens.logit_lens`                | ✅ | ⚠️ degrades with depth — use `TunedLens` | ✅ | ⚠️ |
+| `lens.TunedLens.fit`             | ✅ | ✅ | ✅ | ⚠️ |
+| `diff.compare`                   | ✅ | ✅ | ✅ | ✅ |
+| `transfer.evaluate_transfer`     | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any | ✅ ↔ any |
+| `bench.benchmark`                | ✅ | ✅ | ✅ | partial |
+❌ entries raise a clear `ValueError` rather than silently degrading.
 ---
 ## Install

{archscope-0.2.5 → archscope-0.2.7}/tests/test_unit.py RENAMED Viewed

@@ -22,7 +22,7 @@ def test_imports():
     import archscope
     from archscope import (probes, sae, neurons, attribute, backends,    # noqa: F401
                             circuits, transfer, bench, lens, diff)        # noqa: F401
-    assert archscope.__version__ == "0.2.5"
+    assert archscope.__version__ == "0.2.7"
 def test_loader_exports():
@@ -244,6 +244,71 @@ def test_neurons_layer_filter_rejects_nonmatching():
     assert cfg.layer_filter == "not_a_substring"
+def test_induction_head_score_small_vocab_clear_error():
+    """induction_head_score raises a clear error when vocab is too small."""
+    from archscope.circuits import induction_head_score
+    class _TinyModel:
+        class config:
+            vocab_size = 40   # << 2*n_pairs + 100
+        def __call__(self, ids):
+            return torch.zeros(1, ids.shape[1], 40)
+    with pytest.raises(ValueError) as ei:
+        induction_head_score(_TinyModel(), n_pairs=20, n_trials=1)
+    assert "vocab window" in str(ei.value).lower() or "n_pairs" in str(ei.value)
+def test_probefit_direction_and_bias_accessors():
+    """ProbeFit exposes .direction and .bias for linear probes."""
+    from archscope.probes import ProbeFit, ProbeConfig
+    torch.manual_seed(0)
+    pos = torch.randn(40, 8) + 1.5
+    neg = torch.randn(40, 8) - 1.5
+    cfg = ProbeConfig(layer_name="x", probe_type="linear")
+    pf = ProbeFit(cfg, input_dim=8)
+    pf.train(torch.cat([pos, neg]), torch.cat([torch.ones(40), torch.zeros(40)]),
+              epochs=30, batch_size=16)
+    d, b = pf.direction, pf.bias
+    assert d.shape == (8,), f"direction shape: {d.shape}"
+    assert b.dim() == 0, f"bias should be scalar: {b.shape}"
+    # Manual application matches what probe.score does (up to sigmoid).
+    test_act = torch.randn(3, 8)
+    manual_logits = test_act @ d + b
+    via_probe = pf.probe(test_act)
+    assert torch.allclose(manual_logits, via_probe, atol=1e-5), \
+        "direction @ acts + bias should equal probe(acts)"
+def test_probefit_direction_rejects_mlp():
+    """.direction raises on MLP probes."""
+    from archscope.probes import ProbeFit, ProbeConfig
+    cfg = ProbeConfig(layer_name="x", probe_type="mlp")
+    pf = ProbeFit(cfg, input_dim=8)
+    with pytest.raises(ValueError) as ei:
+        _ = pf.direction
+    assert "linear" in str(ei.value).lower()
+def test_dim_decompose_rejects_mamba_style_model():
+    """dim_decompose raises on models with no attention/MLP submodules."""
+    from archscope.attribute import dim_decompose
+    class _NoSubmods(torch.nn.Module):
+        def forward(self, **kwargs):
+            class Out:
+                logits = torch.zeros(1, 3, 8)
+            return Out()
+    with pytest.raises(ValueError) as ei:
+        dim_decompose(_NoSubmods(),
+                       prompt_a={"input_ids": torch.tensor([[1, 2, 3]])},
+                       prompt_b={"input_ids": torch.tensor([[4, 5, 6]])},
+                       layer_indices=[0, 1],
+                       metric_fn=lambda o: 0.0)
+    assert "attention" in str(ei.value).lower() or "submod" in str(ei.value).lower()
 if __name__ == "__main__":
     # Allow `python tests/test_unit.py` for quick local check
     pytest.main([__file__, "-v"])