PyPI - archscope - Versions diffs - 0.2.2__py3-none-any.whl - Mend

archscope 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

archscope/__init__.py +30 -0
archscope/_utils.py +113 -0
archscope/attribute.py +201 -0
archscope/backends.py +236 -0
archscope/bench.py +262 -0
archscope/circuits.py +255 -0
archscope/cli.py +120 -0
archscope/diff.py +212 -0
archscope/kazdov_backend.py +141 -0
archscope/lens.py +304 -0
archscope/neurons.py +118 -0
archscope/probes.py +160 -0
archscope/sae.py +127 -0
archscope/transfer.py +188 -0
archscope-0.2.2.dist-info/METADATA +324 -0
archscope-0.2.2.dist-info/RECORD +20 -0
archscope-0.2.2.dist-info/WHEEL +5 -0
archscope-0.2.2.dist-info/entry_points.txt +2 -0
archscope-0.2.2.dist-info/licenses/LICENSE +17 -0
archscope-0.2.2.dist-info/top_level.txt +1 -0

archscope/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""archscope: unified mech interp toolkit across small + RNN + transformer.
+Four core methods unified under a single API:
+- probes: linear/MLP probes over hidden states (Drop the Act inspired)
+- sae: sparse autoencoders for residual + recurrent state (WriteSAE)
+- neurons: targeted neuron modulation via contrastive search (Nous Research)
+- attribute: activation patching + DIM decomposition (Multi-Agent Sycophancy)
+Each method exposes the same architecture-agnostic API:
+- .extract(model, inputs) -> hidden states / activations
+- .fit(activations, labels) -> learned tool
+- .apply(model, inputs) -> modified outputs / scores / explanations
+Designed for cross-architecture comparison: transformer, Mamba/SSM, custom RNN.
+"""
+__version__ = "0.2.2"
+from . import probes, sae, neurons, attribute, backends, circuits, transfer, bench, lens, diff
+# Kazdov backend registers itself on import — optional, only if kazdov repo present
+try:
+    from . import kazdov_backend  # noqa: F401
+except ImportError:
+    pass
+__all__ = [
+    "probes", "sae", "neurons", "attribute", "backends",
+    "circuits", "transfer", "bench", "lens", "diff", "__version__",
+]

archscope/_utils.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Internal utilities — not part of the public API.
+Shared layer-resolution logic used by neurons.py and attribute.py to find
+the actual nn.Module corresponding to a layer_name string like
+"layer_5.residual" across different HuggingFace model architectures.
+"""
+from __future__ import annotations
+from typing import Any
+# Common HF architecture paths to layer ModuleList.
+# Order matters: first match wins. Add new architectures here.
+_LAYER_PATHS: list[tuple[str, str | None]] = [
+    ("model", "layers"),         # Llama / Mistral / Qwen / GPT-NeoX-style
+    ("transformer", "h"),         # GPT-2 / Falcon
+    ("transformer", "blocks"),    # MPT
+    ("gpt_neox", "layers"),       # Pythia
+    ("backbone", "layers"),       # Mamba / Mamba-2
+    ("layers", None),             # Direct .layers (some custom models, e.g. kazdov)
+    ("h", None),                  # Direct .h
+    ("blocks", None),             # Direct .blocks (kazdov)
+]
+def _parse_layer_index(layer_name: str) -> int | None:
+    """Extract the integer index from a name like 'layer_5.residual'."""
+    try:
+        idx_part = layer_name.split("_")[1].split(".")[0]
+        return int(idx_part)
+    except (IndexError, ValueError):
+        return None
+def resolve_layer_module(model: Any, layer_name: str):
+    """Return the nn.Module corresponding to a layer_name across HF naming conventions.
+    Handles: Llama, Mistral, Qwen, GPT-2, Falcon, MPT, Pythia, Mamba, custom .blocks.
+    Returns None if the layer name cannot be parsed or no path matches.
+    """
+    idx = _parse_layer_index(layer_name)
+    if idx is None:
+        return None
+    for parent_attr, child_attr in _LAYER_PATHS:
+        parent_obj = getattr(model, parent_attr, None)
+        if parent_obj is None:
+            continue
+        layers = parent_obj if child_attr is None else getattr(parent_obj, child_attr, None)
+        if layers is None:
+            continue
+        try:
+            return layers[idx]
+        except (IndexError, TypeError):
+            continue
+    return None
+_UNEMBED_PATHS = [
+    "lm_head",        # Llama, Pythia, Mistral, Mamba, kazdov, most HF CausalLMs
+    "embed_out",      # some HF models
+    "output_layer",   # some custom models
+]
+_FINAL_NORM_PATHS: list[tuple[str, str | None]] = [
+    ("model", "norm"),                  # Llama / Mistral
+    ("gpt_neox", "final_layer_norm"),    # Pythia
+    ("transformer", "ln_f"),             # GPT-2 / Falcon
+    ("backbone", "norm_f"),              # Mamba
+    ("ln_f", None),                      # kazdov (top-level)
+]
+def resolve_unembedding(model: Any):
+    """Find the model's unembedding / lm_head module. Returns nn.Module or None."""
+    for path in _UNEMBED_PATHS:
+        m = getattr(model, path, None)
+        if m is not None:
+            return m
+    return None
+def resolve_final_norm(model: Any):
+    """Find the model's final pre-unembedding layer norm. Returns module or None."""
+    for parent_attr, child_attr in _FINAL_NORM_PATHS:
+        parent_obj = getattr(model, parent_attr, None)
+        if parent_obj is None:
+            continue
+        norm = parent_obj if child_attr is None else getattr(parent_obj, child_attr, None)
+        if norm is not None:
+            return norm
+    return None
+def resolve_subcomponent_module(model: Any, idx: int, component: str):
+    """Find an attention or MLP submodule inside a layer at index `idx`.
+    component: "attention" or "mlp".
+    Returns None if the component isn't found.
+    """
+    layer = resolve_layer_module(model, f"layer_{idx}.residual")
+    if layer is None:
+        return None
+    if component == "attention":
+        for attr in ("self_attn", "attn", "attention"):
+            sub = getattr(layer, attr, None)
+            if sub is not None:
+                return sub
+    elif component == "mlp":
+        for attr in ("mlp", "feed_forward", "ffn"):
+            sub = getattr(layer, attr, None)
+            if sub is not None:
+                return sub
+    return None

archscope/attribute.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Activation patching + DIM decomposition (Multi-Agent Sycophancy, 2605.12991).
+Two methods:
+- `activation_patch`: replace activations from one prompt with another at
+  specified layer range. Measures how much of the behavioral gap is "restored"
+  by the patch.
+- `dim_decompose`: difference-in-means decomposition of attribution per
+  component (MLP vs attention).
+Use cases:
+- Localize behavior to specific layers (e.g., "L14-L18 restores 96.8% of gap")
+- Separate attention vs MLP contribution
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+from .backends import Backend
+from ._utils import resolve_layer_module, resolve_subcomponent_module
+@dataclass
+class PatchResult:
+    """Outcome of a single activation_patch experiment."""
+    layer_range: tuple[int, int]
+    gap_restored: float       # fraction of behavioral gap closed by patching
+    target_metric: str        # what we measured (e.g., "logit_diff")
+    baseline_metric: float
+    patched_metric: float
+    clean_metric: float
+@dataclass
+class DIMResult:
+    """Difference-in-means attribution per component (attention vs MLP)."""
+    components: dict[str, float]    # e.g., {"attention": 0.45, "mlp": 0.02}
+    total: float
+    layer_range: tuple[int, int]
+def _remove_hooks(hooks):
+    """Cleanly detach a list of forward-hook handles."""
+    for h in hooks:
+        h.remove()
+def activation_patch(
+    model,
+    prompt_source,
+    prompt_target,
+    layer_indices: list[int],
+    metric_fn,
+    backend_hint: str | None = None,
+) -> PatchResult:
+    """Replace activations at chosen layers with those from a source prompt.
+    Args:
+        model: any model exposing `model(**inputs, return_dict=True).logits`
+        prompt_source: tokenized inputs to extract clean activations from
+        prompt_target: tokenized inputs where activations will be replaced
+        layer_indices: which layer indices to patch
+        metric_fn: function mapping `model_outputs → scalar` (e.g., logit diff)
+        backend_hint: backend name for extraction
+    Returns:
+        PatchResult with the fraction of behavioral gap closed by patching.
+    """
+    backend = Backend.for_model(model, hint=backend_hint)
+    layer_names = [f"layer_{i}.residual" for i in layer_indices]
+    # 1. Clean source: extract activations to patch in.
+    src_acts = backend.extract(prompt_source, layers=layer_names)
+    # 2. Clean target: baseline metric.
+    with torch.no_grad():
+        target_clean_out = model(**prompt_target, output_hidden_states=False, return_dict=True)
+    clean_metric = metric_fn(target_clean_out)
+    # 3. Patched target: hook in source activations.
+    hooks = []
+    for layer_name, src_rec in zip(layer_names, src_acts):
+        idx = int(layer_name.split("_")[1].split(".")[0])
+        module = resolve_layer_module(model, f"layer_{idx}.residual")
+        if module is None:
+            continue
+        src_h = src_rec.activations
+        def hook(mod, inp, out, replacement=src_h):
+            if isinstance(out, tuple):
+                return (replacement,) + out[1:]
+            return replacement
+        hooks.append(module.register_forward_hook(hook))
+    try:
+        with torch.no_grad():
+            patched_out = model(**prompt_target, output_hidden_states=False, return_dict=True)
+        patched_metric = metric_fn(patched_out)
+    finally:
+        _remove_hooks(hooks)
+    # Source baseline (no hooks).
+    with torch.no_grad():
+        src_out = model(**prompt_source, output_hidden_states=False, return_dict=True)
+    source_metric = metric_fn(src_out)
+    gap = source_metric - clean_metric
+    gap_restored = 0.0 if abs(gap) < 1e-9 else (patched_metric - clean_metric) / gap
+    return PatchResult(
+        layer_range=(min(layer_indices), max(layer_indices)),
+        gap_restored=float(gap_restored),
+        target_metric="custom",
+        baseline_metric=float(source_metric),
+        patched_metric=float(patched_metric),
+        clean_metric=float(clean_metric),
+    )
+def dim_decompose(
+    model,
+    prompt_a,
+    prompt_b,
+    layer_indices: list[int],
+    metric_fn,
+    components: tuple[str, ...] = ("attention", "mlp"),
+    backend_hint: str | None = None,    # kept for symmetry with activation_patch
+) -> DIMResult:
+    """Decompose a behavioral difference into per-component contributions.
+    For each component (default: attention, mlp), captures its output during
+    `prompt_a`, then patches that output into the forward pass on `prompt_b`,
+    and measures the fraction of the metric gap that the patch closes.
+    `backend_hint` is accepted but unused (this function uses module hooks
+    directly via `resolve_subcomponent_module`).
+    """
+    del backend_hint   # unused; kept for API symmetry
+    with torch.no_grad():
+        out_a = model(**prompt_a, return_dict=True)
+        out_b = model(**prompt_b, return_dict=True)
+    metric_a = metric_fn(out_a)
+    metric_b = metric_fn(out_b)
+    total_gap = metric_a - metric_b
+    contributions: dict[str, float] = {}
+    for comp in components:
+        # 1) Capture component outputs during prompt_a.
+        capture_hooks = []
+        src_acts_by_layer: dict[int, list] = {}
+        for idx in layer_indices:
+            module = resolve_subcomponent_module(model, idx, comp)
+            if module is None:
+                continue
+            captured: list = []
+            def capture(mod, inp, out, store=captured):
+                store.append(out[0] if isinstance(out, tuple) else out)
+            capture_hooks.append(module.register_forward_hook(capture))
+            src_acts_by_layer[idx] = captured
+        try:
+            with torch.no_grad():
+                model(**prompt_a, return_dict=True)
+        finally:
+            _remove_hooks(capture_hooks)
+        # 2) Patch captured outputs into prompt_b's forward pass.
+        patch_hooks = []
+        for idx in layer_indices:
+            if idx not in src_acts_by_layer:
+                continue
+            module = resolve_subcomponent_module(model, idx, comp)
+            if module is None:
+                continue
+            stored = src_acts_by_layer[idx]
+            if not stored:
+                continue
+            captured_out = stored[0]
+            def patch(mod, inp, out, repl=captured_out):
+                if isinstance(out, tuple):
+                    return (repl,) + out[1:]
+                return repl
+            patch_hooks.append(module.register_forward_hook(patch))
+        try:
+            with torch.no_grad():
+                patched_out = model(**prompt_b, return_dict=True)
+            patched_metric = metric_fn(patched_out)
+        finally:
+            _remove_hooks(patch_hooks)
+        contributions[comp] = float((patched_metric - metric_b) / (total_gap + 1e-9))
+    return DIMResult(
+        components=contributions,
+        total=float(total_gap),
+        layer_range=(min(layer_indices), max(layer_indices)),
+    )

archscope/backends.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""Architecture-agnostic activation extraction.
+The core abstraction: a `Backend` knows how to hook into a model and pull out
+hidden states at named layers, regardless of underlying framework
+(PyTorch/JAX/custom).
+Three backends implemented:
+- TransformerBackend: HuggingFace transformers (residual stream per layer)
+- MambaBackend: state-space models (hidden state + ssm state per layer)
+- RecurrentBackend: generic RNN-like (extracts hidden state per timestep)
+Custom architectures (e.g., kazdov MoBE-BCN) register via Backend.register().
+"""
+from __future__ import annotations
+import abc
+import torch
+from dataclasses import dataclass
+from typing import Any
+@dataclass
+class ActivationRecord:
+    """Captured activations from a single forward pass.
+    Attributes:
+        layer_name: identifier of the layer (e.g., "blocks.5.residual")
+        activations: tensor of shape (batch, seq_len, hidden_dim) typically
+        meta: arch-specific metadata (e.g., {'kind': 'residual'} or {'kind': 'ssm_state'})
+    """
+    layer_name: str
+    activations: Any   # torch.Tensor or jax.Array
+    meta: dict
+class Backend(abc.ABC):
+    """Abstract interface — extract activations from any model architecture."""
+    _registry: dict[str, type["Backend"]] = {}
+    @classmethod
+    def register(cls, name: str):
+        def deco(klass):
+            cls._registry[name] = klass
+            return klass
+        return deco
+    @classmethod
+    def for_model(cls, model: Any, hint: str | None = None) -> "Backend":
+        """Auto-detect or use hint to select backend."""
+        if hint and hint in cls._registry:
+            return cls._registry[hint](model)
+        # Auto-detect via attribute introspection
+        if hasattr(model, "config") and getattr(model.config, "model_type", None) in ("llama", "gpt2", "qwen2", "qwen3"):
+            return cls._registry["transformer"](model)
+        if hasattr(model, "config") and getattr(model.config, "model_type", "") in ("mamba", "mamba2"):
+            return cls._registry["mamba"](model)
+        # Default fallback
+        if "recurrent" in cls._registry:
+            return cls._registry["recurrent"](model)
+        raise ValueError(f"No backend matches model {type(model).__name__}. Register via Backend.register('name').")
+    def __init__(self, model: Any):
+        self.model = model
+    @abc.abstractmethod
+    def layer_names(self) -> list[str]:
+        """Return list of layer identifiers we can hook."""
+        ...
+    @abc.abstractmethod
+    def extract(self, inputs: Any, layers: list[str] | None = None) -> list[ActivationRecord]:
+        """Run forward pass, return activations at requested layers (all if None)."""
+        ...
+    @abc.abstractmethod
+    def hidden_dim(self, layer_name: str) -> int:
+        """Dimensionality of activations at a given layer."""
+        ...
+@Backend.register("transformer")
+class TransformerBackend(Backend):
+    """HuggingFace transformers backend — extracts residual stream per layer."""
+    def layer_names(self) -> list[str]:
+        # Standard HF: model.model.layers[i] for decoder transformers
+        n_layers = getattr(self.model.config, "num_hidden_layers", 0)
+        return [f"layer_{i}.residual" for i in range(n_layers)]
+    def extract(self, inputs, layers=None):
+        layers = layers or self.layer_names()
+        # Use HF's output_hidden_states=True for clean extraction.
+        # Wrap in no_grad: extraction shouldn't build a backward graph.
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True, return_dict=True)
+        records = []
+        hidden_states = outputs.hidden_states  # tuple of (n_layers+1) tensors
+        for layer_name in layers:
+            # Parse "layer_N.residual" → N+1 (since [0] is embedding output)
+            idx = int(layer_name.split("_")[1].split(".")[0]) + 1
+            records.append(ActivationRecord(
+                layer_name=layer_name,
+                activations=hidden_states[idx],
+                meta={"kind": "residual", "arch": "transformer"},
+            ))
+        return records
+    def hidden_dim(self, layer_name: str) -> int:
+        return self.model.config.hidden_size
+@Backend.register("mamba")
+class MambaBackend(Backend):
+    """Mamba/Mamba-2 backend — extracts residual stream AND SSM recurrent state.
+    Works with HuggingFace MambaForCausalLM (and Mamba2ForCausalLM).
+    Two flavors of activations exposed:
+    - `layer_N.residual`  → residual stream after block N (B, T, hidden_size)
+    - `layer_N.ssm_state` → final SSM recurrent state after processing the
+                            sequence at block N: shape (B, intermediate_size, ssm_state_size).
+                            This exposes the recurrent state used by Mamba-style models —
+                            useful when experiments need access to memory-like state rather
+                            than residual activations alone.
+    """
+    def layer_names(self) -> list[str]:
+        n_layers = getattr(self.model.config, "n_layer", 0) or getattr(self.model.config, "num_hidden_layers", 0)
+        out = []
+        for i in range(n_layers):
+            out.append(f"layer_{i}.residual")
+            out.append(f"layer_{i}.ssm_state")
+        return out
+    def extract(self, inputs, layers=None):
+        layers = layers or self.layer_names()
+        need_residual = any(".residual" in ln for ln in layers)
+        need_ssm = any(".ssm_state" in ln for ln in layers)
+        with torch.no_grad():
+            if need_ssm:
+                # Pass a DynamicCache so Mamba writes final SSM states to it
+                from transformers.models.mamba.modeling_mamba import DynamicCache
+                cache = DynamicCache(config=self.model.config)
+                outputs = self.model(
+                    **inputs,
+                    cache_params=cache,
+                    use_cache=True,
+                    output_hidden_states=need_residual,
+                    return_dict=True,
+                )
+            else:
+                outputs = self.model(
+                    **inputs, output_hidden_states=True, return_dict=True
+                )
+                cache = None
+        records = []
+        for layer_name in layers:
+            idx = int(layer_name.split("_")[1].split(".")[0])
+            if ".residual" in layer_name:
+                records.append(ActivationRecord(
+                    layer_name=layer_name,
+                    activations=outputs.hidden_states[idx + 1].detach(),  # +1 because [0] is embedding output
+                    meta={"kind": "residual", "arch": "mamba", "shape_meaning": "(B, T, hidden_size)"},
+                ))
+            elif ".ssm_state" in layer_name:
+                if cache is None or not hasattr(cache, "layers") or idx >= len(cache.layers):
+                    continue
+                ssm = cache.layers[idx].recurrent_states
+                if ssm is None:
+                    continue
+                records.append(ActivationRecord(
+                    layer_name=layer_name,
+                    activations=ssm.detach(),
+                    meta={
+                        "kind": "ssm_state",
+                        "arch": "mamba",
+                        "shape_meaning": "(B, intermediate_size, ssm_state_size)",
+                        "d_inner": ssm.shape[-2],
+                        "d_state": ssm.shape[-1],
+                    },
+                ))
+        return records
+    def hidden_dim(self, layer_name: str) -> int:
+        if ".ssm_state" in layer_name:
+            # SSM state is (intermediate_size × ssm_state_size)
+            d_inner = getattr(self.model.config, "intermediate_size", None)
+            d_state = getattr(self.model.config, "state_size", None)
+            if d_inner and d_state:
+                return d_inner * d_state
+            # Fallback: introspect from a block
+            mixer = self.model.backbone.layers[0].mixer
+            return mixer.intermediate_size * mixer.ssm_state_size
+        return getattr(self.model.config, "hidden_size", None) or getattr(self.model.config, "d_model", None)
+@Backend.register("recurrent")
+class RecurrentBackend(Backend):
+    """Generic recurrent backend — for custom RNN-family models (e.g., kazdov MoBE-BCN).
+    Expects model to expose:
+    - .get_hidden_states(inputs) → dict[str, tensor]
+    OR registered forward hooks (user injects).
+    Custom models should subclass and override `extract`.
+    """
+    def layer_names(self) -> list[str]:
+        # Default: try common RNN attribute names
+        if hasattr(self.model, "n_layer"):
+            return [f"layer_{i}.hidden" for i in range(self.model.n_layer)]
+        if hasattr(self.model, "num_layers"):
+            return [f"layer_{i}.hidden" for i in range(self.model.num_layers)]
+        return ["layer_0.hidden"]
+    def extract(self, inputs, layers=None):
+        # Generic — subclass should override
+        if hasattr(self.model, "get_hidden_states"):
+            hs = self.model.get_hidden_states(inputs)
+            return [
+                ActivationRecord(layer_name=k, activations=v, meta={"kind": "hidden", "arch": "recurrent"})
+                for k, v in hs.items()
+            ]
+        raise NotImplementedError(
+            f"RecurrentBackend default extract() not implemented for {type(self.model).__name__}. "
+            "Subclass and override extract(), or call model.get_hidden_states() yourself."
+        )
+    def hidden_dim(self, layer_name: str) -> int:
+        for attr in ("d_model", "hidden_size", "d_hidden", "n_embd"):
+            if hasattr(self.model, attr):
+                return getattr(self.model, attr)
+        raise ValueError("Cannot infer hidden_dim — override hidden_dim() in subclass.")