PyPI - interpkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

interpkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

interpkit/__init__.py +15 -0
interpkit/cli/__init__.py +0 -0
interpkit/cli/main.py +337 -0
interpkit/core/__init__.py +0 -0
interpkit/core/discovery.py +228 -0
interpkit/core/html.py +375 -0
interpkit/core/inputs.py +117 -0
interpkit/core/model.py +551 -0
interpkit/core/plot.py +352 -0
interpkit/core/registry.py +82 -0
interpkit/core/render.py +465 -0
interpkit/core/tl_compat.py +174 -0
interpkit/ops/__init__.py +0 -0
interpkit/ops/ablate.py +90 -0
interpkit/ops/activations.py +67 -0
interpkit/ops/attention.py +234 -0
interpkit/ops/attribute.py +206 -0
interpkit/ops/diff.py +79 -0
interpkit/ops/inspect.py +14 -0
interpkit/ops/lens.py +151 -0
interpkit/ops/patch.py +112 -0
interpkit/ops/probe.py +128 -0
interpkit/ops/sae.py +212 -0
interpkit/ops/steer.py +118 -0
interpkit/ops/trace.py +182 -0
interpkit-0.1.0.dist-info/METADATA +295 -0
interpkit-0.1.0.dist-info/RECORD +31 -0
interpkit-0.1.0.dist-info/WHEEL +5 -0
interpkit-0.1.0.dist-info/entry_points.txt +2 -0
interpkit-0.1.0.dist-info/licenses/LICENSE +21 -0
interpkit-0.1.0.dist-info/top_level.txt +1 -0

interpkit/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""interpkit — mech interp for any HuggingFace model."""
+from interpkit.core.model import load
+from interpkit.core.registry import register
+from interpkit.core.tl_compat import list_tl_hooks, to_native_name, to_tl_name
+def diff(model_a, model_b, input_data, *, save=None):
+    """Compare activations between two models on the same input."""
+    from interpkit.ops.diff import run_diff
+    return run_diff(model_a, model_b, input_data, save=save)
+__all__ = ["load", "register", "diff", "to_tl_name", "to_native_name", "list_tl_hooks"]

interpkit/cli/__init__.py ADDED Viewed

File without changes

interpkit/cli/main.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""CLI entry point — Typer app with all interpkit commands."""
+from __future__ import annotations
+from typing import Optional
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+app = typer.Typer(
+    name="interpkit",
+    help="Mech interp for any HuggingFace model.",
+    no_args_is_help=False,
+    add_completion=False,
+    rich_markup_mode="rich",
+)
+console = Console()
+def _load_model(model_name: str, device: str | None = None):
+    from interpkit.core.model import load
+    with console.status(f"Loading {model_name}..."):
+        return load(model_name, device=device)
+# ══════════════════════════════════════════════════════════════════
+# help — rich overview panel
+# ══════════════════════════════════════════════════════════════════
+@app.callback(invoke_without_command=True)
+def main(ctx: typer.Context) -> None:
+    """Mech interp for any HuggingFace model."""
+    if ctx.invoked_subcommand is not None:
+        return
+    logo = r"""
+  ___       _                  _  ___ _
+ |_ _|_ __ | |_ ___ _ __ _ __| |/ (_) |_
+  | || '_ \| __/ _ \ '__| '_ \ ' /| | __|
+  | || | | | ||  __/ |  | |_) | . \| | |_
+ |___|_| |_|\__\___|_|  | .__/|_|\_\_|\__|
+                         |_|
+"""
+    console.print(f"[bold cyan]{logo}[/bold cyan]", highlight=False)
+    table = Table(
+        show_header=True, header_style="bold", show_lines=False,
+        pad_edge=True, expand=True,
+    )
+    table.add_column("Command", style="cyan", no_wrap=True)
+    table.add_column("Description")
+    table.add_column("Example", style="dim")
+    rows = [
+        ("", "[bold]Core Operations[/bold]", ""),
+        ("inspect", "Module tree with types, params, roles", "interpkit inspect gpt2"),
+        ("patch", "Activation patching at a module", "interpkit patch gpt2 --clean '...' --corrupted '...' --at transformer.h.8.mlp"),
+        ("trace", "Causal tracing — rank modules by effect", "interpkit trace gpt2 --clean '...' --corrupted '...'"),
+        ("lens", "Logit lens — project layers to vocab", "interpkit lens gpt2 'The capital of France is'"),
+        ("attribute", "Gradient saliency over inputs", "interpkit attribute gpt2 'The capital of France is'"),
+        ("", "", ""),
+        ("", "[bold]Analysis Operations[/bold]", ""),
+        ("activations", "Extract raw activation tensors", "interpkit activations gpt2 '...' --at transformer.h.8"),
+        ("ablate", "Zero/mean ablate a component", "interpkit ablate gpt2 '...' --at transformer.h.8.mlp"),
+        ("attention", "Visualize attention patterns", "interpkit attention gpt2 '...' --layer 8"),
+        ("steer", "Apply a steering vector", "interpkit steer gpt2 '...' --positive Love --negative Hate --at transformer.h.8"),
+        ("probe", "Linear probe on activations", "interpkit probe gpt2 --at transformer.h.8 --data data.json"),
+        ("diff", "Compare two models' activations", "interpkit diff gpt2 my-finetuned-gpt2 '...'"),
+        ("", "", ""),
+        ("", "[bold]Advanced[/bold]", ""),
+        ("features", "SAE feature decomposition", "interpkit features gpt2 '...' --at transformer.h.8 --sae jbloom/..."),
+    ]
+    for cmd, desc, example in rows:
+        table.add_row(cmd, desc, example)
+    panel = Panel(
+        table,
+        title="[bold cyan]Commands[/bold cyan]",
+        subtitle="[dim]Mech interp for any HuggingFace model.[/dim]",
+        border_style="cyan",
+        padding=(1, 2),
+    )
+    console.print()
+    console.print(panel)
+    save_hint = Text.assemble(
+        ("  Tip: ", "bold"),
+        ("Most commands accept ", ""),
+        ("--save path.png", "bold green"),
+        (" to export a matplotlib figure and ", ""),
+        ("--html path.html", "bold green"),
+        (" for interactive visualizations.\n", ""),
+    )
+    console.print(save_hint)
+    console.print("  Run [bold cyan]interpkit <command> --help[/bold cyan] for detailed usage.\n")
+# ══════════════════════════════════════════════════════════════════
+# inspect
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def inspect(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID (e.g. gpt2, microsoft/resnet-50)"),
+    device: Optional[str] = typer.Option(None, help="Device (cpu, cuda, mps). Auto-detected if omitted."),
+) -> None:
+    """Print the model's module tree with types, param counts, and detected roles."""
+    m = _load_model(model_name, device=device)
+    m.inspect()
+# ══════════════════════════════════════════════════════════════════
+# patch
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def patch(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    clean: str = typer.Option(..., "--clean", help="Clean input (text string or image path)"),
+    corrupted: str = typer.Option(..., "--corrupted", help="Corrupted input (text string or image path)"),
+    at: str = typer.Option(..., "--at", help="Module name to patch (e.g. transformer.h.8.mlp)"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Activation patching: swap one module's output from clean into corrupted run."""
+    m = _load_model(model_name, device=device)
+    m.patch(clean, corrupted, at=at)
+# ══════════════════════════════════════════════════════════════════
+# trace
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def trace(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    clean: str = typer.Option(..., "--clean", help="Clean input"),
+    corrupted: str = typer.Option(..., "--corrupted", help="Corrupted input"),
+    top_k: int = typer.Option(20, "--top-k", help="Scan top-K modules by proxy score. 0 = scan all."),
+    save: Optional[str] = typer.Option(None, "--save", help="Save bar chart to file (e.g. trace.png)"),
+    html_path: Optional[str] = typer.Option(None, "--html", help="Save interactive HTML to file (e.g. trace.html)"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Causal tracing: rank modules by how much patching them restores clean output."""
+    effective_top_k: int | None = top_k if top_k > 0 else None
+    m = _load_model(model_name, device=device)
+    m.trace(clean, corrupted, top_k=effective_top_k, save=save, html=html_path)
+# ══════════════════════════════════════════════════════════════════
+# lens
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def lens(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    text: str = typer.Argument(..., help="Input text"),
+    save: Optional[str] = typer.Option(None, "--save", help="Save heatmap to file (e.g. lens.png)"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Logit lens: project each layer's hidden state to vocabulary space."""
+    m = _load_model(model_name, device=device)
+    m.lens(text, save=save)
+# ══════════════════════════════════════════════════════════════════
+# attribute
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def attribute(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    input_data: str = typer.Argument(..., help="Input text or image path"),
+    target: Optional[int] = typer.Option(None, "--target", help="Target class/token index for attribution"),
+    save: Optional[str] = typer.Option(None, "--save", help="Save figure to file (e.g. attribution.png)"),
+    html_path: Optional[str] = typer.Option(None, "--html", help="Save interactive HTML to file (e.g. attribution.html)"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Gradient saliency over input tokens or pixels."""
+    m = _load_model(model_name, device=device)
+    m.attribute(input_data, target=target, save=save, html=html_path)
+# ══════════════════════════════════════════════════════════════════
+# activations
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def activations(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    input_data: str = typer.Argument(..., help="Input text or image path"),
+    at: str = typer.Option(..., "--at", help="Module name(s) to extract, comma-separated"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Extract and display activation statistics at named modules."""
+    m = _load_model(model_name, device=device)
+    modules = [s.strip() for s in at.split(",")]
+    if len(modules) == 1:
+        m.activations(input_data, at=modules[0])
+    else:
+        m.activations(input_data, at=modules)
+# ══════════════════════════════════════════════════════════════════
+# ablate
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def ablate(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    input_data: str = typer.Argument(..., help="Input text or image path"),
+    at: str = typer.Option(..., "--at", help="Module name to ablate"),
+    method: str = typer.Option("zero", "--method", help="Ablation method: zero or mean"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Zero or mean ablate a module and measure the effect on output."""
+    m = _load_model(model_name, device=device)
+    m.ablate(input_data, at=at, method=method)
+# ══════════════════════════════════════════════════════════════════
+# attention
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def attention(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    input_data: str = typer.Argument(..., help="Input text"),
+    layer: Optional[int] = typer.Option(None, "--layer", help="Specific layer index"),
+    head: Optional[int] = typer.Option(None, "--head", help="Specific head index"),
+    save: Optional[str] = typer.Option(None, "--save", help="Save heatmap to file (e.g. attention.png)"),
+    html_path: Optional[str] = typer.Option(None, "--html", help="Save interactive HTML to file (e.g. attention.html)"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Show attention patterns for transformer models."""
+    m = _load_model(model_name, device=device)
+    m.attention(input_data, layer=layer, head=head, save=save, html=html_path)
+# ══════════════════════════════════════════════════════════════════
+# steer
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def steer(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    input_data: str = typer.Argument(..., help="Input text to steer"),
+    positive: str = typer.Option(..., "--positive", help="Positive direction text"),
+    negative: str = typer.Option(..., "--negative", help="Negative direction text"),
+    at: str = typer.Option(..., "--at", help="Module name to apply steering at"),
+    scale: float = typer.Option(2.0, "--scale", help="Steering vector scale factor"),
+    save: Optional[str] = typer.Option(None, "--save", help="Save comparison chart to file"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Extract a steering vector and apply it during inference."""
+    m = _load_model(model_name, device=device)
+    vector = m.steer_vector(positive, negative, at=at)
+    m.steer(input_data, vector=vector, at=at, scale=scale, save=save)
+# ══════════════════════════════════════════════════════════════════
+# probe
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def probe(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID"),
+    at: str = typer.Option(..., "--at", help="Module name to probe"),
+    data: str = typer.Option(..., "--data", help="JSON file with {texts: [...], labels: [...]}"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Train a linear probe on activations to test linear separability."""
+    import json
+    from pathlib import Path
+    probe_data = json.loads(Path(data).read_text())
+    m = _load_model(model_name, device=device)
+    m.probe(texts=probe_data["texts"], labels=probe_data["labels"], at=at)
+# ══════════════════════════════════════════════════════════════════
+# diff
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def diff(
+    model_a_name: str = typer.Argument(..., help="First model (e.g. gpt2)"),
+    model_b_name: str = typer.Argument(..., help="Second model (e.g. my-finetuned-gpt2)"),
+    input_data: str = typer.Argument(..., help="Input text to compare on"),
+    save: Optional[str] = typer.Option(None, "--save", help="Save bar chart to file"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Compare activations between two models on the same input."""
+    import interpkit
+    m_a = _load_model(model_a_name, device=device)
+    m_b = _load_model(model_b_name, device=device)
+    interpkit.diff(m_a, m_b, input_data, save=save)
+# ══════════════════════════════════════════════════════════════════
+# features (SAE)
+# ══════════════════════════════════════════════════════════════════
+@app.command()
+def features(
+    model_name: str = typer.Argument(..., help="HuggingFace model ID (e.g. gpt2)"),
+    input_data: str = typer.Argument(..., help="Input text"),
+    at: str = typer.Option(..., "--at", help="Module name to decompose (e.g. transformer.h.8)"),
+    sae: str = typer.Option(..., "--sae", help="HuggingFace repo ID for the SAE weights"),
+    top_k: int = typer.Option(20, "--top-k", help="Number of top features to display"),
+    device: Optional[str] = typer.Option(None, help="Device"),
+) -> None:
+    """Decompose activations through a Sparse Autoencoder into interpretable features."""
+    m = _load_model(model_name, device=device)
+    m.features(input_data, at=at, sae=sae, top_k=top_k)
+if __name__ == "__main__":
+    app()

interpkit/core/__init__.py ADDED Viewed

File without changes

interpkit/core/discovery.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Auto-discover model structure from HF config, module name heuristics, and forward pass."""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Any
+import torch
+import torch.nn as nn
+# ---------------------------------------------------------------------------
+# Heuristic patterns for semantic module role detection
+# ---------------------------------------------------------------------------
+_ATTENTION_PATTERNS = re.compile(
+    r"(^|\.)(self_attn|attn|attention|mha|multi_head_attention)(\.|\b)", re.IGNORECASE
+)
+_MLP_PATTERNS = re.compile(
+    r"(^|\.)(mlp|ffn|feed_forward|dense|fc[_\d]|intermediate)(\.|\b)", re.IGNORECASE
+)
+_HEAD_PATTERNS = re.compile(
+    r"(^|\.)(lm_head|head|classifier|output_projection|qa_outputs)(\.|\b)", re.IGNORECASE
+)
+_NORM_PATTERNS = re.compile(
+    r"(^|\.)(layer_?norm|rms_?norm|norm|ln_f|ln_\d)(\.|\b)", re.IGNORECASE
+)
+_EMBED_PATTERNS = re.compile(
+    r"(^|\.)(embed|wte|wpe|embedding|token_embedding|position_embedding)(\.|\b)",
+    re.IGNORECASE,
+)
+@dataclass
+class ModuleInfo:
+    """Discovered information about a single named module."""
+    name: str
+    type_name: str
+    param_count: int
+    output_shape: tuple[int, ...] | None = None
+    role: str | None = None  # "attention", "mlp", "head", "norm", "embed", or None
+@dataclass
+class ModelArchInfo:
+    """Aggregated architecture information for a model."""
+    arch_family: str | None = None  # e.g. "GPT2LMHeadModel", "MambaForCausalLM"
+    num_layers: int | None = None
+    hidden_size: int | None = None
+    num_attention_heads: int | None = None
+    vocab_size: int | None = None
+    has_lm_head: bool = False
+    output_head_name: str | None = None
+    unembedding_name: str | None = None
+    modules: list[ModuleInfo] = field(default_factory=list)
+    layer_names: list[str] = field(default_factory=list)
+    is_tl_model: bool = False
+    @property
+    def is_language_model(self) -> bool:
+        return self.has_lm_head and self.unembedding_name is not None
+def _classify_role(name: str) -> str | None:
+    if _HEAD_PATTERNS.search(name):
+        return "head"
+    if _ATTENTION_PATTERNS.search(name):
+        return "attention"
+    if _MLP_PATTERNS.search(name):
+        return "mlp"
+    if _NORM_PATTERNS.search(name):
+        return "norm"
+    if _EMBED_PATTERNS.search(name):
+        return "embed"
+    return None
+def _count_params(module: nn.Module) -> int:
+    return sum(p.numel() for p in module.parameters(recurse=False))
+def _parse_hf_config(model: nn.Module) -> dict[str, Any]:
+    """Extract architecture metadata from an HF model's config, if present."""
+    config = getattr(model, "config", None)
+    if config is None:
+        return {}
+    info: dict[str, Any] = {}
+    info["arch_family"] = type(model).__name__
+    for attr in ("num_hidden_layers", "n_layer", "num_layers", "n_layers"):
+        val = getattr(config, attr, None)
+        if val is not None:
+            info["num_layers"] = val
+            break
+    for attr in ("hidden_size", "n_embd", "d_model"):
+        val = getattr(config, attr, None)
+        if val is not None:
+            info["hidden_size"] = val
+            break
+    for attr in ("num_attention_heads", "n_head", "num_heads"):
+        val = getattr(config, attr, None)
+        if val is not None:
+            info["num_attention_heads"] = val
+            break
+    info["vocab_size"] = getattr(config, "vocab_size", None)
+    return info
+def _find_unembedding(model: nn.Module) -> str | None:
+    """Try to find the unembedding / LM head weight matrix."""
+    for name, module in model.named_modules():
+        if _HEAD_PATTERNS.search(name) and hasattr(module, "weight"):
+            return name
+    return None
+def _detect_layers(modules: list[ModuleInfo]) -> list[str]:
+    """Identify repeated structural blocks that look like transformer/SSM layers.
+    Strategy: find modules whose names follow a pattern like ``something.N``
+    where N is a sequential integer, and whose siblings have identical structure.
+    We pick the longest such group.
+    """
+    pattern = re.compile(r"^(.+)\.(\d+)$")
+    groups: dict[str, list[str]] = {}
+    for m in modules:
+        match = pattern.match(m.name)
+        if match:
+            prefix = match.group(1)
+            groups.setdefault(prefix, []).append(m.name)
+    if not groups:
+        return []
+    best_prefix = max(groups, key=lambda k: len(groups[k]))
+    layers = sorted(groups[best_prefix], key=lambda n: int(n.rsplit(".", 1)[-1]))
+    return layers
+def discover(
+    model: nn.Module,
+    dummy_input: Any | None = None,
+) -> ModelArchInfo:
+    """Run full auto-discovery on a model.
+    Parameters
+    ----------
+    model:
+        Any ``nn.Module``, optionally with an HF ``.config`` attribute.
+    dummy_input:
+        If provided, used for a forward pass to capture output shapes.
+        Can be a tensor, dict of tensors, or tuple of tensors.
+    """
+    hf_meta = _parse_hf_config(model)
+    # Enumerate all named modules
+    module_infos: list[ModuleInfo] = []
+    for name, mod in model.named_modules():
+        if name == "":
+            continue
+        info = ModuleInfo(
+            name=name,
+            type_name=type(mod).__name__,
+            param_count=_count_params(mod),
+            role=_classify_role(name),
+        )
+        module_infos.append(info)
+    # Output shape enumeration via hooks
+    if dummy_input is not None:
+        shapes: dict[str, tuple[int, ...]] = {}
+        hooks = []
+        def _make_hook(mod_name: str):
+            def hook_fn(_mod: nn.Module, _inp: Any, output: Any) -> None:
+                if isinstance(output, torch.Tensor):
+                    shapes[mod_name] = tuple(output.shape)
+                elif isinstance(output, (tuple, list)) and len(output) > 0:
+                    first = output[0]
+                    if isinstance(first, torch.Tensor):
+                        shapes[mod_name] = tuple(first.shape)
+            return hook_fn
+        for name, mod in model.named_modules():
+            if name == "":
+                continue
+            hooks.append(mod.register_forward_hook(_make_hook(name)))
+        try:
+            with torch.no_grad():
+                if isinstance(dummy_input, dict):
+                    model(**dummy_input)
+                elif isinstance(dummy_input, (tuple, list)):
+                    model(*dummy_input)
+                else:
+                    model(dummy_input)
+        finally:
+            for h in hooks:
+                h.remove()
+        for info in module_infos:
+            info.output_shape = shapes.get(info.name)
+    # Find unembedding
+    unembed_name = _find_unembedding(model)
+    has_lm_head = unembed_name is not None
+    # Detect layer names
+    layer_names = _detect_layers(module_infos)
+    return ModelArchInfo(
+        arch_family=hf_meta.get("arch_family"),
+        num_layers=hf_meta.get("num_layers"),
+        hidden_size=hf_meta.get("hidden_size"),
+        num_attention_heads=hf_meta.get("num_attention_heads"),
+        vocab_size=hf_meta.get("vocab_size"),
+        has_lm_head=has_lm_head,
+        output_head_name=unembed_name,
+        unembedding_name=unembed_name,
+        modules=module_infos,
+        layer_names=layer_names,
+    )