PyPI - sae-lens - Versions diffs - 6.26.1__py3-none-any.whl → 6.28.1__py3-none-any.whl - Mend

sae-lens 6.26.1py3-none-any.whl → 6.28.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sae_lens/__init__.py +3 -1
sae_lens/cache_activations_runner.py +12 -5
sae_lens/config.py +2 -0
sae_lens/loading/pretrained_sae_loaders.py +2 -1
sae_lens/loading/pretrained_saes_directory.py +18 -0
sae_lens/saes/gated_sae.py +1 -0
sae_lens/saes/jumprelu_sae.py +3 -0
sae_lens/saes/sae.py +13 -0
sae_lens/saes/standard_sae.py +2 -0
sae_lens/saes/temporal_sae.py +1 -0
sae_lens/synthetic/__init__.py +89 -0
sae_lens/synthetic/activation_generator.py +215 -0
sae_lens/synthetic/correlation.py +170 -0
sae_lens/synthetic/evals.py +141 -0
sae_lens/synthetic/feature_dictionary.py +138 -0
sae_lens/synthetic/firing_probabilities.py +104 -0
sae_lens/synthetic/hierarchy.py +335 -0
sae_lens/synthetic/initialization.py +40 -0
sae_lens/synthetic/plotting.py +230 -0
sae_lens/synthetic/training.py +145 -0
sae_lens/tokenization_and_batching.py +1 -1
sae_lens/training/activations_store.py +51 -91
sae_lens/training/mixing_buffer.py +14 -5
sae_lens/training/sae_trainer.py +1 -1
sae_lens/util.py +26 -1
{sae_lens-6.26.1.dist-info → sae_lens-6.28.1.dist-info}/METADATA +3 -1
sae_lens-6.28.1.dist-info/RECORD +52 -0
sae_lens-6.26.1.dist-info/RECORD +0 -42
{sae_lens-6.26.1.dist-info → sae_lens-6.28.1.dist-info}/WHEEL +0 -0
{sae_lens-6.26.1.dist-info → sae_lens-6.28.1.dist-info}/licenses/LICENSE +0 -0

sae_lens/synthetic/evals.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""
+Utilities for training SAEs on synthetic data.
+This module provides helpers for:
+- Generating training data from feature dictionaries
+- Training SAEs on synthetic data
+- Evaluating SAEs against known ground truth features
+- Initializing SAEs to match feature dictionaries
+"""
+from dataclasses import dataclass
+import torch
+from scipy.optimize import linear_sum_assignment
+from sae_lens.synthetic.activation_generator import ActivationGenerator
+from sae_lens.synthetic.feature_dictionary import FeatureDictionary
+def mean_correlation_coefficient(
+    features_a: torch.Tensor,
+    features_b: torch.Tensor,
+) -> float:
+    """
+    Compute Mean Correlation Coefficient (MCC) between two sets of feature vectors.
+    MCC measures how well learned features align with ground truth features by finding
+    an optimal one-to-one matching using the Hungarian algorithm and computing the
+    mean absolute cosine similarity of matched pairs.
+    Reference: O'Neill et al. "Compute Optimal Inference and Provable Amortisation
+    Gap in Sparse Autoencoders" (arXiv:2411.13117)
+    Args:
+        features_a: Feature vectors of shape [num_features_a, hidden_dim]
+        features_b: Feature vectors of shape [num_features_b, hidden_dim]
+    Returns:
+        MCC score in range [0, 1], where 1 indicates perfect alignment
+    """
+    # Normalize to unit vectors
+    a_norm = features_a / features_a.norm(dim=1, keepdim=True).clamp(min=1e-8)
+    b_norm = features_b / features_b.norm(dim=1, keepdim=True).clamp(min=1e-8)
+    # Compute absolute cosine similarity matrix
+    cos_sim = torch.abs(a_norm @ b_norm.T)
+    # Convert to cost matrix for Hungarian algorithm (which minimizes)
+    cost_matrix = 1 - cos_sim.cpu().numpy()
+    # Find optimal matching
+    row_ind, col_ind = linear_sum_assignment(cost_matrix)
+    # Compute mean of matched similarities
+    matched_similarities = cos_sim[row_ind, col_ind]
+    return matched_similarities.mean().item()
+@dataclass
+class SyntheticDataEvalResult:
+    """Results from evaluating an SAE on synthetic data."""
+    true_l0: float
+    """Average L0 of the true feature activations"""
+    sae_l0: float
+    """Average L0 of the SAE's latent activations"""
+    dead_latents: int
+    """Number of SAE latents that never fired"""
+    shrinkage: float
+    """Average ratio of SAE output norm to input norm (1.0 = no shrinkage)"""
+    mcc: float
+    """Mean Correlation Coefficient between SAE decoder and ground truth features"""
+@torch.no_grad()
+def eval_sae_on_synthetic_data(
+    sae: torch.nn.Module,
+    feature_dict: FeatureDictionary,
+    activations_generator: ActivationGenerator,
+    num_samples: int = 100_000,
+) -> SyntheticDataEvalResult:
+    """
+    Evaluate an SAE on synthetic data with known ground truth.
+    Args:
+        sae: The SAE to evaluate. Must have encode() and decode() methods.
+        feature_dict: The feature dictionary used to generate activations
+        activations_generator: Generator that produces feature activations
+        num_samples: Number of samples to use for evaluation
+    Returns:
+        SyntheticDataEvalResult containing evaluation metrics
+    """
+    sae.eval()
+    # Generate samples
+    feature_acts = activations_generator.sample(num_samples)
+    true_l0 = (feature_acts > 0).float().sum(dim=-1).mean().item()
+    hidden_acts = feature_dict(feature_acts)
+    # Filter out entries where no features fire
+    non_zero_mask = hidden_acts.norm(dim=-1) > 0
+    hidden_acts_filtered = hidden_acts[non_zero_mask]
+    # Get SAE reconstructions
+    sae_latents = sae.encode(hidden_acts_filtered)  # type: ignore[attr-defined]
+    sae_output = sae.decode(sae_latents)  # type: ignore[attr-defined]
+    sae_l0 = (sae_latents > 0).float().sum(dim=-1).mean().item()
+    dead_latents = int(
+        ((sae_latents == 0).sum(dim=0) == sae_latents.shape[0]).sum().item()
+    )
+    if hidden_acts_filtered.shape[0] == 0:
+        shrinkage = 0.0
+    else:
+        shrinkage = (
+            (
+                sae_output.norm(dim=-1)
+                / hidden_acts_filtered.norm(dim=-1).clamp(min=1e-8)
+            )
+            .mean()
+            .item()
+        )
+    # Compute MCC between SAE decoder and ground truth features
+    sae_decoder: torch.Tensor = sae.W_dec  # type: ignore[attr-defined]
+    gt_features = feature_dict.feature_vectors
+    mcc = mean_correlation_coefficient(sae_decoder, gt_features)
+    return SyntheticDataEvalResult(
+        true_l0=true_l0,
+        sae_l0=sae_l0,
+        dead_latents=dead_latents,
+        shrinkage=shrinkage,
+        mcc=mcc,
+    )

sae_lens/synthetic/feature_dictionary.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""
+Feature dictionary for generating synthetic activations.
+A FeatureDictionary maps feature activations (sparse coefficients) to dense hidden activations
+by multiplying with a learned or constructed feature embedding matrix.
+"""
+from typing import Callable
+import torch
+from torch import nn
+from tqdm import tqdm
+FeatureDictionaryInitializer = Callable[["FeatureDictionary"], None]
+def orthogonalize_embeddings(
+    embeddings: torch.Tensor,
+    target_cos_sim: float = 0,
+    num_steps: int = 200,
+    lr: float = 0.01,
+    show_progress: bool = False,
+) -> torch.Tensor:
+    num_vectors = embeddings.shape[0]
+    # Create a detached copy and normalize, then enable gradients
+    embeddings = embeddings.detach().clone()
+    embeddings = embeddings / embeddings.norm(p=2, dim=1, keepdim=True).clamp(min=1e-8)
+    embeddings = embeddings.requires_grad_(True)
+    optimizer = torch.optim.Adam([embeddings], lr=lr)  # type: ignore[list-item]
+    # Create a mask to zero out diagonal elements (avoid in-place operations)
+    off_diagonal_mask = ~torch.eye(
+        num_vectors, dtype=torch.bool, device=embeddings.device
+    )
+    pbar = tqdm(
+        range(num_steps), desc="Orthogonalizing vectors", disable=not show_progress
+    )
+    for _ in pbar:
+        optimizer.zero_grad()
+        dot_products = embeddings @ embeddings.T
+        diff = dot_products - target_cos_sim
+        # Use masking instead of in-place fill_diagonal_
+        off_diagonal_diff = diff * off_diagonal_mask.float()
+        loss = off_diagonal_diff.pow(2).sum()
+        loss = loss + num_vectors * (dot_products.diag() - 1).pow(2).sum()
+        loss.backward()
+        optimizer.step()
+        pbar.set_description(f"loss: {loss.item():.3f}")
+    with torch.no_grad():
+        embeddings = embeddings / embeddings.norm(p=2, dim=1, keepdim=True).clamp(
+            min=1e-8
+        )
+    return embeddings.detach().clone()
+def orthogonal_initializer(
+    num_steps: int = 200, lr: float = 0.01, show_progress: bool = False
+) -> FeatureDictionaryInitializer:
+    def initializer(feature_dict: "FeatureDictionary") -> None:
+        feature_dict.feature_vectors.data = orthogonalize_embeddings(
+            feature_dict.feature_vectors,
+            num_steps=num_steps,
+            lr=lr,
+            show_progress=show_progress,
+        )
+    return initializer
+class FeatureDictionary(nn.Module):
+    """
+    A feature dictionary that maps sparse feature activations to dense hidden activations.
+    This class creates a set of feature vectors (the "dictionary") and provides methods
+    to generate hidden activations from feature activations via a linear transformation.
+    The feature vectors can be configured to have a specific pairwise cosine similarity,
+    which is useful for controlling the difficulty of sparse recovery.
+    Attributes:
+        feature_vectors: Parameter of shape [num_features, hidden_dim] containing the
+            feature embedding vectors
+        bias: Parameter of shape [hidden_dim] containing the bias term (zeros if bias=False)
+    """
+    feature_vectors: nn.Parameter
+    bias: nn.Parameter
+    def __init__(
+        self,
+        num_features: int,
+        hidden_dim: int,
+        bias: bool = False,
+        initializer: FeatureDictionaryInitializer | None = orthogonal_initializer(),
+    ):
+        """
+        Create a new FeatureDictionary.
+        Args:
+            num_features: Number of features in the dictionary
+            hidden_dim: Dimensionality of the hidden space
+            bias: Whether to include a bias term in the embedding
+            initializer: Initializer function to use. If None, the embeddings are initialized to random unit vectors. By default will orthogonalize embeddings.
+        """
+        super().__init__()
+        self.num_features = num_features
+        self.hidden_dim = hidden_dim
+        # Initialize feature vectors as unit vectors
+        embeddings = torch.randn(num_features, hidden_dim)
+        embeddings = embeddings / embeddings.norm(p=2, dim=1, keepdim=True).clamp(
+            min=1e-8
+        )
+        self.feature_vectors = nn.Parameter(embeddings)
+        # Initialize bias (zeros if not using bias, but still a parameter for consistent API)
+        self.bias = nn.Parameter(torch.zeros(hidden_dim), requires_grad=bias)
+        if initializer is not None:
+            initializer(self)
+    def forward(self, feature_activations: torch.Tensor) -> torch.Tensor:
+        """
+        Convert feature activations to hidden activations.
+        Args:
+            feature_activations: Tensor of shape [batch, num_features] containing
+                sparse feature activation values
+        Returns:
+            Tensor of shape [batch, hidden_dim] containing dense hidden activations
+        """
+        return feature_activations @ self.feature_vectors + self.bias

sae_lens/synthetic/firing_probabilities.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+Helper functions for generating firing probability distributions.
+"""
+import torch
+def zipfian_firing_probabilities(
+    num_features: int,
+    exponent: float = 1.0,
+    max_prob: float = 0.3,
+    min_prob: float = 0.01,
+) -> torch.Tensor:
+    """
+    Generate firing probabilities following a Zipfian (power-law) distribution.
+    Creates probabilities where a few features fire frequently and most fire rarely,
+    which mirrors the distribution often observed in real neural network features.
+    Args:
+        num_features: Number of features to generate probabilities for
+        exponent: Zipf exponent (higher = steeper dropoff). Default 1.0.
+        max_prob: Maximum firing probability (for the most frequent feature)
+        min_prob: Minimum firing probability (for the least frequent feature)
+    Returns:
+        Tensor of shape [num_features] with firing probabilities in descending order
+    """
+    if num_features < 1:
+        raise ValueError("num_features must be at least 1")
+    if exponent <= 0:
+        raise ValueError("exponent must be positive")
+    if not 0 < min_prob < max_prob <= 1:
+        raise ValueError("Must have 0 < min_prob < max_prob <= 1")
+    ranks = torch.arange(1, num_features + 1, dtype=torch.float32)
+    probs = 1.0 / ranks**exponent
+    # Scale to [min_prob, max_prob]
+    if num_features == 1:
+        return torch.tensor([max_prob])
+    probs_min, probs_max = probs.min(), probs.max()
+    return min_prob + (max_prob - min_prob) * (probs - probs_min) / (
+        probs_max - probs_min
+    )
+def linear_firing_probabilities(
+    num_features: int,
+    max_prob: float = 0.3,
+    min_prob: float = 0.01,
+) -> torch.Tensor:
+    """
+    Generate firing probabilities that decay linearly from max to min.
+    Args:
+        num_features: Number of features to generate probabilities for
+        max_prob: Firing probability for the first feature
+        min_prob: Firing probability for the last feature
+    Returns:
+        Tensor of shape [num_features] with linearly decaying probabilities
+    """
+    if num_features < 1:
+        raise ValueError("num_features must be at least 1")
+    if not 0 < min_prob <= max_prob <= 1:
+        raise ValueError("Must have 0 < min_prob <= max_prob <= 1")
+    if num_features == 1:
+        return torch.tensor([max_prob])
+    return torch.linspace(max_prob, min_prob, num_features)
+def random_firing_probabilities(
+    num_features: int,
+    max_prob: float = 0.5,
+    min_prob: float = 0.01,
+    seed: int | None = None,
+) -> torch.Tensor:
+    """
+    Generate random firing probabilities uniformly sampled from a range.
+    Args:
+        num_features: Number of features to generate probabilities for
+        max_prob: Maximum firing probability
+        min_prob: Minimum firing probability
+        seed: Optional random seed for reproducibility
+    Returns:
+        Tensor of shape [num_features] with random firing probabilities
+    """
+    if num_features < 1:
+        raise ValueError("num_features must be at least 1")
+    if not 0 < min_prob < max_prob <= 1:
+        raise ValueError("Must have 0 < min_prob < max_prob <= 1")
+    generator = torch.Generator()
+    if seed is not None:
+        generator.manual_seed(seed)
+    probs = torch.rand(num_features, generator=generator, dtype=torch.float32)
+    return min_prob + (max_prob - min_prob) * probs

sae-lens 6.26.1__py3-none-any.whl → 6.28.1__py3-none-any.whl

sae-lens 6.26.1py3-none-any.whl → 6.28.1py3-none-any.whl