PyPI - sae-lens - Versions diffs - 6.28.1__tar.gz → 6.29.0__tar.gz - Mend

sae-lens 6.28.1tar.gz → 6.29.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{sae_lens-6.28.1 → sae_lens-6.29.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.28.1
+Version: 6.29.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE
@@ -50,6 +50,8 @@ SAELens exists to help researchers:
 - Analyse sparse autoencoders / research mechanistic interpretability.
 - Generate insights which make it easier to create safe and aligned AI systems.
+SAELens inference works with any PyTorch-based model, not just TransformerLens. While we provide deep integration with TransformerLens via `HookedSAETransformer`, SAEs can be used with Hugging Face Transformers, NNsight, or any other framework by extracting activations and passing them to the SAE's `encode()` and `decode()` methods.
 Please refer to the [documentation](https://decoderesearch.github.io/SAELens/) for information on how to:
 - Download and Analyse pre-trained sparse autoencoders.
@@ -84,6 +86,14 @@ The new v6 update is a major refactor to SAELens and changes the way training co
 Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-375zalm04-GFd5tdBU1yLKlu_T_JSqZQ) for support!
+## Other SAE Projects
+- [dictionary-learning](https://github.com/saprmarks/dictionary_learning): An SAE training library that focuses on having hackable code.
+- [Sparsify](https://github.com/EleutherAI/sparsify): A lean SAE training library focused on TopK SAEs.
+- [Overcomplete](https://github.com/KempnerInstitute/overcomplete): SAE training library focused on vision models.
+- [SAE-Vis](https://github.com/callummcdougall/sae_vis): A library for visualizing SAE features, works with SAELens.
+- [SAEBench](https://github.com/adamkarvonen/SAEBench): A suite of LLM SAE benchmarks, works with SAELens.
 ## Citation
 Please cite the package as follows:

{sae_lens-6.28.1 → sae_lens-6.29.0}/README.md RENAMED Viewed

@@ -14,6 +14,8 @@ SAELens exists to help researchers:
 - Analyse sparse autoencoders / research mechanistic interpretability.
 - Generate insights which make it easier to create safe and aligned AI systems.
+SAELens inference works with any PyTorch-based model, not just TransformerLens. While we provide deep integration with TransformerLens via `HookedSAETransformer`, SAEs can be used with Hugging Face Transformers, NNsight, or any other framework by extracting activations and passing them to the SAE's `encode()` and `decode()` methods.
 Please refer to the [documentation](https://decoderesearch.github.io/SAELens/) for information on how to:
 - Download and Analyse pre-trained sparse autoencoders.
@@ -48,6 +50,14 @@ The new v6 update is a major refactor to SAELens and changes the way training co
 Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-375zalm04-GFd5tdBU1yLKlu_T_JSqZQ) for support!
+## Other SAE Projects
+- [dictionary-learning](https://github.com/saprmarks/dictionary_learning): An SAE training library that focuses on having hackable code.
+- [Sparsify](https://github.com/EleutherAI/sparsify): A lean SAE training library focused on TopK SAEs.
+- [Overcomplete](https://github.com/KempnerInstitute/overcomplete): SAE training library focused on vision models.
+- [SAE-Vis](https://github.com/callummcdougall/sae_vis): A library for visualizing SAE features, works with SAELens.
+- [SAEBench](https://github.com/adamkarvonen/SAEBench): A suite of LLM SAE benchmarks, works with SAELens.
 ## Citation
 Please cite the package as follows:

{sae_lens-6.28.1 → sae_lens-6.29.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.28.1"
+version = "6.29.0"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.28.1 → sae_lens-6.29.0}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.28.1"
+__version__ = "6.29.0"
 import logging

{sae_lens-6.28.1 → sae_lens-6.29.0}/sae_lens/pretrained_saes.yaml RENAMED Viewed

@@ -40631,7 +40631,7 @@ gemma-3-1b-res-matryoshka-dc:
   conversion_func: null
   links:
     model: https://huggingface.co/google/gemma-3-1b-pt
-  model: gemma-3-1b
+  model: google/gemma-3-1b-pt
   repo_id: chanind/gemma-3-1b-batch-topk-matryoshka-saes-w-32k-l0-40
   saes:
   - id: blocks.0.hook_resid_post

{sae_lens-6.28.1 → sae_lens-6.29.0}/sae_lens/synthetic/__init__.py RENAMED Viewed

@@ -17,11 +17,14 @@ from sae_lens.synthetic.activation_generator import (
     ActivationGenerator,
     ActivationsModifier,
     ActivationsModifierInput,
+    CorrelationMatrixInput,
 )
 from sae_lens.synthetic.correlation import (
+    LowRankCorrelationMatrix,
     create_correlation_matrix_from_correlations,
     generate_random_correlation_matrix,
     generate_random_correlations,
+    generate_random_low_rank_correlation_matrix,
 )
 from sae_lens.synthetic.evals import (
     SyntheticDataEvalResult,
@@ -66,6 +69,9 @@ __all__ = [
     "create_correlation_matrix_from_correlations",
     "generate_random_correlations",
     "generate_random_correlation_matrix",
+    "generate_random_low_rank_correlation_matrix",
+    "LowRankCorrelationMatrix",
+    "CorrelationMatrixInput",
     # Feature modifiers
     "ActivationsModifier",
     "ActivationsModifierInput",

{sae_lens-6.28.1 → sae_lens-6.29.0}/sae_lens/synthetic/activation_generator.py RENAMED Viewed

@@ -7,12 +7,16 @@ from collections.abc import Callable, Sequence
 import torch
 from scipy.stats import norm
 from torch import nn
-from torch.distributions import MultivariateNormal
+from torch.distributions import LowRankMultivariateNormal, MultivariateNormal
+from sae_lens.synthetic.correlation import LowRankCorrelationMatrix
 from sae_lens.util import str_to_dtype
 ActivationsModifier = Callable[[torch.Tensor], torch.Tensor]
 ActivationsModifierInput = ActivationsModifier | Sequence[ActivationsModifier] | None
+CorrelationMatrixInput = (
+    torch.Tensor | LowRankCorrelationMatrix | tuple[torch.Tensor, torch.Tensor]
+)
 class ActivationGenerator(nn.Module):
@@ -28,6 +32,7 @@ class ActivationGenerator(nn.Module):
     mean_firing_magnitudes: torch.Tensor
     modify_activations: ActivationsModifier | None
     correlation_matrix: torch.Tensor | None
+    low_rank_correlation: tuple[torch.Tensor, torch.Tensor] | None
     correlation_thresholds: torch.Tensor | None
     def __init__(
@@ -37,7 +42,7 @@ class ActivationGenerator(nn.Module):
         std_firing_magnitudes: torch.Tensor | float = 0.0,
         mean_firing_magnitudes: torch.Tensor | float = 1.0,
         modify_activations: ActivationsModifierInput = None,
-        correlation_matrix: torch.Tensor | None = None,
+        correlation_matrix: CorrelationMatrixInput | None = None,
         device: torch.device | str = "cpu",
         dtype: torch.dtype | str = "float32",
     ):
@@ -54,15 +59,32 @@ class ActivationGenerator(nn.Module):
         )
         self.modify_activations = _normalize_modifiers(modify_activations)
         self.correlation_thresholds = None
+        self.correlation_matrix = None
+        self.low_rank_correlation = None
         if correlation_matrix is not None:
-            _validate_correlation_matrix(correlation_matrix, num_features)
+            if isinstance(correlation_matrix, torch.Tensor):
+                # Full correlation matrix
+                _validate_correlation_matrix(correlation_matrix, num_features)
+                self.correlation_matrix = correlation_matrix
+            else:
+                # Low-rank correlation (tuple or LowRankCorrelationMatrix)
+                correlation_factor, correlation_diag = (
+                    correlation_matrix[0],
+                    correlation_matrix[1],
+                )
+                _validate_low_rank_correlation(
+                    correlation_factor, correlation_diag, num_features
+                )
+                self.low_rank_correlation = (correlation_factor, correlation_diag)
             self.correlation_thresholds = torch.tensor(
                 [norm.ppf(1 - p.item()) for p in self.firing_probabilities],
                 device=device,
                 dtype=self.firing_probabilities.dtype,
             )
-        self.correlation_matrix = correlation_matrix
+    @torch.no_grad()
     def sample(self, batch_size: int) -> torch.Tensor:
         """
         Generate a batch of feature activations with controlled properties.
@@ -89,6 +111,15 @@ class ActivationGenerator(nn.Module):
                 self.correlation_thresholds,
                 device,
             )
+        elif self.low_rank_correlation is not None:
+            assert self.correlation_thresholds is not None
+            firing_features = _generate_low_rank_correlated_features(
+                batch_size,
+                self.low_rank_correlation[0],
+                self.low_rank_correlation[1],
+                self.correlation_thresholds,
+                device,
+            )
         else:
             firing_features = torch.bernoulli(
                 self.firing_probabilities.unsqueeze(0).expand(batch_size, -1)
@@ -132,7 +163,7 @@ def _generate_correlated_features(
         device: Device to generate samples on
     Returns:
-        Binary feature matrix of shape [batch_size, num_features]
+        Binary feature matrix of shape (batch_size, num_features)
     """
     num_features = correlation_matrix.shape[0]
@@ -145,6 +176,41 @@ def _generate_correlated_features(
     return (gaussian_samples > thresholds.unsqueeze(0)).float()
+def _generate_low_rank_correlated_features(
+    batch_size: int,
+    correlation_factor: torch.Tensor,
+    correlation_diag: torch.Tensor,
+    thresholds: torch.Tensor,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generate correlated binary features using low-rank multivariate Gaussian sampling.
+    Uses the Gaussian copula approach with a low-rank covariance structure for scalability.
+    The covariance is represented as: cov = factor @ factor.T + diag(diag_term)
+    Args:
+        batch_size: Number of samples to generate
+        correlation_factor: Factor matrix of shape (num_features, rank)
+        correlation_diag: Diagonal term of shape (num_features,)
+        thresholds: Pre-computed thresholds for each feature (from inverse normal CDF)
+        device: Device to generate samples on
+    Returns:
+        Binary feature matrix of shape (batch_size, num_features)
+    """
+    mvn = LowRankMultivariateNormal(
+        loc=torch.zeros(
+            correlation_factor.shape[0], device=device, dtype=thresholds.dtype
+        ),
+        cov_factor=correlation_factor.to(device=device, dtype=thresholds.dtype),
+        cov_diag=correlation_diag.to(device=device, dtype=thresholds.dtype),
+    )
+    gaussian_samples = mvn.sample((batch_size,))
+    return (gaussian_samples > thresholds.unsqueeze(0)).float()
 def _to_tensor(
     value: torch.Tensor | float,
     num_features: int,
@@ -193,7 +259,7 @@ def _validate_correlation_matrix(
     Args:
         correlation_matrix: The matrix to validate
-        num_features: Expected number of features (matrix should be [num_features, num_features])
+        num_features: Expected number of features (matrix should be (num_features, num_features))
     Raises:
         ValueError: If the matrix has incorrect shape, non-unit diagonal, or is not positive definite
@@ -213,3 +279,36 @@ def _validate_correlation_matrix(
         torch.linalg.cholesky(correlation_matrix)
     except RuntimeError as e:
         raise ValueError("Correlation matrix must be positive definite") from e
+def _validate_low_rank_correlation(
+    correlation_factor: torch.Tensor,
+    correlation_diag: torch.Tensor,
+    num_features: int,
+) -> None:
+    """Validate that low-rank correlation parameters have correct properties.
+    Args:
+        correlation_factor: Factor matrix of shape (num_features, rank)
+        correlation_diag: Diagonal term of shape (num_features,)
+        num_features: Expected number of features
+    Raises:
+        ValueError: If shapes are incorrect or diagonal terms are not positive
+    """
+    if correlation_factor.ndim != 2:
+        raise ValueError(
+            f"correlation_factor must be 2D, got {correlation_factor.ndim}D"
+        )
+    if correlation_factor.shape[0] != num_features:
+        raise ValueError(
+            f"correlation_factor must have shape ({num_features}, rank), "
+            f"got {tuple(correlation_factor.shape)}"
+        )
+    if correlation_diag.shape != (num_features,):
+        raise ValueError(
+            f"correlation_diag must have shape ({num_features},), "
+            f"got {tuple(correlation_diag.shape)}"
+        )
+    if torch.any(correlation_diag <= 0):
+        raise ValueError("correlation_diag must have all positive values")

sae_lens-6.29.0/sae_lens/synthetic/correlation.py ADDED Viewed

@@ -0,0 +1,351 @@
+import random
+from typing import NamedTuple
+import torch
+from sae_lens.util import str_to_dtype
+class LowRankCorrelationMatrix(NamedTuple):
+    """
+    Low-rank representation of a correlation matrix for scalable correlated sampling.
+    The correlation structure is represented as:
+        correlation = correlation_factor @ correlation_factor.T + diag(correlation_diag)
+    This requires O(num_features * rank) storage instead of O(num_features^2),
+    making it suitable for very large numbers of features (e.g., 1M+).
+    Attributes:
+        correlation_factor: Factor matrix of shape (num_features, rank) that captures
+            correlations through shared latent factors.
+        correlation_diag: Diagonal variance term of shape (num_features,). Should be
+            chosen such that the diagonal of the full correlation matrix equals 1.
+            Typically: correlation_diag[i] = 1 - sum(correlation_factor[i, :]^2)
+    """
+    correlation_factor: torch.Tensor
+    correlation_diag: torch.Tensor
+def create_correlation_matrix_from_correlations(
+    num_features: int,
+    correlations: dict[tuple[int, int], float] | None = None,
+    default_correlation: float = 0.0,
+) -> torch.Tensor:
+    """
+    Create a correlation matrix with specified pairwise correlations.
+    Note: If the resulting matrix is not positive definite, it will be adjusted
+    to ensure validity. This adjustment may change the specified correlation
+    values. To minimize this effect, use smaller correlation magnitudes.
+    Args:
+        num_features: Number of features
+        correlations: Dict mapping (i, j) pairs to correlation values.
+            Pairs should have i < j. Pairs not specified will use default_correlation.
+        default_correlation: Default correlation for unspecified pairs
+    Returns:
+        Correlation matrix of shape (num_features, num_features)
+    """
+    matrix = torch.eye(num_features) + default_correlation * (
+        1 - torch.eye(num_features)
+    )
+    if correlations is not None:
+        for (i, j), corr in correlations.items():
+            matrix[i, j] = corr
+            matrix[j, i] = corr
+    # Ensure matrix is symmetric (numerical precision)
+    matrix = (matrix + matrix.T) / 2
+    # Check positive definiteness and fix if necessary
+    # Use eigvalsh for symmetric matrices (returns real eigenvalues)
+    eigenvals = torch.linalg.eigvalsh(matrix)
+    if torch.any(eigenvals < -1e-6):
+        matrix = _fix_correlation_matrix(matrix)
+    return matrix
+def _fix_correlation_matrix(
+    matrix: torch.Tensor, min_eigenval: float = 1e-6
+) -> torch.Tensor:
+    """Fix a correlation matrix to be positive semi-definite."""
+    eigenvals, eigenvecs = torch.linalg.eigh(matrix)
+    eigenvals = torch.clamp(eigenvals, min=min_eigenval)
+    fixed_matrix = eigenvecs @ torch.diag(eigenvals) @ eigenvecs.T
+    diag_vals = torch.diag(fixed_matrix)
+    diag_vals = torch.clamp(diag_vals, min=1e-8)  # Prevent division by zero
+    fixed_matrix = fixed_matrix / torch.sqrt(
+        diag_vals.unsqueeze(0) * diag_vals.unsqueeze(1)
+    )
+    fixed_matrix.fill_diagonal_(1.0)
+    return fixed_matrix
+def _validate_correlation_params(
+    positive_ratio: float,
+    uncorrelated_ratio: float,
+    min_correlation_strength: float,
+    max_correlation_strength: float,
+) -> None:
+    """Validate parameters for correlation generation."""
+    if not 0.0 <= positive_ratio <= 1.0:
+        raise ValueError("positive_ratio must be between 0.0 and 1.0")
+    if not 0.0 <= uncorrelated_ratio <= 1.0:
+        raise ValueError("uncorrelated_ratio must be between 0.0 and 1.0")
+    if min_correlation_strength < 0:
+        raise ValueError("min_correlation_strength must be non-negative")
+    if max_correlation_strength > 1.0:
+        raise ValueError("max_correlation_strength must be <= 1.0")
+    if min_correlation_strength > max_correlation_strength:
+        raise ValueError("min_correlation_strength must be <= max_correlation_strength")
+def generate_random_correlations(
+    num_features: int,
+    positive_ratio: float = 0.5,
+    uncorrelated_ratio: float = 0.3,
+    min_correlation_strength: float = 0.1,
+    max_correlation_strength: float = 0.8,
+    seed: int | None = None,
+) -> dict[tuple[int, int], float]:
+    """
+    Generate random correlations between features with specified constraints.
+    Args:
+        num_features: Number of features
+        positive_ratio: Fraction of correlated pairs that should be positive (0.0 to 1.0)
+        uncorrelated_ratio: Fraction of feature pairs that should have zero correlation
+            (0.0 to 1.0). These pairs are omitted from the returned dictionary.
+        min_correlation_strength: Minimum absolute correlation strength for correlated pairs
+        max_correlation_strength: Maximum absolute correlation strength for correlated pairs
+        seed: Random seed for reproducibility
+    Returns:
+        Dictionary mapping (i, j) pairs to correlation values. Pairs with zero
+        correlation (determined by uncorrelated_ratio) are not included.
+    """
+    # Use local random number generator to avoid side effects on global state
+    rng = random.Random(seed)
+    _validate_correlation_params(
+        positive_ratio,
+        uncorrelated_ratio,
+        min_correlation_strength,
+        max_correlation_strength,
+    )
+    # Generate all possible feature pairs (i, j) where i < j
+    all_pairs = [
+        (i, j) for i in range(num_features) for j in range(i + 1, num_features)
+    ]
+    total_pairs = len(all_pairs)
+    if total_pairs == 0:
+        return {}
+    # Determine how many pairs to correlate vs leave uncorrelated
+    num_uncorrelated = int(total_pairs * uncorrelated_ratio)
+    num_correlated = total_pairs - num_uncorrelated
+    # Randomly select which pairs to correlate
+    correlated_pairs = rng.sample(all_pairs, num_correlated)
+    # For correlated pairs, determine positive vs negative
+    num_positive = int(num_correlated * positive_ratio)
+    num_negative = num_correlated - num_positive
+    # Assign signs
+    signs = [1] * num_positive + [-1] * num_negative
+    rng.shuffle(signs)
+    # Generate correlation strengths
+    correlations = {}
+    for pair, sign in zip(correlated_pairs, signs):
+        # Sample correlation strength uniformly from range
+        strength = rng.uniform(min_correlation_strength, max_correlation_strength)
+        correlations[pair] = sign * strength
+    return correlations
+def generate_random_correlation_matrix(
+    num_features: int,
+    positive_ratio: float = 0.5,
+    uncorrelated_ratio: float = 0.3,
+    min_correlation_strength: float = 0.1,
+    max_correlation_strength: float = 0.8,
+    seed: int | None = None,
+    device: torch.device | str = "cpu",
+    dtype: torch.dtype | str = torch.float32,
+) -> torch.Tensor:
+    """
+    Generate a random correlation matrix with specified constraints.
+    Uses vectorized torch operations for efficiency with large numbers of features.
+    Note: If the randomly generated matrix is not positive definite, it will be
+    adjusted to ensure validity. This adjustment may change correlation values,
+    including turning some zero correlations into non-zero values. To minimize
+    this effect, use smaller correlation strengths (e.g., 0.01-0.1).
+    Args:
+        num_features: Number of features
+        positive_ratio: Fraction of correlated pairs that should be positive (0.0 to 1.0)
+        uncorrelated_ratio: Fraction of feature pairs that should have zero correlation
+            (0.0 to 1.0). Note that matrix fixing for positive definiteness may reduce
+            the actual number of zero correlations.
+        min_correlation_strength: Minimum absolute correlation strength for correlated pairs
+        max_correlation_strength: Maximum absolute correlation strength for correlated pairs
+        seed: Random seed for reproducibility
+        device: Device to create the matrix on
+        dtype: Data type for the matrix
+    Returns:
+        Random correlation matrix of shape (num_features, num_features)
+    """
+    dtype = str_to_dtype(dtype)
+    _validate_correlation_params(
+        positive_ratio,
+        uncorrelated_ratio,
+        min_correlation_strength,
+        max_correlation_strength,
+    )
+    if num_features <= 1:
+        return torch.eye(num_features, device=device, dtype=dtype)
+    # Set random seed if provided
+    generator = torch.Generator(device=device)
+    if seed is not None:
+        generator.manual_seed(seed)
+    # Get upper triangular indices (i < j)
+    row_idx, col_idx = torch.triu_indices(num_features, num_features, offset=1)
+    num_pairs = row_idx.shape[0]
+    # Generate random values for all pairs at once
+    # is_correlated: 1 if this pair should have a correlation, 0 otherwise
+    is_correlated = (
+        torch.rand(num_pairs, generator=generator, device=device) >= uncorrelated_ratio
+    )
+    # signs: +1 for positive correlation, -1 for negative
+    is_positive = (
+        torch.rand(num_pairs, generator=generator, device=device) < positive_ratio
+    )
+    signs = torch.where(is_positive, 1.0, -1.0)
+    # strengths: uniform in [min_strength, max_strength]
+    strengths = (
+        torch.rand(num_pairs, generator=generator, device=device, dtype=dtype)
+        * (max_correlation_strength - min_correlation_strength)
+        + min_correlation_strength
+    )
+    # Combine: correlation = is_correlated * sign * strength
+    correlations = is_correlated.to(dtype) * signs.to(dtype) * strengths
+    # Build the symmetric matrix
+    matrix = torch.eye(num_features, device=device, dtype=dtype)
+    matrix[row_idx, col_idx] = correlations
+    matrix[col_idx, row_idx] = correlations
+    # Check positive definiteness and fix if necessary
+    eigenvals = torch.linalg.eigvalsh(matrix)
+    if torch.any(eigenvals < -1e-6):
+        matrix = _fix_correlation_matrix(matrix)
+    return matrix
+def generate_random_low_rank_correlation_matrix(
+    num_features: int,
+    rank: int,
+    correlation_scale: float = 0.1,
+    seed: int | None = None,
+    device: torch.device | str = "cpu",
+    dtype: torch.dtype | str = torch.float32,
+) -> LowRankCorrelationMatrix:
+    """
+    Generate a random low-rank correlation structure for scalable correlated sampling.
+    The correlation structure is represented as:
+        correlation = factor @ factor.T + diag(diag_term)
+    This requires O(num_features * rank) storage instead of O(num_features^2),
+    making it suitable for very large numbers of features (e.g., 1M+).
+    The factor matrix is initialized with random values scaled by correlation_scale,
+    and the diagonal term is computed to ensure the implied correlation matrix has
+    unit diagonal.
+    Args:
+        num_features: Number of features
+        rank: Rank of the low-rank approximation. Higher rank allows more complex
+            correlation structures but uses more memory. Typical values: 10-100.
+        correlation_scale: Scale factor for random correlations. Larger values produce
+            stronger correlations between features. Use 0 for no correlations (identity
+            matrix). Should be small enough that rank * correlation_scale^2 < 1 to
+            ensure valid diagonal terms.
+        seed: Random seed for reproducibility
+        device: Device to create tensors on
+        dtype: Data type for tensors
+    Returns:
+        LowRankCorrelationMatrix containing the factor matrix and diagonal term
+    """
+    # Minimum diagonal value to ensure numerical stability in the covariance matrix.
+    # This limits how much variance can come from the low-rank factor.
+    _MIN_DIAG = 0.01
+    dtype = str_to_dtype(dtype)
+    device = torch.device(device)
+    if rank <= 0:
+        raise ValueError("rank must be positive")
+    if correlation_scale < 0:
+        raise ValueError("correlation_scale must be non-negative")
+    # Set random seed if provided
+    generator = torch.Generator(device=device)
+    if seed is not None:
+        generator.manual_seed(seed)
+    # Generate random factor matrix
+    # Each row has norm roughly sqrt(rank) * correlation_scale
+    factor = (
+        torch.randn(num_features, rank, generator=generator, device=device, dtype=dtype)
+        * correlation_scale
+    )
+    # Compute diagonal term to ensure unit diagonal in implied correlation matrix
+    # diag(factor @ factor.T) + diag_term = 1
+    # diag_term = 1 - sum(factor[i, :]^2)
+    factor_sq_sum = (factor**2).sum(dim=1)
+    diag_term = 1 - factor_sq_sum
+    # Ensure diagonal terms are at least _MIN_DIAG for numerical stability
+    # If any diagonal term is too small, scale down the factor matrix
+    if torch.any(diag_term < _MIN_DIAG):
+        # Scale factor so max row norm squared is at most (1 - _MIN_DIAG)
+        # This ensures all diagonal terms are >= _MIN_DIAG
+        max_factor_contribution = 1 - _MIN_DIAG
+        max_sq_sum = factor_sq_sum.max()
+        scale = torch.sqrt(
+            torch.tensor(max_factor_contribution, device=device, dtype=dtype)
+            / max_sq_sum
+        )
+        factor = factor * scale
+        factor_sq_sum = (factor**2).sum(dim=1)
+        diag_term = 1 - factor_sq_sum
+    return LowRankCorrelationMatrix(
+        correlation_factor=factor, correlation_diag=diag_term
+    )

sae-lens 6.28.1__tar.gz → 6.29.0__tar.gz

sae-lens 6.28.1tar.gz → 6.29.0tar.gz