PyPI - sae-lens - Versions diffs - 6.26.1__tar.gz → 6.28.0__tar.gz - Mend

sae-lens 6.26.1tar.gz → 6.28.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{sae_lens-6.26.1 → sae_lens-6.28.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.26.1
+Version: 6.28.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE
@@ -77,6 +77,8 @@ The new v6 update is a major refactor to SAELens and changes the way training co
   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/logits_lens_with_features.ipynb)
 - [Training a Sparse Autoencoder](tutorials/training_a_sparse_autoencoder.ipynb)
   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/training_a_sparse_autoencoder.ipynb)
+- [Training SAEs on Synthetic Data](tutorials/training_saes_on_synthetic_data.ipynb)
+  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/training_saes_on_synthetic_data.ipynb)
 ## Join the Slack!

{sae_lens-6.26.1 → sae_lens-6.28.0}/README.md RENAMED Viewed

@@ -41,6 +41,8 @@ The new v6 update is a major refactor to SAELens and changes the way training co
   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/logits_lens_with_features.ipynb)
 - [Training a Sparse Autoencoder](tutorials/training_a_sparse_autoencoder.ipynb)
   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/training_a_sparse_autoencoder.ipynb)
+- [Training SAEs on Synthetic Data](tutorials/training_saes_on_synthetic_data.ipynb)
+  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/decoderesearch/SAELens/blob/main/tutorials/training_saes_on_synthetic_data.ipynb)
 ## Join the Slack!

{sae_lens-6.26.1 → sae_lens-6.28.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.26.1"
+version = "6.28.0"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"
@@ -58,6 +58,7 @@ eai-sparsify = "^1.1.1"
 mike = "^2.0.0"
 trio = "^0.30.0"
 dictionary-learning = "^0.1.0"
+kaleido = "^1.2.0"
 [tool.poetry.extras]
 mamba = ["mamba-lens"]

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.26.1"
+__version__ = "6.28.0"
 import logging
@@ -63,6 +63,7 @@ from .loading.pretrained_sae_loaders import (
 from .pretokenize_runner import PretokenizeRunner, pretokenize_runner
 from .registry import register_sae_class, register_sae_training_class
 from .training.activations_store import ActivationsStore
+from .training.sae_trainer import SAETrainer
 from .training.upload_saes_to_huggingface import upload_saes_to_huggingface
 __all__ = [
@@ -102,6 +103,7 @@ __all__ = [
     "JumpReLUTrainingSAE",
     "JumpReLUTrainingSAEConfig",
     "SAETrainingRunner",
+    "SAETrainer",
     "LoggingConfig",
     "BatchTopKTrainingSAE",
     "BatchTopKTrainingSAEConfig",

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/cache_activations_runner.py RENAMED Viewed

@@ -263,14 +263,21 @@ class CacheActivationsRunner:
         for i in tqdm(range(self.cfg.n_buffers), desc="Caching activations"):
             try:
-                buffer = self.activations_store.get_raw_buffer(
-                    self.cfg.n_batches_in_buffer, shuffle=False
-                )
-                shard = self._create_shard(buffer)
+                # Accumulate n_batches_in_buffer batches into one shard
+                buffers: list[tuple[torch.Tensor, torch.Tensor | None]] = []
+                for _ in range(self.cfg.n_batches_in_buffer):
+                    buffers.append(self.activations_store.get_raw_llm_batch())
+                # Concatenate all batches
+                acts = torch.cat([b[0] for b in buffers], dim=0)
+                token_ids: torch.Tensor | None = None
+                if buffers[0][1] is not None:
+                    # All batches have token_ids if the first one does
+                    token_ids = torch.cat([b[1] for b in buffers], dim=0)  # type: ignore[arg-type]
+                shard = self._create_shard((acts, token_ids))
                 shard.save_to_disk(
                     f"{tmp_cached_activation_path}/shard_{i:05d}", num_shards=1
                 )
-                del buffer, shard
+                del buffers, acts, token_ids, shard
             except StopIteration:
                 logger.warning(
                     f"Warning: Ran out of samples while filling the buffer at batch {i} before reaching {self.cfg.n_buffers} batches."

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/config.py RENAMED Viewed

@@ -148,6 +148,7 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         seqpos_slice (tuple[int | None, ...]): Determines slicing of activations when constructing batches during training. The slice should be (start_pos, end_pos, optional[step_size]), e.g. for Othello we sometimes use (5, -5). Note, step_size > 0.
         disable_concat_sequences (bool): Whether to disable concatenating sequences and ignore sequences shorter than the context size. If True, disables concatenating and ignores short sequences.
         sequence_separator_token (int | Literal["bos", "eos", "sep"] | None): If not `None`, this token will be placed between sentences in a batch to act as a separator. By default, this is the `<bos>` token.
+        activations_mixing_fraction (float): Fraction of the activation buffer to keep for mixing with new activations (default 0.5). Higher values mean more temporal shuffling but slower throughput. If 0, activations are served in order without shuffling (no temporal mixing).
         device (str): The device to use. Usually "cuda".
         act_store_device (str): The device to use for the activation store. "cpu" is advised in order to save VRAM. Defaults to "with_model" which uses the same device as the main model.
         seed (int): The seed to use.
@@ -217,6 +218,7 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     sequence_separator_token: int | Literal["bos", "eos", "sep"] | None = (
         special_token_field(default="bos")
     )
+    activations_mixing_fraction: float = 0.5
     # Misc
     device: str = "cpu"

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/loading/pretrained_sae_loaders.py RENAMED Viewed

@@ -959,7 +959,7 @@ def get_dictionary_learning_config_1_from_hf(
     architecture = "standard"
     if trainer["dict_class"] == "GatedAutoEncoder":
         architecture = "gated"
-    elif trainer["dict_class"] == "MatryoshkaBatchTopKSAE":
+    elif trainer["dict_class"] in ["MatryoshkaBatchTopKSAE", "BatchTopKSAE"]:
         architecture = "jumprelu"
     return {
@@ -1831,6 +1831,7 @@ def temporal_sae_huggingface_loader(
     Load TemporalSAE from canrager/temporalSAEs format (safetensors version).
     Expects folder_name to contain:
     - conf.yaml (configuration)
     - latest_ckpt.safetensors (model weights)
     """

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/saes/gated_sae.py RENAMED Viewed

@@ -118,6 +118,7 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     """
     GatedTrainingSAE is a concrete implementation of BaseTrainingSAE for the "gated" SAE architecture.
     It implements:
       - initialize_weights: sets up gating parameters (as in GatedSAE) plus optional training-specific init.
       - encode: calls encode_with_hidden_pre (standard training approach).
       - decode: linear transformation + hooking, same as GatedSAE or StandardTrainingSAE.

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/saes/jumprelu_sae.py RENAMED Viewed

@@ -105,6 +105,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
     activation function (e.g., ReLU etc.).
     It implements:
       - initialize_weights: sets up parameters, including a threshold.
       - encode: computes the feature activations using JumpReLU.
       - decode: reconstructs the input from the feature activations.
@@ -216,10 +217,12 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
     JumpReLUTrainingSAE is a training-focused implementation of a SAE using a JumpReLU activation.
     Similar to the inference-only JumpReLUSAE, but with:
       - A learnable log-threshold parameter (instead of a raw threshold).
       - A specialized auxiliary loss term for sparsity (L0 or similar).
     Methods of interest include:
     - initialize_weights: sets up W_enc, b_enc, W_dec, b_dec, and log_threshold.
     - encode_with_hidden_pre_jumprelu: runs a forward pass for training.
     - training_forward_pass: calculates MSE and auxiliary losses, returning a TrainStepOutput.

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/saes/standard_sae.py RENAMED Viewed

@@ -34,6 +34,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
     using a simple linear encoder and decoder.
     It implements the required abstract methods from BaseSAE:
       - initialize_weights: sets up simple parameter initializations for W_enc, b_enc, W_dec, and b_dec.
       - encode: computes the feature activations from an input.
       - decode: reconstructs the input from the feature activations.
@@ -99,6 +100,7 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
     """
     StandardTrainingSAE is a concrete implementation of BaseTrainingSAE using the "standard" SAE architecture.
     It implements:
       - initialize_weights: basic weight initialization for encoder/decoder.
       - encode: inference encoding (invokes encode_with_hidden_pre).
       - decode: a simple linear decoder.

{sae_lens-6.26.1 → sae_lens-6.28.0}/sae_lens/saes/temporal_sae.py RENAMED Viewed

@@ -167,6 +167,7 @@ class TemporalSAE(SAE[TemporalSAEConfig]):
     """TemporalSAE: Sparse Autoencoder with temporal attention.
     This SAE decomposes each activation x_t into:
     - x_pred: Information aggregated from context {x_0, ..., x_{t-1}}
     - x_novel: Novel information at position t (encoded sparsely)

sae_lens-6.28.0/sae_lens/synthetic/__init__.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""
+Synthetic data utilities for SAE experiments.
+This module provides tools for creating feature dictionaries and generating
+synthetic activations for testing and experimenting with SAEs.
+Main components:
+- FeatureDictionary: Maps sparse feature activations to dense hidden activations
+- ActivationGenerator: Generates batches of synthetic feature activations
+- HierarchyNode: Enforces hierarchical structure on feature activations
+- Training utilities: Helpers for training and evaluating SAEs on synthetic data
+- Plotting utilities: Visualization helpers for understanding SAE behavior
+"""
+from sae_lens.synthetic.activation_generator import (
+    ActivationGenerator,
+    ActivationsModifier,
+    ActivationsModifierInput,
+)
+from sae_lens.synthetic.correlation import (
+    create_correlation_matrix_from_correlations,
+    generate_random_correlation_matrix,
+    generate_random_correlations,
+)
+from sae_lens.synthetic.evals import (
+    SyntheticDataEvalResult,
+    eval_sae_on_synthetic_data,
+    mean_correlation_coefficient,
+)
+from sae_lens.synthetic.feature_dictionary import (
+    FeatureDictionary,
+    FeatureDictionaryInitializer,
+    orthogonal_initializer,
+    orthogonalize_embeddings,
+)
+from sae_lens.synthetic.firing_probabilities import (
+    linear_firing_probabilities,
+    random_firing_probabilities,
+    zipfian_firing_probabilities,
+)
+from sae_lens.synthetic.hierarchy import HierarchyNode, hierarchy_modifier
+from sae_lens.synthetic.initialization import init_sae_to_match_feature_dict
+from sae_lens.synthetic.plotting import (
+    find_best_feature_ordering,
+    find_best_feature_ordering_across_saes,
+    find_best_feature_ordering_from_sae,
+    plot_sae_feature_similarity,
+)
+from sae_lens.synthetic.training import (
+    SyntheticActivationIterator,
+    train_toy_sae,
+)
+from sae_lens.util import cosine_similarities
+__all__ = [
+    # Main classes
+    "FeatureDictionary",
+    "HierarchyNode",
+    "hierarchy_modifier",
+    "ActivationGenerator",
+    # Activation generation
+    "zipfian_firing_probabilities",
+    "linear_firing_probabilities",
+    "random_firing_probabilities",
+    "create_correlation_matrix_from_correlations",
+    "generate_random_correlations",
+    "generate_random_correlation_matrix",
+    # Feature modifiers
+    "ActivationsModifier",
+    "ActivationsModifierInput",
+    # Utilities
+    "orthogonalize_embeddings",
+    "orthogonal_initializer",
+    "FeatureDictionaryInitializer",
+    "cosine_similarities",
+    # Training utilities
+    "SyntheticActivationIterator",
+    "SyntheticDataEvalResult",
+    "train_toy_sae",
+    "eval_sae_on_synthetic_data",
+    "mean_correlation_coefficient",
+    "init_sae_to_match_feature_dict",
+    # Plotting utilities
+    "find_best_feature_ordering",
+    "find_best_feature_ordering_from_sae",
+    "find_best_feature_ordering_across_saes",
+    "plot_sae_feature_similarity",
+]

sae_lens-6.28.0/sae_lens/synthetic/activation_generator.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""
+Functions for generating synthetic feature activations.
+"""
+from collections.abc import Callable, Sequence
+import torch
+from scipy.stats import norm
+from torch import nn
+from torch.distributions import MultivariateNormal
+from sae_lens.util import str_to_dtype
+ActivationsModifier = Callable[[torch.Tensor], torch.Tensor]
+ActivationsModifierInput = ActivationsModifier | Sequence[ActivationsModifier] | None
+class ActivationGenerator(nn.Module):
+    """
+    Generator for synthetic feature activations.
+    This module provides a generator for synthetic feature activations with controlled properties.
+    """
+    num_features: int
+    firing_probabilities: torch.Tensor
+    std_firing_magnitudes: torch.Tensor
+    mean_firing_magnitudes: torch.Tensor
+    modify_activations: ActivationsModifier | None
+    correlation_matrix: torch.Tensor | None
+    correlation_thresholds: torch.Tensor | None
+    def __init__(
+        self,
+        num_features: int,
+        firing_probabilities: torch.Tensor | float,
+        std_firing_magnitudes: torch.Tensor | float = 0.0,
+        mean_firing_magnitudes: torch.Tensor | float = 1.0,
+        modify_activations: ActivationsModifierInput = None,
+        correlation_matrix: torch.Tensor | None = None,
+        device: torch.device | str = "cpu",
+        dtype: torch.dtype | str = "float32",
+    ):
+        super().__init__()
+        self.num_features = num_features
+        self.firing_probabilities = _to_tensor(
+            firing_probabilities, num_features, device, dtype
+        )
+        self.std_firing_magnitudes = _to_tensor(
+            std_firing_magnitudes, num_features, device, dtype
+        )
+        self.mean_firing_magnitudes = _to_tensor(
+            mean_firing_magnitudes, num_features, device, dtype
+        )
+        self.modify_activations = _normalize_modifiers(modify_activations)
+        self.correlation_thresholds = None
+        if correlation_matrix is not None:
+            _validate_correlation_matrix(correlation_matrix, num_features)
+            self.correlation_thresholds = torch.tensor(
+                [norm.ppf(1 - p.item()) for p in self.firing_probabilities],
+                device=device,
+                dtype=self.firing_probabilities.dtype,
+            )
+        self.correlation_matrix = correlation_matrix
+    def sample(self, batch_size: int) -> torch.Tensor:
+        """
+        Generate a batch of feature activations with controlled properties.
+        This is the main function for generating synthetic training data for SAEs.
+        Features fire independently according to their firing probabilities unless
+        a correlation matrix is provided.
+        Args:
+            batch_size: Number of samples to generate
+        Returns:
+            Tensor of shape [batch_size, num_features] with non-negative activations
+        """
+        # All tensors (firing_probabilities, std_firing_magnitudes, mean_firing_magnitudes)
+        # are on the same device from __init__ via _to_tensor()
+        device = self.firing_probabilities.device
+        if self.correlation_matrix is not None:
+            assert self.correlation_thresholds is not None
+            firing_features = _generate_correlated_features(
+                batch_size,
+                self.correlation_matrix,
+                self.correlation_thresholds,
+                device,
+            )
+        else:
+            firing_features = torch.bernoulli(
+                self.firing_probabilities.unsqueeze(0).expand(batch_size, -1)
+            )
+        firing_magnitude_delta = torch.normal(
+            torch.zeros_like(self.firing_probabilities)
+            .unsqueeze(0)
+            .expand(batch_size, -1),
+            self.std_firing_magnitudes.unsqueeze(0).expand(batch_size, -1),
+        )
+        firing_magnitude_delta[firing_features == 0] = 0
+        feature_activations = (
+            firing_features * self.mean_firing_magnitudes + firing_magnitude_delta
+        ).relu()
+        if self.modify_activations is not None:
+            feature_activations = self.modify_activations(feature_activations).relu()
+        return feature_activations
+    def forward(self, batch_size: int) -> torch.Tensor:
+        return self.sample(batch_size)
+def _generate_correlated_features(
+    batch_size: int,
+    correlation_matrix: torch.Tensor,
+    thresholds: torch.Tensor,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generate correlated binary features using multivariate Gaussian sampling.
+    Uses the Gaussian copula approach: sample from a multivariate normal
+    distribution, then threshold to get binary features.
+    Args:
+        batch_size: Number of samples to generate
+        correlation_matrix: Correlation matrix between features
+        thresholds: Pre-computed thresholds for each feature (from inverse normal CDF)
+        device: Device to generate samples on
+    Returns:
+        Binary feature matrix of shape [batch_size, num_features]
+    """
+    num_features = correlation_matrix.shape[0]
+    mvn = MultivariateNormal(
+        loc=torch.zeros(num_features, device=device, dtype=thresholds.dtype),
+        covariance_matrix=correlation_matrix.to(device=device, dtype=thresholds.dtype),
+    )
+    gaussian_samples = mvn.sample((batch_size,))
+    return (gaussian_samples > thresholds.unsqueeze(0)).float()
+def _to_tensor(
+    value: torch.Tensor | float,
+    num_features: int,
+    device: torch.device | str,
+    dtype: torch.dtype | str,
+) -> torch.Tensor:
+    dtype = str_to_dtype(dtype)
+    device = torch.device(device)
+    if not isinstance(value, torch.Tensor):
+        value = value * torch.ones(num_features, device=device, dtype=dtype)
+    if value.shape != (num_features,):
+        raise ValueError(
+            f"Value must be a tensor of shape ({num_features},) or a float. Got {value.shape}"
+        )
+    return value.to(device, dtype)
+def _normalize_modifiers(
+    modify_activations: ActivationsModifierInput,
+) -> ActivationsModifier | None:
+    """Convert modifier input to a single modifier or None."""
+    if modify_activations is None:
+        return None
+    if callable(modify_activations):
+        return modify_activations
+    # It's a sequence of modifiers - chain them
+    modifiers = list(modify_activations)
+    if len(modifiers) == 0:
+        return None
+    if len(modifiers) == 1:
+        return modifiers[0]
+    def chained(activations: torch.Tensor) -> torch.Tensor:
+        result = activations
+        for modifier in modifiers:
+            result = modifier(result)
+        return result
+    return chained
+def _validate_correlation_matrix(
+    correlation_matrix: torch.Tensor, num_features: int
+) -> None:
+    """Validate that a correlation matrix has correct properties.
+    Args:
+        correlation_matrix: The matrix to validate
+        num_features: Expected number of features (matrix should be [num_features, num_features])
+    Raises:
+        ValueError: If the matrix has incorrect shape, non-unit diagonal, or is not positive definite
+    """
+    expected_shape = (num_features, num_features)
+    if correlation_matrix.shape != expected_shape:
+        raise ValueError(
+            f"Correlation matrix must have shape {expected_shape}, "
+            f"got {tuple(correlation_matrix.shape)}"
+        )
+    diagonal = torch.diag(correlation_matrix)
+    if not torch.allclose(diagonal, torch.ones_like(diagonal)):
+        raise ValueError("Correlation matrix diagonal must be all 1s")
+    try:
+        torch.linalg.cholesky(correlation_matrix)
+    except RuntimeError as e:
+        raise ValueError("Correlation matrix must be positive definite") from e

sae_lens-6.28.0/sae_lens/synthetic/correlation.py ADDED Viewed

@@ -0,0 +1,170 @@
+import random
+import torch
+def create_correlation_matrix_from_correlations(
+    num_features: int,
+    correlations: dict[tuple[int, int], float] | None = None,
+    default_correlation: float = 0.0,
+) -> torch.Tensor:
+    """
+    Create a correlation matrix with specified pairwise correlations.
+    Args:
+        num_features: Number of features
+        correlations: Dict mapping (i, j) pairs to correlation values.
+            Pairs should have i < j.
+        default_correlation: Default correlation for unspecified pairs
+    Returns:
+        Correlation matrix of shape [num_features, num_features]
+    """
+    matrix = torch.eye(num_features) + default_correlation * (
+        1 - torch.eye(num_features)
+    )
+    if correlations is not None:
+        for (i, j), corr in correlations.items():
+            matrix[i, j] = corr
+            matrix[j, i] = corr
+    # Ensure matrix is symmetric (numerical precision)
+    matrix = (matrix + matrix.T) / 2
+    # Check positive definiteness and fix if necessary
+    # Use eigvalsh for symmetric matrices (returns real eigenvalues)
+    eigenvals = torch.linalg.eigvalsh(matrix)
+    if torch.any(eigenvals < -1e-6):
+        matrix = _fix_correlation_matrix(matrix)
+    return matrix
+def _fix_correlation_matrix(
+    matrix: torch.Tensor, min_eigenval: float = 1e-6
+) -> torch.Tensor:
+    """Fix a correlation matrix to be positive semi-definite."""
+    eigenvals, eigenvecs = torch.linalg.eigh(matrix)
+    eigenvals = torch.clamp(eigenvals, min=min_eigenval)
+    fixed_matrix = eigenvecs @ torch.diag(eigenvals) @ eigenvecs.T
+    diag_vals = torch.diag(fixed_matrix)
+    fixed_matrix = fixed_matrix / torch.sqrt(
+        diag_vals.unsqueeze(0) * diag_vals.unsqueeze(1)
+    )
+    fixed_matrix.fill_diagonal_(1.0)
+    return fixed_matrix
+def generate_random_correlations(
+    num_features: int,
+    positive_ratio: float = 0.5,
+    uncorrelated_ratio: float = 0.3,
+    min_correlation_strength: float = 0.1,
+    max_correlation_strength: float = 0.8,
+    seed: int | None = None,
+) -> dict[tuple[int, int], float]:
+    """
+    Generate random correlations between features with specified constraints.
+    Args:
+        num_features: Number of features
+        positive_ratio: Fraction of correlations that should be positive (0.0 to 1.0)
+        uncorrelated_ratio: Fraction of feature pairs that should remain uncorrelated (0.0 to 1.0)
+        min_correlation_strength: Minimum absolute correlation strength
+        max_correlation_strength: Maximum absolute correlation strength
+        seed: Random seed for reproducibility
+    Returns:
+        Dictionary mapping (i, j) pairs to correlation values
+    """
+    # Use local random number generator to avoid side effects on global state
+    rng = random.Random(seed)
+    # Validate inputs
+    if not 0.0 <= positive_ratio <= 1.0:
+        raise ValueError("positive_ratio must be between 0.0 and 1.0")
+    if not 0.0 <= uncorrelated_ratio <= 1.0:
+        raise ValueError("uncorrelated_ratio must be between 0.0 and 1.0")
+    if min_correlation_strength < 0:
+        raise ValueError("min_correlation_strength must be non-negative")
+    if max_correlation_strength > 1.0:
+        raise ValueError("max_correlation_strength must be <= 1.0")
+    if min_correlation_strength > max_correlation_strength:
+        raise ValueError("min_correlation_strength must be <= max_correlation_strength")
+    # Generate all possible feature pairs (i, j) where i < j
+    all_pairs = [
+        (i, j) for i in range(num_features) for j in range(i + 1, num_features)
+    ]
+    total_pairs = len(all_pairs)
+    if total_pairs == 0:
+        return {}
+    # Determine how many pairs to correlate vs leave uncorrelated
+    num_uncorrelated = int(total_pairs * uncorrelated_ratio)
+    num_correlated = total_pairs - num_uncorrelated
+    # Randomly select which pairs to correlate
+    correlated_pairs = rng.sample(all_pairs, num_correlated)
+    # For correlated pairs, determine positive vs negative
+    num_positive = int(num_correlated * positive_ratio)
+    num_negative = num_correlated - num_positive
+    # Assign signs
+    signs = [1] * num_positive + [-1] * num_negative
+    rng.shuffle(signs)
+    # Generate correlation strengths
+    correlations = {}
+    for pair, sign in zip(correlated_pairs, signs):
+        # Sample correlation strength uniformly from range
+        strength = rng.uniform(min_correlation_strength, max_correlation_strength)
+        correlations[pair] = sign * strength
+    return correlations
+def generate_random_correlation_matrix(
+    num_features: int,
+    positive_ratio: float = 0.5,
+    uncorrelated_ratio: float = 0.3,
+    min_correlation_strength: float = 0.1,
+    max_correlation_strength: float = 0.8,
+    seed: int | None = None,
+) -> torch.Tensor:
+    """
+    Generate a random correlation matrix with specified constraints.
+    This is a convenience function that combines generate_random_correlations()
+    and create_correlation_matrix_from_correlations() into a single call.
+    Args:
+        num_features: Number of features
+        positive_ratio: Fraction of correlations that should be positive (0.0 to 1.0)
+        uncorrelated_ratio: Fraction of feature pairs that should remain uncorrelated (0.0 to 1.0)
+        min_correlation_strength: Minimum absolute correlation strength
+        max_correlation_strength: Maximum absolute correlation strength
+        seed: Random seed for reproducibility
+    Returns:
+        Random correlation matrix of shape [num_features, num_features]
+    """
+    # Generate random correlations
+    correlations = generate_random_correlations(
+        num_features=num_features,
+        positive_ratio=positive_ratio,
+        uncorrelated_ratio=uncorrelated_ratio,
+        min_correlation_strength=min_correlation_strength,
+        max_correlation_strength=max_correlation_strength,
+        seed=seed,
+    )
+    # Create and return correlation matrix
+    return create_correlation_matrix_from_correlations(
+        num_features=num_features, correlations=correlations
+    )

sae-lens 6.26.1__tar.gz → 6.28.0__tar.gz

sae-lens 6.26.1tar.gz → 6.28.0tar.gz