PyPI - sae-lens - Versions diffs - 6.29.0__tar.gz → 6.30.1__tar.gz - Mend

sae-lens 6.29.0tar.gz → 6.30.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{sae_lens-6.29.0 → sae_lens-6.30.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.29.0
+Version: 6.30.1
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.29.0 → sae_lens-6.30.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.29.0"
+version = "6.30.1"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.29.0"
+__version__ = "6.30.1"
 import logging

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/loading/pretrained_sae_loaders.py RENAMED Viewed

@@ -575,6 +575,8 @@ def _infer_gemma_3_raw_cfg_dict(repo_id: str, folder_name: str) -> dict[str, Any
         "model_name": model_name,
         "hf_hook_point_in": hf_hook_point_in,
     }
+    if "transcoder" in folder_name or "clt" in folder_name:
+        cfg["affine_connection"] = "affine" in folder_name
     if hf_hook_point_out is not None:
         cfg["hf_hook_point_out"] = hf_hook_point_out
@@ -614,11 +616,11 @@ def get_gemma_3_config_from_hf(
     if "resid_post" in folder_name:
         hook_name = f"blocks.{layer}.hook_resid_post"
     elif "attn_out" in folder_name:
-        hook_name = f"blocks.{layer}.hook_attn_out"
+        hook_name = f"blocks.{layer}.attn.hook_z"
     elif "mlp_out" in folder_name:
         hook_name = f"blocks.{layer}.hook_mlp_out"
     elif "transcoder" in folder_name or "clt" in folder_name:
-        hook_name = f"blocks.{layer}.ln2.hook_normalized"
+        hook_name = f"blocks.{layer}.hook_mlp_in"
         hook_name_out = f"blocks.{layer}.hook_mlp_out"
     else:
         raise ValueError("Hook name not found in folder_name.")
@@ -643,7 +645,11 @@ def get_gemma_3_config_from_hf(
     architecture = "jumprelu"
     if "transcoder" in folder_name or "clt" in folder_name:
-        architecture = "jumprelu_skip_transcoder"
+        architecture = (
+            "jumprelu_skip_transcoder"
+            if raw_cfg_dict.get("affine_connection", False)
+            else "jumprelu_transcoder"
+        )
         d_out = shapes_dict["w_dec"][-1]
     cfg = {

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/pretrained_saes.yaml RENAMED Viewed

@@ -4148,6 +4148,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_17_width_16k_l0_medium
     path: resid_post/layer_17_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/17-gemmascope-2-res-16k
   - id: layer_17_width_16k_l0_small
     path: resid_post/layer_17_width_16k_l0_small
     l0: 20
@@ -4166,6 +4167,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_17_width_262k_l0_medium
     path: resid_post/layer_17_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/17-gemmascope-2-res-262k
   - id: layer_17_width_262k_l0_medium_seed_1
     path: resid_post/layer_17_width_262k_l0_medium_seed_1
     l0: 60
@@ -4178,6 +4180,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_17_width_65k_l0_medium
     path: resid_post/layer_17_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/17-gemmascope-2-res-65k
   - id: layer_17_width_65k_l0_small
     path: resid_post/layer_17_width_65k_l0_small
     l0: 20
@@ -4187,6 +4190,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_22_width_16k_l0_medium
     path: resid_post/layer_22_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/22-gemmascope-2-res-16k
   - id: layer_22_width_16k_l0_small
     path: resid_post/layer_22_width_16k_l0_small
     l0: 20
@@ -4205,6 +4209,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_22_width_262k_l0_medium
     path: resid_post/layer_22_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/22-gemmascope-2-res-262k
   - id: layer_22_width_262k_l0_medium_seed_1
     path: resid_post/layer_22_width_262k_l0_medium_seed_1
     l0: 60
@@ -4217,6 +4222,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_22_width_65k_l0_medium
     path: resid_post/layer_22_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/22-gemmascope-2-res-65k
   - id: layer_22_width_65k_l0_small
     path: resid_post/layer_22_width_65k_l0_small
     l0: 20
@@ -4226,6 +4232,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_29_width_16k_l0_medium
     path: resid_post/layer_29_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/29-gemmascope-2-res-16k
   - id: layer_29_width_16k_l0_small
     path: resid_post/layer_29_width_16k_l0_small
     l0: 20
@@ -4244,6 +4251,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_29_width_262k_l0_medium
     path: resid_post/layer_29_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/29-gemmascope-2-res-262k
   - id: layer_29_width_262k_l0_medium_seed_1
     path: resid_post/layer_29_width_262k_l0_medium_seed_1
     l0: 60
@@ -4256,6 +4264,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_29_width_65k_l0_medium
     path: resid_post/layer_29_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-4b-it/29-gemmascope-2-res-65k
   - id: layer_29_width_65k_l0_small
     path: resid_post/layer_29_width_65k_l0_small
     l0: 20
@@ -4265,6 +4274,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_9_width_16k_l0_medium
     path: resid_post/layer_9_width_16k_l0_medium
     l0: 53
+    neuronpedia: gemma-3-4b-it/9-gemmascope-2-res-16k
   - id: layer_9_width_16k_l0_small
     path: resid_post/layer_9_width_16k_l0_small
     l0: 17
@@ -4283,6 +4293,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_9_width_262k_l0_medium
     path: resid_post/layer_9_width_262k_l0_medium
     l0: 53
+    neuronpedia: gemma-3-4b-it/9-gemmascope-2-res-262k
   - id: layer_9_width_262k_l0_medium_seed_1
     path: resid_post/layer_9_width_262k_l0_medium_seed_1
     l0: 53
@@ -4295,6 +4306,7 @@ gemma-scope-2-4b-it-res:
   - id: layer_9_width_65k_l0_medium
     path: resid_post/layer_9_width_65k_l0_medium
     l0: 53
+    neuronpedia: gemma-3-4b-it/9-gemmascope-2-res-65k
   - id: layer_9_width_65k_l0_small
     path: resid_post/layer_9_width_65k_l0_small
     l0: 17
@@ -14491,6 +14503,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_12_width_16k_l0_medium
     path: resid_post/layer_12_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/12-gemmascope-2-res-16k
   - id: layer_12_width_16k_l0_small
     path: resid_post/layer_12_width_16k_l0_small
     l0: 20
@@ -14509,6 +14522,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_12_width_262k_l0_medium
     path: resid_post/layer_12_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/12-gemmascope-2-res-262k
   - id: layer_12_width_262k_l0_medium_seed_1
     path: resid_post/layer_12_width_262k_l0_medium_seed_1
     l0: 60
@@ -14521,6 +14535,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_12_width_65k_l0_medium
     path: resid_post/layer_12_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/12-gemmascope-2-res-65k
   - id: layer_12_width_65k_l0_small
     path: resid_post/layer_12_width_65k_l0_small
     l0: 20
@@ -14530,6 +14545,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_15_width_16k_l0_medium
     path: resid_post/layer_15_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/15-gemmascope-2-res-16k
   - id: layer_15_width_16k_l0_small
     path: resid_post/layer_15_width_16k_l0_small
     l0: 20
@@ -14548,6 +14564,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_15_width_262k_l0_medium
     path: resid_post/layer_15_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/15-gemmascope-2-res-262k
   - id: layer_15_width_262k_l0_medium_seed_1
     path: resid_post/layer_15_width_262k_l0_medium_seed_1
     l0: 60
@@ -14560,6 +14577,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_15_width_65k_l0_medium
     path: resid_post/layer_15_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/15-gemmascope-2-res-65k
   - id: layer_15_width_65k_l0_small
     path: resid_post/layer_15_width_65k_l0_small
     l0: 20
@@ -14569,6 +14587,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_5_width_16k_l0_medium
     path: resid_post/layer_5_width_16k_l0_medium
     l0: 55
+    neuronpedia: gemma-3-270m-it/5-gemmascope-2-res-16k
   - id: layer_5_width_16k_l0_small
     path: resid_post/layer_5_width_16k_l0_small
     l0: 18
@@ -14587,6 +14606,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_5_width_262k_l0_medium
     path: resid_post/layer_5_width_262k_l0_medium
     l0: 55
+    neuronpedia: gemma-3-270m-it/5-gemmascope-2-res-262k
   - id: layer_5_width_262k_l0_medium_seed_1
     path: resid_post/layer_5_width_262k_l0_medium_seed_1
     l0: 55
@@ -14599,6 +14619,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_5_width_65k_l0_medium
     path: resid_post/layer_5_width_65k_l0_medium
     l0: 55
+    neuronpedia: gemma-3-270m-it/5-gemmascope-2-res-65k
   - id: layer_5_width_65k_l0_small
     path: resid_post/layer_5_width_65k_l0_small
     l0: 18
@@ -14608,6 +14629,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_9_width_16k_l0_medium
     path: resid_post/layer_9_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/9-gemmascope-2-res-16k
   - id: layer_9_width_16k_l0_small
     path: resid_post/layer_9_width_16k_l0_small
     l0: 20
@@ -14626,6 +14648,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_9_width_262k_l0_medium
     path: resid_post/layer_9_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/9-gemmascope-2-res-262k
   - id: layer_9_width_262k_l0_medium_seed_1
     path: resid_post/layer_9_width_262k_l0_medium_seed_1
     l0: 60
@@ -14638,6 +14661,7 @@ gemma-scope-2-270m-it-res:
   - id: layer_9_width_65k_l0_medium
     path: resid_post/layer_9_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-270m-it/9-gemmascope-2-res-65k
   - id: layer_9_width_65k_l0_small
     path: resid_post/layer_9_width_65k_l0_small
     l0: 20
@@ -18727,6 +18751,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_13_width_16k_l0_medium
     path: resid_post/layer_13_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/13-gemmascope-2-res-16k
   - id: layer_13_width_16k_l0_small
     path: resid_post/layer_13_width_16k_l0_small
     l0: 20
@@ -18745,6 +18770,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_13_width_262k_l0_medium
     path: resid_post/layer_13_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/13-gemmascope-2-res-262k
   - id: layer_13_width_262k_l0_medium_seed_1
     path: resid_post/layer_13_width_262k_l0_medium_seed_1
     l0: 60
@@ -18757,6 +18783,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_13_width_65k_l0_medium
     path: resid_post/layer_13_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/13-gemmascope-2-res-65k
   - id: layer_13_width_65k_l0_small
     path: resid_post/layer_13_width_65k_l0_small
     l0: 20
@@ -18766,6 +18793,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_17_width_16k_l0_medium
     path: resid_post/layer_17_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/17-gemmascope-2-res-16k
   - id: layer_17_width_16k_l0_small
     path: resid_post/layer_17_width_16k_l0_small
     l0: 20
@@ -18784,6 +18812,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_17_width_262k_l0_medium
     path: resid_post/layer_17_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/17-gemmascope-2-res-262k
   - id: layer_17_width_262k_l0_medium_seed_1
     path: resid_post/layer_17_width_262k_l0_medium_seed_1
     l0: 60
@@ -18796,6 +18825,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_17_width_65k_l0_medium
     path: resid_post/layer_17_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/17-gemmascope-2-res-65k
   - id: layer_17_width_65k_l0_small
     path: resid_post/layer_17_width_65k_l0_small
     l0: 20
@@ -18805,6 +18835,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_22_width_16k_l0_medium
     path: resid_post/layer_22_width_16k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/22-gemmascope-2-res-16k
   - id: layer_22_width_16k_l0_small
     path: resid_post/layer_22_width_16k_l0_small
     l0: 20
@@ -18823,6 +18854,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_22_width_262k_l0_medium
     path: resid_post/layer_22_width_262k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/22-gemmascope-2-res-262k
   - id: layer_22_width_262k_l0_medium_seed_1
     path: resid_post/layer_22_width_262k_l0_medium_seed_1
     l0: 60
@@ -18835,6 +18867,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_22_width_65k_l0_medium
     path: resid_post/layer_22_width_65k_l0_medium
     l0: 60
+    neuronpedia: gemma-3-1b-it/22-gemmascope-2-res-65k
   - id: layer_22_width_65k_l0_small
     path: resid_post/layer_22_width_65k_l0_small
     l0: 20
@@ -18844,6 +18877,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_7_width_16k_l0_medium
     path: resid_post/layer_7_width_16k_l0_medium
     l0: 54
+    neuronpedia: gemma-3-1b-it/7-gemmascope-2-res-16k
   - id: layer_7_width_16k_l0_small
     path: resid_post/layer_7_width_16k_l0_small
     l0: 18
@@ -18862,6 +18896,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_7_width_262k_l0_medium
     path: resid_post/layer_7_width_262k_l0_medium
     l0: 54
+    neuronpedia: gemma-3-1b-it/7-gemmascope-2-res-262k
   - id: layer_7_width_262k_l0_medium_seed_1
     path: resid_post/layer_7_width_262k_l0_medium_seed_1
     l0: 54
@@ -18874,6 +18909,7 @@ gemma-scope-2-1b-it-res:
   - id: layer_7_width_65k_l0_medium
     path: resid_post/layer_7_width_65k_l0_medium
     l0: 54
+    neuronpedia: gemma-3-1b-it/7-gemmascope-2-res-65k
   - id: layer_7_width_65k_l0_small
     path: resid_post/layer_7_width_65k_l0_small
     l0: 18

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/synthetic/activation_generator.py RENAMED Viewed

@@ -2,12 +2,12 @@
 Functions for generating synthetic feature activations.
 """
+import math
 from collections.abc import Callable, Sequence
 import torch
-from scipy.stats import norm
 from torch import nn
-from torch.distributions import LowRankMultivariateNormal, MultivariateNormal
+from torch.distributions import MultivariateNormal
 from sae_lens.synthetic.correlation import LowRankCorrelationMatrix
 from sae_lens.util import str_to_dtype
@@ -34,6 +34,7 @@ class ActivationGenerator(nn.Module):
     correlation_matrix: torch.Tensor | None
     low_rank_correlation: tuple[torch.Tensor, torch.Tensor] | None
     correlation_thresholds: torch.Tensor | None
+    use_sparse_tensors: bool
     def __init__(
         self,
@@ -45,7 +46,34 @@ class ActivationGenerator(nn.Module):
         correlation_matrix: CorrelationMatrixInput | None = None,
         device: torch.device | str = "cpu",
         dtype: torch.dtype | str = "float32",
+        use_sparse_tensors: bool = False,
     ):
+        """
+        Create a new ActivationGenerator.
+        Args:
+            num_features: Number of features to generate activations for.
+            firing_probabilities: Probability of each feature firing. Can be a single
+                float (applied to all features) or a tensor of shape (num_features,).
+            std_firing_magnitudes: Standard deviation of firing magnitudes. Can be a
+                single float or a tensor of shape (num_features,). Defaults to 0.0
+                (deterministic magnitudes).
+            mean_firing_magnitudes: Mean firing magnitude when a feature fires. Can be
+                a single float or a tensor of shape (num_features,). Defaults to 1.0.
+            modify_activations: Optional function(s) to modify activations after
+                generation. Can be a single callable, a sequence of callables (applied
+                in order), or None. Useful for applying hierarchy constraints.
+            correlation_matrix: Optional correlation structure between features. Can be:
+                - A full correlation matrix tensor of shape (num_features, num_features)
+                - A LowRankCorrelationMatrix for memory-efficient large-scale correlations
+                - A tuple of (factor, diag) tensors representing low-rank structure
+            device: Device to place tensors on. Defaults to "cpu".
+            dtype: Data type for tensors. Defaults to "float32".
+            use_sparse_tensors: If True, return sparse COO tensors from sample().
+                Only recommended when using massive numbers of features. Defaults to False.
+        """
         super().__init__()
         self.num_features = num_features
         self.firing_probabilities = _to_tensor(
@@ -61,6 +89,7 @@ class ActivationGenerator(nn.Module):
         self.correlation_thresholds = None
         self.correlation_matrix = None
         self.low_rank_correlation = None
+        self.use_sparse_tensors = use_sparse_tensors
         if correlation_matrix is not None:
             if isinstance(correlation_matrix, torch.Tensor):
@@ -76,12 +105,15 @@ class ActivationGenerator(nn.Module):
                 _validate_low_rank_correlation(
                     correlation_factor, correlation_diag, num_features
                 )
-                self.low_rank_correlation = (correlation_factor, correlation_diag)
+                # Pre-compute sqrt for efficiency (used every sample call)
+                self.low_rank_correlation = (
+                    correlation_factor,
+                    correlation_diag.sqrt(),
+                )
-            self.correlation_thresholds = torch.tensor(
-                [norm.ppf(1 - p.item()) for p in self.firing_probabilities],
-                device=device,
-                dtype=self.firing_probabilities.dtype,
+            # Vectorized inverse normal CDF: norm.ppf(1-p) = sqrt(2) * erfinv(1 - 2*p)
+            self.correlation_thresholds = math.sqrt(2) * torch.erfinv(
+                1 - 2 * self.firing_probabilities
             )
     @torch.no_grad()
@@ -105,7 +137,7 @@ class ActivationGenerator(nn.Module):
         if self.correlation_matrix is not None:
             assert self.correlation_thresholds is not None
-            firing_features = _generate_correlated_features(
+            firing_indices = _generate_correlated_features(
                 batch_size,
                 self.correlation_matrix,
                 self.correlation_thresholds,
@@ -113,7 +145,7 @@ class ActivationGenerator(nn.Module):
             )
         elif self.low_rank_correlation is not None:
             assert self.correlation_thresholds is not None
-            firing_features = _generate_low_rank_correlated_features(
+            firing_indices = _generate_low_rank_correlated_features(
                 batch_size,
                 self.low_rank_correlation[0],
                 self.low_rank_correlation[1],
@@ -121,23 +153,58 @@ class ActivationGenerator(nn.Module):
                 device,
             )
         else:
-            firing_features = torch.bernoulli(
+            firing_indices = torch.bernoulli(
                 self.firing_probabilities.unsqueeze(0).expand(batch_size, -1)
+            ).nonzero(as_tuple=True)
+        # Compute activations only at firing positions (sparse optimization)
+        feature_indices = firing_indices[1]
+        num_firing = feature_indices.shape[0]
+        mean_at_firing = self.mean_firing_magnitudes[feature_indices]
+        std_at_firing = self.std_firing_magnitudes[feature_indices]
+        random_deltas = (
+            torch.randn(
+                num_firing, device=device, dtype=self.mean_firing_magnitudes.dtype
             )
-        firing_magnitude_delta = torch.normal(
-            torch.zeros_like(self.firing_probabilities)
-            .unsqueeze(0)
-            .expand(batch_size, -1),
-            self.std_firing_magnitudes.unsqueeze(0).expand(batch_size, -1),
+            * std_at_firing
         )
-        firing_magnitude_delta[firing_features == 0] = 0
-        feature_activations = (
-            firing_features * self.mean_firing_magnitudes + firing_magnitude_delta
-        ).relu()
+        activations_at_firing = (mean_at_firing + random_deltas).relu()
+        if self.use_sparse_tensors:
+            # Return sparse COO tensor
+            indices = torch.stack(firing_indices)  # [2, nnz]
+            feature_activations = torch.sparse_coo_tensor(
+                indices,
+                activations_at_firing,
+                size=(batch_size, self.num_features),
+                device=device,
+                dtype=self.mean_firing_magnitudes.dtype,
+            )
+        else:
+            # Dense tensor path
+            feature_activations = torch.zeros(
+                batch_size,
+                self.num_features,
+                device=device,
+                dtype=self.mean_firing_magnitudes.dtype,
+            )
+            feature_activations[firing_indices] = activations_at_firing
         if self.modify_activations is not None:
-            feature_activations = self.modify_activations(feature_activations).relu()
+            feature_activations = self.modify_activations(feature_activations)
+            if feature_activations.is_sparse:
+                # Apply relu to sparse values
+                feature_activations = feature_activations.coalesce()
+                feature_activations = torch.sparse_coo_tensor(
+                    feature_activations.indices(),
+                    feature_activations.values().relu(),
+                    feature_activations.shape,
+                    device=feature_activations.device,
+                    dtype=feature_activations.dtype,
+                )
+            else:
+                feature_activations = feature_activations.relu()
         return feature_activations
     def forward(self, batch_size: int) -> torch.Tensor:
@@ -149,7 +216,7 @@ def _generate_correlated_features(
     correlation_matrix: torch.Tensor,
     thresholds: torch.Tensor,
     device: torch.device,
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Generate correlated binary features using multivariate Gaussian sampling.
@@ -163,7 +230,7 @@ def _generate_correlated_features(
         device: Device to generate samples on
     Returns:
-        Binary feature matrix of shape (batch_size, num_features)
+        Tuple of (row_indices, col_indices) for firing features
     """
     num_features = correlation_matrix.shape[0]
@@ -173,16 +240,17 @@ def _generate_correlated_features(
     )
     gaussian_samples = mvn.sample((batch_size,))
-    return (gaussian_samples > thresholds.unsqueeze(0)).float()
+    indices = (gaussian_samples > thresholds.unsqueeze(0)).nonzero(as_tuple=True)
+    return indices[0], indices[1]
 def _generate_low_rank_correlated_features(
     batch_size: int,
     correlation_factor: torch.Tensor,
-    correlation_diag: torch.Tensor,
+    cov_diag_sqrt: torch.Tensor,
     thresholds: torch.Tensor,
     device: torch.device,
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Generate correlated binary features using low-rank multivariate Gaussian sampling.
@@ -192,23 +260,29 @@ def _generate_low_rank_correlated_features(
     Args:
         batch_size: Number of samples to generate
         correlation_factor: Factor matrix of shape (num_features, rank)
-        correlation_diag: Diagonal term of shape (num_features,)
+        cov_diag_sqrt: Pre-computed sqrt of diagonal term, shape (num_features,)
         thresholds: Pre-computed thresholds for each feature (from inverse normal CDF)
         device: Device to generate samples on
     Returns:
-        Binary feature matrix of shape (batch_size, num_features)
+        Tuple of (row_indices, col_indices) for firing features
     """
-    mvn = LowRankMultivariateNormal(
-        loc=torch.zeros(
-            correlation_factor.shape[0], device=device, dtype=thresholds.dtype
-        ),
-        cov_factor=correlation_factor.to(device=device, dtype=thresholds.dtype),
-        cov_diag=correlation_diag.to(device=device, dtype=thresholds.dtype),
+    # Manual low-rank MVN sampling to enable autocast for the expensive matmul
+    # samples = eps @ cov_factor.T + eta * sqrt(cov_diag)
+    # where eps ~ N(0, I_rank) and eta ~ N(0, I_n)
+    num_features, rank = correlation_factor.shape
+    # Generate random samples in float32 for numerical stability
+    eps = torch.randn(batch_size, rank, device=device, dtype=correlation_factor.dtype)
+    eta = torch.randn(
+        batch_size, num_features, device=device, dtype=cov_diag_sqrt.dtype
     )
-    gaussian_samples = mvn.sample((batch_size,))
-    return (gaussian_samples > thresholds.unsqueeze(0)).float()
+    gaussian_samples = eps @ correlation_factor.T + eta * cov_diag_sqrt
+    indices = (gaussian_samples > thresholds.unsqueeze(0)).nonzero(as_tuple=True)
+    return indices[0], indices[1]
 def _to_tensor(

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/synthetic/feature_dictionary.py RENAMED Viewed

@@ -168,9 +168,18 @@ class FeatureDictionary(nn.Module):
         Args:
             feature_activations: Tensor of shape [batch, num_features] containing
-                sparse feature activation values
+                sparse feature activation values. Can be dense or sparse COO.
         Returns:
             Tensor of shape [batch, hidden_dim] containing dense hidden activations
         """
+        if feature_activations.is_sparse:
+            # autocast is disabled here because sparse matmul is not supported with bfloat16
+            with torch.autocast(
+                device_type=feature_activations.device.type, enabled=False
+            ):
+                return (
+                    torch.sparse.mm(feature_activations, self.feature_vectors)
+                    + self.bias
+                )
         return feature_activations @ self.feature_vectors + self.bias

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/synthetic/hierarchy.py RENAMED Viewed

@@ -147,6 +147,14 @@ class _SparseHierarchyData:
     # Total number of ME groups
     num_groups: int
+    # Sparse COO support: Feature-to-parent mapping
+    # feat_to_parent[f] = parent feature index, or -1 if root/no parent
+    feat_to_parent: torch.Tensor | None = None  # [num_features]
+    # Sparse COO support: Feature-to-ME-group mapping
+    # feat_to_me_group[f] = group index, or -1 if not in any ME group
+    feat_to_me_group: torch.Tensor | None = None  # [num_features]
 def _build_sparse_hierarchy(
     roots: Sequence[HierarchyNode],
@@ -232,7 +240,11 @@ def _build_sparse_hierarchy(
             me_indices = torch.empty(0, dtype=torch.long)
         level_data.append(
-            _LevelData(features=feats, parents=parents, me_group_indices=me_indices)
+            _LevelData(
+                features=feats,
+                parents=parents,
+                me_group_indices=me_indices,
+            )
         )
     # Build group siblings and parents tensors
@@ -254,12 +266,30 @@ def _build_sparse_hierarchy(
         me_group_parents = torch.empty(0, dtype=torch.long)
         num_groups = 0
+    # Build sparse COO support: feat_to_parent and feat_to_me_group mappings
+    # First determine num_features (max feature index + 1)
+    all_features = [f for f, _, _ in feature_info]
+    num_features = max(all_features) + 1 if all_features else 0
+    # Build feature-to-parent mapping
+    feat_to_parent = torch.full((num_features,), -1, dtype=torch.long)
+    for feat, parent, _ in feature_info:
+        feat_to_parent[feat] = parent
+    # Build feature-to-ME-group mapping
+    feat_to_me_group = torch.full((num_features,), -1, dtype=torch.long)
+    for g_idx, (_, _, siblings) in enumerate(me_groups):
+        for sib in siblings:
+            feat_to_me_group[sib] = g_idx
     return _SparseHierarchyData(
         level_data=level_data,
         me_group_siblings=me_group_siblings,
         me_group_sizes=me_group_sizes,
         me_group_parents=me_group_parents,
         num_groups=num_groups,
+        feat_to_parent=feat_to_parent,
+        feat_to_me_group=feat_to_me_group,
     )
@@ -396,8 +426,9 @@ def _apply_me_for_groups(
     # Random selection for winner
     # Use -1e9 instead of -inf to avoid creating a tensor (torch.tensor(-float("inf")))
     # on every call. Since random scores are in [0,1], -1e9 is effectively -inf for argmax.
+    _INACTIVE_SCORE = -1e9
     random_scores = torch.rand(num_conflicts, max_siblings, device=device)
-    random_scores[~conflict_active] = -1e9
+    random_scores[~conflict_active] = _INACTIVE_SCORE
     winner_idx = random_scores.argmax(dim=1)
@@ -420,6 +451,275 @@ def _apply_me_for_groups(
     activations[deact_batch, deact_feat] = 0
+# ---------------------------------------------------------------------------
+# Sparse COO hierarchy implementation
+# ---------------------------------------------------------------------------
+def _apply_hierarchy_sparse_coo(
+    sparse_tensor: torch.Tensor,
+    sparse_data: _SparseHierarchyData,
+) -> torch.Tensor:
+    """
+    Apply hierarchy constraints to a sparse COO tensor.
+    This is the sparse analog of _apply_hierarchy_sparse. It processes
+    level-by-level, applying parent deactivation then mutual exclusion.
+    """
+    if sparse_tensor._nnz() == 0:
+        return sparse_tensor
+    sparse_tensor = sparse_tensor.coalesce()
+    for level_data in sparse_data.level_data:
+        # Step 1: Apply parent deactivation for features at this level
+        if level_data.features.numel() > 0:
+            sparse_tensor = _apply_parent_deactivation_coo(
+                sparse_tensor, level_data, sparse_data
+            )
+        # Step 2: Apply ME for groups whose parent is at this level
+        if level_data.me_group_indices.numel() > 0:
+            sparse_tensor = _apply_me_coo(
+                sparse_tensor, level_data.me_group_indices, sparse_data
+            )
+    return sparse_tensor
+def _apply_parent_deactivation_coo(
+    sparse_tensor: torch.Tensor,
+    level_data: _LevelData,
+    sparse_data: _SparseHierarchyData,
+) -> torch.Tensor:
+    """
+    Remove children from sparse COO tensor when their parent is inactive.
+    Uses searchsorted for efficient membership testing of parent activity.
+    """
+    if sparse_tensor._nnz() == 0 or level_data.features.numel() == 0:
+        return sparse_tensor
+    sparse_tensor = sparse_tensor.coalesce()
+    indices = sparse_tensor.indices()  # [2, nnz]
+    values = sparse_tensor.values()  # [nnz]
+    batch_indices = indices[0]
+    feat_indices = indices[1]
+    _, num_features = sparse_tensor.shape
+    device = sparse_tensor.device
+    nnz = indices.shape[1]
+    # Build set of active (batch, feature) pairs for efficient lookup
+    # Encode as: batch_idx * num_features + feat_idx
+    active_pairs = batch_indices * num_features + feat_indices
+    active_pairs_sorted, _ = active_pairs.sort()
+    # Use the precomputed feat_to_parent mapping
+    assert sparse_data.feat_to_parent is not None
+    hierarchy_num_features = sparse_data.feat_to_parent.numel()
+    # Handle features outside the hierarchy (they have no parent, pass through)
+    in_hierarchy = feat_indices < hierarchy_num_features
+    parent_of_feat = torch.full((nnz,), -1, dtype=torch.long, device=device)
+    parent_of_feat[in_hierarchy] = sparse_data.feat_to_parent[
+        feat_indices[in_hierarchy]
+    ]
+    # Find entries that have a parent (parent >= 0 means this feature has a parent)
+    has_parent = parent_of_feat >= 0
+    if not has_parent.any():
+        return sparse_tensor
+    # For entries with parents, check if parent is active
+    child_entry_indices = torch.where(has_parent)[0]
+    child_batch = batch_indices[has_parent]
+    child_parents = parent_of_feat[has_parent]
+    # Look up parent activity using searchsorted
+    parent_pairs = child_batch * num_features + child_parents
+    search_pos = torch.searchsorted(active_pairs_sorted, parent_pairs)
+    search_pos = search_pos.clamp(max=active_pairs_sorted.numel() - 1)
+    parent_active = active_pairs_sorted[search_pos] == parent_pairs
+    # Handle empty case
+    if active_pairs_sorted.numel() == 0:
+        parent_active = torch.zeros_like(parent_pairs, dtype=torch.bool)
+    # Build keep mask: keep entry if it's a root OR its parent is active
+    keep_mask = torch.ones(nnz, dtype=torch.bool, device=device)
+    keep_mask[child_entry_indices[~parent_active]] = False
+    if keep_mask.all():
+        return sparse_tensor
+    return torch.sparse_coo_tensor(
+        indices[:, keep_mask],
+        values[keep_mask],
+        sparse_tensor.shape,
+        device=device,
+        dtype=sparse_tensor.dtype,
+    )
+def _apply_me_coo(
+    sparse_tensor: torch.Tensor,
+    group_indices: torch.Tensor,
+    sparse_data: _SparseHierarchyData,
+) -> torch.Tensor:
+    """
+    Apply mutual exclusion to sparse COO tensor.
+    For each ME group with multiple active siblings in the same batch,
+    randomly selects one winner and removes the rest.
+    """
+    if sparse_tensor._nnz() == 0 or group_indices.numel() == 0:
+        return sparse_tensor
+    sparse_tensor = sparse_tensor.coalesce()
+    indices = sparse_tensor.indices()  # [2, nnz]
+    values = sparse_tensor.values()  # [nnz]
+    batch_indices = indices[0]
+    feat_indices = indices[1]
+    _, num_features = sparse_tensor.shape
+    device = sparse_tensor.device
+    nnz = indices.shape[1]
+    # Use precomputed feat_to_me_group mapping
+    assert sparse_data.feat_to_me_group is not None
+    hierarchy_num_features = sparse_data.feat_to_me_group.numel()
+    # Handle features outside the hierarchy (they are not in any ME group)
+    in_hierarchy = feat_indices < hierarchy_num_features
+    me_group_of_feat = torch.full((nnz,), -1, dtype=torch.long, device=device)
+    me_group_of_feat[in_hierarchy] = sparse_data.feat_to_me_group[
+        feat_indices[in_hierarchy]
+    ]
+    # Find entries that belong to ME groups we're processing (vectorized)
+    in_relevant_group = torch.isin(me_group_of_feat, group_indices)
+    if not in_relevant_group.any():
+        return sparse_tensor
+    # Get the ME entries
+    me_entry_indices = torch.where(in_relevant_group)[0]
+    me_batch = batch_indices[in_relevant_group]
+    me_group = me_group_of_feat[in_relevant_group]
+    # Check parent activity for ME groups (only apply ME if parent is active)
+    me_group_parents = sparse_data.me_group_parents[me_group]
+    has_parent = me_group_parents >= 0
+    if has_parent.any():
+        # Build active pairs for parent lookup
+        active_pairs = batch_indices * num_features + feat_indices
+        active_pairs_sorted, _ = active_pairs.sort()
+        parent_pairs = (
+            me_batch[has_parent] * num_features + me_group_parents[has_parent]
+        )
+        search_pos = torch.searchsorted(active_pairs_sorted, parent_pairs)
+        search_pos = search_pos.clamp(max=active_pairs_sorted.numel() - 1)
+        parent_active_for_has_parent = active_pairs_sorted[search_pos] == parent_pairs
+        # Build full parent_active mask
+        parent_active = torch.ones(
+            me_entry_indices.numel(), dtype=torch.bool, device=device
+        )
+        parent_active[has_parent] = parent_active_for_has_parent
+        # Filter to only ME entries where parent is active
+        valid_me = parent_active
+        me_entry_indices = me_entry_indices[valid_me]
+        me_batch = me_batch[valid_me]
+        me_group = me_group[valid_me]
+    if me_entry_indices.numel() == 0:
+        return sparse_tensor
+    # Encode (batch, group) pairs
+    num_groups = sparse_data.num_groups
+    batch_group_pairs = me_batch * num_groups + me_group
+    # Find unique (batch, group) pairs and count occurrences
+    unique_bg, inverse, counts = torch.unique(
+        batch_group_pairs, return_inverse=True, return_counts=True
+    )
+    # Only process pairs with count > 1 (conflicts)
+    has_conflict = counts > 1
+    if not has_conflict.any():
+        return sparse_tensor
+    # For efficiency, we process all conflicts together
+    # Assign random scores to each ME entry
+    random_scores = torch.rand(me_entry_indices.numel(), device=device)
+    # For each (batch, group) pair, we want the entry with highest score to be winner
+    # Use scatter_reduce to find max score per (batch, group)
+    bg_to_dense = torch.zeros(unique_bg.numel(), dtype=torch.long, device=device)
+    bg_to_dense[has_conflict.nonzero(as_tuple=True)[0]] = torch.arange(
+        has_conflict.sum(), device=device
+    )
+    # Map each ME entry to its dense conflict index
+    entry_has_conflict = has_conflict[inverse]
+    if not entry_has_conflict.any():
+        return sparse_tensor
+    conflict_entries_mask = entry_has_conflict
+    conflict_entry_indices = me_entry_indices[conflict_entries_mask]
+    conflict_random_scores = random_scores[conflict_entries_mask]
+    conflict_inverse = inverse[conflict_entries_mask]
+    conflict_dense_idx = bg_to_dense[conflict_inverse]
+    # Vectorized winner selection using sorting
+    # Sort entries by (group_idx, -random_score) so highest score comes first per group
+    # Use group * 2 - score to sort by group ascending, then score descending
+    sort_keys = conflict_dense_idx.float() * 2.0 - conflict_random_scores
+    sorted_order = sort_keys.argsort()
+    sorted_dense_idx = conflict_dense_idx[sorted_order]
+    # Find first entry of each group in sorted order (these are winners)
+    group_starts = torch.cat(
+        [
+            torch.tensor([True], device=device),
+            sorted_dense_idx[1:] != sorted_dense_idx[:-1],
+        ]
+    )
+    # Winners are entries at group starts in sorted order
+    winner_positions_in_sorted = torch.where(group_starts)[0]
+    winner_original_positions = sorted_order[winner_positions_in_sorted]
+    # Create winner mask (vectorized)
+    is_winner = torch.zeros(
+        conflict_entry_indices.numel(), dtype=torch.bool, device=device
+    )
+    is_winner[winner_original_positions] = True
+    # Build keep mask (vectorized)
+    keep_mask = torch.ones(nnz, dtype=torch.bool, device=device)
+    loser_entry_indices = conflict_entry_indices[~is_winner]
+    keep_mask[loser_entry_indices] = False
+    if keep_mask.all():
+        return sparse_tensor
+    return torch.sparse_coo_tensor(
+        indices[:, keep_mask],
+        values[keep_mask],
+        sparse_tensor.shape,
+        device=device,
+        dtype=sparse_tensor.dtype,
+    )
 @torch.no_grad()
 def hierarchy_modifier(
     roots: Sequence[HierarchyNode] | HierarchyNode,
@@ -475,12 +775,24 @@ def hierarchy_modifier(
                 me_group_sizes=sparse_data.me_group_sizes.to(device),
                 me_group_parents=sparse_data.me_group_parents.to(device),
                 num_groups=sparse_data.num_groups,
+                feat_to_parent=(
+                    sparse_data.feat_to_parent.to(device)
+                    if sparse_data.feat_to_parent is not None
+                    else None
+                ),
+                feat_to_me_group=(
+                    sparse_data.feat_to_me_group.to(device)
+                    if sparse_data.feat_to_me_group is not None
+                    else None
+                ),
             )
         return device_cache[device]
     def modifier(activations: torch.Tensor) -> torch.Tensor:
         device = activations.device
         cached = _get_sparse_for_device(device)
+        if activations.is_sparse:
+            return _apply_hierarchy_sparse_coo(activations, cached)
         return _apply_hierarchy_sparse(activations, cached)
     return modifier

{sae_lens-6.29.0 → sae_lens-6.30.1}/sae_lens/synthetic/training.py RENAMED Viewed

@@ -23,6 +23,8 @@ def train_toy_sae(
     device: str | torch.device = "cpu",
     n_snapshots: int = 0,
     snapshot_fn: Callable[[SAETrainer[Any, Any]], None] | None = None,
+    autocast_sae: bool = False,
+    autocast_data: bool = False,
 ) -> None:
     """
     Train an SAE on synthetic activations from a feature dictionary.
@@ -46,6 +48,8 @@ def train_toy_sae(
         snapshot_fn: Callback function called at each snapshot point. Receives
             the SAETrainer instance, allowing access to the SAE, training step,
             and other training state. Required if n_snapshots > 0.
+        autocast_sae: Whether to autocast the SAE to bfloat16. Only recommend for large SAEs on CUDA
+        autocast_data: Whether to autocast the activations generator and feature dictionary to bfloat16. Only recommend for large data on CUDA.
     """
     device_str = str(device) if isinstance(device, torch.device) else device
@@ -55,6 +59,7 @@ def train_toy_sae(
         feature_dict=feature_dict,
         activations_generator=activations_generator,
         batch_size=batch_size,
+        autocast=autocast_data,
     )
     # Create trainer config
@@ -64,7 +69,7 @@ def train_toy_sae(
         save_final_checkpoint=False,
         total_training_samples=training_samples,
         device=device_str,
-        autocast=False,
+        autocast=autocast_sae,
         lr=lr,
         lr_end=lr,
         lr_scheduler_name="constant",
@@ -119,6 +124,7 @@ class SyntheticActivationIterator(Iterator[torch.Tensor]):
         feature_dict: FeatureDictionary,
         activations_generator: ActivationGenerator,
         batch_size: int,
+        autocast: bool = False,
     ):
         """
         Create a new SyntheticActivationIterator.
@@ -127,16 +133,23 @@ class SyntheticActivationIterator(Iterator[torch.Tensor]):
             feature_dict: The feature dictionary to use for generating hidden activations
             activations_generator: Generator that produces feature activations
             batch_size: Number of samples per batch
+            autocast: Whether to autocast the activations generator and feature dictionary to bfloat16.
         """
         self.feature_dict = feature_dict
         self.activations_generator = activations_generator
         self.batch_size = batch_size
+        self.autocast = autocast
     @torch.no_grad()
     def next_batch(self) -> torch.Tensor:
         """Generate the next batch of hidden activations."""
-        features = self.activations_generator(self.batch_size)
-        return self.feature_dict(features)
+        with torch.autocast(
+            device_type=self.feature_dict.feature_vectors.device.type,
+            dtype=torch.bfloat16,
+            enabled=self.autocast,
+        ):
+            features = self.activations_generator(self.batch_size)
+            return self.feature_dict(features)
     def __iter__(self) -> "SyntheticActivationIterator":
         return self