PyPI - sae-lens - Versions diffs - 6.12.1__py3-none-any.whl → 6.21.0__py3-none-any.whl - Mend

sae-lens 6.12.1py3-none-any.whl → 6.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +15 -1
sae_lens/cache_activations_runner.py +1 -1
sae_lens/config.py +39 -2
sae_lens/constants.py +1 -0
sae_lens/evals.py +20 -14
sae_lens/llm_sae_training_runner.py +17 -18
sae_lens/loading/pretrained_sae_loaders.py +194 -0
sae_lens/loading/pretrained_saes_directory.py +5 -3
sae_lens/pretokenize_runner.py +2 -1
sae_lens/pretrained_saes.yaml +75 -1
sae_lens/saes/__init__.py +9 -0
sae_lens/saes/batchtopk_sae.py +32 -1
sae_lens/saes/matryoshka_batchtopk_sae.py +137 -0
sae_lens/saes/sae.py +22 -24
sae_lens/saes/temporal_sae.py +372 -0
sae_lens/saes/topk_sae.py +287 -17
sae_lens/tokenization_and_batching.py +21 -6
sae_lens/training/activation_scaler.py +7 -0
sae_lens/training/activations_store.py +52 -31
sae_lens/training/optim.py +11 -0
sae_lens/training/sae_trainer.py +57 -16
sae_lens/training/types.py +1 -1
sae_lens/util.py +27 -0
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info}/METADATA +19 -17
sae_lens-6.21.0.dist-info/RECORD +41 -0
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info}/WHEEL +1 -1
sae_lens-6.12.1.dist-info/RECORD +0 -39
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info/licenses}/LICENSE +0 -0

sae_lens/pretrained_saes.yaml CHANGED Viewed

@@ -1,3 +1,35 @@
+temporal-sae-gemma-2-2b:
+  conversion_func: temporal
+  model: gemma-2-2b
+  repo_id: canrager/temporalSAEs
+  config_overrides:
+    model_name: gemma-2-2b
+    hook_name: blocks.12.hook_resid_post
+    dataset_path: monology/pile-uncopyrighted
+  saes:
+  - id: blocks.12.hook_resid_post
+    l0: 192
+    norm_scaling_factor: 0.00666666667
+    path: gemma-2-2B/layer_12/temporal
+    neuronpedia: gemma-2-2b/12-temporal-res
+temporal-sae-llama-3.1-8b:
+  conversion_func: temporal
+  model: meta-llama/Llama-3.1-8B
+  repo_id: canrager/temporalSAEs
+  config_overrides:
+    model_name: meta-llama/Llama-3.1-8B
+    dataset_path: monology/pile-uncopyrighted
+  saes:
+  - id: blocks.15.hook_resid_post
+    l0: 256
+    norm_scaling_factor: 0.029
+    path: llama-3.1-8B/layer_15/temporal
+    neuronpedia: llama3.1-8b/15-temporal-res
+  - id: blocks.26.hook_resid_post
+    l0: 256
+    norm_scaling_factor: 0.029
+    path: llama-3.1-8B/layer_26/temporal
+    neuronpedia: llama3.1-8b/26-temporal-res
 deepseek-r1-distill-llama-8b-qresearch:
   conversion_func: deepseek_r1
   model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -14882,4 +14914,46 @@ qwen2.5-7b-instruct-andyrdt:
     neuronpedia: qwen2.5-7b-it/23-resid-post-aa
   - id: resid_post_layer_27_trainer_1
     path: resid_post_layer_27/trainer_1
-    neuronpedia: qwen2.5-7b-it/27-resid-post-aa
+    neuronpedia: qwen2.5-7b-it/27-resid-post-aa
+gpt-oss-20b-andyrdt:
+  conversion_func: dictionary_learning_1
+  model: openai/gpt-oss-20b
+  repo_id: andyrdt/saes-gpt-oss-20b
+  saes:
+  - id: resid_post_layer_3_trainer_0
+    path: resid_post_layer_3/trainer_0
+    neuronpedia: gpt-oss-20b/3-resid-post-aa
+  - id: resid_post_layer_7_trainer_0
+    path: resid_post_layer_7/trainer_0
+    neuronpedia: gpt-oss-20b/7-resid-post-aa
+  - id: resid_post_layer_11_trainer_0
+    path: resid_post_layer_11/trainer_0
+    neuronpedia: gpt-oss-20b/11-resid-post-aa
+  - id: resid_post_layer_15_trainer_0
+    path: resid_post_layer_15/trainer_0
+    neuronpedia: gpt-oss-20b/15-resid-post-aa
+  - id: resid_post_layer_19_trainer_0
+    path: resid_post_layer_19/trainer_0
+    neuronpedia: gpt-oss-20b/19-resid-post-aa
+  - id: resid_post_layer_23_trainer_0
+    path: resid_post_layer_23/trainer_0
+    neuronpedia: gpt-oss-20b/23-resid-post-aa
+goodfire-llama-3.3-70b-instruct:
+  conversion_func: goodfire
+  model: meta-llama/Llama-3.3-70B-Instruct
+  repo_id: Goodfire/Llama-3.3-70B-Instruct-SAE-l50
+  saes:
+  - id: layer_50
+    path: Llama-3.3-70B-Instruct-SAE-l50.pt
+    l0: 121
+goodfire-llama-3.1-8b-instruct:
+  conversion_func: goodfire
+  model: meta-llama/Llama-3.1-8B-Instruct
+  repo_id: Goodfire/Llama-3.1-8B-Instruct-SAE-l19
+  saes:
+  - id: layer_19
+    path: Llama-3.1-8B-Instruct-SAE-l19.pth
+    l0: 91

sae_lens/saes/__init__.py CHANGED Viewed

@@ -14,6 +14,10 @@ from .jumprelu_sae import (
     JumpReLUTrainingSAE,
     JumpReLUTrainingSAEConfig,
 )
+from .matryoshka_batchtopk_sae import (
+    MatryoshkaBatchTopKTrainingSAE,
+    MatryoshkaBatchTopKTrainingSAEConfig,
+)
 from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
 from .standard_sae import (
     StandardSAE,
@@ -21,6 +25,7 @@ from .standard_sae import (
     StandardTrainingSAE,
     StandardTrainingSAEConfig,
 )
+from .temporal_sae import TemporalSAE, TemporalSAEConfig
 from .topk_sae import (
     TopKSAE,
     TopKSAEConfig,
@@ -65,4 +70,8 @@ __all__ = [
     "SkipTranscoderConfig",
     "JumpReLUTranscoder",
     "JumpReLUTranscoderConfig",
+    "MatryoshkaBatchTopKTrainingSAE",
+    "MatryoshkaBatchTopKTrainingSAEConfig",
+    "TemporalSAE",
+    "TemporalSAEConfig",
 ]

sae_lens/saes/batchtopk_sae.py CHANGED Viewed

@@ -23,7 +23,9 @@ class BatchTopK(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         acts = x.relu()
         flat_acts = acts.flatten()
-        acts_topk_flat = torch.topk(flat_acts, int(self.k * acts.shape[0]), dim=-1)
+        # Calculate total number of samples across all non-feature dimensions
+        num_samples = acts.shape[:-1].numel()
+        acts_topk_flat = torch.topk(flat_acts, int(self.k * num_samples), dim=-1)
         return (
             torch.zeros_like(flat_acts)
             .scatter(-1, acts_topk_flat.indices, acts_topk_flat.values)
@@ -35,6 +37,35 @@ class BatchTopK(nn.Module):
 class BatchTopKTrainingSAEConfig(TopKTrainingSAEConfig):
     """
     Configuration class for training a BatchTopKTrainingSAE.
+    BatchTopK SAEs maintain k active features on average across the entire batch,
+    rather than enforcing k features per sample like standard TopK SAEs. During training,
+    the SAE learns a global threshold that is updated based on the minimum positive
+    activation value. After training, BatchTopK SAEs are saved as JumpReLU SAEs.
+    Args:
+        k (float): Average number of features to keep active across the batch. Unlike
+            standard TopK SAEs where k is an integer per sample, this is a float
+            representing the average number of active features across all samples in
+            the batch. Defaults to 100.
+        topk_threshold_lr (float): Learning rate for updating the global topk threshold.
+            The threshold is updated using an exponential moving average of the minimum
+            positive activation value. Defaults to 0.01.
+        aux_loss_coefficient (float): Coefficient for the auxiliary loss that encourages
+            dead neurons to learn useful features. Inherited from TopKTrainingSAEConfig.
+            Defaults to 1.0.
+        rescale_acts_by_decoder_norm (bool): Treat the decoder as if it was already normalized.
+            Inherited from TopKTrainingSAEConfig. Defaults to True.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
     """
     k: float = 100  # type: ignore[assignment]

sae_lens/saes/matryoshka_batchtopk_sae.py ADDED Viewed

@@ -0,0 +1,137 @@
+import warnings
+from dataclasses import dataclass, field
+import torch
+from jaxtyping import Float
+from typing_extensions import override
+from sae_lens.saes.batchtopk_sae import (
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
+)
+from sae_lens.saes.sae import TrainStepInput, TrainStepOutput
+from sae_lens.saes.topk_sae import _sparse_matmul_nd
+@dataclass
+class MatryoshkaBatchTopKTrainingSAEConfig(BatchTopKTrainingSAEConfig):
+    """
+    Configuration class for training a MatryoshkaBatchTopKTrainingSAE.
+    [Matryoshka SAEs](https://arxiv.org/pdf/2503.17547) use a series of nested reconstruction
+    losses of different widths during training to avoid feature absorption. This also has a
+    nice side-effect of encouraging higher-frequency features to be learned in earlier levels.
+    However, this SAE has more hyperparameters to tune than standard BatchTopK SAEs, and takes
+    longer to train due to requiring multiple forward passes per training step.
+    After training, MatryoshkaBatchTopK SAEs are saved as JumpReLU SAEs.
+    Args:
+        matryoshka_widths (list[int]): The widths of the matryoshka levels. Defaults to an empty list.
+        k (float): The number of features to keep active. Inherited from BatchTopKTrainingSAEConfig.
+            Defaults to 100.
+        topk_threshold_lr (float): Learning rate for updating the global topk threshold.
+            The threshold is updated using an exponential moving average of the minimum
+            positive activation value. Defaults to 0.01.
+        aux_loss_coefficient (float): Coefficient for the auxiliary loss that encourages
+            dead neurons to learn useful features. Inherited from TopKTrainingSAEConfig.
+            Defaults to 1.0.
+        rescale_acts_by_decoder_norm (bool): Treat the decoder as if it was already normalized.
+            Inherited from TopKTrainingSAEConfig. Defaults to True.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+    """
+    matryoshka_widths: list[int] = field(default_factory=list)
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "matryoshka_batchtopk"
+class MatryoshkaBatchTopKTrainingSAE(BatchTopKTrainingSAE):
+    """
+    Global Batch TopK Training SAE
+    This SAE will maintain the k on average across the batch, rather than enforcing the k per-sample as in standard TopK.
+    BatchTopK SAEs are saved as JumpReLU SAEs after training.
+    """
+    cfg: MatryoshkaBatchTopKTrainingSAEConfig  # type: ignore[assignment]
+    def __init__(
+        self, cfg: MatryoshkaBatchTopKTrainingSAEConfig, use_error_term: bool = False
+    ):
+        super().__init__(cfg, use_error_term)
+        _validate_matryoshka_config(cfg)
+    @override
+    def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
+        base_output = super().training_forward_pass(step_input)
+        inv_W_dec_norm = 1 / self.W_dec.norm(dim=-1)
+        # the outer matryoshka level is the base SAE, so we don't need to add an extra loss for it
+        for width in self.cfg.matryoshka_widths[:-1]:
+            inner_reconstruction = self._decode_matryoshka_level(
+                base_output.feature_acts, width, inv_W_dec_norm
+            )
+            inner_mse_loss = (
+                self.mse_loss_fn(inner_reconstruction, step_input.sae_in)
+                .sum(dim=-1)
+                .mean()
+            )
+            base_output.losses[f"inner_mse_loss_{width}"] = inner_mse_loss
+            base_output.loss = base_output.loss + inner_mse_loss
+        return base_output
+    def _decode_matryoshka_level(
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
+        width: int,
+        inv_W_dec_norm: torch.Tensor,
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decodes feature activations back into input space for a matryoshka level
+        """
+        inner_feature_acts = feature_acts[:, :width]
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if self.cfg.rescale_acts_by_decoder_norm:
+            # need to multiply by the inverse of the norm because division is illegal with sparse tensors
+            inner_feature_acts = inner_feature_acts * inv_W_dec_norm[:width]
+        if inner_feature_acts.is_sparse:
+            sae_out_pre = (
+                _sparse_matmul_nd(inner_feature_acts, self.W_dec[:width]) + self.b_dec
+            )
+        else:
+            sae_out_pre = inner_feature_acts @ self.W_dec[:width] + self.b_dec
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+def _validate_matryoshka_config(cfg: MatryoshkaBatchTopKTrainingSAEConfig) -> None:
+    if cfg.matryoshka_widths[-1] != cfg.d_sae:
+        # warn the users that we will add a final matryoshka level
+        warnings.warn(
+            "WARNING: The final matryoshka level width is not set to cfg.d_sae. "
+            "A final matryoshka level of width=cfg.d_sae will be added."
+        )
+        cfg.matryoshka_widths.append(cfg.d_sae)
+    for prev_width, curr_width in zip(
+        cfg.matryoshka_widths[:-1], cfg.matryoshka_widths[1:]
+    ):
+        if prev_width >= curr_width:
+            raise ValueError("cfg.matryoshka_widths must be strictly increasing.")
+    if len(cfg.matryoshka_widths) == 1:
+        warnings.warn(
+            "WARNING: You have only set one matryoshka level. This is equivalent to using a standard BatchTopK SAE and is likely not what you want."
+        )

sae_lens/saes/sae.py CHANGED Viewed

@@ -14,7 +14,6 @@ from typing import (
     Generic,
     Literal,
     NamedTuple,
-    Type,
     TypeVar,
 )
@@ -22,7 +21,7 @@ import einops
 import torch
 from jaxtyping import Float
 from numpy.typing import NDArray
-from safetensors.torch import save_file
+from safetensors.torch import load_file, save_file
 from torch import nn
 from transformer_lens.hook_points import HookedRootModule, HookPoint
 from typing_extensions import deprecated, overload, override
@@ -156,9 +155,9 @@ class SAEConfig(ABC):
     dtype: str = "float32"
     device: str = "cpu"
     apply_b_dec_to_input: bool = True
-    normalize_activations: Literal[
-        "none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"
-    ] = "none"  # none, expected_average_only_in (Anthropic April Update), constant_norm_rescale (Anthropic Feb Update)
+    normalize_activations: Literal["none", "expected_average_only_in", "layer_norm"] = (
+        "none"  # none, expected_average_only_in (Anthropic April Update)
+    )
     reshape_activations: Literal["none", "hook_z"] = "none"
     metadata: SAEMetadata = field(default_factory=SAEMetadata)
@@ -218,6 +217,7 @@ class TrainStepInput:
     sae_in: torch.Tensor
     coefficients: dict[str, float]
     dead_neuron_mask: torch.Tensor | None
+    n_training_steps: int
 class TrainCoefficientConfig(NamedTuple):
@@ -245,7 +245,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.cfg = cfg
-        if cfg.metadata and cfg.metadata:
+        if cfg.metadata and cfg.metadata.model_from_pretrained_kwargs:
             warnings.warn(
                 "\nThis SAE has non-empty model_from_pretrained_kwargs. "
                 "\nFor optimal performance, load the model like so:\n"
@@ -309,6 +309,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             self.run_time_activation_norm_fn_in = run_time_activation_norm_fn_in
             self.run_time_activation_norm_fn_out = run_time_activation_norm_fn_out
         elif self.cfg.normalize_activations == "layer_norm":
             #  we need to scale the norm of the input and store the scaling factor
             def run_time_activation_ln_in(
@@ -452,23 +453,14 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     def process_sae_in(
         self, sae_in: Float[torch.Tensor, "... d_in"]
     ) -> Float[torch.Tensor, "... d_in"]:
-        # print(f"Input shape to process_sae_in: {sae_in.shape}")
-        # print(f"self.cfg.hook_name: {self.cfg.hook_name}")
-        # print(f"self.b_dec shape: {self.b_dec.shape}")
-        # print(f"Hook z reshaping mode: {getattr(self, 'hook_z_reshaping_mode', False)}")
         sae_in = sae_in.to(self.dtype)
-        # print(f"Shape before reshape_fn_in: {sae_in.shape}")
         sae_in = self.reshape_fn_in(sae_in)
-        # print(f"Shape after reshape_fn_in: {sae_in.shape}")
         sae_in = self.hook_sae_input(sae_in)
         sae_in = self.run_time_activation_norm_fn_in(sae_in)
         # Here's where the error happens
         bias_term = self.b_dec * self.cfg.apply_b_dec_to_input
-        # print(f"Bias term shape: {bias_term.shape}")
         return sae_in - bias_term
@@ -534,7 +526,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     @deprecated("Use load_from_disk instead")
     def load_from_pretrained(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         path: str | Path,
         device: str = "cpu",
         dtype: str | None = None,
@@ -543,7 +535,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def load_from_disk(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         path: str | Path,
         device: str = "cpu",
         dtype: str | None = None,
@@ -564,7 +556,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def from_pretrained(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         release: str,
         sae_id: str,
         device: str = "cpu",
@@ -585,7 +577,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def from_pretrained_with_cfg_and_sparsity(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         release: str,
         sae_id: str,
         device: str = "cpu",
@@ -684,7 +676,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return sae, cfg_dict, log_sparsities
     @classmethod
-    def from_dict(cls: Type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:
+    def from_dict(cls: type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:
         """Create an SAE from a config dictionary."""
         sae_cls = cls.get_sae_class_for_architecture(config_dict["architecture"])
         sae_config_cls = cls.get_sae_config_class_for_architecture(
@@ -694,8 +686,8 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def get_sae_class_for_architecture(
-        cls: Type[T_SAE], architecture: str
-    ) -> Type[T_SAE]:
+        cls: type[T_SAE], architecture: str
+    ) -> type[T_SAE]:
         """Get the SAE class for a given architecture."""
         sae_cls, _ = get_sae_class(architecture)
         if not issubclass(sae_cls, cls):
@@ -1000,8 +992,8 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     @classmethod
     def get_sae_class_for_architecture(
-        cls: Type[T_TRAINING_SAE], architecture: str
-    ) -> Type[T_TRAINING_SAE]:
+        cls: type[T_TRAINING_SAE], architecture: str
+    ) -> type[T_TRAINING_SAE]:
         """Get the SAE class for a given architecture."""
         sae_cls, _ = get_sae_training_class(architecture)
         if not issubclass(sae_cls, cls):
@@ -1018,6 +1010,12 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     ) -> type[TrainingSAEConfig]:
         return get_sae_training_class(architecture)[1]
+    def load_weights_from_checkpoint(self, checkpoint_path: Path | str) -> None:
+        checkpoint_path = Path(checkpoint_path)
+        state_dict = load_file(checkpoint_path / SAE_WEIGHTS_FILENAME)
+        self.process_state_dict_for_loading(state_dict)
+        self.load_state_dict(state_dict)
 _blank_hook = nn.Identity()

sae-lens 6.12.1__py3-none-any.whl → 6.21.0__py3-none-any.whl

sae-lens 6.12.1py3-none-any.whl → 6.21.0py3-none-any.whl