PyPI - sae-lens - Versions diffs - 6.14.1__tar.gz → 6.15.0__tar.gz - Mend

sae-lens 6.14.1tar.gz → 6.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{sae_lens-6.14.1 → sae_lens-6.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.14.1
+Version: 6.15.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.14.1 → sae_lens-6.15.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.14.1"
+version = "6.15.0"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.14.1"
+__version__ = "6.15.0"
 import logging
@@ -19,6 +19,8 @@ from sae_lens.saes import (
     JumpReLUTrainingSAEConfig,
     JumpReLUTranscoder,
     JumpReLUTranscoderConfig,
+    MatryoshkaBatchTopKTrainingSAE,
+    MatryoshkaBatchTopKTrainingSAEConfig,
     SAEConfig,
     SkipTranscoder,
     SkipTranscoderConfig,
@@ -101,6 +103,8 @@ __all__ = [
     "SkipTranscoderConfig",
     "JumpReLUTranscoder",
     "JumpReLUTranscoderConfig",
+    "MatryoshkaBatchTopKTrainingSAE",
+    "MatryoshkaBatchTopKTrainingSAEConfig",
 ]
@@ -115,6 +119,11 @@ register_sae_training_class("jumprelu", JumpReLUTrainingSAE, JumpReLUTrainingSAE
 register_sae_training_class(
     "batchtopk", BatchTopKTrainingSAE, BatchTopKTrainingSAEConfig
 )
+register_sae_training_class(
+    "matryoshka_batchtopk",
+    MatryoshkaBatchTopKTrainingSAE,
+    MatryoshkaBatchTopKTrainingSAEConfig,
+)
 register_sae_class("transcoder", Transcoder, TranscoderConfig)
 register_sae_class("skip_transcoder", SkipTranscoder, SkipTranscoderConfig)
 register_sae_class("jumprelu_transcoder", JumpReLUTranscoder, JumpReLUTranscoderConfig)

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/evals.py RENAMED Viewed

@@ -11,7 +11,7 @@ from dataclasses import dataclass, field
 from functools import partial
 from importlib.metadata import PackageNotFoundError, version
 from pathlib import Path
-from typing import Any
+from typing import Any, Iterable
 import einops
 import pandas as pd
@@ -24,7 +24,10 @@ from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_direc
 from sae_lens.saes.sae import SAE, SAEConfig
 from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
-from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
+from sae_lens.util import (
+    extract_stop_at_layer_from_tlens_hook_name,
+    get_special_token_ids,
+)
 def get_library_version() -> str:
@@ -109,9 +112,15 @@ def run_evals(
     activation_scaler: ActivationScaler,
     eval_config: EvalConfig = EvalConfig(),
     model_kwargs: Mapping[str, Any] = {},
-    ignore_tokens: set[int | None] = set(),
+    exclude_special_tokens: Iterable[int] | bool = True,
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
+    ignore_tokens = None
+    if exclude_special_tokens is True:
+        ignore_tokens = list(get_special_token_ids(model.tokenizer))  # type: ignore
+    elif exclude_special_tokens:
+        ignore_tokens = list(exclude_special_tokens)
     hook_name = sae.cfg.metadata.hook_name
     actual_batch_size = (
         eval_config.batch_size_prompts or activation_store.store_batch_size_prompts
@@ -312,7 +321,7 @@ def get_downstream_reconstruction_metrics(
     compute_ce_loss: bool,
     n_batches: int,
     eval_batch_size_prompts: int,
-    ignore_tokens: set[int | None] = set(),
+    ignore_tokens: list[int] | None = None,
     verbose: bool = False,
 ):
     metrics_dict = {}
@@ -339,7 +348,7 @@ def get_downstream_reconstruction_metrics(
             compute_ce_loss=compute_ce_loss,
             ignore_tokens=ignore_tokens,
         ).items():
-            if len(ignore_tokens) > 0:
+            if ignore_tokens:
                 mask = torch.logical_not(
                     torch.any(
                         torch.stack(
@@ -384,7 +393,7 @@ def get_sparsity_and_variance_metrics(
     compute_featurewise_density_statistics: bool,
     eval_batch_size_prompts: int,
     model_kwargs: Mapping[str, Any],
-    ignore_tokens: set[int | None] = set(),
+    ignore_tokens: list[int] | None = None,
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     hook_name = sae.cfg.metadata.hook_name
@@ -426,7 +435,7 @@ def get_sparsity_and_variance_metrics(
     for _ in batch_iter:
         batch_tokens = activation_store.get_batch_tokens(eval_batch_size_prompts)
-        if len(ignore_tokens) > 0:
+        if ignore_tokens:
             mask = torch.logical_not(
                 torch.any(
                     torch.stack(
@@ -596,7 +605,7 @@ def get_recons_loss(
     batch_tokens: torch.Tensor,
     compute_kl: bool,
     compute_ce_loss: bool,
-    ignore_tokens: set[int | None] = set(),
+    ignore_tokens: list[int] | None = None,
     model_kwargs: Mapping[str, Any] = {},
     hook_name: str | None = None,
 ) -> dict[str, Any]:
@@ -610,7 +619,7 @@ def get_recons_loss(
         batch_tokens, return_type="both", loss_per_token=True, **model_kwargs
     )
-    if len(ignore_tokens) > 0:
+    if ignore_tokens:
         mask = torch.logical_not(
             torch.any(
                 torch.stack([batch_tokens == token for token in ignore_tokens], dim=0),
@@ -856,11 +865,6 @@ def multiple_evals(
                     activation_scaler=ActivationScaler(),
                     model=current_model,
                     eval_config=eval_config,
-                    ignore_tokens={
-                        current_model.tokenizer.pad_token_id,  # type: ignore
-                        current_model.tokenizer.eos_token_id,  # type: ignore
-                        current_model.tokenizer.bos_token_id,  # type: ignore
-                    },
                     verbose=verbose,
                 )
                 eval_metrics["metrics"] = scalar_metrics

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/llm_sae_training_runner.py RENAMED Viewed

@@ -22,17 +22,13 @@ from sae_lens.constants import (
 )
 from sae_lens.evals import EvalConfig, run_evals
 from sae_lens.load_model import load_model
-from sae_lens.saes.batchtopk_sae import BatchTopKTrainingSAEConfig
-from sae_lens.saes.gated_sae import GatedTrainingSAEConfig
-from sae_lens.saes.jumprelu_sae import JumpReLUTrainingSAEConfig
+from sae_lens.registry import SAE_TRAINING_CLASS_REGISTRY
 from sae_lens.saes.sae import (
     T_TRAINING_SAE,
     T_TRAINING_SAE_CONFIG,
     TrainingSAE,
     TrainingSAEConfig,
 )
-from sae_lens.saes.standard_sae import StandardTrainingSAEConfig
-from sae_lens.saes.topk_sae import TopKTrainingSAEConfig
 from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
 from sae_lens.training.sae_trainer import SAETrainer
@@ -61,9 +57,11 @@ class LLMSaeEvaluator(Generic[T_TRAINING_SAE]):
         data_provider: DataProvider,
         activation_scaler: ActivationScaler,
     ) -> dict[str, Any]:
-        ignore_tokens = set()
+        exclude_special_tokens = False
         if self.activations_store.exclude_special_tokens is not None:
-            ignore_tokens = set(self.activations_store.exclude_special_tokens.tolist())
+            exclude_special_tokens = (
+                self.activations_store.exclude_special_tokens.tolist()
+            )
         eval_config = EvalConfig(
             batch_size_prompts=self.eval_batch_size_prompts,
@@ -81,7 +79,7 @@ class LLMSaeEvaluator(Generic[T_TRAINING_SAE]):
             model=self.model,
             activation_scaler=activation_scaler,
             eval_config=eval_config,
-            ignore_tokens=ignore_tokens,
+            exclude_special_tokens=exclude_special_tokens,
             model_kwargs=self.model_kwargs,
         )  # not calculating featurwise metrics here.
@@ -393,12 +391,8 @@ def _parse_cfg_args(
         )
     # Map architecture to concrete config class
-    sae_config_map = {
-        "standard": StandardTrainingSAEConfig,
-        "gated": GatedTrainingSAEConfig,
-        "jumprelu": JumpReLUTrainingSAEConfig,
-        "topk": TopKTrainingSAEConfig,
-        "batchtopk": BatchTopKTrainingSAEConfig,
+    sae_config_map: dict[str, type[TrainingSAEConfig]] = {
+        name: cfg for name, (_, cfg) in SAE_TRAINING_CLASS_REGISTRY.items()
     }
     sae_config_type = sae_config_map[architecture]

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/saes/__init__.py RENAMED Viewed

@@ -14,6 +14,10 @@ from .jumprelu_sae import (
     JumpReLUTrainingSAE,
     JumpReLUTrainingSAEConfig,
 )
+from .matryoshka_batchtopk_sae import (
+    MatryoshkaBatchTopKTrainingSAE,
+    MatryoshkaBatchTopKTrainingSAEConfig,
+)
 from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
 from .standard_sae import (
     StandardSAE,
@@ -65,4 +69,6 @@ __all__ = [
     "SkipTranscoderConfig",
     "JumpReLUTranscoder",
     "JumpReLUTranscoderConfig",
+    "MatryoshkaBatchTopKTrainingSAE",
+    "MatryoshkaBatchTopKTrainingSAEConfig",
 ]

sae_lens-6.15.0/sae_lens/saes/matryoshka_batchtopk_sae.py ADDED Viewed

@@ -0,0 +1,143 @@
+import warnings
+from dataclasses import dataclass, field
+import torch
+from jaxtyping import Float
+from typing_extensions import override
+from sae_lens.saes.batchtopk_sae import (
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
+)
+from sae_lens.saes.sae import TrainStepInput, TrainStepOutput
+from sae_lens.saes.topk_sae import _sparse_matmul_nd
+@dataclass
+class MatryoshkaBatchTopKTrainingSAEConfig(BatchTopKTrainingSAEConfig):
+    """
+    Configuration class for training a MatryoshkaBatchTopKTrainingSAE.
+    [Matryoshka SAEs](https://arxiv.org/pdf/2503.17547) use a series of nested reconstruction
+    losses of different widths during training to avoid feature absorption. This also has a
+    nice side-effect of encouraging higher-frequency features to be learned in earlier levels.
+    However, this SAE has more hyperparameters to tune than standard BatchTopK SAEs, and takes
+    longer to train due to requiring multiple forward passes per training step.
+    After training, MatryoshkaBatchTopK SAEs are saved as JumpReLU SAEs.
+    Args:
+        matryoshka_widths (list[int]): The widths of the matryoshka levels. Defaults to an empty list.
+        k (float): The number of features to keep active. Inherited from BatchTopKTrainingSAEConfig.
+            Defaults to 100.
+        topk_threshold_lr (float): Learning rate for updating the global topk threshold.
+            The threshold is updated using an exponential moving average of the minimum
+            positive activation value. Defaults to 0.01.
+        aux_loss_coefficient (float): Coefficient for the auxiliary loss that encourages
+            dead neurons to learn useful features. Inherited from TopKTrainingSAEConfig.
+            Defaults to 1.0.
+        rescale_acts_by_decoder_norm (bool): Treat the decoder as if it was already normalized.
+            Inherited from TopKTrainingSAEConfig. Defaults to True.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+    """
+    matryoshka_widths: list[int] = field(default_factory=list)
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "matryoshka_batchtopk"
+class MatryoshkaBatchTopKTrainingSAE(BatchTopKTrainingSAE):
+    """
+    Global Batch TopK Training SAE
+    This SAE will maintain the k on average across the batch, rather than enforcing the k per-sample as in standard TopK.
+    BatchTopK SAEs are saved as JumpReLU SAEs after training.
+    """
+    cfg: MatryoshkaBatchTopKTrainingSAEConfig  # type: ignore[assignment]
+    def __init__(
+        self, cfg: MatryoshkaBatchTopKTrainingSAEConfig, use_error_term: bool = False
+    ):
+        super().__init__(cfg, use_error_term)
+        _validate_matryoshka_config(cfg)
+    @override
+    def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
+        base_output = super().training_forward_pass(step_input)
+        hidden_pre = base_output.hidden_pre
+        inv_W_dec_norm = 1 / self.W_dec.norm(dim=-1)
+        # the outer matryoshka level is the base SAE, so we don't need to add an extra loss for it
+        for width in self.cfg.matryoshka_widths[:-1]:
+            inner_hidden_pre = hidden_pre[:, :width]
+            inner_feat_acts = self.activation_fn(inner_hidden_pre)
+            inner_reconstruction = self._decode_matryoshka_level(
+                inner_feat_acts, width, inv_W_dec_norm
+            )
+            inner_mse_loss = (
+                self.mse_loss_fn(inner_reconstruction, step_input.sae_in)
+                .sum(dim=-1)
+                .mean()
+            )
+            base_output.losses[f"inner_mse_loss_{width}"] = inner_mse_loss
+            base_output.loss = base_output.loss + inner_mse_loss
+        return base_output
+    def _decode_matryoshka_level(
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
+        width: int,
+        inv_W_dec_norm: torch.Tensor,
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decodes feature activations back into input space for a matryoshka level
+        """
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if self.cfg.rescale_acts_by_decoder_norm:
+            # need to multiply by the inverse of the norm because division is illegal with sparse tensors
+            feature_acts = feature_acts * inv_W_dec_norm[:width]
+        if feature_acts.is_sparse:
+            sae_out_pre = (
+                _sparse_matmul_nd(feature_acts, self.W_dec[:width]) + self.b_dec
+            )
+        else:
+            sae_out_pre = feature_acts @ self.W_dec[:width] + self.b_dec
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+def _validate_matryoshka_config(cfg: MatryoshkaBatchTopKTrainingSAEConfig) -> None:
+    if cfg.matryoshka_widths[-1] != cfg.d_sae:
+        # warn the users that we will add a final matryoshka level
+        warnings.warn(
+            "WARNING: The final matryoshka level width is not set to cfg.d_sae. "
+            "A final matryoshka level of width=cfg.d_sae will be added."
+        )
+        cfg.matryoshka_widths.append(cfg.d_sae)
+    for prev_width, curr_width in zip(
+        cfg.matryoshka_widths[:-1], cfg.matryoshka_widths[1:]
+    ):
+        if prev_width >= curr_width:
+            raise ValueError("cfg.matryoshka_widths must be strictly increasing.")
+    if len(cfg.matryoshka_widths) == 1:
+        warnings.warn(
+            "WARNING: You have only set one matryoshka level. This is equivalent to using a standard BatchTopK SAE and is likely not what you want."
+        )
+    if cfg.matryoshka_widths[0] < cfg.k:
+        raise ValueError(
+            "The smallest matryoshka level width cannot be smaller than cfg.k."
+        )

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/training/activations_store.py RENAMED Viewed

@@ -29,7 +29,10 @@ from sae_lens.pretokenize_runner import get_special_token_from_cfg
 from sae_lens.saes.sae import SAE, T_SAE_CONFIG, T_TRAINING_SAE_CONFIG
 from sae_lens.tokenization_and_batching import concat_and_batch_sequences
 from sae_lens.training.mixing_buffer import mixing_buffer
-from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
+from sae_lens.util import (
+    extract_stop_at_layer_from_tlens_hook_name,
+    get_special_token_ids,
+)
 # TODO: Make an activation store config class to be consistent with the rest of the code.
@@ -113,7 +116,7 @@ class ActivationsStore:
         if exclude_special_tokens is False:
             exclude_special_tokens = None
         if exclude_special_tokens is True:
-            exclude_special_tokens = _get_special_token_ids(model.tokenizer)  # type: ignore
+            exclude_special_tokens = get_special_token_ids(model.tokenizer)  # type: ignore
         if exclude_special_tokens is not None:
             exclude_special_tokens = torch.tensor(
                 exclude_special_tokens, dtype=torch.long, device=device
@@ -763,31 +766,6 @@ def _get_model_device(model: HookedRootModule) -> torch.device:
     return next(model.parameters()).device  # type: ignore
-def _get_special_token_ids(tokenizer: PreTrainedTokenizerBase) -> list[int]:
-    """Get all special token IDs from a tokenizer."""
-    special_tokens = set()
-    # Get special tokens from tokenizer attributes
-    for attr in dir(tokenizer):
-        if attr.endswith("_token_id"):
-            token_id = getattr(tokenizer, attr)
-            if token_id is not None:
-                special_tokens.add(token_id)
-    # Get any additional special tokens from the tokenizer's special tokens map
-    if hasattr(tokenizer, "special_tokens_map"):
-        for token in tokenizer.special_tokens_map.values():
-            if isinstance(token, str):
-                token_id = tokenizer.convert_tokens_to_ids(token)  # type: ignore
-                special_tokens.add(token_id)
-            elif isinstance(token, list):
-                for t in token:
-                    token_id = tokenizer.convert_tokens_to_ids(t)  # type: ignore
-                    special_tokens.add(token_id)
-    return list(special_tokens)
 def _filter_buffer_acts(
     buffer: tuple[torch.Tensor, torch.Tensor | None],
     exclude_tokens: torch.Tensor | None,

{sae_lens-6.14.1 → sae_lens-6.15.0}/sae_lens/util.py RENAMED Viewed

@@ -5,6 +5,8 @@ from dataclasses import asdict, fields, is_dataclass
 from pathlib import Path
 from typing import Sequence, TypeVar
+from transformers import PreTrainedTokenizerBase
 K = TypeVar("K")
 V = TypeVar("V")
@@ -63,3 +65,28 @@ def path_or_tmp_dir(path: str | Path | None):
             yield Path(td)
     else:
         yield Path(path)
+def get_special_token_ids(tokenizer: PreTrainedTokenizerBase) -> list[int]:
+    """Get all special token IDs from a tokenizer."""
+    special_tokens = set()
+    # Get special tokens from tokenizer attributes
+    for attr in dir(tokenizer):
+        if attr.endswith("_token_id"):
+            token_id = getattr(tokenizer, attr)
+            if token_id is not None:
+                special_tokens.add(token_id)
+    # Get any additional special tokens from the tokenizer's special tokens map
+    if hasattr(tokenizer, "special_tokens_map"):
+        for token in tokenizer.special_tokens_map.values():
+            if isinstance(token, str):
+                token_id = tokenizer.convert_tokens_to_ids(token)  # type: ignore
+                special_tokens.add(token_id)
+            elif isinstance(token, list):
+                for t in token:
+                    token_id = tokenizer.convert_tokens_to_ids(t)  # type: ignore
+                    special_tokens.add(token_id)
+    return list(special_tokens)