PyPI - sae-lens - Versions diffs - 6.0.0rc1__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl - Mend

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

sae_lens/__init__.py +50 -16
sae_lens/analysis/hooked_sae_transformer.py +10 -10
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +2 -1
sae_lens/config.py +59 -231
sae_lens/constants.py +18 -0
sae_lens/evals.py +16 -13
sae_lens/loading/pretrained_sae_loaders.py +36 -3
sae_lens/registry.py +49 -0
sae_lens/sae_training_runner.py +22 -21
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +70 -59
sae_lens/saes/jumprelu_sae.py +58 -72
sae_lens/saes/sae.py +250 -272
sae_lens/saes/standard_sae.py +75 -57
sae_lens/saes/topk_sae.py +72 -83
sae_lens/training/activations_store.py +31 -15
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +44 -69
sae_lens/training/upload_saes_to_huggingface.py +11 -5
sae_lens/util.py +28 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc2.dist-info/RECORD +35 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/WHEEL +1 -1
sae_lens/regsitry.py +0 -34
sae_lens-6.0.0rc1.dist-info/RECORD +0 -32
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/LICENSE +0 -0

sae_lens/config.py CHANGED Viewed

@@ -3,7 +3,7 @@ import math
 import os
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
-from typing import Any, Literal, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, cast
 import simple_parsing
 import torch
@@ -17,24 +17,15 @@ from datasets import (
 )
 from sae_lens import __version__, logger
+from sae_lens.constants import DTYPE_MAP
+from sae_lens.saes.sae import TrainingSAEConfig
-DTYPE_MAP = {
-    "float32": torch.float32,
-    "float64": torch.float64,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "torch.float32": torch.float32,
-    "torch.float64": torch.float64,
-    "torch.float16": torch.float16,
-    "torch.bfloat16": torch.bfloat16,
-}
-HfDataset = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
+if TYPE_CHECKING:
+    pass
+T_TRAINING_SAE_CONFIG = TypeVar("T_TRAINING_SAE_CONFIG", bound=TrainingSAEConfig)
-SPARSITY_FILENAME = "sparsity.safetensors"
-SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"
-SAE_CFG_FILENAME = "cfg.json"
+HfDataset = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
 # calling this "json_dict" so error messages will reference "json_dict" being invalid
@@ -101,95 +92,68 @@ class LoggingConfig:
 @dataclass
-class LanguageModelSAERunnerConfig:
+class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     """
     Configuration for training a sparse autoencoder on a language model.
     Args:
-        architecture (str): The architecture to use, either "standard", "gated", "topk", or "jumprelu".
+        sae (T_TRAINING_SAE_CONFIG): The configuration for the SAE itself (e.g. StandardSAEConfig, GatedSAEConfig).
         model_name (str): The name of the model to use. This should be the name of the model in the Hugging Face model hub.
         model_class_name (str): The name of the class of the model to use. This should be either `HookedTransformer` or `HookedMamba`.
         hook_name (str): The name of the hook to use. This should be a valid TransformerLens hook.
         hook_eval (str): NOT CURRENTLY IN USE. The name of the hook to use for evaluation.
         hook_layer (int): The index of the layer to hook. Used to stop forward passes early and speed up processing.
-        hook_head_index (int, optional): When the hook if for an activatio with a head index, we can specify a specific head to use here.
+        hook_head_index (int, optional): When the hook is for an activation with a head index, we can specify a specific head to use here.
         dataset_path (str): A Hugging Face dataset path.
         dataset_trust_remote_code (bool): Whether to trust remote code when loading datasets from Huggingface.
         streaming (bool): Whether to stream the dataset. Streaming large datasets is usually practical.
-        is_dataset_tokenized (bool): NOT IN USE. We used to use this but now automatically detect if the dataset is tokenized.
+        is_dataset_tokenized (bool): Whether the dataset is already tokenized.
         context_size (int): The context size to use when generating activations on which to train the SAE.
         use_cached_activations (bool): Whether to use cached activations. This is useful when doing sweeps over the same activations.
-        cached_activations_path (str, optional): The path to the cached activations.
-        d_in (int): The input dimension of the SAE.
-        d_sae (int, optional): The output dimension of the SAE. If None, defaults to `d_in * expansion_factor`.
-        b_dec_init_method (str): The method to use to initialize the decoder bias. Zeros is likely fine.
-        expansion_factor (int): The expansion factor. Larger is better but more computationally expensive. Default is 4.
-        activation_fn (str): The activation function to use. Relu is standard.
-        normalize_sae_decoder (bool): Whether to normalize the SAE decoder. Unit normed decoder weights used to be preferred.
-        noise_scale (float): Using noise to induce sparsity is supported but not recommended.
+        cached_activations_path (str, optional): The path to the cached activations. Defaults to "activations/{dataset_path}/{model_name}/{hook_name}_{hook_head_index}".
         from_pretrained_path (str, optional): The path to a pretrained SAE. We can finetune an existing SAE if needed.
-        apply_b_dec_to_input (bool): Whether to apply the decoder bias to the input. Not currently advised.
-        decoder_orthogonal_init (bool): Whether to use orthogonal initialization for the decoder. Not currently advised.
-        decoder_heuristic_init (bool): Whether to use heuristic initialization for the decoder. See Anthropic April Update.
-        init_encoder_as_decoder_transpose (bool): Whether to initialize the encoder as the transpose of the decoder. See Anthropic April Update.
-        n_batches_in_buffer (int): The number of batches in the buffer. When not using cached activations, a buffer in ram is used. The larger it is, the better shuffled the activations will be.
+        n_batches_in_buffer (int): The number of batches in the buffer. When not using cached activations, a buffer in RAM is used. The larger it is, the better shuffled the activations will be.
         training_tokens (int): The number of training tokens.
-        finetuning_tokens (int): The number of finetuning tokens. See [here](https://www.lesswrong.com/posts/3JuSjTZyMzaSeTxKk/addressing-feature-suppression-in-saes)
-        store_batch_size_prompts (int): The batch size for storing activations. This controls how many prompts are in the batch of the language model when generating actiations.
-        train_batch_size_tokens (int): The batch size for training. This controls the batch size of the SAE Training loop.
-        normalize_activations (str): Activation Normalization Strategy. Either none, expected_average_only_in (estimate the average activation norm and divide activations by it following Antrhopic April update -> this can be folded post training and set to None), or constant_norm_rescale (at runtime set activation norm to sqrt(d_in) and then scale up the SAE output).
-        seqpos_slice (tuple): Determines slicing of activations when constructing batches during training. The slice should be (start_pos, end_pos, optional[step_size]), e.g. for Othello we sometimes use (5, -5). Note, step_size > 0.
-        device (str): The device to use. Usually cuda.
-        act_store_device (str): The device to use for the activation store. CPU is advised in order to save vram.
+        store_batch_size_prompts (int): The batch size for storing activations. This controls how many prompts are in the batch of the language model when generating activations.
+        seqpos_slice (tuple[int | None, ...]): Determines slicing of activations when constructing batches during training. The slice should be (start_pos, end_pos, optional[step_size]), e.g. for Othello we sometimes use (5, -5). Note, step_size > 0.
+        device (str): The device to use. Usually "cuda".
+        act_store_device (str): The device to use for the activation store. "cpu" is advised in order to save VRAM. Defaults to "with_model" which uses the same device as the main model.
         seed (int): The seed to use.
-        dtype (str): The data type to use.
+        dtype (str): The data type to use for the SAE and activations.
         prepend_bos (bool): Whether to prepend the beginning of sequence token. You should use whatever the model was trained with.
-        jumprelu_init_threshold (float): The threshold to initialize for training JumpReLU SAEs.
-        jumprelu_bandwidth (float): Bandwidth for training JumpReLU SAEs.
-        autocast (bool): Whether to use autocast during training. Saves vram.
-        autocast_lm (bool): Whether to use autocast during activation fetching.
-        compile_llm (bool): Whether to compile the LLM.
-        llm_compilation_mode (str): The compilation mode to use for the LLM.
-        compile_sae (bool): Whether to compile the SAE.
-        sae_compilation_mode (str): The compilation mode to use for the SAE.
-        adam_beta1 (float): The beta1 parameter for Adam.
-        adam_beta2 (float): The beta2 parameter for Adam.
-        mse_loss_normalization (str): The normalization to use for the MSE loss.
-        l1_coefficient (float): The L1 coefficient.
-        lp_norm (float): The Lp norm.
-        scale_sparsity_penalty_by_decoder_norm (bool): Whether to scale the sparsity penalty by the decoder norm.
-        l1_warm_up_steps (int): The number of warm-up steps for the L1 loss.
+        autocast (bool): Whether to use autocast (mixed-precision) during SAE training. Saves VRAM.
+        autocast_lm (bool): Whether to use autocast (mixed-precision) during activation fetching. Saves VRAM.
+        compile_llm (bool): Whether to compile the LLM using `torch.compile`.
+        llm_compilation_mode (str, optional): The compilation mode to use for the LLM if `compile_llm` is True.
+        compile_sae (bool): Whether to compile the SAE using `torch.compile`.
+        sae_compilation_mode (str, optional): The compilation mode to use for the SAE if `compile_sae` is True.
+        train_batch_size_tokens (int): The batch size for training, in tokens. This controls the batch size of the SAE training loop.
+        adam_beta1 (float): The beta1 parameter for the Adam optimizer.
+        adam_beta2 (float): The beta2 parameter for the Adam optimizer.
         lr (float): The learning rate.
-        lr_scheduler_name (str): The name of the learning rate scheduler to use.
+        lr_scheduler_name (str): The name of the learning rate scheduler to use (e.g., "constant", "cosineannealing", "cosineannealingwarmrestarts").
         lr_warm_up_steps (int): The number of warm-up steps for the learning rate.
-        lr_end (float): The end learning rate if lr_decay_steps is set. Default is lr / 10.
-        lr_decay_steps (int): The number of decay steps for the learning rate.
-        n_restart_cycles (int): The number of restart cycles for the cosine annealing warm restarts scheduler.
-        finetuning_method (str): The method to use for finetuning.
-        use_ghost_grads (bool): Whether to use ghost gradients.
-        feature_sampling_window (int): The feature sampling window.
-        dead_feature_window (int): The dead feature window.
-        dead_feature_threshold (float): The dead feature threshold.
-        n_eval_batches (int): The number of evaluation batches.
-        eval_batch_size_prompts (int): The batch size for evaluation.
-        log_to_wandb (bool): Whether to log to Weights & Biases.
-        log_activations_store_to_wandb (bool): NOT CURRENTLY USED. Whether to log the activations store to Weights & Biases.
-        log_optimizer_state_to_wandb (bool): NOT CURRENTLY USED. Whether to log the optimizer state to Weights & Biases.
-        wandb_project (str): The Weights & Biases project to log to.
-        wandb_id (str): The Weights & Biases ID.
-        run_name (str): The name of the run.
-        wandb_entity (str): The Weights & Biases entity.
-        wandb_log_frequency (int): The frequency to log to Weights & Biases.
-        eval_every_n_wandb_logs (int): The frequency to evaluate.
-        resume (bool): Whether to resume training.
-        n_checkpoints (int): The number of checkpoints.
-        checkpoint_path (str): The path to save checkpoints.
+        lr_end (float, optional): The end learning rate if using a scheduler like cosine annealing. Defaults to `lr / 10`.
+        lr_decay_steps (int): The number of decay steps for the learning rate if using a scheduler with decay.
+        n_restart_cycles (int): The number of restart cycles for the cosine annealing with warm restarts scheduler.
+        dead_feature_window (int): The window size (in training steps) for detecting dead features.
+        feature_sampling_window (int): The window size (in training steps) for resampling features (e.g. dead features).
+        dead_feature_threshold (float): The threshold below which a feature's activation frequency is considered dead.
+        n_eval_batches (int): The number of batches to use for evaluation.
+        eval_batch_size_prompts (int, optional): The batch size for evaluation, in prompts. Useful if evals cause OOM.
+        logger (LoggingConfig): Configuration for logging (e.g. W&B).
+        n_checkpoints (int): The number of checkpoints to save during training. 0 means no checkpoints.
+        checkpoint_path (str): The path to save checkpoints. A unique ID will be appended to this path.
         verbose (bool): Whether to print verbose output.
-        model_kwargs (dict[str, Any]): Additional keyword arguments for the model.
-        model_from_pretrained_kwargs (dict[str, Any]): Additional keyword arguments for the model from pretrained.
-        exclude_special_tokens (bool | list[int]): Whether to exclude special tokens from the activations.
+        model_kwargs (dict[str, Any]): Keyword arguments for `model.run_with_cache`
+        model_from_pretrained_kwargs (dict[str, Any], optional): Additional keyword arguments to pass to the model's `from_pretrained` method.
+        sae_lens_version (str): The version of the sae_lens library.
+        sae_lens_training_version (str): The version of the sae_lens training library.
+        exclude_special_tokens (bool | list[int]): Whether to exclude special tokens from the activations. If True, excludes all special tokens. If a list of ints, excludes those token IDs.
     """
+    sae: T_TRAINING_SAE_CONFIG
     # Data Generating Function (Model + Training Distibuion)
     model_name: str = "gelu-2l"
     model_class_name: str = "HookedTransformer"
@@ -208,29 +172,12 @@ class LanguageModelSAERunnerConfig:
     )
     # SAE Parameters
-    architecture: Literal["standard", "gated", "jumprelu", "topk"] = "standard"
-    d_in: int = 512
-    d_sae: int | None = None
-    b_dec_init_method: str = "geometric_median"
-    expansion_factor: int | None = (
-        None  # defaults to 4 if d_sae and expansion_factor is None
-    )
-    activation_fn: str = None  # relu, tanh-relu, topk. Default is relu. # type: ignore
-    activation_fn_kwargs: dict[str, int] = dict_field(default=None)  # for topk
-    normalize_sae_decoder: bool = True
-    noise_scale: float = 0.0
     from_pretrained_path: str | None = None
-    apply_b_dec_to_input: bool = True
-    decoder_orthogonal_init: bool = False
-    decoder_heuristic_init: bool = False
-    init_encoder_as_decoder_transpose: bool = False
     # Activation Store Parameters
     n_batches_in_buffer: int = 20
     training_tokens: int = 2_000_000
-    finetuning_tokens: int = 0
     store_batch_size_prompts: int = 32
-    normalize_activations: str = "none"  # none, expected_average_only_in (Anthropic April Update), constant_norm_rescale (Anthropic Feb Update)
     seqpos_slice: tuple[int | None, ...] = (None,)
     # Misc
@@ -240,10 +187,6 @@ class LanguageModelSAERunnerConfig:
     dtype: str = "float32"  # type: ignore #
     prepend_bos: bool = True
-    # JumpReLU Parameters
-    jumprelu_init_threshold: float = 0.001
-    jumprelu_bandwidth: float = 0.001
     # Performance - see compilation section of lm_runner.py for info
     autocast: bool = False  # autocast to autocast_dtype during training
     autocast_lm: bool = False  # autocast lm during activation fetching
@@ -261,13 +204,6 @@ class LanguageModelSAERunnerConfig:
     adam_beta1: float = 0.0
     adam_beta2: float = 0.999
-    ## Loss Function
-    mse_loss_normalization: str | None = None
-    l1_coefficient: float = 1e-3
-    lp_norm: float = 1
-    scale_sparsity_penalty_by_decoder_norm: bool = False
-    l1_warm_up_steps: int = 0
     ## Learning Rate Schedule
     lr: float = 3e-4
     lr_scheduler_name: str = (
@@ -278,14 +214,9 @@ class LanguageModelSAERunnerConfig:
     lr_decay_steps: int = 0
     n_restart_cycles: int = 1  # used only for cosineannealingwarmrestarts
-    ## FineTuning
-    finetuning_method: str | None = None  # scale, decoder or unrotated_decoder
     # Resampling protocol args
-    use_ghost_grads: bool = False  # want to change this to true on some timeline.
-    feature_sampling_window: int = 2000
     dead_feature_window: int = 1000  # unless this window is larger feature sampling,
+    feature_sampling_window: int = 2000
     dead_feature_threshold: float = 1e-8
     # Evals
@@ -295,7 +226,6 @@ class LanguageModelSAERunnerConfig:
     logger: LoggingConfig = field(default_factory=LoggingConfig)
     # Misc
-    resume: bool = False
     n_checkpoints: int = 0
     checkpoint_path: str = "checkpoints"
     verbose: bool = True
@@ -306,12 +236,6 @@ class LanguageModelSAERunnerConfig:
     exclude_special_tokens: bool | list[int] = False
     def __post_init__(self):
-        if self.resume:
-            raise ValueError(
-                "Resuming is no longer supported. You can finetune a trained SAE using cfg.from_pretrained path."
-                + "If you want to load an SAE with resume=True in the config, please manually set resume=False in that config."
-            )
         if self.use_cached_activations and self.cached_activations_path is None:
             self.cached_activations_path = _default_cached_activations_path(
                 self.dataset_path,
@@ -319,37 +243,12 @@ class LanguageModelSAERunnerConfig:
                 self.hook_name,
                 self.hook_head_index,
             )
-        if self.activation_fn is None:
-            self.activation_fn = "topk" if self.architecture == "topk" else "relu"
-        if self.architecture == "topk" and self.activation_fn != "topk":
-            raise ValueError("If using topk architecture, activation_fn must be topk.")
-        if self.activation_fn_kwargs is None:
-            self.activation_fn_kwargs = (
-                {"k": 100} if self.activation_fn == "topk" else {}
-            )
-        if self.architecture == "topk" and self.activation_fn_kwargs.get("k") is None:
-            raise ValueError(
-                "activation_fn_kwargs.k must be provided for topk architecture."
-            )
-        if self.d_sae is not None and self.expansion_factor is not None:
-            raise ValueError("You can't set both d_sae and expansion_factor.")
-        if self.d_sae is None and self.expansion_factor is None:
-            self.expansion_factor = 4
-        if self.d_sae is None and self.expansion_factor is not None:
-            self.d_sae = self.d_in * self.expansion_factor
         self.tokens_per_buffer = (
             self.train_batch_size_tokens * self.context_size * self.n_batches_in_buffer
         )
         if self.logger.run_name is None:
-            self.logger.run_name = f"{self.d_sae}-L1-{self.l1_coefficient}-LR-{self.lr}-Tokens-{self.training_tokens:3.3e}"
+            self.logger.run_name = f"{self.sae.architecture()}-{self.sae.d_sae}-LR-{self.lr}-Tokens-{self.training_tokens:3.3e}"
         if self.model_from_pretrained_kwargs is None:
             if self.model_class_name == "HookedTransformer":
@@ -357,37 +256,6 @@ class LanguageModelSAERunnerConfig:
             else:
                 self.model_from_pretrained_kwargs = {}
-        if self.b_dec_init_method not in ["geometric_median", "mean", "zeros"]:
-            raise ValueError(
-                f"b_dec_init_method must be geometric_median, mean, or zeros. Got {self.b_dec_init_method}"
-            )
-        if self.normalize_sae_decoder and self.decoder_heuristic_init:
-            raise ValueError(
-                "You can't normalize the decoder and use heuristic initialization."
-            )
-        if self.normalize_sae_decoder and self.scale_sparsity_penalty_by_decoder_norm:
-            raise ValueError(
-                "Weighting loss by decoder norm makes no sense if you are normalizing the decoder weight norms to 1"
-            )
-        # if we use decoder fine tuning, we can't be applying b_dec to the input
-        if (self.finetuning_method == "decoder") and (self.apply_b_dec_to_input):
-            raise ValueError(
-                "If we are fine tuning the decoder, we can't be applying b_dec to the input.\nSet apply_b_dec_to_input to False."
-            )
-        if self.normalize_activations not in [
-            "none",
-            "expected_average_only_in",
-            "constant_norm_rescale",
-            "layer_norm",
-        ]:
-            raise ValueError(
-                f"normalize_activations must be none, layer_norm, expected_average_only_in, or constant_norm_rescale. Got {self.normalize_activations}"
-            )
         if self.act_store_device == "with_model":
             self.act_store_device = self.device
@@ -403,7 +271,7 @@ class LanguageModelSAERunnerConfig:
         if self.verbose:
             logger.info(
-                f"Run name: {self.d_sae}-L1-{self.l1_coefficient}-LR-{self.lr}-Tokens-{self.training_tokens:3.3e}"
+                f"Run name: {self.sae.architecture()}-{self.sae.d_sae}-LR-{self.lr}-Tokens-{self.training_tokens:3.3e}"
             )
             # Print out some useful info:
             n_tokens_per_buffer = (
@@ -422,7 +290,7 @@ class LanguageModelSAERunnerConfig:
             )
             total_training_steps = (
-                self.training_tokens + self.finetuning_tokens
+                self.training_tokens
             ) // self.train_batch_size_tokens
             logger.info(f"Total training steps: {total_training_steps}")
@@ -450,9 +318,6 @@ class LanguageModelSAERunnerConfig:
                 f"Number tokens in sparsity calculation window: {self.feature_sampling_window * self.train_batch_size_tokens:.2e}"
             )
-        if self.use_ghost_grads:
-            logger.info("Using Ghost Grads.")
         if self.context_size < 0:
             raise ValueError(
                 f"The provided context_size is {self.context_size} is negative. Expecting positive context_size."
@@ -467,62 +332,21 @@ class LanguageModelSAERunnerConfig:
     @property
     def total_training_tokens(self) -> int:
-        return self.training_tokens + self.finetuning_tokens
+        return self.training_tokens
     @property
     def total_training_steps(self) -> int:
         return self.total_training_tokens // self.train_batch_size_tokens
-    def get_base_sae_cfg_dict(self) -> dict[str, Any]:
-        return {
-            # TEMP
-            "architecture": self.architecture,
-            "d_in": self.d_in,
-            "d_sae": self.d_sae,
-            "dtype": self.dtype,
-            "device": self.device,
-            "model_name": self.model_name,
-            "hook_name": self.hook_name,
-            "hook_layer": self.hook_layer,
-            "hook_head_index": self.hook_head_index,
-            "activation_fn": self.activation_fn,
-            "apply_b_dec_to_input": self.apply_b_dec_to_input,
-            "context_size": self.context_size,
-            "prepend_bos": self.prepend_bos,
-            "dataset_path": self.dataset_path,
-            "dataset_trust_remote_code": self.dataset_trust_remote_code,
-            "finetuning_scaling_factor": self.finetuning_method is not None,
-            "sae_lens_training_version": self.sae_lens_training_version,
-            "normalize_activations": self.normalize_activations,
-            "activation_fn_kwargs": self.activation_fn_kwargs,
-            "model_from_pretrained_kwargs": self.model_from_pretrained_kwargs,
-            "seqpos_slice": self.seqpos_slice,
-        }
     def get_training_sae_cfg_dict(self) -> dict[str, Any]:
-        return {
-            **self.get_base_sae_cfg_dict(),
-            "l1_coefficient": self.l1_coefficient,
-            "lp_norm": self.lp_norm,
-            "use_ghost_grads": self.use_ghost_grads,
-            "normalize_sae_decoder": self.normalize_sae_decoder,
-            "noise_scale": self.noise_scale,
-            "decoder_orthogonal_init": self.decoder_orthogonal_init,
-            "mse_loss_normalization": self.mse_loss_normalization,
-            "decoder_heuristic_init": self.decoder_heuristic_init,
-            "init_encoder_as_decoder_transpose": self.init_encoder_as_decoder_transpose,
-            "normalize_activations": self.normalize_activations,
-            "jumprelu_init_threshold": self.jumprelu_init_threshold,
-            "jumprelu_bandwidth": self.jumprelu_bandwidth,
-            "scale_sparsity_penalty_by_decoder_norm": self.scale_sparsity_penalty_by_decoder_norm,
-        }
+        return self.sae.to_dict()
     def to_dict(self) -> dict[str, Any]:
-        # Make a shallow copy of config’s dictionary
+        # Make a shallow copy of config's dictionary
         d = dict(self.__dict__)
         d["logger"] = asdict(self.logger)
+        d["sae"] = self.sae.to_dict()
         # Overwrite fields that might not be JSON-serializable
         d["dtype"] = str(self.dtype)
         d["device"] = str(self.device)
@@ -537,7 +361,7 @@ class LanguageModelSAERunnerConfig:
             json.dump(self.to_dict(), f, indent=2)
     @classmethod
-    def from_json(cls, path: str) -> "LanguageModelSAERunnerConfig":
+    def from_json(cls, path: str) -> "LanguageModelSAERunnerConfig[Any]":
         with open(path + "cfg.json") as f:
             cfg = json.load(f)
@@ -720,6 +544,10 @@ def _validate_seqpos(seqpos: tuple[int | None, ...], context_size: int) -> None:
 @dataclass
 class PretokenizeRunnerConfig:
+    """
+    Configuration class for pretokenizing a dataset.
+    """
     tokenizer_name: str = "gpt2"
     dataset_path: str = ""
     dataset_name: str | None = None

sae_lens/constants.py ADDED Viewed

@@ -0,0 +1,18 @@
+import torch
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float64": torch.float64,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "torch.float32": torch.float32,
+    "torch.float64": torch.float64,
+    "torch.float16": torch.float16,
+    "torch.bfloat16": torch.bfloat16,
+}
+SPARSITY_FILENAME = "sparsity.safetensors"
+SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"
+SAE_CFG_FILENAME = "cfg.json"
+RUNNER_CFG_FILENAME = "runner_cfg.json"

sae_lens/evals.py CHANGED Viewed

@@ -20,7 +20,7 @@ from transformer_lens import HookedTransformer
 from transformer_lens.hook_points import HookedRootModule
 from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory
-from sae_lens.saes.sae import SAE
+from sae_lens.saes.sae import SAE, SAEConfig
 from sae_lens.training.activations_store import ActivationsStore
@@ -100,7 +100,7 @@ def get_eval_everything_config(
 @torch.no_grad()
 def run_evals(
-    sae: SAE,
+    sae: SAE[Any],
     activation_store: ActivationsStore,
     model: HookedRootModule,
     eval_config: EvalConfig = EvalConfig(),
@@ -108,7 +108,7 @@ def run_evals(
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
+    hook_name = sae.cfg.metadata.hook_name
     actual_batch_size = (
         eval_config.batch_size_prompts or activation_store.store_batch_size_prompts
     )
@@ -274,7 +274,7 @@ def run_evals(
     return all_metrics, feature_metrics
-def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
+def get_featurewise_weight_based_metrics(sae: SAE[Any]) -> dict[str, Any]:
     unit_norm_encoders = (sae.W_enc / sae.W_enc.norm(dim=0, keepdim=True)).cpu()
     unit_norm_decoder = (sae.W_dec.T / sae.W_dec.T.norm(dim=0, keepdim=True)).cpu()
@@ -298,7 +298,7 @@ def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
 def get_downstream_reconstruction_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
     compute_kl: bool,
@@ -366,7 +366,7 @@ def get_downstream_reconstruction_metrics(
 def get_sparsity_and_variance_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
     n_batches: int,
@@ -379,8 +379,8 @@ def get_sparsity_and_variance_metrics(
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
-    hook_head_index = sae.cfg.hook_head_index
+    hook_name = sae.cfg.metadata.hook_name
+    hook_head_index = sae.cfg.metadata.hook_head_index
     metric_dict = {}
     feature_metric_dict = {}
@@ -436,7 +436,7 @@ def get_sparsity_and_variance_metrics(
             batch_tokens,
             prepend_bos=False,
             names_filter=[hook_name],
-            stop_at_layer=sae.cfg.hook_layer + 1,
+            stop_at_layer=sae.cfg.metadata.hook_layer + 1,
             **model_kwargs,
         )
@@ -580,7 +580,7 @@ def get_sparsity_and_variance_metrics(
 @torch.no_grad()
 def get_recons_loss(
-    sae: SAE,
+    sae: SAE[SAEConfig],
     model: HookedRootModule,
     batch_tokens: torch.Tensor,
     activation_store: ActivationsStore,
@@ -588,9 +588,13 @@ def get_recons_loss(
     compute_ce_loss: bool,
     ignore_tokens: set[int | None] = set(),
     model_kwargs: Mapping[str, Any] = {},
+    hook_name: str | None = None,
 ) -> dict[str, Any]:
-    hook_name = sae.cfg.hook_name
-    head_index = sae.cfg.hook_head_index
+    hook_name = hook_name or sae.cfg.metadata.hook_name
+    head_index = sae.cfg.metadata.hook_head_index
+    if hook_name is None:
+        raise ValueError("hook_name must be provided")
     original_logits, original_ce_loss = model(
         batch_tokens, return_type="both", loss_per_token=True, **model_kwargs
@@ -806,7 +810,6 @@ def multiple_evals(
     current_model = None
     current_model_str = None
-    print(filtered_saes)
     for sae_release_name, sae_id, _, _ in tqdm(filtered_saes):
         sae = SAE.from_pretrained(
             release=sae_release_name,  # see other options in sae_lens/pretrained_saes.yaml

sae_lens/loading/pretrained_sae_loaders.py CHANGED Viewed

@@ -7,11 +7,12 @@ import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
+from packaging.version import Version
 from safetensors import safe_open
 from safetensors.torch import load_file
 from sae_lens import logger
-from sae_lens.config import (
+from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
@@ -22,6 +23,8 @@ from sae_lens.loading.pretrained_saes_directory import (
     get_pretrained_saes_directory,
     get_repo_id_and_folder_name,
 )
+from sae_lens.registry import get_sae_class
+from sae_lens.util import filter_valid_dataclass_fields
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
@@ -174,6 +177,20 @@ def get_sae_lens_config_from_disk(
 def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    sae_lens_version = cfg_dict.get("sae_lens_version")
+    if not sae_lens_version and "metadata" in cfg_dict:
+        sae_lens_version = cfg_dict["metadata"].get("sae_lens_version")
+    if not sae_lens_version or Version(sae_lens_version) < Version("6.0.0-rc.0"):
+        cfg_dict = handle_pre_6_0_config(cfg_dict)
+    return cfg_dict
+def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    """
+    Format a config dictionary for a Sparse Autoencoder (SAE) to be compatible with the new 6.0 format.
+    """
     rename_keys_map = {
         "hook_point": "hook_name",
         "hook_point_layer": "hook_layer",
@@ -202,10 +219,26 @@ def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
             else "expected_average_only_in"
         )
-    new_cfg.setdefault("normalize_activations", "none")
+    if new_cfg.get("normalize_activations") is None:
+        new_cfg["normalize_activations"] = "none"
     new_cfg.setdefault("device", "cpu")
-    return new_cfg
+    architecture = new_cfg.get("architecture", "standard")
+    config_class = get_sae_class(architecture)[1]
+    sae_cfg_dict = filter_valid_dataclass_fields(new_cfg, config_class)
+    if architecture == "topk":
+        sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
+    # import here to avoid circular import
+    from sae_lens.saes.sae import SAEMetadata
+    meta_dict = filter_valid_dataclass_fields(new_cfg, SAEMetadata)
+    sae_cfg_dict["metadata"] = meta_dict
+    sae_cfg_dict["architecture"] = architecture
+    return sae_cfg_dict
 def get_connor_rob_hook_z_config_from_hf(

sae-lens 6.0.0rc1__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc2py3-none-any.whl