PyPI - sae-lens - Versions diffs - 5.10.3__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl - Mend

sae-lens 5.10.3py3-none-any.whl → 6.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sae_lens/__init__.py +56 -6
sae_lens/analysis/hooked_sae_transformer.py +12 -12
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +2 -1
sae_lens/config.py +121 -252
sae_lens/constants.py +18 -0
sae_lens/evals.py +32 -17
sae_lens/{toolkit → loading}/pretrained_sae_loaders.py +68 -36
sae_lens/pretrained_saes.yaml +0 -12
sae_lens/registry.py +49 -0
sae_lens/sae_training_runner.py +40 -54
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +258 -0
sae_lens/saes/jumprelu_sae.py +354 -0
sae_lens/saes/sae.py +948 -0
sae_lens/saes/standard_sae.py +185 -0
sae_lens/saes/topk_sae.py +294 -0
sae_lens/training/activations_store.py +32 -16
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +55 -86
sae_lens/training/upload_saes_to_huggingface.py +12 -6
sae_lens/util.py +28 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc2.dist-info/RECORD +35 -0
sae_lens/sae.py +0 -747
sae_lens/training/training_sae.py +0 -710
sae_lens-5.10.3.dist-info/RECORD +0 -28
/sae_lens/{toolkit → loading}/__init__.py +0 -0
/sae_lens/{toolkit → loading}/pretrained_saes_directory.py +0 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/LICENSE +0 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/WHEEL +0 -0

sae_lens/evals.py CHANGED Viewed

@@ -19,8 +19,8 @@ from tqdm import tqdm
 from transformer_lens import HookedTransformer
 from transformer_lens.hook_points import HookedRootModule
-from sae_lens.sae import SAE
-from sae_lens.toolkit.pretrained_saes_directory import get_pretrained_saes_directory
+from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory
+from sae_lens.saes.sae import SAE, SAEConfig
 from sae_lens.training.activations_store import ActivationsStore
@@ -100,7 +100,7 @@ def get_eval_everything_config(
 @torch.no_grad()
 def run_evals(
-    sae: SAE,
+    sae: SAE[Any],
     activation_store: ActivationsStore,
     model: HookedRootModule,
     eval_config: EvalConfig = EvalConfig(),
@@ -108,7 +108,7 @@ def run_evals(
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
+    hook_name = sae.cfg.metadata.hook_name
     actual_batch_size = (
         eval_config.batch_size_prompts or activation_store.store_batch_size_prompts
     )
@@ -274,12 +274,11 @@ def run_evals(
     return all_metrics, feature_metrics
-def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
+def get_featurewise_weight_based_metrics(sae: SAE[Any]) -> dict[str, Any]:
     unit_norm_encoders = (sae.W_enc / sae.W_enc.norm(dim=0, keepdim=True)).cpu()
     unit_norm_decoder = (sae.W_dec.T / sae.W_dec.T.norm(dim=0, keepdim=True)).cpu()
     encoder_norms = sae.W_enc.norm(dim=-2).cpu().tolist()
-    encoder_bias = sae.b_enc.cpu().tolist()
     encoder_decoder_cosine_sim = (
         torch.nn.functional.cosine_similarity(
             unit_norm_decoder.T,
@@ -289,15 +288,17 @@ def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
         .tolist()
     )
-    return {
-        "encoder_bias": encoder_bias,
+    metrics = {
         "encoder_norm": encoder_norms,
         "encoder_decoder_cosine_sim": encoder_decoder_cosine_sim,
     }
+    if hasattr(sae, "b_enc") and sae.b_enc is not None:
+        metrics["encoder_bias"] = sae.b_enc.cpu().tolist()  # type: ignore
+    return metrics
 def get_downstream_reconstruction_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
     compute_kl: bool,
@@ -365,7 +366,7 @@ def get_downstream_reconstruction_metrics(
 def get_sparsity_and_variance_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
     n_batches: int,
@@ -378,8 +379,8 @@ def get_sparsity_and_variance_metrics(
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
-    hook_head_index = sae.cfg.hook_head_index
+    hook_name = sae.cfg.metadata.hook_name
+    hook_head_index = sae.cfg.metadata.hook_head_index
     metric_dict = {}
     feature_metric_dict = {}
@@ -435,7 +436,7 @@ def get_sparsity_and_variance_metrics(
             batch_tokens,
             prepend_bos=False,
             names_filter=[hook_name],
-            stop_at_layer=sae.cfg.hook_layer + 1,
+            stop_at_layer=sae.cfg.metadata.hook_layer + 1,
             **model_kwargs,
         )
@@ -579,7 +580,7 @@ def get_sparsity_and_variance_metrics(
 @torch.no_grad()
 def get_recons_loss(
-    sae: SAE,
+    sae: SAE[SAEConfig],
     model: HookedRootModule,
     batch_tokens: torch.Tensor,
     activation_store: ActivationsStore,
@@ -587,9 +588,13 @@ def get_recons_loss(
     compute_ce_loss: bool,
     ignore_tokens: set[int | None] = set(),
     model_kwargs: Mapping[str, Any] = {},
+    hook_name: str | None = None,
 ) -> dict[str, Any]:
-    hook_name = sae.cfg.hook_name
-    head_index = sae.cfg.hook_head_index
+    hook_name = hook_name or sae.cfg.metadata.hook_name
+    head_index = sae.cfg.metadata.hook_head_index
+    if hook_name is None:
+        raise ValueError("hook_name must be provided")
     original_logits, original_ce_loss = model(
         batch_tokens, return_type="both", loss_per_token=True, **model_kwargs
@@ -764,6 +769,17 @@ def nested_dict() -> defaultdict[Any, Any]:
     return defaultdict(nested_dict)
+def dict_to_nested(flat_dict: dict[str, Any]) -> defaultdict[Any, Any]:
+    nested = nested_dict()
+    for key, value in flat_dict.items():
+        parts = key.split("/")
+        d = nested
+        for part in parts[:-1]:
+            d = d[part]
+        d[parts[-1]] = value
+    return nested
 def multiple_evals(
     sae_regex_pattern: str,
     sae_block_pattern: str,
@@ -794,7 +810,6 @@ def multiple_evals(
     current_model = None
     current_model_str = None
-    print(filtered_saes)
     for sae_release_name, sae_id, _, _ in tqdm(filtered_saes):
         sae = SAE.from_pretrained(
             release=sae_release_name,  # see other options in sae_lens/pretrained_saes.yaml

sae_lens/{toolkit → loading}/pretrained_sae_loaders.py RENAMED Viewed

@@ -7,21 +7,24 @@ import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
+from packaging.version import Version
 from safetensors import safe_open
 from safetensors.torch import load_file
 from sae_lens import logger
-from sae_lens.config import (
+from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
     SPARSITY_FILENAME,
 )
-from sae_lens.toolkit.pretrained_saes_directory import (
+from sae_lens.loading.pretrained_saes_directory import (
     get_config_overrides,
     get_pretrained_saes_directory,
     get_repo_id_and_folder_name,
 )
+from sae_lens.registry import get_sae_class
+from sae_lens.util import filter_valid_dataclass_fields
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
@@ -174,30 +177,68 @@ def get_sae_lens_config_from_disk(
 def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    sae_lens_version = cfg_dict.get("sae_lens_version")
+    if not sae_lens_version and "metadata" in cfg_dict:
+        sae_lens_version = cfg_dict["metadata"].get("sae_lens_version")
+    if not sae_lens_version or Version(sae_lens_version) < Version("6.0.0-rc.0"):
+        cfg_dict = handle_pre_6_0_config(cfg_dict)
+    return cfg_dict
+def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    """
+    Format a config dictionary for a Sparse Autoencoder (SAE) to be compatible with the new 6.0 format.
+    """
+    rename_keys_map = {
+        "hook_point": "hook_name",
+        "hook_point_layer": "hook_layer",
+        "hook_point_head_index": "hook_head_index",
+        "activation_fn_str": "activation_fn",
+    }
+    new_cfg = {rename_keys_map.get(k, k): v for k, v in cfg_dict.items()}
     # Set default values for backwards compatibility
-    cfg_dict.setdefault("prepend_bos", True)
-    cfg_dict.setdefault("dataset_trust_remote_code", True)
-    cfg_dict.setdefault("apply_b_dec_to_input", True)
-    cfg_dict.setdefault("finetuning_scaling_factor", False)
-    cfg_dict.setdefault("sae_lens_training_version", None)
-    cfg_dict.setdefault("activation_fn_str", cfg_dict.get("activation_fn", "relu"))
-    cfg_dict.setdefault("architecture", "standard")
-    cfg_dict.setdefault("neuronpedia_id", None)
-    if "normalize_activations" in cfg_dict and isinstance(
-        cfg_dict["normalize_activations"], bool
+    new_cfg.setdefault("prepend_bos", True)
+    new_cfg.setdefault("dataset_trust_remote_code", True)
+    new_cfg.setdefault("apply_b_dec_to_input", True)
+    new_cfg.setdefault("finetuning_scaling_factor", False)
+    new_cfg.setdefault("sae_lens_training_version", None)
+    new_cfg.setdefault("activation_fn", new_cfg.get("activation_fn", "relu"))
+    new_cfg.setdefault("architecture", "standard")
+    new_cfg.setdefault("neuronpedia_id", None)
+    if "normalize_activations" in new_cfg and isinstance(
+        new_cfg["normalize_activations"], bool
     ):
         # backwards compatibility
-        cfg_dict["normalize_activations"] = (
+        new_cfg["normalize_activations"] = (
             "none"
-            if not cfg_dict["normalize_activations"]
+            if not new_cfg["normalize_activations"]
             else "expected_average_only_in"
         )
-    cfg_dict.setdefault("normalize_activations", "none")
-    cfg_dict.setdefault("device", "cpu")
+    if new_cfg.get("normalize_activations") is None:
+        new_cfg["normalize_activations"] = "none"
-    return cfg_dict
+    new_cfg.setdefault("device", "cpu")
+    architecture = new_cfg.get("architecture", "standard")
+    config_class = get_sae_class(architecture)[1]
+    sae_cfg_dict = filter_valid_dataclass_fields(new_cfg, config_class)
+    if architecture == "topk":
+        sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
+    # import here to avoid circular import
+    from sae_lens.saes.sae import SAEMetadata
+    meta_dict = filter_valid_dataclass_fields(new_cfg, SAEMetadata)
+    sae_cfg_dict["metadata"] = meta_dict
+    sae_cfg_dict["architecture"] = architecture
+    return sae_cfg_dict
 def get_connor_rob_hook_z_config_from_hf(
@@ -223,7 +264,7 @@ def get_connor_rob_hook_z_config_from_hf(
         "hook_name": old_cfg_dict["act_name"],
         "hook_layer": old_cfg_dict["layer"],
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "apply_b_dec_to_input": True,
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
@@ -372,7 +413,7 @@ def get_gemma_2_config_from_hf(
         "hook_name": hook_name,
         "hook_layer": layer,
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,
@@ -473,20 +514,11 @@ def get_llama_scope_config_from_hf(
     # Model specific parameters
     model_name, d_in = "meta-llama/Llama-3.1-8B", old_cfg_dict["d_model"]
-    # Get norm scaling factor to rescale jumprelu threshold.
-    # We need this because sae.fold_activation_norm_scaling_factor folds scaling norm into W_enc.
-    # This requires jumprelu threshold to be scaled in the same way
-    norm_scaling_factor = (
-        d_in**0.5 / old_cfg_dict["dataset_average_activation_norm"]["in"]
-    )
     cfg_dict = {
         "architecture": "jumprelu",
-        "jump_relu_threshold": old_cfg_dict["jump_relu_threshold"]
-        * norm_scaling_factor,
+        "jump_relu_threshold": old_cfg_dict["jump_relu_threshold"],
         # We use a scalar jump_relu_threshold for all features
         # This is different from Gemma Scope JumpReLU SAEs.
-        # Scaled with norm_scaling_factor to match sae.fold_activation_norm_scaling_factor
         "d_in": d_in,
         "d_sae": old_cfg_dict["d_sae"],
         "dtype": "bfloat16",
@@ -494,7 +526,7 @@ def get_llama_scope_config_from_hf(
         "hook_name": old_cfg_dict["hook_point_in"],
         "hook_layer": int(old_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,
@@ -606,8 +638,8 @@ def get_dictionary_learning_config_1_from_hf(
     hook_point_name = f"blocks.{trainer['layer']}.hook_resid_post"
-    activation_fn_str = "topk" if trainer["dict_class"] == "AutoEncoderTopK" else "relu"
-    activation_fn_kwargs = {"k": trainer["k"]} if activation_fn_str == "topk" else {}
+    activation_fn = "topk" if trainer["dict_class"] == "AutoEncoderTopK" else "relu"
+    activation_fn_kwargs = {"k": trainer["k"]} if activation_fn == "topk" else {}
     return {
         "architecture": (
@@ -621,7 +653,7 @@ def get_dictionary_learning_config_1_from_hf(
         "hook_name": hook_point_name,
         "hook_layer": trainer["layer"],
         "hook_head_index": None,
-        "activation_fn_str": activation_fn_str,
+        "activation_fn": activation_fn,
         "activation_fn_kwargs": activation_fn_kwargs,
         "apply_b_dec_to_input": True,
         "finetuning_scaling_factor": False,
@@ -664,7 +696,7 @@ def get_deepseek_r1_config_from_hf(
         "dataset_path": "lmsys/lmsys-chat-1m",
         "dataset_trust_remote_code": True,
         "sae_lens_training_version": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "normalize_activations": "none",
         "device": device,
         "apply_b_dec_to_input": False,
@@ -819,7 +851,7 @@ def get_llama_scope_r1_distill_config_from_hf(
         "hook_name": huggingface_cfg_dict["hook_point_in"],
         "hook_layer": int(huggingface_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,

sae_lens/pretrained_saes.yaml CHANGED Viewed

@@ -13634,51 +13634,39 @@ gemma-2-2b-res-matryoshka-dc:
   - id: blocks.13.hook_resid_post
     path: standard/blocks.13.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/13-res-matryoshka-dc
   - id: blocks.14.hook_resid_post
     path: standard/blocks.14.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/14-res-matryoshka-dc
   - id: blocks.15.hook_resid_post
     path: standard/blocks.15.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/15-res-matryoshka-dc
   - id: blocks.16.hook_resid_post
     path: standard/blocks.16.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/16-res-matryoshka-dc
   - id: blocks.17.hook_resid_post
     path: standard/blocks.17.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/17-res-matryoshka-dc
   - id: blocks.18.hook_resid_post
     path: standard/blocks.18.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/18-res-matryoshka-dc
   - id: blocks.19.hook_resid_post
     path: standard/blocks.19.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/19-res-matryoshka-dc
   - id: blocks.20.hook_resid_post
     path: standard/blocks.20.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/20-res-matryoshka-dc
   - id: blocks.21.hook_resid_post
     path: standard/blocks.21.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/21-res-matryoshka-dc
   - id: blocks.22.hook_resid_post
     path: standard/blocks.22.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/22-res-matryoshka-dc
   - id: blocks.23.hook_resid_post
     path: standard/blocks.23.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/23-res-matryoshka-dc
   - id: blocks.24.hook_resid_post
     path: standard/blocks.24.hook_resid_post
     l0: 40.0
-    neuronpedia: gemma-2-2b/24-res-matryoshka-dc
 gemma-2-2b-res-snap-matryoshka-dc:
   conversion_func: null
   links:

sae_lens/registry.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING, Any
+# avoid circular imports
+if TYPE_CHECKING:
+    from sae_lens.saes.sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+SAE_CLASS_REGISTRY: dict[str, tuple["type[SAE[Any]]", "type[SAEConfig]"]] = {}
+SAE_TRAINING_CLASS_REGISTRY: dict[
+    str, tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]
+] = {}
+def register_sae_class(
+    architecture: str,
+    sae_class: "type[SAE[Any]]",
+    sae_config_class: "type[SAEConfig]",
+) -> None:
+    if architecture in SAE_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE class for architecture {architecture} already registered."
+        )
+    SAE_CLASS_REGISTRY[architecture] = (sae_class, sae_config_class)
+def register_sae_training_class(
+    architecture: str,
+    sae_training_class: "type[TrainingSAE[Any]]",
+    sae_training_config_class: "type[TrainingSAEConfig]",
+) -> None:
+    if architecture in SAE_TRAINING_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE training class for architecture {architecture} already registered."
+        )
+    SAE_TRAINING_CLASS_REGISTRY[architecture] = (
+        sae_training_class,
+        sae_training_config_class,
+    )
+def get_sae_class(
+    architecture: str,
+) -> tuple["type[SAE[Any]]", "type[SAEConfig]"]:
+    return SAE_CLASS_REGISTRY[architecture]
+def get_sae_training_class(
+    architecture: str,
+) -> tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]:
+    return SAE_TRAINING_CLASS_REGISTRY[architecture]

sae_lens/sae_training_runner.py CHANGED Viewed

@@ -7,16 +7,18 @@ from typing import Any, cast
 import torch
 import wandb
+from safetensors.torch import save_file
 from simple_parsing import ArgumentParser
 from transformer_lens.hook_points import HookedRootModule
 from sae_lens import logger
 from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
+from sae_lens.constants import RUNNER_CFG_FILENAME, SPARSITY_FILENAME
 from sae_lens.load_model import load_model
+from sae_lens.saes.sae import T_TRAINING_SAE_CONFIG, TrainingSAE, TrainingSAEConfig
 from sae_lens.training.activations_store import ActivationsStore
 from sae_lens.training.geometric_median import compute_geometric_median
 from sae_lens.training.sae_trainer import SAETrainer
-from sae_lens.training.training_sae import TrainingSAE, TrainingSAEConfig
 class InterruptedException(Exception):
@@ -32,17 +34,17 @@ class SAETrainingRunner:
     Class to run the training of a Sparse Autoencoder (SAE) on a TransformerLens model.
     """
-    cfg: LanguageModelSAERunnerConfig
+    cfg: LanguageModelSAERunnerConfig[Any]
     model: HookedRootModule
-    sae: TrainingSAE
+    sae: TrainingSAE[Any]
     activations_store: ActivationsStore
     def __init__(
         self,
-        cfg: LanguageModelSAERunnerConfig,
+        cfg: LanguageModelSAERunnerConfig[T_TRAINING_SAE_CONFIG],
         override_dataset: HfDataset | None = None,
         override_model: HookedRootModule | None = None,
-        override_sae: TrainingSAE | None = None,
+        override_sae: TrainingSAE[Any] | None = None,
     ):
         if override_dataset is not None:
             logger.warning(
@@ -73,14 +75,14 @@ class SAETrainingRunner:
         if override_sae is None:
             if self.cfg.from_pretrained_path is not None:
-                self.sae = TrainingSAE.load_from_pretrained(
+                self.sae = TrainingSAE.load_from_disk(
                     self.cfg.from_pretrained_path, self.cfg.device
                 )
             else:
-                self.sae = TrainingSAE(
+                self.sae = TrainingSAE.from_dict(
                     TrainingSAEConfig.from_dict(
                         self.cfg.get_training_sae_cfg_dict(),
-                    )
+                    ).to_dict()
                 )
                 self._init_sae_group_b_decs()
         else:
@@ -91,13 +93,13 @@ class SAETrainingRunner:
         Run the training of the SAE.
         """
-        if self.cfg.log_to_wandb:
+        if self.cfg.logger.log_to_wandb:
             wandb.init(
-                project=self.cfg.wandb_project,
-                entity=self.cfg.wandb_entity,
+                project=self.cfg.logger.wandb_project,
+                entity=self.cfg.logger.wandb_entity,
                 config=cast(Any, self.cfg),
-                name=self.cfg.run_name,
-                id=self.cfg.wandb_id,
+                name=self.cfg.logger.run_name,
+                id=self.cfg.logger.wandb_id,
             )
         trainer = SAETrainer(
@@ -111,7 +113,7 @@ class SAETrainingRunner:
         self._compile_if_needed()
         sae = self.run_trainer_with_interruption_handling(trainer)
-        if self.cfg.log_to_wandb:
+        if self.cfg.logger.log_to_wandb:
             wandb.finish()
         return sae
@@ -141,7 +143,9 @@ class SAETrainingRunner:
                 backend=backend,
             )  # type: ignore
-    def run_trainer_with_interruption_handling(self, trainer: SAETrainer):
+    def run_trainer_with_interruption_handling(
+        self, trainer: SAETrainer[TrainingSAE[TrainingSAEConfig], TrainingSAEConfig]
+    ):
         try:
             # signal handlers (if preempted)
             signal.signal(signal.SIGINT, interrupt_callback)
@@ -167,7 +171,7 @@ class SAETrainingRunner:
         extract all activations at a certain layer and use for sae b_dec initialization
         """
-        if self.cfg.b_dec_init_method == "geometric_median":
+        if self.cfg.sae.b_dec_init_method == "geometric_median":
             self.activations_store.set_norm_scaling_factor_if_needed()
             layer_acts = self.activations_store.storage_buffer.detach()[:, 0, :]
             # get geometric median of the activations if we're using those.
@@ -175,15 +179,15 @@ class SAETrainingRunner:
                 layer_acts,
                 maxiter=100,
             ).median
-            self.sae.initialize_b_dec_with_precalculated(median)  # type: ignore
-        elif self.cfg.b_dec_init_method == "mean":
+            self.sae.initialize_b_dec_with_precalculated(median)
+        elif self.cfg.sae.b_dec_init_method == "mean":
             self.activations_store.set_norm_scaling_factor_if_needed()
             layer_acts = self.activations_store.storage_buffer.detach().cpu()[:, 0, :]
             self.sae.initialize_b_dec_with_mean(layer_acts)  # type: ignore
     @staticmethod
     def save_checkpoint(
-        trainer: SAETrainer,
+        trainer: SAETrainer[TrainingSAE[Any], Any],
         checkpoint_name: str,
         wandb_aliases: list[str] | None = None,
     ) -> None:
@@ -194,46 +198,28 @@ class SAETrainingRunner:
             str(base_path / "activations_store_state.safetensors")
         )
-        if trainer.sae.cfg.normalize_sae_decoder:
-            trainer.sae.set_decoder_norm_to_unit_norm()
+        weights_path, cfg_path = trainer.sae.save_model(str(base_path))
-        weights_path, cfg_path, sparsity_path = trainer.sae.save_model(
-            str(base_path),
-            trainer.log_feature_sparsity,
-        )
+        sparsity_path = base_path / SPARSITY_FILENAME
+        save_file({"sparsity": trainer.log_feature_sparsity}, sparsity_path)
-        # let's over write the cfg file with the trainer cfg, which is a super set of the original cfg.
-        # and should not cause issues but give us more info about SAEs we trained in SAE Lens.
-        config = trainer.cfg.to_dict()
-        with open(cfg_path, "w") as f:
-            json.dump(config, f)
-        if trainer.cfg.log_to_wandb:
-            # Avoid wandb saving errors such as:
-            #   ValueError: Artifact name may only contain alphanumeric characters, dashes, underscores, and dots. Invalid name: sae_google/gemma-2b_etc
-            sae_name = trainer.sae.get_name().replace("/", "__")
-            # save model weights and cfg
-            model_artifact = wandb.Artifact(
-                sae_name,
-                type="model",
-                metadata=dict(trainer.cfg.__dict__),
-            )
-            model_artifact.add_file(str(weights_path))
-            model_artifact.add_file(str(cfg_path))
-            wandb.log_artifact(model_artifact, aliases=wandb_aliases)
-            # save log feature sparsity
-            sparsity_artifact = wandb.Artifact(
-                f"{sae_name}_log_feature_sparsity",
-                type="log_feature_sparsity",
-                metadata=dict(trainer.cfg.__dict__),
+        runner_config = trainer.cfg.to_dict()
+        with open(base_path / RUNNER_CFG_FILENAME, "w") as f:
+            json.dump(runner_config, f)
+        if trainer.cfg.logger.log_to_wandb:
+            trainer.cfg.logger.log(
+                trainer,
+                weights_path,
+                cfg_path,
+                sparsity_path=sparsity_path,
+                wandb_aliases=wandb_aliases,
             )
-            sparsity_artifact.add_file(str(sparsity_path))
-            wandb.log_artifact(sparsity_artifact)
-def _parse_cfg_args(args: Sequence[str]) -> LanguageModelSAERunnerConfig:
+def _parse_cfg_args(
+    args: Sequence[str],
+) -> LanguageModelSAERunnerConfig[TrainingSAEConfig]:
     if len(args) == 0:
         args = ["--help"]
     parser = ArgumentParser(exit_on_error=False)

sae_lens/saes/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+from .gated_sae import (
+    GatedSAE,
+    GatedSAEConfig,
+    GatedTrainingSAE,
+    GatedTrainingSAEConfig,
+)
+from .jumprelu_sae import (
+    JumpReLUSAE,
+    JumpReLUSAEConfig,
+    JumpReLUTrainingSAE,
+    JumpReLUTrainingSAEConfig,
+)
+from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+from .standard_sae import (
+    StandardSAE,
+    StandardSAEConfig,
+    StandardTrainingSAE,
+    StandardTrainingSAEConfig,
+)
+from .topk_sae import (
+    TopKSAE,
+    TopKSAEConfig,
+    TopKTrainingSAE,
+    TopKTrainingSAEConfig,
+)
+__all__ = [
+    "SAE",
+    "SAEConfig",
+    "TrainingSAE",
+    "TrainingSAEConfig",
+    "StandardSAE",
+    "StandardSAEConfig",
+    "StandardTrainingSAE",
+    "StandardTrainingSAEConfig",
+    "GatedSAE",
+    "GatedSAEConfig",
+    "GatedTrainingSAE",
+    "GatedTrainingSAEConfig",
+    "JumpReLUSAE",
+    "JumpReLUSAEConfig",
+    "JumpReLUTrainingSAE",
+    "JumpReLUTrainingSAEConfig",
+    "TopKSAE",
+    "TopKSAEConfig",
+    "TopKTrainingSAE",
+    "TopKTrainingSAEConfig",
+]

sae-lens 5.10.3__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl

sae-lens 5.10.3py3-none-any.whl → 6.0.0rc2py3-none-any.whl