PyPI - sae-lens - Versions diffs - 6.0.0rc4__tar.gz → 6.1.0__tar.gz - Mend

sae-lens 6.0.0rc4tar.gz → 6.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: sae-lens
-Version: 6.0.0rc4
+Version: 6.1.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch
@@ -80,7 +80,7 @@ Pre-trained SAEs for various models can be imported via SAE Lens. See this [page
 ## Join the Slack!
-Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-2o756ku1c-_yKBeUQMVfS_p_qcK6QLeA) for support!
+Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-375zalm04-GFd5tdBU1yLKlu_T_JSqZQ) for support!
 ## Citation

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/README.md RENAMED Viewed

@@ -40,7 +40,7 @@ Pre-trained SAEs for various models can be imported via SAE Lens. See this [page
 ## Join the Slack!
-Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-2o756ku1c-_yKBeUQMVfS_p_qcK6QLeA) for support!
+Feel free to join the [Open Source Mechanistic Interpretability Slack](https://join.slack.com/t/opensourcemechanistic/shared_invite/zt-375zalm04-GFd5tdBU1yLKlu_T_JSqZQ) for support!
 ## Citation

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.0.0-rc.4"
+version = "6.1.0"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.0.0-rc.4"
+__version__ = "6.1.0"
 import logging
@@ -7,6 +7,8 @@ logger = logging.getLogger(__name__)
 from sae_lens.saes import (
     SAE,
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
     GatedSAE,
     GatedSAEConfig,
     GatedTrainingSAE,
@@ -85,6 +87,8 @@ __all__ = [
     "JumpReLUTrainingSAEConfig",
     "SAETrainingRunner",
     "LoggingConfig",
+    "BatchTopKTrainingSAE",
+    "BatchTopKTrainingSAEConfig",
 ]
@@ -96,3 +100,6 @@ register_sae_class("topk", TopKSAE, TopKSAEConfig)
 register_sae_training_class("topk", TopKTrainingSAE, TopKTrainingSAEConfig)
 register_sae_class("jumprelu", JumpReLUSAE, JumpReLUSAEConfig)
 register_sae_training_class("jumprelu", JumpReLUTrainingSAE, JumpReLUTrainingSAEConfig)
+register_sae_training_class(
+    "batchtopk", BatchTopKTrainingSAE, BatchTopKTrainingSAEConfig
+)

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/config.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import json
 import math
-import os
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, cast
@@ -353,28 +352,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         d["act_store_device"] = str(self.act_store_device)
         return d
-    def to_json(self, path: str) -> None:
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-        with open(path + "cfg.json", "w") as f:
-            json.dump(self.to_dict(), f, indent=2)
-    @classmethod
-    def from_json(cls, path: str) -> "LanguageModelSAERunnerConfig[Any]":
-        with open(path + "cfg.json") as f:
-            cfg = json.load(f)
-        # ensure that seqpos slices is a tuple
-        # Ensure seqpos_slice is a tuple
-        if "seqpos_slice" in cfg:
-            if isinstance(cfg["seqpos_slice"], list):
-                cfg["seqpos_slice"] = tuple(cfg["seqpos_slice"])
-            elif not isinstance(cfg["seqpos_slice"], tuple):
-                cfg["seqpos_slice"] = (cfg["seqpos_slice"],)
-        return cls(**cfg)
     def to_sae_trainer_config(self) -> "SAETrainerConfig":
         return SAETrainerConfig(
             n_checkpoints=self.n_checkpoints,

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/constants.py RENAMED Viewed

@@ -16,5 +16,6 @@ SPARSITY_FILENAME = "sparsity.safetensors"
 SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"
 SAE_CFG_FILENAME = "cfg.json"
 RUNNER_CFG_FILENAME = "runner_cfg.json"
+SPARSIFY_WEIGHTS_FILENAME = "sae.safetensors"
 ACTIVATIONS_STORE_STATE_FILENAME = "activations_store_state.safetensors"
 ACTIVATION_SCALER_CFG_FILENAME = "activation_scaler.json"

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/evals.py RENAMED Viewed

@@ -769,17 +769,6 @@ def nested_dict() -> defaultdict[Any, Any]:
     return defaultdict(nested_dict)
-def dict_to_nested(flat_dict: dict[str, Any]) -> defaultdict[Any, Any]:
-    nested = nested_dict()
-    for key, value in flat_dict.items():
-        parts = key.split("/")
-        d = nested
-        for part in parts[:-1]:
-            d = d[part]
-        d[parts[-1]] = value
-    return nested
 def multiple_evals(
     sae_regex_pattern: str,
     sae_block_pattern: str,

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/loading/pretrained_sae_loaders.py RENAMED Viewed

@@ -16,6 +16,7 @@ from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
+    SPARSIFY_WEIGHTS_FILENAME,
     SPARSITY_FILENAME,
 )
 from sae_lens.loading.pretrained_saes_directory import (
@@ -248,7 +249,7 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     config_class = get_sae_class(architecture)[1]
     sae_cfg_dict = filter_valid_dataclass_fields(new_cfg, config_class)
-    if architecture == "topk":
+    if architecture == "topk" and "activation_fn_kwargs" in new_cfg:
         sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
     sae_cfg_dict["metadata"] = {
@@ -530,11 +531,20 @@ def get_llama_scope_config_from_hf(
     # Model specific parameters
     model_name, d_in = "meta-llama/Llama-3.1-8B", old_cfg_dict["d_model"]
+    # Get norm scaling factor to rescale jumprelu threshold.
+    # We need this because sae.fold_activation_norm_scaling_factor folds scaling norm into W_enc.
+    # This requires jumprelu threshold to be scaled in the same way
+    norm_scaling_factor = (
+        d_in**0.5 / old_cfg_dict["dataset_average_activation_norm"]["in"]
+    )
     cfg_dict = {
         "architecture": "jumprelu",
-        "jump_relu_threshold": old_cfg_dict["jump_relu_threshold"],
+        "jump_relu_threshold": old_cfg_dict["jump_relu_threshold"]
+        * norm_scaling_factor,
         # We use a scalar jump_relu_threshold for all features
         # This is different from Gemma Scope JumpReLU SAEs.
+        # Scaled with norm_scaling_factor to match sae.fold_activation_norm_scaling_factor
         "d_in": d_in,
         "d_sae": old_cfg_dict["d_sae"],
         "dtype": "bfloat16",
@@ -942,6 +952,146 @@ def llama_scope_r1_distill_sae_huggingface_loader(
     return cfg_dict, state_dict, log_sparsity
+def get_sparsify_config_from_hf(
+    repo_id: str,
+    folder_name: str,
+    device: str,
+    force_download: bool = False,
+    cfg_overrides: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    cfg_filename = f"{folder_name}/{SAE_CFG_FILENAME}"
+    cfg_path = hf_hub_download(
+        repo_id,
+        filename=cfg_filename,
+        force_download=force_download,
+    )
+    sae_path = Path(cfg_path).parent
+    return get_sparsify_config_from_disk(
+        sae_path, device=device, cfg_overrides=cfg_overrides
+    )
+def get_sparsify_config_from_disk(
+    path: str | Path,
+    device: str | None = None,
+    cfg_overrides: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    path = Path(path)
+    with open(path / SAE_CFG_FILENAME) as f:
+        old_cfg_dict = json.load(f)
+    config_path = path.parent / "config.json"
+    if config_path.exists():
+        with open(config_path) as f:
+            config_dict = json.load(f)
+    else:
+        config_dict = {}
+    folder_name = path.name
+    if folder_name == "embed_tokens":
+        hook_name, layer = "hook_embed", 0
+    else:
+        match = re.search(r"layers[._](\d+)", folder_name)
+        if match is None:
+            raise ValueError(f"Unrecognized Sparsify folder: {folder_name}")
+        layer = int(match.group(1))
+        hook_name = f"blocks.{layer}.hook_resid_post"
+    cfg_dict: dict[str, Any] = {
+        "architecture": "standard",
+        "d_in": old_cfg_dict["d_in"],
+        "d_sae": old_cfg_dict["d_in"] * old_cfg_dict["expansion_factor"],
+        "dtype": "bfloat16",
+        "device": device or "cpu",
+        "model_name": config_dict.get("model", path.parts[-2]),
+        "hook_name": hook_name,
+        "hook_layer": layer,
+        "hook_head_index": None,
+        "activation_fn_str": "topk",
+        "activation_fn_kwargs": {
+            "k": old_cfg_dict["k"],
+            "signed": old_cfg_dict.get("signed", False),
+        },
+        "apply_b_dec_to_input": not old_cfg_dict.get("normalize_decoder", False),
+        "dataset_path": config_dict.get(
+            "dataset", "togethercomputer/RedPajama-Data-1T-Sample"
+        ),
+        "context_size": config_dict.get("ctx_len", 2048),
+        "finetuning_scaling_factor": False,
+        "sae_lens_training_version": None,
+        "prepend_bos": True,
+        "dataset_trust_remote_code": True,
+        "normalize_activations": "none",
+        "neuronpedia_id": None,
+    }
+    if cfg_overrides:
+        cfg_dict.update(cfg_overrides)
+    return cfg_dict
+def sparsify_huggingface_loader(
+    repo_id: str,
+    folder_name: str,
+    device: str = "cpu",
+    force_download: bool = False,
+    cfg_overrides: dict[str, Any] | None = None,
+) -> tuple[dict[str, Any], dict[str, torch.Tensor], None]:
+    weights_filename = f"{folder_name}/{SPARSIFY_WEIGHTS_FILENAME}"
+    sae_path = hf_hub_download(
+        repo_id,
+        filename=weights_filename,
+        force_download=force_download,
+    )
+    cfg_dict, state_dict = sparsify_disk_loader(
+        Path(sae_path).parent, device=device, cfg_overrides=cfg_overrides
+    )
+    return cfg_dict, state_dict, None
+def sparsify_disk_loader(
+    path: str | Path,
+    device: str = "cpu",
+    cfg_overrides: dict[str, Any] | None = None,
+) -> tuple[dict[str, Any], dict[str, torch.Tensor]]:
+    cfg_dict = get_sparsify_config_from_disk(path, device, cfg_overrides)
+    weight_path = Path(path) / SPARSIFY_WEIGHTS_FILENAME
+    state_dict_loaded = load_file(weight_path, device=device)
+    dtype = DTYPE_MAP[cfg_dict["dtype"]]
+    W_enc = (
+        state_dict_loaded["W_enc"]
+        if "W_enc" in state_dict_loaded
+        else state_dict_loaded["encoder.weight"].T
+    ).to(dtype)
+    if "W_dec" in state_dict_loaded:
+        W_dec = state_dict_loaded["W_dec"].T.to(dtype)
+    else:
+        W_dec = state_dict_loaded["decoder.weight"].T.to(dtype)
+    if "b_enc" in state_dict_loaded:
+        b_enc = state_dict_loaded["b_enc"].to(dtype)
+    elif "encoder.bias" in state_dict_loaded:
+        b_enc = state_dict_loaded["encoder.bias"].to(dtype)
+    else:
+        b_enc = torch.zeros(cfg_dict["d_sae"], dtype=dtype, device=device)
+    if "b_dec" in state_dict_loaded:
+        b_dec = state_dict_loaded["b_dec"].to(dtype)
+    elif "decoder.bias" in state_dict_loaded:
+        b_dec = state_dict_loaded["decoder.bias"].to(dtype)
+    else:
+        b_dec = torch.zeros(cfg_dict["d_in"], dtype=dtype, device=device)
+    state_dict = {"W_enc": W_enc, "b_enc": b_enc, "W_dec": W_dec, "b_dec": b_dec}
+    return cfg_dict, state_dict
 NAMED_PRETRAINED_SAE_LOADERS: dict[str, PretrainedSaeHuggingfaceLoader] = {
     "sae_lens": sae_lens_huggingface_loader,
     "connor_rob_hook_z": connor_rob_hook_z_huggingface_loader,
@@ -950,6 +1100,7 @@ NAMED_PRETRAINED_SAE_LOADERS: dict[str, PretrainedSaeHuggingfaceLoader] = {
     "llama_scope_r1_distill": llama_scope_r1_distill_sae_huggingface_loader,
     "dictionary_learning_1": dictionary_learning_sae_huggingface_loader_1,
     "deepseek_r1": deepseek_r1_sae_huggingface_loader,
+    "sparsify": sparsify_huggingface_loader,
 }
@@ -961,4 +1112,5 @@ NAMED_PRETRAINED_SAE_CONFIG_GETTERS: dict[str, PretrainedSaeConfigHuggingfaceLoa
     "llama_scope_r1_distill": get_llama_scope_r1_distill_config_from_hf,
     "dictionary_learning_1": get_dictionary_learning_config_1_from_hf,
     "deepseek_r1": get_deepseek_r1_config_from_hf,
+    "sparsify": get_sparsify_config_from_hf,
 }

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/pretrained_saes.yaml RENAMED Viewed

@@ -13634,39 +13634,51 @@ gemma-2-2b-res-matryoshka-dc:
   - id: blocks.13.hook_resid_post
     path: standard/blocks.13.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/13-res-matryoshka-dc
   - id: blocks.14.hook_resid_post
     path: standard/blocks.14.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/14-res-matryoshka-dc
   - id: blocks.15.hook_resid_post
     path: standard/blocks.15.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/15-res-matryoshka-dc
   - id: blocks.16.hook_resid_post
     path: standard/blocks.16.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/16-res-matryoshka-dc
   - id: blocks.17.hook_resid_post
     path: standard/blocks.17.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/17-res-matryoshka-dc
   - id: blocks.18.hook_resid_post
     path: standard/blocks.18.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/18-res-matryoshka-dc
   - id: blocks.19.hook_resid_post
     path: standard/blocks.19.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/19-res-matryoshka-dc
   - id: blocks.20.hook_resid_post
     path: standard/blocks.20.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/20-res-matryoshka-dc
   - id: blocks.21.hook_resid_post
     path: standard/blocks.21.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/21-res-matryoshka-dc
   - id: blocks.22.hook_resid_post
     path: standard/blocks.22.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/22-res-matryoshka-dc
   - id: blocks.23.hook_resid_post
     path: standard/blocks.23.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/23-res-matryoshka-dc
   - id: blocks.24.hook_resid_post
     path: standard/blocks.24.hook_resid_post
     l0: 40.0
+    neuronpedia: gemma-2-2b/24-res-matryoshka-dc
 gemma-2-2b-res-snap-matryoshka-dc:
   conversion_func: null
   links:

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/__init__.py RENAMED Viewed

@@ -1,3 +1,7 @@
+from .batchtopk_sae import (
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
+)
 from .gated_sae import (
     GatedSAE,
     GatedSAEConfig,
@@ -45,4 +49,6 @@ __all__ = [
     "TopKSAEConfig",
     "TopKTrainingSAE",
     "TopKTrainingSAEConfig",
+    "BatchTopKTrainingSAE",
+    "BatchTopKTrainingSAEConfig",
 ]

sae_lens-6.1.0/sae_lens/saes/batchtopk_sae.py ADDED Viewed

@@ -0,0 +1,102 @@
+from dataclasses import dataclass
+from typing import Any, Callable
+import torch
+import torch.nn as nn
+from typing_extensions import override
+from sae_lens.saes.jumprelu_sae import JumpReLUSAEConfig
+from sae_lens.saes.sae import SAEConfig, TrainStepInput, TrainStepOutput
+from sae_lens.saes.topk_sae import TopKTrainingSAE, TopKTrainingSAEConfig
+class BatchTopK(nn.Module):
+    """BatchTopK activation function"""
+    def __init__(
+        self,
+        k: int,
+    ):
+        super().__init__()
+        self.k = k
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        acts = x.relu()
+        flat_acts = acts.flatten()
+        acts_topk_flat = torch.topk(flat_acts, self.k * acts.shape[0], dim=-1)
+        return (
+            torch.zeros_like(flat_acts)
+            .scatter(-1, acts_topk_flat.indices, acts_topk_flat.values)
+            .reshape(acts.shape)
+        )
+@dataclass
+class BatchTopKTrainingSAEConfig(TopKTrainingSAEConfig):
+    """
+    Configuration class for training a BatchTopKTrainingSAE.
+    """
+    topk_threshold_lr: float = 0.01
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "batchtopk"
+    @override
+    def get_inference_config_class(self) -> type[SAEConfig]:
+        return JumpReLUSAEConfig
+class BatchTopKTrainingSAE(TopKTrainingSAE):
+    """
+    Global Batch TopK Training SAE
+    This SAE will maintain the k on average across the batch, rather than enforcing the k per-sample as in standard TopK.
+    BatchTopK SAEs are saved as JumpReLU SAEs after training.
+    """
+    topk_threshold: torch.Tensor
+    cfg: BatchTopKTrainingSAEConfig  # type: ignore[assignment]
+    def __init__(self, cfg: BatchTopKTrainingSAEConfig, use_error_term: bool = False):
+        super().__init__(cfg, use_error_term)
+        self.register_buffer(
+            "topk_threshold",
+            # use double precision as otherwise we can run into numerical issues
+            torch.tensor(0.0, dtype=torch.double, device=self.W_dec.device),
+        )
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return BatchTopK(self.cfg.k)
+    @override
+    def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
+        output = super().training_forward_pass(step_input)
+        self.update_topk_threshold(output.feature_acts)
+        output.metrics["topk_threshold"] = self.topk_threshold
+        return output
+    @torch.no_grad()
+    def update_topk_threshold(self, acts_topk: torch.Tensor) -> None:
+        positive_mask = acts_topk > 0
+        lr = self.cfg.topk_threshold_lr
+        # autocast can cause numerical issues with the threshold update
+        with torch.autocast(self.topk_threshold.device.type, enabled=False):
+            if positive_mask.any():
+                min_positive = (
+                    acts_topk[positive_mask].min().to(self.topk_threshold.dtype)
+                )
+                self.topk_threshold = (1 - lr) * self.topk_threshold + lr * min_positive
+    @override
+    def process_state_dict_for_saving_inference(
+        self, state_dict: dict[str, Any]
+    ) -> None:
+        super().process_state_dict_for_saving_inference(state_dict)
+        # turn the topk threshold into jumprelu threshold
+        topk_threshold = state_dict.pop("topk_threshold").item()
+        state_dict["threshold"] = torch.ones_like(self.b_enc) * topk_threshold

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/gated_sae.py RENAMED Viewed

@@ -15,7 +15,6 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 @dataclass
@@ -100,16 +99,10 @@ class GatedSAE(SAE[GatedSAEConfig]):
         self.W_enc.data = self.W_enc.data * W_dec_norms.T
         # Gated-specific parameters need special handling
-        self.r_mag.data = self.r_mag.data * W_dec_norms.squeeze()
+        # r_mag doesn't need scaling since W_enc scaling is sufficient for magnitude path
         self.b_gate.data = self.b_gate.data * W_dec_norms.squeeze()
         self.b_mag.data = self.b_mag.data * W_dec_norms.squeeze()
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm."""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
 @dataclass
 class GatedTrainingSAEConfig(TrainingSAEConfig):
@@ -133,7 +126,7 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
       - initialize_weights: sets up gating parameters (as in GatedSAE) plus optional training-specific init.
       - encode: calls encode_with_hidden_pre (standard training approach).
       - decode: linear transformation + hooking, same as GatedSAE or StandardTrainingSAE.
-      - encode_with_hidden_pre: gating logic + optional noise injection for training.
+      - encode_with_hidden_pre: gating logic.
       - calculate_aux_loss: includes an auxiliary reconstruction path and gating-based sparsity penalty.
       - training_forward_pass: calls encode_with_hidden_pre, decode, and sums up MSE + gating losses.
     """
@@ -158,7 +151,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
         """
         Gated forward pass with pre-activation (for training).
-        We also inject noise if self.training is True.
         """
         sae_in = self.process_sae_in(x)
@@ -219,12 +211,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
             "weights/b_mag": b_mag_dist,
         }
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm"""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
     def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
         return {
             "l1": TrainCoefficientConfig(
@@ -233,10 +219,17 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
             ),
         }
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), GatedSAEConfig, ["architecture"]
-        )
+    @torch.no_grad()
+    def fold_W_dec_norm(self):
+        """Override to handle gated-specific parameters."""
+        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        self.W_dec.data = self.W_dec.data / W_dec_norms
+        self.W_enc.data = self.W_enc.data * W_dec_norms.T
+        # Gated-specific parameters need special handling
+        # r_mag doesn't need scaling since W_enc scaling is sufficient for magnitude path
+        self.b_gate.data = self.b_gate.data * W_dec_norms.squeeze()
+        self.b_mag.data = self.b_mag.data * W_dec_norms.squeeze()
 def _init_weights_gated(

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/jumprelu_sae.py RENAMED Viewed

@@ -14,9 +14,7 @@ from sae_lens.saes.sae import (
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
-    TrainStepOutput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 def rectangle(x: torch.Tensor) -> torch.Tensor:
@@ -208,12 +206,11 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
     Similar to the inference-only JumpReLUSAE, but with:
       - A learnable log-threshold parameter (instead of a raw threshold).
-      - Forward passes that add noise during training, if configured.
       - A specialized auxiliary loss term for sparsity (L0 or similar).
     Methods of interest include:
     - initialize_weights: sets up W_enc, b_enc, W_dec, b_dec, and log_threshold.
-    - encode_with_hidden_pre_jumprelu: runs a forward pass for training, optionally adding noise.
+    - encode_with_hidden_pre_jumprelu: runs a forward pass for training.
     - training_forward_pass: calculates MSE and auxiliary losses, returning a TrainStepOutput.
     """
@@ -300,34 +297,6 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         # Fix: Use squeeze() instead of squeeze(-1) to match old behavior
         self.log_threshold.data = torch.log(current_thresh * W_dec_norms.squeeze())
-    def _create_train_step_output(
-        self,
-        sae_in: torch.Tensor,
-        sae_out: torch.Tensor,
-        feature_acts: torch.Tensor,
-        hidden_pre: torch.Tensor,
-        loss: torch.Tensor,
-        losses: dict[str, torch.Tensor],
-    ) -> TrainStepOutput:
-        """
-        Helper to produce a TrainStepOutput from the trainer.
-        The old code expects a method named _create_train_step_output().
-        """
-        return TrainStepOutput(
-            sae_in=sae_in,
-            sae_out=sae_out,
-            feature_acts=feature_acts,
-            hidden_pre=hidden_pre,
-            loss=loss,
-            losses=losses,
-        )
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm"""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
     def process_state_dict_for_saving(self, state_dict: dict[str, Any]) -> None:
         """Convert log_threshold to threshold for saving"""
         if "log_threshold" in state_dict:
@@ -341,8 +310,3 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
             threshold = state_dict["threshold"]
             del state_dict["threshold"]
             state_dict["log_threshold"] = torch.log(threshold).detach().contiguous()
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), JumpReLUSAEConfig, ["architecture"]
-        )

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/sae.py RENAMED Viewed

@@ -27,7 +27,7 @@ from torch import nn
 from transformer_lens.hook_points import HookedRootModule, HookPoint
 from typing_extensions import deprecated, overload, override
-from sae_lens import __version__, logger
+from sae_lens import __version__
 from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
@@ -207,6 +207,8 @@ class TrainStepOutput:
     hidden_pre: torch.Tensor
     loss: torch.Tensor  # we need to call backwards on this
     losses: dict[str, torch.Tensor]
+    # any extra metrics to log can be added here
+    metrics: dict[str, torch.Tensor | float | int] = field(default_factory=dict)
 @dataclass
@@ -528,28 +530,6 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return model_weights_path, cfg_path
-    ## Initialization Methods
-    @torch.no_grad()
-    def initialize_b_dec_with_precalculated(self, origin: torch.Tensor):
-        out = torch.tensor(origin, dtype=self.dtype, device=self.device)
-        self.b_dec.data = out
-    @torch.no_grad()
-    def initialize_b_dec_with_mean(self, all_activations: torch.Tensor):
-        previous_b_dec = self.b_dec.clone().cpu()
-        out = all_activations.mean(dim=0)
-        previous_distances = torch.norm(all_activations - previous_b_dec, dim=-1)
-        distances = torch.norm(all_activations - out, dim=-1)
-        logger.info("Reinitializing b_dec with mean of activations")
-        logger.debug(
-            f"Previous distances: {previous_distances.median(0).values.mean().item()}"
-        )
-        logger.debug(f"New distances: {distances.median(0).values.mean().item()}")
-        self.b_dec.data = out.to(self.dtype).to(self.device)
     # Class methods for loading models
     @classmethod
     @deprecated("Use load_from_disk instead")
@@ -732,6 +712,64 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     ) -> type[SAEConfig]:
         return SAEConfig
+    ### Methods to support deprecated usage of SAE.from_pretrained() ###
+    def __getitem__(self, index: int) -> Any:
+        """
+        Support indexing for backward compatibility with tuple unpacking.
+        DEPRECATED: SAE.from_pretrained() no longer returns a tuple.
+        Use SAE.from_pretrained_with_cfg_and_sparsity() instead.
+        """
+        warnings.warn(
+            "Indexing SAE objects is deprecated. SAE.from_pretrained() now returns "
+            "only the SAE object. Use SAE.from_pretrained_with_cfg_and_sparsity() "
+            "to get the config dict and sparsity as well.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if index == 0:
+            return self
+        if index == 1:
+            return self.cfg.to_dict()
+        if index == 2:
+            return None
+        raise IndexError(f"SAE tuple index {index} out of range")
+    def __iter__(self):
+        """
+        Support unpacking for backward compatibility with tuple unpacking.
+        DEPRECATED: SAE.from_pretrained() no longer returns a tuple.
+        Use SAE.from_pretrained_with_cfg_and_sparsity() instead.
+        """
+        warnings.warn(
+            "Unpacking SAE objects is deprecated. SAE.from_pretrained() now returns "
+            "only the SAE object. Use SAE.from_pretrained_with_cfg_and_sparsity() "
+            "to get the config dict and sparsity as well.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        yield self
+        yield self.cfg.to_dict()
+        yield None
+    def __len__(self) -> int:
+        """
+        Support len() for backward compatibility with tuple unpacking.
+        DEPRECATED: SAE.from_pretrained() no longer returns a tuple.
+        Use SAE.from_pretrained_with_cfg_and_sparsity() instead.
+        """
+        warnings.warn(
+            "Getting length of SAE objects is deprecated. SAE.from_pretrained() now returns "
+            "only the SAE object. Use SAE.from_pretrained_with_cfg_and_sparsity() "
+            "to get the config dict and sparsity as well.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return 3
 @dataclass(kw_only=True)
 class TrainingSAEConfig(SAEConfig, ABC):
@@ -789,20 +827,26 @@ class TrainingSAEConfig(SAEConfig, ABC):
             "architecture": self.architecture(),
         }
+    def get_inference_config_class(self) -> type[SAEConfig]:
+        """
+        Get the architecture for inference.
+        """
+        return get_sae_class(self.architecture())[1]
     # this needs to exist so we can initialize the parent sae cfg without the training specific
     # parameters. Maybe there's a cleaner way to do this
-    def get_base_sae_cfg_dict(self) -> dict[str, Any]:
+    def get_inference_sae_cfg_dict(self) -> dict[str, Any]:
         """
         Creates a dictionary containing attributes corresponding to the fields
         defined in the base SAEConfig class.
         """
-        base_sae_cfg_class = get_sae_class(self.architecture())[1]
+        base_sae_cfg_class = self.get_inference_config_class()
         base_config_field_names = {f.name for f in fields(base_sae_cfg_class)}
         result_dict = {
             field_name: getattr(self, field_name)
             for field_name in base_config_field_names
         }
-        result_dict["architecture"] = self.architecture()
+        result_dict["architecture"] = base_sae_cfg_class.architecture()
         result_dict["metadata"] = self.metadata.to_dict()
         return result_dict
@@ -930,18 +974,13 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         save_file(state_dict, model_weights_path)
         # Save the config
-        config = self.to_inference_config_dict()
+        config = self.cfg.get_inference_sae_cfg_dict()
         cfg_path = path / SAE_CFG_FILENAME
         with open(cfg_path, "w") as f:
             json.dump(config, f)
         return model_weights_path, cfg_path
-    @abstractmethod
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        """Convert the config into an inference SAE config dict."""
-        ...
     def process_state_dict_for_saving_inference(
         self, state_dict: dict[str, Any]
     ) -> None:
@@ -951,23 +990,6 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         """
         return self.process_state_dict_for_saving(state_dict)
-    @torch.no_grad()
-    def remove_gradient_parallel_to_decoder_directions(self) -> None:
-        """Remove gradient components parallel to decoder directions."""
-        # Implement the original logic since this may not be in the base class
-        assert self.W_dec.grad is not None
-        parallel_component = einops.einsum(
-            self.W_dec.grad,
-            self.W_dec.data,
-            "d_sae d_in, d_sae d_in -> d_sae",
-        )
-        self.W_dec.grad -= einops.einsum(
-            parallel_component,
-            self.W_dec.data,
-            "d_sae, d_sae d_in -> d_sae d_in",
-        )
     @torch.no_grad()
     def log_histograms(self) -> dict[str, NDArray[Any]]:
         """Log histograms of the weights and biases."""

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/standard_sae.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from typing import Any
 import numpy as np
 import torch
@@ -16,7 +15,6 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 @dataclass
@@ -61,7 +59,6 @@ class StandardSAE(SAE[StandardSAEConfig]):
     ) -> Float[torch.Tensor, "... d_sae"]:
         """
         Encode the input tensor into the feature space.
-        For inference, no noise is added.
         """
         # Preprocess the SAE input (casting type, applying hooks, normalization)
         sae_in = self.process_sae_in(x)
@@ -110,7 +107,7 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
       - initialize_weights: basic weight initialization for encoder/decoder.
       - encode: inference encoding (invokes encode_with_hidden_pre).
       - decode: a simple linear decoder.
-      - encode_with_hidden_pre: computes pre-activations, adds noise when training, and then activates.
+      - encode_with_hidden_pre: computes activations and pre-activations.
       - calculate_aux_loss: computes a sparsity penalty based on the (optionally scaled) p-norm of feature activations.
     """
@@ -164,11 +161,6 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
             "weights/b_e": b_e_dist,
         }
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), StandardSAEConfig, ["architecture"]
-        )
 def _init_weights_standard(
     sae: SAE[StandardSAEConfig] | TrainingSAE[StandardTrainingSAEConfig],

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/saes/topk_sae.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
 from dataclasses import dataclass
-from typing import Any, Callable
+from typing import Callable
 import torch
 from jaxtyping import Float
@@ -16,13 +16,12 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 class TopK(nn.Module):
     """
     A simple TopK activation that zeroes out all but the top K elements along the last dimension,
-    then optionally applies a post-activation function (e.g., ReLU).
+    and applies ReLU to the top K elements.
     """
     b_enc: nn.Parameter
@@ -30,20 +29,18 @@ class TopK(nn.Module):
     def __init__(
         self,
         k: int,
-        postact_fn: Callable[[torch.Tensor], torch.Tensor] = nn.ReLU(),
     ):
         super().__init__()
         self.k = k
-        self.postact_fn = postact_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         1) Select top K elements along the last dimension.
-        2) Apply post-activation (often ReLU).
+        2) Apply ReLU.
         3) Zero out all other entries.
         """
         topk = torch.topk(x, k=self.k, dim=-1)
-        values = self.postact_fn(topk.values)
+        values = topk.values.relu()
         result = torch.zeros_like(x)
         result.scatter_(-1, topk.indices, values)
         return result
@@ -139,8 +136,7 @@ class TopKTrainingSAEConfig(TrainingSAEConfig):
 class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     """
-    TopK variant with training functionality. Injects noise during training, optionally
-    calculates a topk-related auxiliary loss, etc.
+    TopK variant with training functionality. Calculates a topk-related auxiliary loss, etc.
     """
     b_enc: nn.Parameter
@@ -157,7 +153,7 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         self, x: Float[torch.Tensor, "... d_in"]
     ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
         """
-        Similar to the base training method: cast input, optionally add noise, then apply TopK.
+        Similar to the base training method: calculate pre-activations, then apply TopK.
         """
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
@@ -237,50 +233,24 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         auxk_loss = (recons - residual).pow(2).sum(dim=-1).mean()
         return scale * auxk_loss
-    def _calculate_topk_aux_acts(
-        self,
-        k_aux: int,
-        hidden_pre: torch.Tensor,
-        dead_neuron_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Helper method to calculate activations for the auxiliary loss.
-        Args:
-            k_aux: Number of top dead neurons to select
-            hidden_pre: Pre-activation values from encoder
-            dead_neuron_mask: Boolean mask indicating which neurons are dead
-        Returns:
-            Tensor with activations for only the top-k dead neurons, zeros elsewhere
-        """
-        # Don't include living latents in this loss (set them to -inf so they won't be selected)
-        auxk_latents = torch.where(
-            dead_neuron_mask[None],
-            hidden_pre,
-            torch.tensor(-float("inf"), device=hidden_pre.device),
-        )
-        # Find topk values among dead neurons
-        auxk_topk = auxk_latents.topk(k_aux, dim=-1, sorted=False)
-        # Create a tensor of zeros, then place the topk values at their proper indices
-        auxk_acts = torch.zeros_like(hidden_pre)
-        auxk_acts.scatter_(-1, auxk_topk.indices, auxk_topk.values)
-        return auxk_acts
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), TopKSAEConfig, ["architecture"]
-        )
 def _calculate_topk_aux_acts(
     k_aux: int,
     hidden_pre: torch.Tensor,
     dead_neuron_mask: torch.Tensor,
 ) -> torch.Tensor:
+    """
+    Helper method to calculate activations for the auxiliary loss.
+    Args:
+        k_aux: Number of top dead neurons to select
+        hidden_pre: Pre-activation values from encoder
+        dead_neuron_mask: Boolean mask indicating which neurons are dead
+    Returns:
+        Tensor with activations for only the top-k dead neurons, zeros elsewhere
+    """
     # Don't include living latents in this loss
     auxk_latents = torch.where(dead_neuron_mask[None], hidden_pre, -torch.inf)
     # Top-k dead latents

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/training/activations_store.py RENAMED Viewed

@@ -7,7 +7,6 @@ from collections.abc import Generator, Iterator, Sequence
 from typing import Any, Literal, cast
 import datasets
-import numpy as np
 import torch
 from datasets import Dataset, DatasetDict, IterableDataset, load_dataset
 from huggingface_hub import hf_hub_download
@@ -420,20 +419,6 @@ class ActivationsStore:
         return activations_dataset
-    @torch.no_grad()
-    def estimate_norm_scaling_factor(self, n_batches_for_norm_estimate: int = int(1e3)):
-        norms_per_batch = []
-        for _ in tqdm(
-            range(n_batches_for_norm_estimate), desc="Estimating norm scaling factor"
-        ):
-            # temporalily set estimated_norm_scaling_factor to 1.0 so the dataloader works
-            self.estimated_norm_scaling_factor = 1.0
-            acts = self.next_batch()[0]
-            self.estimated_norm_scaling_factor = None
-            norms_per_batch.append(acts.norm(dim=-1).mean().item())
-        mean_norm = np.mean(norms_per_batch)
-        return np.sqrt(self.d_in) / mean_norm
     def shuffle_input_dataset(self, seed: int, buffer_size: int = 1):
         """
         This applies a shuffle to the huggingface dataset that is the input to the activations store. This

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/training/optim.py RENAMED Viewed

@@ -2,8 +2,6 @@
 Took the LR scheduler from my previous work: https://github.com/jbloomAus/DecisionTransformerInterpretability/blob/ee55df35cdb92e81d689c72fb9dd5a7252893363/src/decision_transformer/utils.py#L425
 """
-from typing import Any
 import torch.optim as optim
 import torch.optim.lr_scheduler as lr_scheduler
@@ -152,34 +150,3 @@ class CoefficientScheduler:
     def value(self) -> float:
         """Returns the current scalar value."""
         return self.current_value
-    def state_dict(self) -> dict[str, Any]:
-        """State dict for serialization."""
-        return {
-            "warm_up_steps": self.warm_up_steps,
-            "final_value": self.final_value,
-            "current_step": self.current_step,
-            "current_value": self.current_value,
-        }
-    def load_state_dict(self, state_dict: dict[str, Any]):
-        """Loads the scheduler state."""
-        self.warm_up_steps = state_dict["warm_up_steps"]
-        self.final_value = state_dict["final_value"]
-        self.current_step = state_dict["current_step"]
-        # Maintain consistency: re-calculate current_value based on loaded step
-        # This handles resuming correctly if stopped mid-warmup.
-        if self.current_step <= self.warm_up_steps and self.warm_up_steps > 0:
-            # Use max(0, ...) to handle case where current_step might be loaded as -1 or similar before first step
-            step_for_calc = max(0, self.current_step)
-            # Recalculate based on the step *before* the one about to be taken
-            # Or simply use the saved current_value if available and consistent
-            if "current_value" in state_dict:
-                self.current_value = state_dict["current_value"]
-            else:  # Legacy state dicts might not have current_value
-                self.current_value = self.final_value * (
-                    step_for_calc / self.warm_up_steps
-                )
-        else:
-            self.current_value = self.final_value

{sae_lens-6.0.0rc4 → sae_lens-6.1.0}/sae_lens/training/sae_trainer.py RENAMED Viewed

@@ -349,8 +349,10 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
             },
         }
         for loss_name, loss_value in output.losses.items():
-            loss_item = _unwrap_item(loss_value)
-            log_dict[f"losses/{loss_name}"] = loss_item
+            log_dict[f"losses/{loss_name}"] = _unwrap_item(loss_value)
+        for metric_name, metric_value in output.metrics.items():
+            log_dict[f"metrics/{metric_name}"] = _unwrap_item(metric_value)
         return log_dict