PyPI - sae-lens - Versions diffs - 6.0.0rc1__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl - Mend

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

sae_lens/__init__.py +50 -16
sae_lens/analysis/hooked_sae_transformer.py +10 -10
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +2 -1
sae_lens/config.py +59 -231
sae_lens/constants.py +18 -0
sae_lens/evals.py +16 -13
sae_lens/loading/pretrained_sae_loaders.py +36 -3
sae_lens/registry.py +49 -0
sae_lens/sae_training_runner.py +22 -21
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +70 -59
sae_lens/saes/jumprelu_sae.py +58 -72
sae_lens/saes/sae.py +250 -272
sae_lens/saes/standard_sae.py +75 -57
sae_lens/saes/topk_sae.py +72 -83
sae_lens/training/activations_store.py +31 -15
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +44 -69
sae_lens/training/upload_saes_to_huggingface.py +11 -5
sae_lens/util.py +28 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc2.dist-info/RECORD +35 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/WHEEL +1 -1
sae_lens/regsitry.py +0 -34
sae_lens-6.0.0rc1.dist-info/RECORD +0 -32
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc2.dist-info}/LICENSE +0 -0

sae_lens/registry.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING, Any
+# avoid circular imports
+if TYPE_CHECKING:
+    from sae_lens.saes.sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+SAE_CLASS_REGISTRY: dict[str, tuple["type[SAE[Any]]", "type[SAEConfig]"]] = {}
+SAE_TRAINING_CLASS_REGISTRY: dict[
+    str, tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]
+] = {}
+def register_sae_class(
+    architecture: str,
+    sae_class: "type[SAE[Any]]",
+    sae_config_class: "type[SAEConfig]",
+) -> None:
+    if architecture in SAE_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE class for architecture {architecture} already registered."
+        )
+    SAE_CLASS_REGISTRY[architecture] = (sae_class, sae_config_class)
+def register_sae_training_class(
+    architecture: str,
+    sae_training_class: "type[TrainingSAE[Any]]",
+    sae_training_config_class: "type[TrainingSAEConfig]",
+) -> None:
+    if architecture in SAE_TRAINING_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE training class for architecture {architecture} already registered."
+        )
+    SAE_TRAINING_CLASS_REGISTRY[architecture] = (
+        sae_training_class,
+        sae_training_config_class,
+    )
+def get_sae_class(
+    architecture: str,
+) -> tuple["type[SAE[Any]]", "type[SAEConfig]"]:
+    return SAE_CLASS_REGISTRY[architecture]
+def get_sae_training_class(
+    architecture: str,
+) -> tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]:
+    return SAE_TRAINING_CLASS_REGISTRY[architecture]

sae_lens/sae_training_runner.py CHANGED Viewed

@@ -7,13 +7,15 @@ from typing import Any, cast
 import torch
 import wandb
+from safetensors.torch import save_file
 from simple_parsing import ArgumentParser
 from transformer_lens.hook_points import HookedRootModule
 from sae_lens import logger
 from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
+from sae_lens.constants import RUNNER_CFG_FILENAME, SPARSITY_FILENAME
 from sae_lens.load_model import load_model
-from sae_lens.saes.sae import TrainingSAE, TrainingSAEConfig
+from sae_lens.saes.sae import T_TRAINING_SAE_CONFIG, TrainingSAE, TrainingSAEConfig
 from sae_lens.training.activations_store import ActivationsStore
 from sae_lens.training.geometric_median import compute_geometric_median
 from sae_lens.training.sae_trainer import SAETrainer
@@ -32,17 +34,17 @@ class SAETrainingRunner:
     Class to run the training of a Sparse Autoencoder (SAE) on a TransformerLens model.
     """
-    cfg: LanguageModelSAERunnerConfig
+    cfg: LanguageModelSAERunnerConfig[Any]
     model: HookedRootModule
-    sae: TrainingSAE
+    sae: TrainingSAE[Any]
     activations_store: ActivationsStore
     def __init__(
         self,
-        cfg: LanguageModelSAERunnerConfig,
+        cfg: LanguageModelSAERunnerConfig[T_TRAINING_SAE_CONFIG],
         override_dataset: HfDataset | None = None,
         override_model: HookedRootModule | None = None,
-        override_sae: TrainingSAE | None = None,
+        override_sae: TrainingSAE[Any] | None = None,
     ):
         if override_dataset is not None:
             logger.warning(
@@ -141,7 +143,9 @@ class SAETrainingRunner:
                 backend=backend,
             )  # type: ignore
-    def run_trainer_with_interruption_handling(self, trainer: SAETrainer):
+    def run_trainer_with_interruption_handling(
+        self, trainer: SAETrainer[TrainingSAE[TrainingSAEConfig], TrainingSAEConfig]
+    ):
         try:
             # signal handlers (if preempted)
             signal.signal(signal.SIGINT, interrupt_callback)
@@ -167,7 +171,7 @@ class SAETrainingRunner:
         extract all activations at a certain layer and use for sae b_dec initialization
         """
-        if self.cfg.b_dec_init_method == "geometric_median":
+        if self.cfg.sae.b_dec_init_method == "geometric_median":
             self.activations_store.set_norm_scaling_factor_if_needed()
             layer_acts = self.activations_store.storage_buffer.detach()[:, 0, :]
             # get geometric median of the activations if we're using those.
@@ -176,14 +180,14 @@ class SAETrainingRunner:
                 maxiter=100,
             ).median
             self.sae.initialize_b_dec_with_precalculated(median)
-        elif self.cfg.b_dec_init_method == "mean":
+        elif self.cfg.sae.b_dec_init_method == "mean":
             self.activations_store.set_norm_scaling_factor_if_needed()
             layer_acts = self.activations_store.storage_buffer.detach().cpu()[:, 0, :]
             self.sae.initialize_b_dec_with_mean(layer_acts)  # type: ignore
     @staticmethod
     def save_checkpoint(
-        trainer: SAETrainer,
+        trainer: SAETrainer[TrainingSAE[Any], Any],
         checkpoint_name: str,
         wandb_aliases: list[str] | None = None,
     ) -> None:
@@ -194,19 +198,14 @@ class SAETrainingRunner:
             str(base_path / "activations_store_state.safetensors")
         )
-        if trainer.sae.cfg.normalize_sae_decoder:
-            trainer.sae.set_decoder_norm_to_unit_norm()
+        weights_path, cfg_path = trainer.sae.save_model(str(base_path))
-        weights_path, cfg_path, sparsity_path = trainer.sae.save_model(
-            str(base_path),
-            trainer.log_feature_sparsity,
-        )
+        sparsity_path = base_path / SPARSITY_FILENAME
+        save_file({"sparsity": trainer.log_feature_sparsity}, sparsity_path)
-        # let's over write the cfg file with the trainer cfg, which is a super set of the original cfg.
-        # and should not cause issues but give us more info about SAEs we trained in SAE Lens.
-        config = trainer.cfg.to_dict()
-        with open(cfg_path, "w") as f:
-            json.dump(config, f)
+        runner_config = trainer.cfg.to_dict()
+        with open(base_path / RUNNER_CFG_FILENAME, "w") as f:
+            json.dump(runner_config, f)
         if trainer.cfg.logger.log_to_wandb:
             trainer.cfg.logger.log(
@@ -218,7 +217,9 @@ class SAETrainingRunner:
             )
-def _parse_cfg_args(args: Sequence[str]) -> LanguageModelSAERunnerConfig:
+def _parse_cfg_args(
+    args: Sequence[str],
+) -> LanguageModelSAERunnerConfig[TrainingSAEConfig]:
     if len(args) == 0:
         args = ["--help"]
     parser = ArgumentParser(exit_on_error=False)

sae_lens/saes/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+from .gated_sae import (
+    GatedSAE,
+    GatedSAEConfig,
+    GatedTrainingSAE,
+    GatedTrainingSAEConfig,
+)
+from .jumprelu_sae import (
+    JumpReLUSAE,
+    JumpReLUSAEConfig,
+    JumpReLUTrainingSAE,
+    JumpReLUTrainingSAEConfig,
+)
+from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+from .standard_sae import (
+    StandardSAE,
+    StandardSAEConfig,
+    StandardTrainingSAE,
+    StandardTrainingSAEConfig,
+)
+from .topk_sae import (
+    TopKSAE,
+    TopKSAEConfig,
+    TopKTrainingSAE,
+    TopKTrainingSAEConfig,
+)
+__all__ = [
+    "SAE",
+    "SAEConfig",
+    "TrainingSAE",
+    "TrainingSAEConfig",
+    "StandardSAE",
+    "StandardSAEConfig",
+    "StandardTrainingSAE",
+    "StandardTrainingSAEConfig",
+    "GatedSAE",
+    "GatedSAEConfig",
+    "GatedTrainingSAE",
+    "GatedTrainingSAEConfig",
+    "JumpReLUSAE",
+    "JumpReLUSAEConfig",
+    "JumpReLUTrainingSAE",
+    "JumpReLUTrainingSAEConfig",
+    "TopKSAE",
+    "TopKSAEConfig",
+    "TopKTrainingSAE",
+    "TopKTrainingSAEConfig",
+]

sae_lens/saes/gated_sae.py CHANGED Viewed

@@ -1,20 +1,36 @@
+from dataclasses import dataclass
 from typing import Any
 import torch
 from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
+from typing_extensions import override
 from sae_lens.saes.sae import (
     SAE,
     SAEConfig,
+    TrainCoefficientConfig,
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
 )
+from sae_lens.util import filter_valid_dataclass_fields
-class GatedSAE(SAE):
+@dataclass
+class GatedSAEConfig(SAEConfig):
+    """
+    Configuration class for a GatedSAE.
+    """
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedSAE(SAE[GatedSAEConfig]):
     """
     GatedSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a gated linear encoder and a standard linear decoder.
@@ -24,48 +40,15 @@ class GatedSAE(SAE):
     b_mag: nn.Parameter
     r_mag: nn.Parameter
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: GatedSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
         # Ensure b_enc does not exist for the gated architecture
         self.b_enc = None
+    @override
     def initialize_weights(self) -> None:
-        """
-        Initialize weights exactly as in the original SAE class for gated architecture.
-        """
-        # Use the same initialization methods and values as in original SAE
-        self.W_enc = nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(
-                    self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-                )
-            )
-        )
-        self.b_gate = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        # Ensure r_mag is initialized to zero as in original
-        self.r_mag = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_mag = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        # Decoder parameters with same initialization as original
-        self.W_dec = nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(
-                    self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-                )
-            )
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # after defining b_gate, b_mag, etc.:
-        self.b_enc = None
+        super().initialize_weights()
+        _init_weights_gated(self)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -101,9 +84,8 @@ class GatedSAE(SAE):
           4) If the SAE was reshaping hook_z activations, reshape back.
         """
         # 1) optional finetuning scaling
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
         # 2) linear transform
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         # 3) hooking and normalization
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
@@ -129,7 +111,22 @@ class GatedSAE(SAE):
         self.W_dec.data *= norm
-class GatedTrainingSAE(TrainingSAE):
+@dataclass
+class GatedTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a GatedTrainingSAE.
+    """
+    l1_coefficient: float = 1.0
+    l1_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     """
     GatedTrainingSAE is a concrete implementation of BaseTrainingSAE for the "gated" SAE architecture.
     It implements:
@@ -145,7 +142,7 @@ class GatedTrainingSAE(TrainingSAE):
     b_mag: nn.Parameter  # type: ignore
     r_mag: nn.Parameter  # type: ignore
-    def __init__(self, cfg: TrainingSAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: GatedTrainingSAEConfig, use_error_term: bool = False):
         if use_error_term:
             raise ValueError(
                 "GatedSAE does not support `use_error_term`. Please set `use_error_term=False`."
@@ -153,22 +150,8 @@ class GatedTrainingSAE(TrainingSAE):
         super().__init__(cfg, use_error_term)
     def initialize_weights(self) -> None:
-        # Reuse the gating parameter initialization from GatedSAE:
-        GatedSAE.initialize_weights(self)  # type: ignore
-        # Additional training-specific logic, e.g. orthogonal init or heuristics:
-        if self.cfg.decoder_orthogonal_init:
-            self.W_dec.data = nn.init.orthogonal_(self.W_dec.data.T).T
-        elif self.cfg.decoder_heuristic_init:
-            self.W_dec.data = torch.rand(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-            self.initialize_decoder_norm_constant_norm()
-        if self.cfg.init_encoder_as_decoder_transpose:
-            self.W_enc.data = self.W_dec.data.T.clone().contiguous()
-        if self.cfg.normalize_sae_decoder:
-            with torch.no_grad():
-                self.set_decoder_norm_to_unit_norm()
+        super().initialize_weights()
+        _init_weights_gated(self)
     def encode_with_hidden_pre(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -217,7 +200,7 @@ class GatedTrainingSAE(TrainingSAE):
         # L1-like penalty scaled by W_dec norms
         l1_loss = (
-            step_input.current_l1_coefficient
+            step_input.coefficients["l1"]
             * torch.sum(pi_gate_act * self.W_dec.norm(dim=1), dim=-1).mean()
         )
@@ -245,3 +228,31 @@ class GatedTrainingSAE(TrainingSAE):
         """Initialize decoder with constant norm"""
         self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
         self.W_dec.data *= norm
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l1": TrainCoefficientConfig(
+                value=self.cfg.l1_coefficient,
+                warm_up_steps=self.cfg.l1_warm_up_steps,
+            ),
+        }
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), GatedSAEConfig, ["architecture"]
+        )
+def _init_weights_gated(
+    sae: SAE[GatedSAEConfig] | TrainingSAE[GatedTrainingSAEConfig],
+) -> None:
+    sae.b_gate = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    # Ensure r_mag is initialized to zero as in original
+    sae.r_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    sae.b_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import Any
 import numpy as np
@@ -9,11 +10,13 @@ from typing_extensions import override
 from sae_lens.saes.sae import (
     SAE,
     SAEConfig,
+    TrainCoefficientConfig,
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
     TrainStepOutput,
 )
+from sae_lens.util import filter_valid_dataclass_fields
 def rectangle(x: torch.Tensor) -> torch.Tensor:
@@ -85,7 +88,19 @@ class JumpReLU(torch.autograd.Function):
         return x_grad, threshold_grad, None
-class JumpReLUSAE(SAE):
+@dataclass
+class JumpReLUSAEConfig(SAEConfig):
+    """
+    Configuration class for a JumpReLUSAE.
+    """
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "jumprelu"
+class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
     """
     JumpReLUSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a JumpReLU activation. For each unit, if its pre-activation is
@@ -104,42 +119,18 @@ class JumpReLUSAE(SAE):
     b_enc: nn.Parameter
     threshold: nn.Parameter
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: JumpReLUSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
+    @override
     def initialize_weights(self) -> None:
-        """
-        Initialize encoder and decoder weights, as well as biases.
-        Additionally, include a learnable `threshold` parameter that
-        determines when units "turn on" for the JumpReLU.
-        """
-        # Biases
-        self.b_enc = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # Threshold for JumpReLU
-        # You can pick a default initialization (e.g., zeros means unit is off unless hidden_pre > 0)
-        # or see the training version for more advanced init with log_threshold, etc.
+        super().initialize_weights()
         self.threshold = nn.Parameter(
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        # Encoder and Decoder weights
-        w_enc_data = torch.empty(
-            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_enc_data)
-        self.W_enc = nn.Parameter(w_enc_data)
-        w_dec_data = torch.empty(
-            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
+        self.b_enc = nn.Parameter(
+            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        nn.init.kaiming_uniform_(w_dec_data)
-        self.W_dec = nn.Parameter(w_dec_data)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -168,8 +159,7 @@ class JumpReLUSAE(SAE):
         Decode the feature activations back to the input space.
         Follows the same steps as StandardSAE: apply scaling, transform, hook, and optionally reshape.
         """
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         return self.reshape_fn_out(sae_out_pre, self.d_head)
@@ -195,7 +185,24 @@ class JumpReLUSAE(SAE):
         self.threshold.data = current_thresh * W_dec_norms
-class JumpReLUTrainingSAE(TrainingSAE):
+@dataclass
+class JumpReLUTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a JumpReLUTrainingSAE.
+    """
+    jumprelu_init_threshold: float = 0.001
+    jumprelu_bandwidth: float = 0.001
+    l0_coefficient: float = 1.0
+    l0_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "jumprelu"
+class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
     """
     JumpReLUTrainingSAE is a training-focused implementation of a SAE using a JumpReLU activation.
@@ -213,7 +220,7 @@ class JumpReLUTrainingSAE(TrainingSAE):
     b_enc: nn.Parameter
     log_threshold: nn.Parameter
-    def __init__(self, cfg: TrainingSAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: JumpReLUTrainingSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
         # We'll store a bandwidth for the training approach, if needed
@@ -225,51 +232,16 @@ class JumpReLUTrainingSAE(TrainingSAE):
             * np.log(cfg.jumprelu_init_threshold)
         )
+    @override
     def initialize_weights(self) -> None:
         """
         Initialize parameters like the base SAE, but also add log_threshold.
         """
+        super().initialize_weights()
         # Encoder Bias
         self.b_enc = nn.Parameter(
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        # Decoder Bias
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # W_enc
-        w_enc_data = torch.nn.init.kaiming_uniform_(
-            torch.empty(
-                self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-            )
-        )
-        self.W_enc = nn.Parameter(w_enc_data)
-        # W_dec
-        w_dec_data = torch.nn.init.kaiming_uniform_(
-            torch.empty(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-        )
-        self.W_dec = nn.Parameter(w_dec_data)
-        # Optionally apply orthogonal or heuristic init
-        if self.cfg.decoder_orthogonal_init:
-            self.W_dec.data = nn.init.orthogonal_(self.W_dec.data.T).T
-        elif self.cfg.decoder_heuristic_init:
-            self.W_dec.data = torch.rand(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-            self.initialize_decoder_norm_constant_norm()
-        # Optionally transpose
-        if self.cfg.init_encoder_as_decoder_transpose:
-            self.W_enc.data = self.W_dec.data.T.clone().contiguous()
-        # Optionally normalize columns of W_dec
-        if self.cfg.normalize_sae_decoder:
-            with torch.no_grad():
-                self.set_decoder_norm_to_unit_norm()
     @property
     def threshold(self) -> torch.Tensor:
@@ -305,9 +277,18 @@ class JumpReLUTrainingSAE(TrainingSAE):
     ) -> dict[str, torch.Tensor]:
         """Calculate architecture-specific auxiliary loss terms."""
         l0 = torch.sum(Step.apply(hidden_pre, self.threshold, self.bandwidth), dim=-1)  # type: ignore
-        l0_loss = (step_input.current_l1_coefficient * l0).mean()
+        l0_loss = (step_input.coefficients["l0"] * l0).mean()
         return {"l0_loss": l0_loss}
+    @override
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l0": TrainCoefficientConfig(
+                value=self.cfg.l0_coefficient,
+                warm_up_steps=self.cfg.l0_warm_up_steps,
+            ),
+        }
     @torch.no_grad()
     def fold_W_dec_norm(self):
         """
@@ -366,3 +347,8 @@ class JumpReLUTrainingSAE(TrainingSAE):
             threshold = state_dict["threshold"]
             del state_dict["threshold"]
             state_dict["log_threshold"] = torch.log(threshold).detach().contiguous()
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), JumpReLUSAEConfig, ["architecture"]
+        )

sae-lens 6.0.0rc1__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc2py3-none-any.whl