PyPI - sae-lens - Versions diffs - 6.0.0rc1__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl - Mend

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sae_lens/__init__.py +55 -18
sae_lens/analysis/hooked_sae_transformer.py +10 -10
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +9 -7
sae_lens/config.py +105 -235
sae_lens/constants.py +20 -0
sae_lens/evals.py +34 -31
sae_lens/{sae_training_runner.py → llm_sae_training_runner.py} +103 -70
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +36 -10
sae_lens/registry.py +49 -0
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +70 -59
sae_lens/saes/jumprelu_sae.py +58 -72
sae_lens/saes/sae.py +248 -273
sae_lens/saes/standard_sae.py +75 -57
sae_lens/saes/topk_sae.py +72 -83
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +105 -184
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +134 -158
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +11 -5
sae_lens/util.py +47 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc3.dist-info/RECORD +38 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/WHEEL +1 -1
sae_lens/regsitry.py +0 -34
sae_lens-6.0.0rc1.dist-info/RECORD +0 -32
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/LICENSE +0 -0

sae_lens/loading/pretrained_sae_loaders.py CHANGED Viewed

@@ -7,11 +7,12 @@ import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
+from packaging.version import Version
 from safetensors import safe_open
 from safetensors.torch import load_file
 from sae_lens import logger
-from sae_lens.config import (
+from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
@@ -22,6 +23,8 @@ from sae_lens.loading.pretrained_saes_directory import (
     get_pretrained_saes_directory,
     get_repo_id_and_folder_name,
 )
+from sae_lens.registry import get_sae_class
+from sae_lens.util import filter_valid_dataclass_fields
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
@@ -174,9 +177,22 @@ def get_sae_lens_config_from_disk(
 def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    sae_lens_version = cfg_dict.get("sae_lens_version")
+    if not sae_lens_version and "metadata" in cfg_dict:
+        sae_lens_version = cfg_dict["metadata"].get("sae_lens_version")
+    if not sae_lens_version or Version(sae_lens_version) < Version("6.0.0-rc.0"):
+        cfg_dict = handle_pre_6_0_config(cfg_dict)
+    return cfg_dict
+def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    """
+    Format a config dictionary for a Sparse Autoencoder (SAE) to be compatible with the new 6.0 format.
+    """
     rename_keys_map = {
         "hook_point": "hook_name",
-        "hook_point_layer": "hook_layer",
         "hook_point_head_index": "hook_head_index",
         "activation_fn_str": "activation_fn",
     }
@@ -202,10 +218,26 @@ def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
             else "expected_average_only_in"
         )
-    new_cfg.setdefault("normalize_activations", "none")
+    if new_cfg.get("normalize_activations") is None:
+        new_cfg["normalize_activations"] = "none"
     new_cfg.setdefault("device", "cpu")
-    return new_cfg
+    architecture = new_cfg.get("architecture", "standard")
+    config_class = get_sae_class(architecture)[1]
+    sae_cfg_dict = filter_valid_dataclass_fields(new_cfg, config_class)
+    if architecture == "topk":
+        sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
+    # import here to avoid circular import
+    from sae_lens.saes.sae import SAEMetadata
+    meta_dict = filter_valid_dataclass_fields(new_cfg, SAEMetadata)
+    sae_cfg_dict["metadata"] = meta_dict
+    sae_cfg_dict["architecture"] = architecture
+    return sae_cfg_dict
 def get_connor_rob_hook_z_config_from_hf(
@@ -229,7 +261,6 @@ def get_connor_rob_hook_z_config_from_hf(
         "device": device if device is not None else "cpu",
         "model_name": "gpt2-small",
         "hook_name": old_cfg_dict["act_name"],
-        "hook_layer": old_cfg_dict["layer"],
         "hook_head_index": None,
         "activation_fn": "relu",
         "apply_b_dec_to_input": True,
@@ -378,7 +409,6 @@ def get_gemma_2_config_from_hf(
         "dtype": "float32",
         "model_name": model_name,
         "hook_name": hook_name,
-        "hook_layer": layer,
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,
@@ -491,7 +521,6 @@ def get_llama_scope_config_from_hf(
         "dtype": "bfloat16",
         "model_name": model_name,
         "hook_name": old_cfg_dict["hook_point_in"],
-        "hook_layer": int(old_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,
@@ -618,7 +647,6 @@ def get_dictionary_learning_config_1_from_hf(
         "device": device,
         "model_name": trainer["lm_name"].split("/")[-1],
         "hook_name": hook_point_name,
-        "hook_layer": trainer["layer"],
         "hook_head_index": None,
         "activation_fn": activation_fn,
         "activation_fn_kwargs": activation_fn_kwargs,
@@ -657,7 +685,6 @@ def get_deepseek_r1_config_from_hf(
         "context_size": 1024,
         "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
         "hook_name": f"blocks.{layer}.hook_resid_post",
-        "hook_layer": layer,
         "hook_head_index": None,
         "prepend_bos": True,
         "dataset_path": "lmsys/lmsys-chat-1m",
@@ -816,7 +843,6 @@ def get_llama_scope_r1_distill_config_from_hf(
         "device": device,
         "model_name": model_name,
         "hook_name": huggingface_cfg_dict["hook_point_in"],
-        "hook_layer": int(huggingface_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,

sae_lens/registry.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING, Any
+# avoid circular imports
+if TYPE_CHECKING:
+    from sae_lens.saes.sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+SAE_CLASS_REGISTRY: dict[str, tuple["type[SAE[Any]]", "type[SAEConfig]"]] = {}
+SAE_TRAINING_CLASS_REGISTRY: dict[
+    str, tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]
+] = {}
+def register_sae_class(
+    architecture: str,
+    sae_class: "type[SAE[Any]]",
+    sae_config_class: "type[SAEConfig]",
+) -> None:
+    if architecture in SAE_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE class for architecture {architecture} already registered."
+        )
+    SAE_CLASS_REGISTRY[architecture] = (sae_class, sae_config_class)
+def register_sae_training_class(
+    architecture: str,
+    sae_training_class: "type[TrainingSAE[Any]]",
+    sae_training_config_class: "type[TrainingSAEConfig]",
+) -> None:
+    if architecture in SAE_TRAINING_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE training class for architecture {architecture} already registered."
+        )
+    SAE_TRAINING_CLASS_REGISTRY[architecture] = (
+        sae_training_class,
+        sae_training_config_class,
+    )
+def get_sae_class(
+    architecture: str,
+) -> tuple["type[SAE[Any]]", "type[SAEConfig]"]:
+    return SAE_CLASS_REGISTRY[architecture]
+def get_sae_training_class(
+    architecture: str,
+) -> tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]:
+    return SAE_TRAINING_CLASS_REGISTRY[architecture]

sae_lens/saes/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+from .gated_sae import (
+    GatedSAE,
+    GatedSAEConfig,
+    GatedTrainingSAE,
+    GatedTrainingSAEConfig,
+)
+from .jumprelu_sae import (
+    JumpReLUSAE,
+    JumpReLUSAEConfig,
+    JumpReLUTrainingSAE,
+    JumpReLUTrainingSAEConfig,
+)
+from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+from .standard_sae import (
+    StandardSAE,
+    StandardSAEConfig,
+    StandardTrainingSAE,
+    StandardTrainingSAEConfig,
+)
+from .topk_sae import (
+    TopKSAE,
+    TopKSAEConfig,
+    TopKTrainingSAE,
+    TopKTrainingSAEConfig,
+)
+__all__ = [
+    "SAE",
+    "SAEConfig",
+    "TrainingSAE",
+    "TrainingSAEConfig",
+    "StandardSAE",
+    "StandardSAEConfig",
+    "StandardTrainingSAE",
+    "StandardTrainingSAEConfig",
+    "GatedSAE",
+    "GatedSAEConfig",
+    "GatedTrainingSAE",
+    "GatedTrainingSAEConfig",
+    "JumpReLUSAE",
+    "JumpReLUSAEConfig",
+    "JumpReLUTrainingSAE",
+    "JumpReLUTrainingSAEConfig",
+    "TopKSAE",
+    "TopKSAEConfig",
+    "TopKTrainingSAE",
+    "TopKTrainingSAEConfig",
+]

sae_lens/saes/gated_sae.py CHANGED Viewed

@@ -1,20 +1,36 @@
+from dataclasses import dataclass
 from typing import Any
 import torch
 from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
+from typing_extensions import override
 from sae_lens.saes.sae import (
     SAE,
     SAEConfig,
+    TrainCoefficientConfig,
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
 )
+from sae_lens.util import filter_valid_dataclass_fields
-class GatedSAE(SAE):
+@dataclass
+class GatedSAEConfig(SAEConfig):
+    """
+    Configuration class for a GatedSAE.
+    """
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedSAE(SAE[GatedSAEConfig]):
     """
     GatedSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a gated linear encoder and a standard linear decoder.
@@ -24,48 +40,15 @@ class GatedSAE(SAE):
     b_mag: nn.Parameter
     r_mag: nn.Parameter
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: GatedSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
         # Ensure b_enc does not exist for the gated architecture
         self.b_enc = None
+    @override
     def initialize_weights(self) -> None:
-        """
-        Initialize weights exactly as in the original SAE class for gated architecture.
-        """
-        # Use the same initialization methods and values as in original SAE
-        self.W_enc = nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(
-                    self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-                )
-            )
-        )
-        self.b_gate = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        # Ensure r_mag is initialized to zero as in original
-        self.r_mag = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_mag = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        # Decoder parameters with same initialization as original
-        self.W_dec = nn.Parameter(
-            torch.nn.init.kaiming_uniform_(
-                torch.empty(
-                    self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-                )
-            )
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # after defining b_gate, b_mag, etc.:
-        self.b_enc = None
+        super().initialize_weights()
+        _init_weights_gated(self)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -101,9 +84,8 @@ class GatedSAE(SAE):
           4) If the SAE was reshaping hook_z activations, reshape back.
         """
         # 1) optional finetuning scaling
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
         # 2) linear transform
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         # 3) hooking and normalization
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
@@ -129,7 +111,22 @@ class GatedSAE(SAE):
         self.W_dec.data *= norm
-class GatedTrainingSAE(TrainingSAE):
+@dataclass
+class GatedTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a GatedTrainingSAE.
+    """
+    l1_coefficient: float = 1.0
+    l1_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     """
     GatedTrainingSAE is a concrete implementation of BaseTrainingSAE for the "gated" SAE architecture.
     It implements:
@@ -145,7 +142,7 @@ class GatedTrainingSAE(TrainingSAE):
     b_mag: nn.Parameter  # type: ignore
     r_mag: nn.Parameter  # type: ignore
-    def __init__(self, cfg: TrainingSAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: GatedTrainingSAEConfig, use_error_term: bool = False):
         if use_error_term:
             raise ValueError(
                 "GatedSAE does not support `use_error_term`. Please set `use_error_term=False`."
@@ -153,22 +150,8 @@ class GatedTrainingSAE(TrainingSAE):
         super().__init__(cfg, use_error_term)
     def initialize_weights(self) -> None:
-        # Reuse the gating parameter initialization from GatedSAE:
-        GatedSAE.initialize_weights(self)  # type: ignore
-        # Additional training-specific logic, e.g. orthogonal init or heuristics:
-        if self.cfg.decoder_orthogonal_init:
-            self.W_dec.data = nn.init.orthogonal_(self.W_dec.data.T).T
-        elif self.cfg.decoder_heuristic_init:
-            self.W_dec.data = torch.rand(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-            self.initialize_decoder_norm_constant_norm()
-        if self.cfg.init_encoder_as_decoder_transpose:
-            self.W_enc.data = self.W_dec.data.T.clone().contiguous()
-        if self.cfg.normalize_sae_decoder:
-            with torch.no_grad():
-                self.set_decoder_norm_to_unit_norm()
+        super().initialize_weights()
+        _init_weights_gated(self)
     def encode_with_hidden_pre(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -217,7 +200,7 @@ class GatedTrainingSAE(TrainingSAE):
         # L1-like penalty scaled by W_dec norms
         l1_loss = (
-            step_input.current_l1_coefficient
+            step_input.coefficients["l1"]
             * torch.sum(pi_gate_act * self.W_dec.norm(dim=1), dim=-1).mean()
         )
@@ -245,3 +228,31 @@ class GatedTrainingSAE(TrainingSAE):
         """Initialize decoder with constant norm"""
         self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
         self.W_dec.data *= norm
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l1": TrainCoefficientConfig(
+                value=self.cfg.l1_coefficient,
+                warm_up_steps=self.cfg.l1_warm_up_steps,
+            ),
+        }
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), GatedSAEConfig, ["architecture"]
+        )
+def _init_weights_gated(
+    sae: SAE[GatedSAEConfig] | TrainingSAE[GatedTrainingSAEConfig],
+) -> None:
+    sae.b_gate = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    # Ensure r_mag is initialized to zero as in original
+    sae.r_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    sae.b_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import Any
 import numpy as np
@@ -9,11 +10,13 @@ from typing_extensions import override
 from sae_lens.saes.sae import (
     SAE,
     SAEConfig,
+    TrainCoefficientConfig,
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
     TrainStepOutput,
 )
+from sae_lens.util import filter_valid_dataclass_fields
 def rectangle(x: torch.Tensor) -> torch.Tensor:
@@ -85,7 +88,19 @@ class JumpReLU(torch.autograd.Function):
         return x_grad, threshold_grad, None
-class JumpReLUSAE(SAE):
+@dataclass
+class JumpReLUSAEConfig(SAEConfig):
+    """
+    Configuration class for a JumpReLUSAE.
+    """
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "jumprelu"
+class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
     """
     JumpReLUSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a JumpReLU activation. For each unit, if its pre-activation is
@@ -104,42 +119,18 @@ class JumpReLUSAE(SAE):
     b_enc: nn.Parameter
     threshold: nn.Parameter
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: JumpReLUSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
+    @override
     def initialize_weights(self) -> None:
-        """
-        Initialize encoder and decoder weights, as well as biases.
-        Additionally, include a learnable `threshold` parameter that
-        determines when units "turn on" for the JumpReLU.
-        """
-        # Biases
-        self.b_enc = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # Threshold for JumpReLU
-        # You can pick a default initialization (e.g., zeros means unit is off unless hidden_pre > 0)
-        # or see the training version for more advanced init with log_threshold, etc.
+        super().initialize_weights()
         self.threshold = nn.Parameter(
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        # Encoder and Decoder weights
-        w_enc_data = torch.empty(
-            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_enc_data)
-        self.W_enc = nn.Parameter(w_enc_data)
-        w_dec_data = torch.empty(
-            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
+        self.b_enc = nn.Parameter(
+            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        nn.init.kaiming_uniform_(w_dec_data)
-        self.W_dec = nn.Parameter(w_dec_data)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -168,8 +159,7 @@ class JumpReLUSAE(SAE):
         Decode the feature activations back to the input space.
         Follows the same steps as StandardSAE: apply scaling, transform, hook, and optionally reshape.
         """
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         return self.reshape_fn_out(sae_out_pre, self.d_head)
@@ -195,7 +185,24 @@ class JumpReLUSAE(SAE):
         self.threshold.data = current_thresh * W_dec_norms
-class JumpReLUTrainingSAE(TrainingSAE):
+@dataclass
+class JumpReLUTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a JumpReLUTrainingSAE.
+    """
+    jumprelu_init_threshold: float = 0.001
+    jumprelu_bandwidth: float = 0.001
+    l0_coefficient: float = 1.0
+    l0_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "jumprelu"
+class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
     """
     JumpReLUTrainingSAE is a training-focused implementation of a SAE using a JumpReLU activation.
@@ -213,7 +220,7 @@ class JumpReLUTrainingSAE(TrainingSAE):
     b_enc: nn.Parameter
     log_threshold: nn.Parameter
-    def __init__(self, cfg: TrainingSAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: JumpReLUTrainingSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
         # We'll store a bandwidth for the training approach, if needed
@@ -225,51 +232,16 @@ class JumpReLUTrainingSAE(TrainingSAE):
             * np.log(cfg.jumprelu_init_threshold)
         )
+    @override
     def initialize_weights(self) -> None:
         """
         Initialize parameters like the base SAE, but also add log_threshold.
         """
+        super().initialize_weights()
         # Encoder Bias
         self.b_enc = nn.Parameter(
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-        # Decoder Bias
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # W_enc
-        w_enc_data = torch.nn.init.kaiming_uniform_(
-            torch.empty(
-                self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-            )
-        )
-        self.W_enc = nn.Parameter(w_enc_data)
-        # W_dec
-        w_dec_data = torch.nn.init.kaiming_uniform_(
-            torch.empty(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-        )
-        self.W_dec = nn.Parameter(w_dec_data)
-        # Optionally apply orthogonal or heuristic init
-        if self.cfg.decoder_orthogonal_init:
-            self.W_dec.data = nn.init.orthogonal_(self.W_dec.data.T).T
-        elif self.cfg.decoder_heuristic_init:
-            self.W_dec.data = torch.rand(
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-            self.initialize_decoder_norm_constant_norm()
-        # Optionally transpose
-        if self.cfg.init_encoder_as_decoder_transpose:
-            self.W_enc.data = self.W_dec.data.T.clone().contiguous()
-        # Optionally normalize columns of W_dec
-        if self.cfg.normalize_sae_decoder:
-            with torch.no_grad():
-                self.set_decoder_norm_to_unit_norm()
     @property
     def threshold(self) -> torch.Tensor:
@@ -305,9 +277,18 @@ class JumpReLUTrainingSAE(TrainingSAE):
     ) -> dict[str, torch.Tensor]:
         """Calculate architecture-specific auxiliary loss terms."""
         l0 = torch.sum(Step.apply(hidden_pre, self.threshold, self.bandwidth), dim=-1)  # type: ignore
-        l0_loss = (step_input.current_l1_coefficient * l0).mean()
+        l0_loss = (step_input.coefficients["l0"] * l0).mean()
         return {"l0_loss": l0_loss}
+    @override
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l0": TrainCoefficientConfig(
+                value=self.cfg.l0_coefficient,
+                warm_up_steps=self.cfg.l0_warm_up_steps,
+            ),
+        }
     @torch.no_grad()
     def fold_W_dec_norm(self):
         """
@@ -366,3 +347,8 @@ class JumpReLUTrainingSAE(TrainingSAE):
             threshold = state_dict["threshold"]
             del state_dict["threshold"]
             state_dict["log_threshold"] = torch.log(threshold).detach().contiguous()
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), JumpReLUSAEConfig, ["architecture"]
+        )

sae-lens 6.0.0rc1__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc3py3-none-any.whl