PyPI - sae-lens - Versions diffs - 5.11.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

sae-lens 5.11.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

sae_lens/__init__.py +60 -7
sae_lens/analysis/hooked_sae_transformer.py +12 -12
sae_lens/analysis/neuronpedia_integration.py +16 -14
sae_lens/cache_activations_runner.py +9 -7
sae_lens/config.py +170 -258
sae_lens/constants.py +21 -0
sae_lens/evals.py +59 -44
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +52 -4
sae_lens/{toolkit → loading}/pretrained_sae_loaders.py +85 -32
sae_lens/registry.py +49 -0
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +254 -0
sae_lens/saes/jumprelu_sae.py +348 -0
sae_lens/saes/sae.py +1076 -0
sae_lens/saes/standard_sae.py +178 -0
sae_lens/saes/topk_sae.py +300 -0
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +103 -184
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +155 -177
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +13 -7
sae_lens/util.py +47 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/METADATA +1 -1
sae_lens-6.0.0.dist-info/RECORD +37 -0
sae_lens/sae.py +0 -747
sae_lens/sae_training_runner.py +0 -251
sae_lens/training/geometric_median.py +0 -101
sae_lens/training/training_sae.py +0 -710
sae_lens-5.11.0.dist-info/RECORD +0 -28
/sae_lens/{toolkit → loading}/__init__.py +0 -0
/sae_lens/{toolkit → loading}/pretrained_saes_directory.py +0 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/LICENSE +0 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/WHEEL +0 -0

sae_lens/load_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Literal, cast
+from typing import Any, Callable, Literal, cast
 import torch
 from transformer_lens import HookedTransformer
@@ -77,6 +77,7 @@ class HookedProxyLM(HookedRootModule):
     # copied and modified from base HookedRootModule
     def setup(self):
         self.mod_dict = {}
+        self.named_modules_dict = {}
         self.hook_dict: dict[str, HookPoint] = {}
         for name, module in self.model.named_modules():
             if name == "":
@@ -89,14 +90,21 @@ class HookedProxyLM(HookedRootModule):
             self.hook_dict[name] = hook_point
             self.mod_dict[name] = hook_point
+            self.named_modules_dict[name] = module
+    def run_with_cache(self, *args: Any, **kwargs: Any):  # type: ignore
+        if "names_filter" in kwargs:
+            # hacky way to make sure that the names_filter is passed to our forward method
+            kwargs["_names_filter"] = kwargs["names_filter"]
+        return super().run_with_cache(*args, **kwargs)
     def forward(
         self,
         tokens: torch.Tensor,
         return_type: Literal["both", "logits"] = "logits",
         loss_per_token: bool = False,
-        # TODO: implement real support for stop_at_layer
         stop_at_layer: int | None = None,
+        _names_filter: list[str] | None = None,
         **kwargs: Any,
     ) -> Output | Loss:
         # This is just what's needed for evals, not everything that HookedTransformer has
@@ -107,8 +115,28 @@ class HookedProxyLM(HookedRootModule):
             raise NotImplementedError(
                 "Only return_type supported is 'both' or 'logits' to match what's in evals.py and ActivationsStore"
             )
-        output = self.model(tokens)
-        logits = _extract_logits_from_output(output)
+        stop_hooks = []
+        if stop_at_layer is not None and _names_filter is not None:
+            if return_type != "logits":
+                raise NotImplementedError(
+                    "stop_at_layer is not supported for return_type='both'"
+                )
+            stop_manager = StopManager(_names_filter)
+            for hook_name in _names_filter:
+                module = self.named_modules_dict[hook_name]
+                stop_fn = stop_manager.get_stop_hook_fn(hook_name)
+                stop_hooks.append(module.register_forward_hook(stop_fn))
+        try:
+            output = self.model(tokens)
+            logits = _extract_logits_from_output(output)
+        except StopForward:
+            # If we stop early, we don't care about the return output
+            return None  # type: ignore
+        finally:
+            for stop_hook in stop_hooks:
+                stop_hook.remove()
         if return_type == "logits":
             return logits
@@ -183,3 +211,23 @@ def get_hook_fn(hook_point: HookPoint):
         return output
     return hook_fn
+class StopForward(Exception):
+    pass
+class StopManager:
+    def __init__(self, hook_names: list[str]):
+        self.hook_names = hook_names
+        self.total_hook_names = len(set(hook_names))
+        self.called_hook_names = set()
+    def get_stop_hook_fn(self, hook_name: str) -> Callable[[Any, Any, Any], Any]:
+        def stop_hook_fn(module: Any, input: Any, output: Any) -> Any:  # noqa: ARG001
+            self.called_hook_names.add(hook_name)
+            if len(self.called_hook_names) == self.total_hook_names:
+                raise StopForward()
+            return output
+        return stop_hook_fn

sae_lens/{toolkit → loading}/pretrained_sae_loaders.py RENAMED Viewed

@@ -7,22 +7,41 @@ import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
+from packaging.version import Version
 from safetensors import safe_open
 from safetensors.torch import load_file
 from sae_lens import logger
-from sae_lens.config import (
+from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
     SPARSIFY_WEIGHTS_FILENAME,
     SPARSITY_FILENAME,
 )
-from sae_lens.toolkit.pretrained_saes_directory import (
+from sae_lens.loading.pretrained_saes_directory import (
     get_config_overrides,
     get_pretrained_saes_directory,
     get_repo_id_and_folder_name,
 )
+from sae_lens.registry import get_sae_class
+from sae_lens.util import filter_valid_dataclass_fields
+LLM_METADATA_KEYS = {
+    "model_name",
+    "hook_name",
+    "model_class_name",
+    "hook_head_index",
+    "model_from_pretrained_kwargs",
+    "prepend_bos",
+    "exclude_special_tokens",
+    "neuronpedia_id",
+    "context_size",
+    "seqpos_slice",
+    "dataset_path",
+    "sae_lens_version",
+    "sae_lens_training_version",
+}
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
@@ -175,30 +194,69 @@ def get_sae_lens_config_from_disk(
 def handle_config_defaulting(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    sae_lens_version = cfg_dict.get("sae_lens_version")
+    if not sae_lens_version and "metadata" in cfg_dict:
+        sae_lens_version = cfg_dict["metadata"].get("sae_lens_version")
+    if not sae_lens_version or Version(sae_lens_version) < Version("6.0.0-rc.0"):
+        cfg_dict = handle_pre_6_0_config(cfg_dict)
+    return cfg_dict
+def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
+    """
+    Format a config dictionary for a Sparse Autoencoder (SAE) to be compatible with the new 6.0 format.
+    """
+    rename_keys_map = {
+        "hook_point": "hook_name",
+        "hook_point_head_index": "hook_head_index",
+        "activation_fn_str": "activation_fn",
+    }
+    new_cfg = {rename_keys_map.get(k, k): v for k, v in cfg_dict.items()}
     # Set default values for backwards compatibility
-    cfg_dict.setdefault("prepend_bos", True)
-    cfg_dict.setdefault("dataset_trust_remote_code", True)
-    cfg_dict.setdefault("apply_b_dec_to_input", True)
-    cfg_dict.setdefault("finetuning_scaling_factor", False)
-    cfg_dict.setdefault("sae_lens_training_version", None)
-    cfg_dict.setdefault("activation_fn_str", cfg_dict.get("activation_fn", "relu"))
-    cfg_dict.setdefault("architecture", "standard")
-    cfg_dict.setdefault("neuronpedia_id", None)
-    if "normalize_activations" in cfg_dict and isinstance(
-        cfg_dict["normalize_activations"], bool
+    new_cfg.setdefault("prepend_bos", True)
+    new_cfg.setdefault("dataset_trust_remote_code", True)
+    new_cfg.setdefault("apply_b_dec_to_input", True)
+    new_cfg.setdefault("finetuning_scaling_factor", False)
+    new_cfg.setdefault("sae_lens_training_version", None)
+    new_cfg.setdefault("activation_fn", new_cfg.get("activation_fn", "relu"))
+    new_cfg.setdefault("architecture", "standard")
+    new_cfg.setdefault("neuronpedia_id", None)
+    new_cfg.setdefault(
+        "reshape_activations",
+        "hook_z" if "hook_z" in new_cfg.get("hook_name", "") else "none",
+    )
+    if "normalize_activations" in new_cfg and isinstance(
+        new_cfg["normalize_activations"], bool
     ):
         # backwards compatibility
-        cfg_dict["normalize_activations"] = (
+        new_cfg["normalize_activations"] = (
             "none"
-            if not cfg_dict["normalize_activations"]
+            if not new_cfg["normalize_activations"]
             else "expected_average_only_in"
         )
-    cfg_dict.setdefault("normalize_activations", "none")
-    cfg_dict.setdefault("device", "cpu")
+    if new_cfg.get("normalize_activations") is None:
+        new_cfg["normalize_activations"] = "none"
-    return cfg_dict
+    new_cfg.setdefault("device", "cpu")
+    architecture = new_cfg.get("architecture", "standard")
+    config_class = get_sae_class(architecture)[1]
+    sae_cfg_dict = filter_valid_dataclass_fields(new_cfg, config_class)
+    if architecture == "topk" and "activation_fn_kwargs" in new_cfg:
+        sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
+    sae_cfg_dict["metadata"] = {
+        k: v for k, v in new_cfg.items() if k in LLM_METADATA_KEYS
+    }
+    sae_cfg_dict["architecture"] = architecture
+    return sae_cfg_dict
 def get_connor_rob_hook_z_config_from_hf(
@@ -222,9 +280,8 @@ def get_connor_rob_hook_z_config_from_hf(
         "device": device if device is not None else "cpu",
         "model_name": "gpt2-small",
         "hook_name": old_cfg_dict["act_name"],
-        "hook_layer": old_cfg_dict["layer"],
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "apply_b_dec_to_input": True,
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
@@ -233,6 +290,7 @@ def get_connor_rob_hook_z_config_from_hf(
         "context_size": 128,
         "normalize_activations": "none",
         "dataset_trust_remote_code": True,
+        "reshape_activations": "hook_z",
         **(cfg_overrides or {}),
     }
@@ -371,9 +429,8 @@ def get_gemma_2_config_from_hf(
         "dtype": "float32",
         "model_name": model_name,
         "hook_name": hook_name,
-        "hook_layer": layer,
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,
@@ -493,9 +550,8 @@ def get_llama_scope_config_from_hf(
         "dtype": "bfloat16",
         "model_name": model_name,
         "hook_name": old_cfg_dict["hook_point_in"],
-        "hook_layer": int(old_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,
@@ -607,8 +663,8 @@ def get_dictionary_learning_config_1_from_hf(
     hook_point_name = f"blocks.{trainer['layer']}.hook_resid_post"
-    activation_fn_str = "topk" if trainer["dict_class"] == "AutoEncoderTopK" else "relu"
-    activation_fn_kwargs = {"k": trainer["k"]} if activation_fn_str == "topk" else {}
+    activation_fn = "topk" if trainer["dict_class"] == "AutoEncoderTopK" else "relu"
+    activation_fn_kwargs = {"k": trainer["k"]} if activation_fn == "topk" else {}
     return {
         "architecture": (
@@ -620,9 +676,8 @@ def get_dictionary_learning_config_1_from_hf(
         "device": device,
         "model_name": trainer["lm_name"].split("/")[-1],
         "hook_name": hook_point_name,
-        "hook_layer": trainer["layer"],
         "hook_head_index": None,
-        "activation_fn_str": activation_fn_str,
+        "activation_fn": activation_fn,
         "activation_fn_kwargs": activation_fn_kwargs,
         "apply_b_dec_to_input": True,
         "finetuning_scaling_factor": False,
@@ -659,13 +714,12 @@ def get_deepseek_r1_config_from_hf(
         "context_size": 1024,
         "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
         "hook_name": f"blocks.{layer}.hook_resid_post",
-        "hook_layer": layer,
         "hook_head_index": None,
         "prepend_bos": True,
         "dataset_path": "lmsys/lmsys-chat-1m",
         "dataset_trust_remote_code": True,
         "sae_lens_training_version": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "normalize_activations": "none",
         "device": device,
         "apply_b_dec_to_input": False,
@@ -818,9 +872,8 @@ def get_llama_scope_r1_distill_config_from_hf(
         "device": device,
         "model_name": model_name,
         "hook_name": huggingface_cfg_dict["hook_point_in"],
-        "hook_layer": int(huggingface_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
-        "activation_fn_str": "relu",
+        "activation_fn": "relu",
         "finetuning_scaling_factor": False,
         "sae_lens_training_version": None,
         "prepend_bos": True,

sae_lens/registry.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING, Any
+# avoid circular imports
+if TYPE_CHECKING:
+    from sae_lens.saes.sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+SAE_CLASS_REGISTRY: dict[str, tuple["type[SAE[Any]]", "type[SAEConfig]"]] = {}
+SAE_TRAINING_CLASS_REGISTRY: dict[
+    str, tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]
+] = {}
+def register_sae_class(
+    architecture: str,
+    sae_class: "type[SAE[Any]]",
+    sae_config_class: "type[SAEConfig]",
+) -> None:
+    if architecture in SAE_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE class for architecture {architecture} already registered."
+        )
+    SAE_CLASS_REGISTRY[architecture] = (sae_class, sae_config_class)
+def register_sae_training_class(
+    architecture: str,
+    sae_training_class: "type[TrainingSAE[Any]]",
+    sae_training_config_class: "type[TrainingSAEConfig]",
+) -> None:
+    if architecture in SAE_TRAINING_CLASS_REGISTRY:
+        raise ValueError(
+            f"SAE training class for architecture {architecture} already registered."
+        )
+    SAE_TRAINING_CLASS_REGISTRY[architecture] = (
+        sae_training_class,
+        sae_training_config_class,
+    )
+def get_sae_class(
+    architecture: str,
+) -> tuple["type[SAE[Any]]", "type[SAEConfig]"]:
+    return SAE_CLASS_REGISTRY[architecture]
+def get_sae_training_class(
+    architecture: str,
+) -> tuple["type[TrainingSAE[Any]]", "type[TrainingSAEConfig]"]:
+    return SAE_TRAINING_CLASS_REGISTRY[architecture]

sae_lens/saes/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+from .gated_sae import (
+    GatedSAE,
+    GatedSAEConfig,
+    GatedTrainingSAE,
+    GatedTrainingSAEConfig,
+)
+from .jumprelu_sae import (
+    JumpReLUSAE,
+    JumpReLUSAEConfig,
+    JumpReLUTrainingSAE,
+    JumpReLUTrainingSAEConfig,
+)
+from .sae import SAE, SAEConfig, TrainingSAE, TrainingSAEConfig
+from .standard_sae import (
+    StandardSAE,
+    StandardSAEConfig,
+    StandardTrainingSAE,
+    StandardTrainingSAEConfig,
+)
+from .topk_sae import (
+    TopKSAE,
+    TopKSAEConfig,
+    TopKTrainingSAE,
+    TopKTrainingSAEConfig,
+)
+__all__ = [
+    "SAE",
+    "SAEConfig",
+    "TrainingSAE",
+    "TrainingSAEConfig",
+    "StandardSAE",
+    "StandardSAEConfig",
+    "StandardTrainingSAE",
+    "StandardTrainingSAEConfig",
+    "GatedSAE",
+    "GatedSAEConfig",
+    "GatedTrainingSAE",
+    "GatedTrainingSAEConfig",
+    "JumpReLUSAE",
+    "JumpReLUSAEConfig",
+    "JumpReLUTrainingSAE",
+    "JumpReLUTrainingSAEConfig",
+    "TopKSAE",
+    "TopKSAEConfig",
+    "TopKTrainingSAE",
+    "TopKTrainingSAEConfig",
+]

sae_lens/saes/gated_sae.py ADDED Viewed

@@ -0,0 +1,254 @@
+from dataclasses import dataclass
+from typing import Any
+import torch
+from jaxtyping import Float
+from numpy.typing import NDArray
+from torch import nn
+from typing_extensions import override
+from sae_lens.saes.sae import (
+    SAE,
+    SAEConfig,
+    TrainCoefficientConfig,
+    TrainingSAE,
+    TrainingSAEConfig,
+    TrainStepInput,
+)
+from sae_lens.util import filter_valid_dataclass_fields
+@dataclass
+class GatedSAEConfig(SAEConfig):
+    """
+    Configuration class for a GatedSAE.
+    """
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedSAE(SAE[GatedSAEConfig]):
+    """
+    GatedSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
+    using a gated linear encoder and a standard linear decoder.
+    """
+    b_gate: nn.Parameter
+    b_mag: nn.Parameter
+    r_mag: nn.Parameter
+    def __init__(self, cfg: GatedSAEConfig, use_error_term: bool = False):
+        super().__init__(cfg, use_error_term)
+        # Ensure b_enc does not exist for the gated architecture
+        self.b_enc = None
+    @override
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        _init_weights_gated(self)
+    def encode(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> Float[torch.Tensor, "... d_sae"]:
+        """
+        Encode the input tensor into the feature space using a gated encoder.
+        This must match the original encode_gated implementation from SAE class.
+        """
+        # Preprocess the SAE input (casting type, applying hooks, normalization)
+        sae_in = self.process_sae_in(x)
+        # Gating path exactly as in original SAE.encode_gated
+        gating_pre_activation = sae_in @ self.W_enc + self.b_gate
+        active_features = (gating_pre_activation > 0).to(self.dtype)
+        # Magnitude path (weight sharing with gated encoder)
+        magnitude_pre_activation = self.hook_sae_acts_pre(
+            sae_in @ (self.W_enc * self.r_mag.exp()) + self.b_mag
+        )
+        feature_magnitudes = self.activation_fn(magnitude_pre_activation)
+        # Combine gating and magnitudes
+        return self.hook_sae_acts_post(active_features * feature_magnitudes)
+    def decode(
+        self, feature_acts: Float[torch.Tensor, "... d_sae"]
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decode the feature activations back into the input space:
+          1) Apply optional finetuning scaling.
+          2) Linear transform plus bias.
+          3) Run any reconstruction hooks and out-normalization if configured.
+          4) If the SAE was reshaping hook_z activations, reshape back.
+        """
+        # 1) optional finetuning scaling
+        # 2) linear transform
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        # 3) hooking and normalization
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        # 4) reshape if needed (hook_z)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+    @torch.no_grad()
+    def fold_W_dec_norm(self):
+        """Override to handle gated-specific parameters."""
+        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        self.W_dec.data = self.W_dec.data / W_dec_norms
+        self.W_enc.data = self.W_enc.data * W_dec_norms.T
+        # Gated-specific parameters need special handling
+        self.r_mag.data = self.r_mag.data * W_dec_norms.squeeze()
+        self.b_gate.data = self.b_gate.data * W_dec_norms.squeeze()
+        self.b_mag.data = self.b_mag.data * W_dec_norms.squeeze()
+    @torch.no_grad()
+    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
+        """Initialize decoder with constant norm."""
+        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
+        self.W_dec.data *= norm
+@dataclass
+class GatedTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a GatedTrainingSAE.
+    """
+    l1_coefficient: float = 1.0
+    l1_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "gated"
+class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
+    """
+    GatedTrainingSAE is a concrete implementation of BaseTrainingSAE for the "gated" SAE architecture.
+    It implements:
+      - initialize_weights: sets up gating parameters (as in GatedSAE) plus optional training-specific init.
+      - encode: calls encode_with_hidden_pre (standard training approach).
+      - decode: linear transformation + hooking, same as GatedSAE or StandardTrainingSAE.
+      - encode_with_hidden_pre: gating logic + optional noise injection for training.
+      - calculate_aux_loss: includes an auxiliary reconstruction path and gating-based sparsity penalty.
+      - training_forward_pass: calls encode_with_hidden_pre, decode, and sums up MSE + gating losses.
+    """
+    b_gate: nn.Parameter  # type: ignore
+    b_mag: nn.Parameter  # type: ignore
+    r_mag: nn.Parameter  # type: ignore
+    def __init__(self, cfg: GatedTrainingSAEConfig, use_error_term: bool = False):
+        if use_error_term:
+            raise ValueError(
+                "GatedSAE does not support `use_error_term`. Please set `use_error_term=False`."
+            )
+        super().__init__(cfg, use_error_term)
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        _init_weights_gated(self)
+    def encode_with_hidden_pre(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        """
+        Gated forward pass with pre-activation (for training).
+        We also inject noise if self.training is True.
+        """
+        sae_in = self.process_sae_in(x)
+        # Gating path
+        gating_pre_activation = sae_in @ self.W_enc + self.b_gate
+        active_features = (gating_pre_activation > 0).to(self.dtype)
+        # Magnitude path
+        magnitude_pre_activation = sae_in @ (self.W_enc * self.r_mag.exp()) + self.b_mag
+        magnitude_pre_activation = self.hook_sae_acts_pre(magnitude_pre_activation)
+        feature_magnitudes = self.activation_fn(magnitude_pre_activation)
+        # Combine gating path and magnitude path
+        feature_acts = self.hook_sae_acts_post(active_features * feature_magnitudes)
+        # Return both the final feature activations and the pre-activation (for logging or penalty)
+        return feature_acts, magnitude_pre_activation
+    def calculate_aux_loss(
+        self,
+        step_input: TrainStepInput,
+        feature_acts: torch.Tensor,
+        hidden_pre: torch.Tensor,
+        sae_out: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        # Re-center the input if apply_b_dec_to_input is set
+        sae_in_centered = step_input.sae_in - (
+            self.b_dec * self.cfg.apply_b_dec_to_input
+        )
+        # The gating pre-activation (pi_gate) for the auxiliary path
+        pi_gate = sae_in_centered @ self.W_enc + self.b_gate
+        pi_gate_act = torch.relu(pi_gate)
+        # L1-like penalty scaled by W_dec norms
+        l1_loss = (
+            step_input.coefficients["l1"]
+            * torch.sum(pi_gate_act * self.W_dec.norm(dim=1), dim=-1).mean()
+        )
+        # Aux reconstruction: reconstruct x purely from gating path
+        via_gate_reconstruction = pi_gate_act @ self.W_dec + self.b_dec
+        aux_recon_loss = (
+            (via_gate_reconstruction - step_input.sae_in).pow(2).sum(dim=-1).mean()
+        )
+        # Return both losses separately
+        return {"l1_loss": l1_loss, "auxiliary_reconstruction_loss": aux_recon_loss}
+    def log_histograms(self) -> dict[str, NDArray[Any]]:
+        """Log histograms of the weights and biases."""
+        b_gate_dist = self.b_gate.detach().float().cpu().numpy()
+        b_mag_dist = self.b_mag.detach().float().cpu().numpy()
+        return {
+            **super().log_histograms(),
+            "weights/b_gate": b_gate_dist,
+            "weights/b_mag": b_mag_dist,
+        }
+    @torch.no_grad()
+    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
+        """Initialize decoder with constant norm"""
+        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
+        self.W_dec.data *= norm
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l1": TrainCoefficientConfig(
+                value=self.cfg.l1_coefficient,
+                warm_up_steps=self.cfg.l1_warm_up_steps,
+            ),
+        }
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), GatedSAEConfig, ["architecture"]
+        )
+def _init_weights_gated(
+    sae: SAE[GatedSAEConfig] | TrainingSAE[GatedTrainingSAEConfig],
+) -> None:
+    sae.b_gate = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    # Ensure r_mag is initialized to zero as in original
+    sae.r_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )
+    sae.b_mag = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

sae-lens 5.11.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

sae-lens 5.11.0py3-none-any.whl → 6.0.0py3-none-any.whl