PyPI - sae-lens - Versions diffs - 6.0.0rc5__py3-none-any.whl → 6.2.0__py3-none-any.whl - Mend

sae-lens 6.0.0rc5py3-none-any.whl → 6.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

sae_lens/__init__.py +8 -1
sae_lens/config.py +0 -23
sae_lens/saes/__init__.py +6 -0
sae_lens/saes/batchtopk_sae.py +102 -0
sae_lens/saes/gated_sae.py +13 -20
sae_lens/saes/jumprelu_sae.py +1 -37
sae_lens/saes/sae.py +13 -49
sae_lens/saes/standard_sae.py +1 -9
sae_lens/saes/topk_sae.py +20 -49
sae_lens/training/activations_store.py +0 -15
sae_lens/training/optim.py +0 -33
sae_lens/training/sae_trainer.py +4 -2
{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/METADATA +5 -1
{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/RECORD +16 -15
{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/WHEEL +0 -0

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.0.0-rc.5"
+__version__ = "6.2.0"
 import logging
@@ -7,6 +7,8 @@ logger = logging.getLogger(__name__)
 from sae_lens.saes import (
     SAE,
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
     GatedSAE,
     GatedSAEConfig,
     GatedTrainingSAE,
@@ -85,6 +87,8 @@ __all__ = [
     "JumpReLUTrainingSAEConfig",
     "SAETrainingRunner",
     "LoggingConfig",
+    "BatchTopKTrainingSAE",
+    "BatchTopKTrainingSAEConfig",
 ]
@@ -96,3 +100,6 @@ register_sae_class("topk", TopKSAE, TopKSAEConfig)
 register_sae_training_class("topk", TopKTrainingSAE, TopKTrainingSAEConfig)
 register_sae_class("jumprelu", JumpReLUSAE, JumpReLUSAEConfig)
 register_sae_training_class("jumprelu", JumpReLUTrainingSAE, JumpReLUTrainingSAEConfig)
+register_sae_training_class(
+    "batchtopk", BatchTopKTrainingSAE, BatchTopKTrainingSAEConfig
+)

sae_lens/config.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import math
-import os
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, cast
@@ -353,28 +352,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         d["act_store_device"] = str(self.act_store_device)
         return d
-    def to_json(self, path: str) -> None:
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-        with open(path + "cfg.json", "w") as f:
-            json.dump(self.to_dict(), f, indent=2)
-    @classmethod
-    def from_json(cls, path: str) -> "LanguageModelSAERunnerConfig[Any]":
-        with open(path + "cfg.json") as f:
-            cfg = json.load(f)
-        # ensure that seqpos slices is a tuple
-        # Ensure seqpos_slice is a tuple
-        if "seqpos_slice" in cfg:
-            if isinstance(cfg["seqpos_slice"], list):
-                cfg["seqpos_slice"] = tuple(cfg["seqpos_slice"])
-            elif not isinstance(cfg["seqpos_slice"], tuple):
-                cfg["seqpos_slice"] = (cfg["seqpos_slice"],)
-        return cls(**cfg)
     def to_sae_trainer_config(self) -> "SAETrainerConfig":
         return SAETrainerConfig(
             n_checkpoints=self.n_checkpoints,

sae_lens/saes/__init__.py CHANGED Viewed

@@ -1,3 +1,7 @@
+from .batchtopk_sae import (
+    BatchTopKTrainingSAE,
+    BatchTopKTrainingSAEConfig,
+)
 from .gated_sae import (
     GatedSAE,
     GatedSAEConfig,
@@ -45,4 +49,6 @@ __all__ = [
     "TopKSAEConfig",
     "TopKTrainingSAE",
     "TopKTrainingSAEConfig",
+    "BatchTopKTrainingSAE",
+    "BatchTopKTrainingSAEConfig",
 ]

sae_lens/saes/batchtopk_sae.py ADDED Viewed

@@ -0,0 +1,102 @@
+from dataclasses import dataclass
+from typing import Any, Callable
+import torch
+import torch.nn as nn
+from typing_extensions import override
+from sae_lens.saes.jumprelu_sae import JumpReLUSAEConfig
+from sae_lens.saes.sae import SAEConfig, TrainStepInput, TrainStepOutput
+from sae_lens.saes.topk_sae import TopKTrainingSAE, TopKTrainingSAEConfig
+class BatchTopK(nn.Module):
+    """BatchTopK activation function"""
+    def __init__(
+        self,
+        k: int,
+    ):
+        super().__init__()
+        self.k = k
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        acts = x.relu()
+        flat_acts = acts.flatten()
+        acts_topk_flat = torch.topk(flat_acts, self.k * acts.shape[0], dim=-1)
+        return (
+            torch.zeros_like(flat_acts)
+            .scatter(-1, acts_topk_flat.indices, acts_topk_flat.values)
+            .reshape(acts.shape)
+        )
+@dataclass
+class BatchTopKTrainingSAEConfig(TopKTrainingSAEConfig):
+    """
+    Configuration class for training a BatchTopKTrainingSAE.
+    """
+    topk_threshold_lr: float = 0.01
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "batchtopk"
+    @override
+    def get_inference_config_class(self) -> type[SAEConfig]:
+        return JumpReLUSAEConfig
+class BatchTopKTrainingSAE(TopKTrainingSAE):
+    """
+    Global Batch TopK Training SAE
+    This SAE will maintain the k on average across the batch, rather than enforcing the k per-sample as in standard TopK.
+    BatchTopK SAEs are saved as JumpReLU SAEs after training.
+    """
+    topk_threshold: torch.Tensor
+    cfg: BatchTopKTrainingSAEConfig  # type: ignore[assignment]
+    def __init__(self, cfg: BatchTopKTrainingSAEConfig, use_error_term: bool = False):
+        super().__init__(cfg, use_error_term)
+        self.register_buffer(
+            "topk_threshold",
+            # use double precision as otherwise we can run into numerical issues
+            torch.tensor(0.0, dtype=torch.double, device=self.W_dec.device),
+        )
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return BatchTopK(self.cfg.k)
+    @override
+    def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
+        output = super().training_forward_pass(step_input)
+        self.update_topk_threshold(output.feature_acts)
+        output.metrics["topk_threshold"] = self.topk_threshold
+        return output
+    @torch.no_grad()
+    def update_topk_threshold(self, acts_topk: torch.Tensor) -> None:
+        positive_mask = acts_topk > 0
+        lr = self.cfg.topk_threshold_lr
+        # autocast can cause numerical issues with the threshold update
+        with torch.autocast(self.topk_threshold.device.type, enabled=False):
+            if positive_mask.any():
+                min_positive = (
+                    acts_topk[positive_mask].min().to(self.topk_threshold.dtype)
+                )
+                self.topk_threshold = (1 - lr) * self.topk_threshold + lr * min_positive
+    @override
+    def process_state_dict_for_saving_inference(
+        self, state_dict: dict[str, Any]
+    ) -> None:
+        super().process_state_dict_for_saving_inference(state_dict)
+        # turn the topk threshold into jumprelu threshold
+        topk_threshold = state_dict.pop("topk_threshold").item()
+        state_dict["threshold"] = torch.ones_like(self.b_enc) * topk_threshold

sae_lens/saes/gated_sae.py CHANGED Viewed

@@ -15,7 +15,6 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 @dataclass
@@ -100,16 +99,10 @@ class GatedSAE(SAE[GatedSAEConfig]):
         self.W_enc.data = self.W_enc.data * W_dec_norms.T
         # Gated-specific parameters need special handling
-        self.r_mag.data = self.r_mag.data * W_dec_norms.squeeze()
+        # r_mag doesn't need scaling since W_enc scaling is sufficient for magnitude path
         self.b_gate.data = self.b_gate.data * W_dec_norms.squeeze()
         self.b_mag.data = self.b_mag.data * W_dec_norms.squeeze()
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm."""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
 @dataclass
 class GatedTrainingSAEConfig(TrainingSAEConfig):
@@ -133,7 +126,7 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
       - initialize_weights: sets up gating parameters (as in GatedSAE) plus optional training-specific init.
       - encode: calls encode_with_hidden_pre (standard training approach).
       - decode: linear transformation + hooking, same as GatedSAE or StandardTrainingSAE.
-      - encode_with_hidden_pre: gating logic + optional noise injection for training.
+      - encode_with_hidden_pre: gating logic.
       - calculate_aux_loss: includes an auxiliary reconstruction path and gating-based sparsity penalty.
       - training_forward_pass: calls encode_with_hidden_pre, decode, and sums up MSE + gating losses.
     """
@@ -158,7 +151,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
         """
         Gated forward pass with pre-activation (for training).
-        We also inject noise if self.training is True.
         """
         sae_in = self.process_sae_in(x)
@@ -219,12 +211,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
             "weights/b_mag": b_mag_dist,
         }
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm"""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
     def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
         return {
             "l1": TrainCoefficientConfig(
@@ -233,10 +219,17 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
             ),
         }
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), GatedSAEConfig, ["architecture"]
-        )
+    @torch.no_grad()
+    def fold_W_dec_norm(self):
+        """Override to handle gated-specific parameters."""
+        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        self.W_dec.data = self.W_dec.data / W_dec_norms
+        self.W_enc.data = self.W_enc.data * W_dec_norms.T
+        # Gated-specific parameters need special handling
+        # r_mag doesn't need scaling since W_enc scaling is sufficient for magnitude path
+        self.b_gate.data = self.b_gate.data * W_dec_norms.squeeze()
+        self.b_mag.data = self.b_mag.data * W_dec_norms.squeeze()
 def _init_weights_gated(

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -14,9 +14,7 @@ from sae_lens.saes.sae import (
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
-    TrainStepOutput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 def rectangle(x: torch.Tensor) -> torch.Tensor:
@@ -208,12 +206,11 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
     Similar to the inference-only JumpReLUSAE, but with:
       - A learnable log-threshold parameter (instead of a raw threshold).
-      - Forward passes that add noise during training, if configured.
       - A specialized auxiliary loss term for sparsity (L0 or similar).
     Methods of interest include:
     - initialize_weights: sets up W_enc, b_enc, W_dec, b_dec, and log_threshold.
-    - encode_with_hidden_pre_jumprelu: runs a forward pass for training, optionally adding noise.
+    - encode_with_hidden_pre_jumprelu: runs a forward pass for training.
     - training_forward_pass: calculates MSE and auxiliary losses, returning a TrainStepOutput.
     """
@@ -300,34 +297,6 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         # Fix: Use squeeze() instead of squeeze(-1) to match old behavior
         self.log_threshold.data = torch.log(current_thresh * W_dec_norms.squeeze())
-    def _create_train_step_output(
-        self,
-        sae_in: torch.Tensor,
-        sae_out: torch.Tensor,
-        feature_acts: torch.Tensor,
-        hidden_pre: torch.Tensor,
-        loss: torch.Tensor,
-        losses: dict[str, torch.Tensor],
-    ) -> TrainStepOutput:
-        """
-        Helper to produce a TrainStepOutput from the trainer.
-        The old code expects a method named _create_train_step_output().
-        """
-        return TrainStepOutput(
-            sae_in=sae_in,
-            sae_out=sae_out,
-            feature_acts=feature_acts,
-            hidden_pre=hidden_pre,
-            loss=loss,
-            losses=losses,
-        )
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        """Initialize decoder with constant norm"""
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)
-        self.W_dec.data *= norm
     def process_state_dict_for_saving(self, state_dict: dict[str, Any]) -> None:
         """Convert log_threshold to threshold for saving"""
         if "log_threshold" in state_dict:
@@ -341,8 +310,3 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
             threshold = state_dict["threshold"]
             del state_dict["threshold"]
             state_dict["log_threshold"] = torch.log(threshold).detach().contiguous()
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), JumpReLUSAEConfig, ["architecture"]
-        )

sae_lens/saes/sae.py CHANGED Viewed

@@ -27,7 +27,7 @@ from torch import nn
 from transformer_lens.hook_points import HookedRootModule, HookPoint
 from typing_extensions import deprecated, overload, override
-from sae_lens import __version__, logger
+from sae_lens import __version__
 from sae_lens.constants import (
     DTYPE_MAP,
     SAE_CFG_FILENAME,
@@ -207,6 +207,8 @@ class TrainStepOutput:
     hidden_pre: torch.Tensor
     loss: torch.Tensor  # we need to call backwards on this
     losses: dict[str, torch.Tensor]
+    # any extra metrics to log can be added here
+    metrics: dict[str, torch.Tensor | float | int] = field(default_factory=dict)
 @dataclass
@@ -528,28 +530,6 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return model_weights_path, cfg_path
-    ## Initialization Methods
-    @torch.no_grad()
-    def initialize_b_dec_with_precalculated(self, origin: torch.Tensor):
-        out = torch.tensor(origin, dtype=self.dtype, device=self.device)
-        self.b_dec.data = out
-    @torch.no_grad()
-    def initialize_b_dec_with_mean(self, all_activations: torch.Tensor):
-        previous_b_dec = self.b_dec.clone().cpu()
-        out = all_activations.mean(dim=0)
-        previous_distances = torch.norm(all_activations - previous_b_dec, dim=-1)
-        distances = torch.norm(all_activations - out, dim=-1)
-        logger.info("Reinitializing b_dec with mean of activations")
-        logger.debug(
-            f"Previous distances: {previous_distances.median(0).values.mean().item()}"
-        )
-        logger.debug(f"New distances: {distances.median(0).values.mean().item()}")
-        self.b_dec.data = out.to(self.dtype).to(self.device)
     # Class methods for loading models
     @classmethod
     @deprecated("Use load_from_disk instead")
@@ -847,20 +827,26 @@ class TrainingSAEConfig(SAEConfig, ABC):
             "architecture": self.architecture(),
         }
+    def get_inference_config_class(self) -> type[SAEConfig]:
+        """
+        Get the architecture for inference.
+        """
+        return get_sae_class(self.architecture())[1]
     # this needs to exist so we can initialize the parent sae cfg without the training specific
     # parameters. Maybe there's a cleaner way to do this
-    def get_base_sae_cfg_dict(self) -> dict[str, Any]:
+    def get_inference_sae_cfg_dict(self) -> dict[str, Any]:
         """
         Creates a dictionary containing attributes corresponding to the fields
         defined in the base SAEConfig class.
         """
-        base_sae_cfg_class = get_sae_class(self.architecture())[1]
+        base_sae_cfg_class = self.get_inference_config_class()
         base_config_field_names = {f.name for f in fields(base_sae_cfg_class)}
         result_dict = {
             field_name: getattr(self, field_name)
             for field_name in base_config_field_names
         }
-        result_dict["architecture"] = self.architecture()
+        result_dict["architecture"] = base_sae_cfg_class.architecture()
         result_dict["metadata"] = self.metadata.to_dict()
         return result_dict
@@ -988,18 +974,13 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         save_file(state_dict, model_weights_path)
         # Save the config
-        config = self.to_inference_config_dict()
+        config = self.cfg.get_inference_sae_cfg_dict()
         cfg_path = path / SAE_CFG_FILENAME
         with open(cfg_path, "w") as f:
             json.dump(config, f)
         return model_weights_path, cfg_path
-    @abstractmethod
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        """Convert the config into an inference SAE config dict."""
-        ...
     def process_state_dict_for_saving_inference(
         self, state_dict: dict[str, Any]
     ) -> None:
@@ -1009,23 +990,6 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         """
         return self.process_state_dict_for_saving(state_dict)
-    @torch.no_grad()
-    def remove_gradient_parallel_to_decoder_directions(self) -> None:
-        """Remove gradient components parallel to decoder directions."""
-        # Implement the original logic since this may not be in the base class
-        assert self.W_dec.grad is not None
-        parallel_component = einops.einsum(
-            self.W_dec.grad,
-            self.W_dec.data,
-            "d_sae d_in, d_sae d_in -> d_sae",
-        )
-        self.W_dec.grad -= einops.einsum(
-            parallel_component,
-            self.W_dec.data,
-            "d_sae, d_sae d_in -> d_sae d_in",
-        )
     @torch.no_grad()
     def log_histograms(self) -> dict[str, NDArray[Any]]:
         """Log histograms of the weights and biases."""

sae_lens/saes/standard_sae.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from typing import Any
 import numpy as np
 import torch
@@ -16,7 +15,6 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 @dataclass
@@ -61,7 +59,6 @@ class StandardSAE(SAE[StandardSAEConfig]):
     ) -> Float[torch.Tensor, "... d_sae"]:
         """
         Encode the input tensor into the feature space.
-        For inference, no noise is added.
         """
         # Preprocess the SAE input (casting type, applying hooks, normalization)
         sae_in = self.process_sae_in(x)
@@ -110,7 +107,7 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
       - initialize_weights: basic weight initialization for encoder/decoder.
       - encode: inference encoding (invokes encode_with_hidden_pre).
       - decode: a simple linear decoder.
-      - encode_with_hidden_pre: computes pre-activations, adds noise when training, and then activates.
+      - encode_with_hidden_pre: computes activations and pre-activations.
       - calculate_aux_loss: computes a sparsity penalty based on the (optionally scaled) p-norm of feature activations.
     """
@@ -164,11 +161,6 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
             "weights/b_e": b_e_dist,
         }
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), StandardSAEConfig, ["architecture"]
-        )
 def _init_weights_standard(
     sae: SAE[StandardSAEConfig] | TrainingSAE[StandardTrainingSAEConfig],

sae_lens/saes/topk_sae.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
 from dataclasses import dataclass
-from typing import Any, Callable
+from typing import Callable
 import torch
 from jaxtyping import Float
@@ -16,13 +16,12 @@ from sae_lens.saes.sae import (
     TrainingSAEConfig,
     TrainStepInput,
 )
-from sae_lens.util import filter_valid_dataclass_fields
 class TopK(nn.Module):
     """
     A simple TopK activation that zeroes out all but the top K elements along the last dimension,
-    then optionally applies a post-activation function (e.g., ReLU).
+    and applies ReLU to the top K elements.
     """
     b_enc: nn.Parameter
@@ -30,20 +29,18 @@ class TopK(nn.Module):
     def __init__(
         self,
         k: int,
-        postact_fn: Callable[[torch.Tensor], torch.Tensor] = nn.ReLU(),
     ):
         super().__init__()
         self.k = k
-        self.postact_fn = postact_fn
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         1) Select top K elements along the last dimension.
-        2) Apply post-activation (often ReLU).
+        2) Apply ReLU.
         3) Zero out all other entries.
         """
         topk = torch.topk(x, k=self.k, dim=-1)
-        values = self.postact_fn(topk.values)
+        values = topk.values.relu()
         result = torch.zeros_like(x)
         result.scatter_(-1, topk.indices, values)
         return result
@@ -130,6 +127,7 @@ class TopKTrainingSAEConfig(TrainingSAEConfig):
     """
     k: int = 100
+    aux_loss_coefficient: float = 1.0
     @override
     @classmethod
@@ -139,8 +137,7 @@ class TopKTrainingSAEConfig(TrainingSAEConfig):
 class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     """
-    TopK variant with training functionality. Injects noise during training, optionally
-    calculates a topk-related auxiliary loss, etc.
+    TopK variant with training functionality. Calculates a topk-related auxiliary loss, etc.
     """
     b_enc: nn.Parameter
@@ -157,7 +154,7 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         self, x: Float[torch.Tensor, "... d_in"]
     ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
         """
-        Similar to the base training method: cast input, optionally add noise, then apply TopK.
+        Similar to the base training method: calculate pre-activations, then apply TopK.
         """
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
@@ -235,45 +232,7 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         # top k living latents
         recons = self.decode(auxk_acts)
         auxk_loss = (recons - residual).pow(2).sum(dim=-1).mean()
-        return scale * auxk_loss
-    def _calculate_topk_aux_acts(
-        self,
-        k_aux: int,
-        hidden_pre: torch.Tensor,
-        dead_neuron_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Helper method to calculate activations for the auxiliary loss.
-        Args:
-            k_aux: Number of top dead neurons to select
-            hidden_pre: Pre-activation values from encoder
-            dead_neuron_mask: Boolean mask indicating which neurons are dead
-        Returns:
-            Tensor with activations for only the top-k dead neurons, zeros elsewhere
-        """
-        # Don't include living latents in this loss (set them to -inf so they won't be selected)
-        auxk_latents = torch.where(
-            dead_neuron_mask[None],
-            hidden_pre,
-            torch.tensor(-float("inf"), device=hidden_pre.device),
-        )
-        # Find topk values among dead neurons
-        auxk_topk = auxk_latents.topk(k_aux, dim=-1, sorted=False)
-        # Create a tensor of zeros, then place the topk values at their proper indices
-        auxk_acts = torch.zeros_like(hidden_pre)
-        auxk_acts.scatter_(-1, auxk_topk.indices, auxk_topk.values)
-        return auxk_acts
-    def to_inference_config_dict(self) -> dict[str, Any]:
-        return filter_valid_dataclass_fields(
-            self.cfg.to_dict(), TopKSAEConfig, ["architecture"]
-        )
+        return self.cfg.aux_loss_coefficient * scale * auxk_loss
 def _calculate_topk_aux_acts(
@@ -281,6 +240,18 @@ def _calculate_topk_aux_acts(
     hidden_pre: torch.Tensor,
     dead_neuron_mask: torch.Tensor,
 ) -> torch.Tensor:
+    """
+    Helper method to calculate activations for the auxiliary loss.
+    Args:
+        k_aux: Number of top dead neurons to select
+        hidden_pre: Pre-activation values from encoder
+        dead_neuron_mask: Boolean mask indicating which neurons are dead
+    Returns:
+        Tensor with activations for only the top-k dead neurons, zeros elsewhere
+    """
     # Don't include living latents in this loss
     auxk_latents = torch.where(dead_neuron_mask[None], hidden_pre, -torch.inf)
     # Top-k dead latents

sae_lens/training/activations_store.py CHANGED Viewed

@@ -7,7 +7,6 @@ from collections.abc import Generator, Iterator, Sequence
 from typing import Any, Literal, cast
 import datasets
-import numpy as np
 import torch
 from datasets import Dataset, DatasetDict, IterableDataset, load_dataset
 from huggingface_hub import hf_hub_download
@@ -420,20 +419,6 @@ class ActivationsStore:
         return activations_dataset
-    @torch.no_grad()
-    def estimate_norm_scaling_factor(self, n_batches_for_norm_estimate: int = int(1e3)):
-        norms_per_batch = []
-        for _ in tqdm(
-            range(n_batches_for_norm_estimate), desc="Estimating norm scaling factor"
-        ):
-            # temporalily set estimated_norm_scaling_factor to 1.0 so the dataloader works
-            self.estimated_norm_scaling_factor = 1.0
-            acts = self.next_batch()[:, 0]
-            self.estimated_norm_scaling_factor = None
-            norms_per_batch.append(acts.norm(dim=-1).mean().item())
-        mean_norm = np.mean(norms_per_batch)
-        return np.sqrt(self.d_in) / mean_norm
     def shuffle_input_dataset(self, seed: int, buffer_size: int = 1):
         """
         This applies a shuffle to the huggingface dataset that is the input to the activations store. This

sae_lens/training/optim.py CHANGED Viewed

@@ -2,8 +2,6 @@
 Took the LR scheduler from my previous work: https://github.com/jbloomAus/DecisionTransformerInterpretability/blob/ee55df35cdb92e81d689c72fb9dd5a7252893363/src/decision_transformer/utils.py#L425
 """
-from typing import Any
 import torch.optim as optim
 import torch.optim.lr_scheduler as lr_scheduler
@@ -152,34 +150,3 @@ class CoefficientScheduler:
     def value(self) -> float:
         """Returns the current scalar value."""
         return self.current_value
-    def state_dict(self) -> dict[str, Any]:
-        """State dict for serialization."""
-        return {
-            "warm_up_steps": self.warm_up_steps,
-            "final_value": self.final_value,
-            "current_step": self.current_step,
-            "current_value": self.current_value,
-        }
-    def load_state_dict(self, state_dict: dict[str, Any]):
-        """Loads the scheduler state."""
-        self.warm_up_steps = state_dict["warm_up_steps"]
-        self.final_value = state_dict["final_value"]
-        self.current_step = state_dict["current_step"]
-        # Maintain consistency: re-calculate current_value based on loaded step
-        # This handles resuming correctly if stopped mid-warmup.
-        if self.current_step <= self.warm_up_steps and self.warm_up_steps > 0:
-            # Use max(0, ...) to handle case where current_step might be loaded as -1 or similar before first step
-            step_for_calc = max(0, self.current_step)
-            # Recalculate based on the step *before* the one about to be taken
-            # Or simply use the saved current_value if available and consistent
-            if "current_value" in state_dict:
-                self.current_value = state_dict["current_value"]
-            else:  # Legacy state dicts might not have current_value
-                self.current_value = self.final_value * (
-                    step_for_calc / self.warm_up_steps
-                )
-        else:
-            self.current_value = self.final_value

sae_lens/training/sae_trainer.py CHANGED Viewed

@@ -349,8 +349,10 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
             },
         }
         for loss_name, loss_value in output.losses.items():
-            loss_item = _unwrap_item(loss_value)
-            log_dict[f"losses/{loss_name}"] = loss_item
+            log_dict[f"losses/{loss_name}"] = _unwrap_item(loss_value)
+        for metric_name, metric_value in output.metrics.items():
+            log_dict[f"metrics/{metric_name}"] = _unwrap_item(metric_value)
         return log_dict

{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: sae-lens
-Version: 6.0.0rc5
+Version: 6.2.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch
@@ -68,6 +68,10 @@ This library is maintained by [Joseph Bloom](https://www.jbloomaus.com/), [Curt
 Pre-trained SAEs for various models can be imported via SAE Lens. See this [page](https://jbloomaus.github.io/SAELens/sae_table/) in the readme for a list of all SAEs.
+## Migrating to SAELens v6
+The new v6 update is a major refactor to SAELens and changes the way training code is structured. Check out the [migration guide](https://jbloomaus.github.io/SAELens/latest/migrating/) for more details.
 ## Tutorials
 - [SAE Lens + Neuronpedia](tutorials/tutorial_2_0.ipynb)[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/jbloomAus/SAELens/blob/main/tutorials/tutorial_2_0.ipynb)

{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-sae_lens/__init__.py,sha256=hiHDLT9_1V7iVulw5hwqDqDj2HVxUR9I88xOfYx6X94,2861
+sae_lens/__init__.py,sha256=ByxdNdLeg_pvK89IX1lHa6iHgs2ab-UulX55Y0hUhY4,3073
 sae_lens/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sae_lens/analysis/hooked_sae_transformer.py,sha256=Eyg1Y2hVIHNuiiLOCTgzstOuW6iA-7hPHqaGR8y_vMs,13809
 sae_lens/analysis/neuronpedia_integration.py,sha256=MrENqc81Mc2SMbxGjbwHzpkGUCAFKSf0i4EdaUF2Oj4,18707
 sae_lens/cache_activations_runner.py,sha256=L5hhuU2-zPQr2S3L64GMKKLeMQfqXxwDl8NbuOtrybI,12567
-sae_lens/config.py,sha256=9Lg4HkQvj1t9QZJdmC071lyJMc_iqNQknosT7zOYfwM,27278
+sae_lens/config.py,sha256=qMMx9KuiXTD5lG3g0VzaekWOnvdAzGFSq8j1n-GObEQ,26467
 sae_lens/constants.py,sha256=CSjmiZ-bhjQeVLyRvWxAjBokCgkfM8mnvd7-vxLIWTY,639
 sae_lens/evals.py,sha256=kQyrzczKaVD9rHwfFa_DxL_gMXDxsoIVHmsFIPIU2bY,38696
 sae_lens/llm_sae_training_runner.py,sha256=58XbDylw2fPOD7C-ZfSAjeNqJLXB05uHGTuiYVVbXXY,13354
@@ -14,24 +14,25 @@ sae_lens/loading/pretrained_saes_directory.py,sha256=4Vn-Jex6SveD7EbxcSOBv8cx1gk
 sae_lens/pretokenize_runner.py,sha256=0nHQq3s_d80VS8iVK4-e6y_orAYVO8c4RrLGtIDfK_E,6885
 sae_lens/pretrained_saes.yaml,sha256=nhHW1auhyi4GHYrjUnHQqbNVhI5cMJv-HThzbzU1xG0,574145
 sae_lens/registry.py,sha256=nhy7BPSudSATqW4lo9H_k3Na7sfGHmAf9v-3wpnLL_o,1490
-sae_lens/saes/__init__.py,sha256=v6mfeDzyGYtT6x5SszAQtkldTXwPE-V_iwOlrT_pDwQ,1008
-sae_lens/saes/gated_sae.py,sha256=0zd66bH04nsaGk3bxHk10hsZofa2GrFbMo15LOsuqgU,9233
-sae_lens/saes/jumprelu_sae.py,sha256=iwmPQJ4XpIxzgosty680u8Zj7x1uVZhM75kPOT3obi0,12060
-sae_lens/saes/sae.py,sha256=ZEXEXFVtrtFrzuOV3nyweTBleNCV4EDGh1ImaF32uqg,39618
-sae_lens/saes/standard_sae.py,sha256=PfkGLsw_6La3PXHOQL0u7qQsaZsXCJqYCeCcRDj5n64,6274
-sae_lens/saes/topk_sae.py,sha256=kmry1FE1H06OvCfn84V-j2JfWGKcU5b2urwAq_Oq5j4,9893
+sae_lens/saes/__init__.py,sha256=RYqE1qkMws-kwQLmBZFhA_VCa69zVtBjGPIy_UAk2pw,1159
+sae_lens/saes/batchtopk_sae.py,sha256=CyaFG2hMyyDaEaXXrAMJC8wQDW1JoddTKF5mvxxBQKY,3395
+sae_lens/saes/gated_sae.py,sha256=qcmM9JwBA8aZR8z_IRHV1_gQX-q_63tKewWXRnhdXuo,8986
+sae_lens/saes/jumprelu_sae.py,sha256=3xkhBcCol2mEpIBLceymCpudocm2ypOjTeTXbpiXoA4,10794
+sae_lens/saes/sae.py,sha256=McpF4pTh70r6SQUbHFm0YQ9X2c2qPULBUSd_YmnEk4Y,38284
+sae_lens/saes/standard_sae.py,sha256=9UqYyYtQuThYxXKNaDjYcyowpOx2-7cShG-TeUP6JCQ,5940
+sae_lens/saes/topk_sae.py,sha256=CXMBI6CFvI5829bOhoQ350VXR9d8uFHUDlULTIWHXoU,8686
 sae_lens/tokenization_and_batching.py,sha256=oUAscjy_LPOrOb8_Ty6eLAcZ0B3HB_wiWjWktgolhG0,4314
 sae_lens/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sae_lens/training/activation_scaler.py,sha256=seEE-2Qd2JMHxqgnsNWPt-DGtYGZxWPnOwCGuVNSOtI,1719
-sae_lens/training/activations_store.py,sha256=z8erbiB6ODbsqlu-bwEWbyj4XZvgsVgjCRBuQovqp2Q,32612
+sae_lens/training/activations_store.py,sha256=HBN3oEib3PlPUDJb_yVFabQp0JcN9rWbnUN1s2DBMAs,31933
 sae_lens/training/mixing_buffer.py,sha256=vDpYG5ZE70szDvBsRKcNHEES3h_WTKJ16qDYk5jPOVA,2015
-sae_lens/training/optim.py,sha256=KXdOym-Ly3f2aFbndRc0JEH0Wa7u1BE5ljxGN3YtouQ,6836
-sae_lens/training/sae_trainer.py,sha256=9K0VudwSTJp9OlCVzaU_ngZ0WlYNrN6-ozTCCAxR9_k,15421
+sae_lens/training/optim.py,sha256=TiI9nbffzXNsI8WjcIsqa2uheW6suxqL_KDDmWXobWI,5312
+sae_lens/training/sae_trainer.py,sha256=2xcO-02OozFunob5vwoHud-hVMhVl9d28_F9gDCiL6o,15529
 sae_lens/training/types.py,sha256=qSjmGzXf3MLalygG0psnVjmhX_mpLmL47MQtZfe7qxg,81
 sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
 sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
 sae_lens/util.py,sha256=mCwLAilGMVo8Scm7CIsCafU7GsfmBvCcjwmloI4Ly7Y,1718
-sae_lens-6.0.0rc5.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
-sae_lens-6.0.0rc5.dist-info/METADATA,sha256=ZrBaBFeIuM-ZJ9r0HHKakxnx3tGv7Zf6l_Z2OIdBxIU,5326
-sae_lens-6.0.0rc5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-sae_lens-6.0.0rc5.dist-info/RECORD,,
+sae_lens-6.2.0.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
+sae_lens-6.2.0.dist-info/METADATA,sha256=Fqsq0scF5Uia0YBmeZQwVi4m4DX16_Ck-cKokbuch7U,5555
+sae_lens-6.2.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+sae_lens-6.2.0.dist-info/RECORD,,

{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{sae_lens-6.0.0rc5.dist-info → sae_lens-6.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

sae-lens 6.0.0rc5__py3-none-any.whl → 6.2.0__py3-none-any.whl

sae-lens 6.0.0rc5py3-none-any.whl → 6.2.0py3-none-any.whl