PyPI - sae-lens - Versions diffs - 6.15.0__py3-none-any.whl → 6.24.1__py3-none-any.whl - Mend

sae-lens 6.15.0py3-none-any.whl → 6.24.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

sae_lens/__init__.py +13 -1
sae_lens/analysis/hooked_sae_transformer.py +4 -13
sae_lens/cache_activations_runner.py +3 -4
sae_lens/config.py +39 -2
sae_lens/constants.py +1 -0
sae_lens/llm_sae_training_runner.py +9 -4
sae_lens/loading/pretrained_sae_loaders.py +430 -24
sae_lens/loading/pretrained_saes_directory.py +5 -3
sae_lens/pretokenize_runner.py +3 -3
sae_lens/pretrained_saes.yaml +26977 -65
sae_lens/saes/__init__.py +7 -0
sae_lens/saes/batchtopk_sae.py +3 -1
sae_lens/saes/gated_sae.py +6 -11
sae_lens/saes/jumprelu_sae.py +8 -13
sae_lens/saes/matryoshka_batchtopk_sae.py +8 -15
sae_lens/saes/sae.py +20 -32
sae_lens/saes/standard_sae.py +4 -9
sae_lens/saes/temporal_sae.py +365 -0
sae_lens/saes/topk_sae.py +8 -11
sae_lens/saes/transcoder.py +41 -0
sae_lens/training/activation_scaler.py +7 -0
sae_lens/training/activations_store.py +54 -12
sae_lens/training/optim.py +11 -0
sae_lens/training/sae_trainer.py +50 -11
{sae_lens-6.15.0.dist-info → sae_lens-6.24.1.dist-info}/METADATA +16 -16
sae_lens-6.24.1.dist-info/RECORD +41 -0
sae_lens-6.15.0.dist-info/RECORD +0 -40
{sae_lens-6.15.0.dist-info → sae_lens-6.24.1.dist-info}/WHEEL +0 -0
{sae_lens-6.15.0.dist-info → sae_lens-6.24.1.dist-info}/licenses/LICENSE +0 -0

sae_lens/saes/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .standard_sae import (
     StandardTrainingSAE,
     StandardTrainingSAEConfig,
 )
+from .temporal_sae import TemporalSAE, TemporalSAEConfig
 from .topk_sae import (
     TopKSAE,
     TopKSAEConfig,
@@ -32,6 +33,8 @@ from .topk_sae import (
     TopKTrainingSAEConfig,
 )
 from .transcoder import (
+    JumpReLUSkipTranscoder,
+    JumpReLUSkipTranscoderConfig,
     JumpReLUTranscoder,
     JumpReLUTranscoderConfig,
     SkipTranscoder,
@@ -69,6 +72,10 @@ __all__ = [
     "SkipTranscoderConfig",
     "JumpReLUTranscoder",
     "JumpReLUTranscoderConfig",
+    "JumpReLUSkipTranscoder",
+    "JumpReLUSkipTranscoderConfig",
     "MatryoshkaBatchTopKTrainingSAE",
     "MatryoshkaBatchTopKTrainingSAEConfig",
+    "TemporalSAE",
+    "TemporalSAEConfig",
 ]

sae_lens/saes/batchtopk_sae.py CHANGED Viewed

@@ -23,7 +23,9 @@ class BatchTopK(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         acts = x.relu()
         flat_acts = acts.flatten()
-        acts_topk_flat = torch.topk(flat_acts, int(self.k * acts.shape[0]), dim=-1)
+        # Calculate total number of samples across all non-feature dimensions
+        num_samples = acts.shape[:-1].numel()
+        acts_topk_flat = torch.topk(flat_acts, int(self.k * num_samples), dim=-1)
         return (
             torch.zeros_like(flat_acts)
             .scatter(-1, acts_topk_flat.indices, acts_topk_flat.values)

sae_lens/saes/gated_sae.py CHANGED Viewed

@@ -2,7 +2,6 @@ from dataclasses import dataclass
 from typing import Any
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
 from typing_extensions import override
@@ -49,9 +48,7 @@ class GatedSAE(SAE[GatedSAEConfig]):
         super().initialize_weights()
         _init_weights_gated(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space using a gated encoder.
         This must match the original encode_gated implementation from SAE class.
@@ -72,9 +69,7 @@ class GatedSAE(SAE[GatedSAEConfig]):
         # Combine gating and magnitudes
         return self.hook_sae_acts_post(active_features * feature_magnitudes)
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back into the input space:
           1) Apply optional finetuning scaling.
@@ -94,7 +89,7 @@ class GatedSAE(SAE[GatedSAEConfig]):
     @torch.no_grad()
     def fold_W_dec_norm(self):
         """Override to handle gated-specific parameters."""
-        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        W_dec_norms = self.W_dec.norm(dim=-1).clamp(min=1e-8).unsqueeze(1)
         self.W_dec.data = self.W_dec.data / W_dec_norms
         self.W_enc.data = self.W_enc.data * W_dec_norms.T
@@ -147,8 +142,8 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
         _init_weights_gated(self)
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gated forward pass with pre-activation (for training).
         """
@@ -222,7 +217,7 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
     @torch.no_grad()
     def fold_W_dec_norm(self):
         """Override to handle gated-specific parameters."""
-        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        W_dec_norms = self.W_dec.norm(dim=-1).clamp(min=1e-8).unsqueeze(1)
         self.W_dec.data = self.W_dec.data / W_dec_norms
         self.W_enc.data = self.W_enc.data * W_dec_norms.T

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Any, Literal
 import numpy as np
 import torch
-from jaxtyping import Float
 from torch import nn
 from typing_extensions import override
@@ -130,9 +129,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space using JumpReLU.
         The threshold parameter determines which units remain active.
@@ -150,9 +147,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
         # 3) Multiply the normally activated units by that mask.
         return self.hook_sae_acts_post(base_acts * jump_relu_mask)
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back to the input space.
         Follows the same steps as StandardSAE: apply scaling, transform, hook, and optionally reshape.
@@ -172,8 +167,8 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
         # Save the current threshold before calling parent method
         current_thresh = self.threshold.clone()
-        # Get W_dec norms that will be used for scaling
-        W_dec_norms = self.W_dec.norm(dim=-1)
+        # Get W_dec norms that will be used for scaling (clamped to avoid division by zero)
+        W_dec_norms = self.W_dec.norm(dim=-1).clamp(min=1e-8)
         # Call parent implementation to handle W_enc, W_dec, and b_enc adjustment
         super().fold_W_dec_norm()
@@ -265,8 +260,8 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         return torch.exp(self.log_threshold)
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         sae_in = self.process_sae_in(x)
         hidden_pre = sae_in @ self.W_enc + self.b_enc
@@ -330,8 +325,8 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         # Save the current threshold before we call the parent method
         current_thresh = self.threshold.clone()
-        # Get W_dec norms
-        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        # Get W_dec norms (clamped to avoid division by zero)
+        W_dec_norms = self.W_dec.norm(dim=-1).clamp(min=1e-8).unsqueeze(1)
         # Call parent implementation to handle W_enc and W_dec adjustment
         super().fold_W_dec_norm()

sae_lens/saes/matryoshka_batchtopk_sae.py CHANGED Viewed

@@ -2,7 +2,6 @@ import warnings
 from dataclasses import dataclass, field
 import torch
-from jaxtyping import Float
 from typing_extensions import override
 from sae_lens.saes.batchtopk_sae import (
@@ -78,14 +77,11 @@ class MatryoshkaBatchTopKTrainingSAE(BatchTopKTrainingSAE):
     @override
     def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
         base_output = super().training_forward_pass(step_input)
-        hidden_pre = base_output.hidden_pre
         inv_W_dec_norm = 1 / self.W_dec.norm(dim=-1)
         # the outer matryoshka level is the base SAE, so we don't need to add an extra loss for it
         for width in self.cfg.matryoshka_widths[:-1]:
-            inner_hidden_pre = hidden_pre[:, :width]
-            inner_feat_acts = self.activation_fn(inner_hidden_pre)
             inner_reconstruction = self._decode_matryoshka_level(
-                inner_feat_acts, width, inv_W_dec_norm
+                base_output.feature_acts, width, inv_W_dec_norm
             )
             inner_mse_loss = (
                 self.mse_loss_fn(inner_reconstruction, step_input.sae_in)
@@ -98,23 +94,24 @@ class MatryoshkaBatchTopKTrainingSAE(BatchTopKTrainingSAE):
     def _decode_matryoshka_level(
         self,
-        feature_acts: Float[torch.Tensor, "... d_sae"],
+        feature_acts: torch.Tensor,
         width: int,
         inv_W_dec_norm: torch.Tensor,
-    ) -> Float[torch.Tensor, "... d_in"]:
+    ) -> torch.Tensor:
         """
         Decodes feature activations back into input space for a matryoshka level
         """
+        inner_feature_acts = feature_acts[:, :width]
         # Handle sparse tensors using efficient sparse matrix multiplication
         if self.cfg.rescale_acts_by_decoder_norm:
             # need to multiply by the inverse of the norm because division is illegal with sparse tensors
-            feature_acts = feature_acts * inv_W_dec_norm[:width]
-        if feature_acts.is_sparse:
+            inner_feature_acts = inner_feature_acts * inv_W_dec_norm[:width]
+        if inner_feature_acts.is_sparse:
             sae_out_pre = (
-                _sparse_matmul_nd(feature_acts, self.W_dec[:width]) + self.b_dec
+                _sparse_matmul_nd(inner_feature_acts, self.W_dec[:width]) + self.b_dec
             )
         else:
-            sae_out_pre = feature_acts @ self.W_dec[:width] + self.b_dec
+            sae_out_pre = inner_feature_acts @ self.W_dec[:width] + self.b_dec
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         return self.reshape_fn_out(sae_out_pre, self.d_head)
@@ -137,7 +134,3 @@ def _validate_matryoshka_config(cfg: MatryoshkaBatchTopKTrainingSAEConfig) -> No
         warnings.warn(
             "WARNING: You have only set one matryoshka level. This is equivalent to using a standard BatchTopK SAE and is likely not what you want."
         )
-    if cfg.matryoshka_widths[0] < cfg.k:
-        raise ValueError(
-            "The smallest matryoshka level width cannot be smaller than cfg.k."
-        )

sae_lens/saes/sae.py CHANGED Viewed

@@ -19,9 +19,8 @@ from typing import (
 import einops
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
-from safetensors.torch import save_file
+from safetensors.torch import load_file, save_file
 from torch import nn
 from transformer_lens.hook_points import HookedRootModule, HookPoint
 from typing_extensions import deprecated, overload, override
@@ -155,9 +154,9 @@ class SAEConfig(ABC):
     dtype: str = "float32"
     device: str = "cpu"
     apply_b_dec_to_input: bool = True
-    normalize_activations: Literal[
-        "none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"
-    ] = "none"  # none, expected_average_only_in (Anthropic April Update), constant_norm_rescale (Anthropic Feb Update)
+    normalize_activations: Literal["none", "expected_average_only_in", "layer_norm"] = (
+        "none"  # none, expected_average_only_in (Anthropic April Update)
+    )
     reshape_activations: Literal["none", "hook_z"] = "none"
     metadata: SAEMetadata = field(default_factory=SAEMetadata)
@@ -217,6 +216,7 @@ class TrainStepInput:
     sae_in: torch.Tensor
     coefficients: dict[str, float]
     dead_neuron_mask: torch.Tensor | None
+    n_training_steps: int
 class TrainCoefficientConfig(NamedTuple):
@@ -308,6 +308,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             self.run_time_activation_norm_fn_in = run_time_activation_norm_fn_in
             self.run_time_activation_norm_fn_out = run_time_activation_norm_fn_out
         elif self.cfg.normalize_activations == "layer_norm":
             #  we need to scale the norm of the input and store the scaling factor
             def run_time_activation_ln_in(
@@ -349,16 +350,12 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.W_enc = nn.Parameter(w_enc_data)
     @abstractmethod
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """Encode input tensor to feature space."""
         pass
     @abstractmethod
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """Decode feature activations back to input space."""
         pass
@@ -448,26 +445,15 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return super().to(*args, **kwargs)
-    def process_sae_in(
-        self, sae_in: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_in"]:
-        # print(f"Input shape to process_sae_in: {sae_in.shape}")
-        # print(f"self.cfg.hook_name: {self.cfg.hook_name}")
-        # print(f"self.b_dec shape: {self.b_dec.shape}")
-        # print(f"Hook z reshaping mode: {getattr(self, 'hook_z_reshaping_mode', False)}")
+    def process_sae_in(self, sae_in: torch.Tensor) -> torch.Tensor:
         sae_in = sae_in.to(self.dtype)
-        # print(f"Shape before reshape_fn_in: {sae_in.shape}")
         sae_in = self.reshape_fn_in(sae_in)
-        # print(f"Shape after reshape_fn_in: {sae_in.shape}")
         sae_in = self.hook_sae_input(sae_in)
         sae_in = self.run_time_activation_norm_fn_in(sae_in)
         # Here's where the error happens
         bias_term = self.b_dec * self.cfg.apply_b_dec_to_input
-        # print(f"Bias term shape: {bias_term.shape}")
         return sae_in - bias_term
@@ -498,7 +484,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @torch.no_grad()
     def fold_W_dec_norm(self):
         """Fold decoder norms into encoder."""
-        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        W_dec_norms = self.W_dec.norm(dim=-1).clamp(min=1e-8).unsqueeze(1)
         self.W_dec.data = self.W_dec.data / W_dec_norms
         self.W_enc.data = self.W_enc.data * W_dec_norms.T
@@ -866,14 +852,12 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     @abstractmethod
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Encode with access to pre-activation values for training."""
         ...
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         For inference, just encode without returning hidden_pre.
         (training_forward_pass calls encode_with_hidden_pre).
@@ -881,9 +865,7 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         feature_acts, _ = self.encode_with_hidden_pre(x)
         return feature_acts
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decodes feature activations back into input space,
         applying optional finetuning scale, hooking, out normalization, etc.
@@ -1017,6 +999,12 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     ) -> type[TrainingSAEConfig]:
         return get_sae_training_class(architecture)[1]
+    def load_weights_from_checkpoint(self, checkpoint_path: Path | str) -> None:
+        checkpoint_path = Path(checkpoint_path)
+        state_dict = load_file(checkpoint_path / SAE_WEIGHTS_FILENAME)
+        self.process_state_dict_for_loading(state_dict)
+        self.load_state_dict(state_dict)
 _blank_hook = nn.Identity()

sae_lens/saes/standard_sae.py CHANGED Viewed

@@ -2,7 +2,6 @@ from dataclasses import dataclass
 import numpy as np
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
 from typing_extensions import override
@@ -54,9 +53,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         super().initialize_weights()
         _init_weights_standard(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space.
         """
@@ -67,9 +64,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         # Apply the activation function (e.g., ReLU, depending on config)
         return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back to the input space.
         Now, if hook_z reshaping is turned on, we reverse the flattening.
@@ -127,8 +122,8 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
         }
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # Process the input (including dtype conversion, hook call, and any activation normalization)
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation (and allow for a hook if desired)

sae-lens 6.15.0__py3-none-any.whl → 6.24.1__py3-none-any.whl

sae-lens 6.15.0py3-none-any.whl → 6.24.1py3-none-any.whl