PyPI - sae-lens - Versions diffs - 6.0.0rc1__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl - Mend

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sae_lens/__init__.py +55 -18
sae_lens/analysis/hooked_sae_transformer.py +10 -10
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +9 -7
sae_lens/config.py +105 -235
sae_lens/constants.py +20 -0
sae_lens/evals.py +34 -31
sae_lens/{sae_training_runner.py → llm_sae_training_runner.py} +103 -70
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +36 -10
sae_lens/registry.py +49 -0
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +70 -59
sae_lens/saes/jumprelu_sae.py +58 -72
sae_lens/saes/sae.py +248 -273
sae_lens/saes/standard_sae.py +75 -57
sae_lens/saes/topk_sae.py +72 -83
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +105 -184
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +134 -158
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +11 -5
sae_lens/util.py +47 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc3.dist-info/RECORD +38 -0
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/WHEEL +1 -1
sae_lens/regsitry.py +0 -34
sae_lens-6.0.0rc1.dist-info/RECORD +0 -32
{sae_lens-6.0.0rc1.dist-info → sae_lens-6.0.0rc3.dist-info}/LICENSE +0 -0

sae_lens/saes/standard_sae.py CHANGED Viewed

@@ -1,13 +1,37 @@
+from dataclasses import dataclass
+from typing import Any
 import numpy as np
 import torch
 from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
+from typing_extensions import override
+from sae_lens.saes.sae import (
+    SAE,
+    SAEConfig,
+    TrainCoefficientConfig,
+    TrainingSAE,
+    TrainingSAEConfig,
+    TrainStepInput,
+)
+from sae_lens.util import filter_valid_dataclass_fields
+@dataclass
+class StandardSAEConfig(SAEConfig):
+    """
+    Configuration class for a StandardSAE.
+    """
-from sae_lens.saes.sae import SAE, SAEConfig, TrainingSAE, TrainStepInput
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "standard"
-class StandardSAE(SAE):
+class StandardSAE(SAE[StandardSAEConfig]):
     """
     StandardSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a simple linear encoder and decoder.
@@ -23,31 +47,14 @@ class StandardSAE(SAE):
     b_enc: nn.Parameter
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: StandardSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
+    @override
     def initialize_weights(self) -> None:
         # Initialize encoder weights and bias.
-        self.b_enc = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # Use Kaiming Uniform for W_enc
-        w_enc_data = torch.empty(
-            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_enc_data)
-        self.W_enc = nn.Parameter(w_enc_data)
-        # Use Kaiming Uniform for W_dec
-        w_dec_data = torch.empty(
-            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_dec_data)
-        self.W_dec = nn.Parameter(w_dec_data)
+        super().initialize_weights()
+        _init_weights_standard(self)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -70,11 +77,9 @@ class StandardSAE(SAE):
         Decode the feature activations back to the input space.
         Now, if hook_z reshaping is turned on, we reverse the flattening.
         """
-        # 1) apply finetuning scaling if configured.
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
-        # 2) linear transform
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
-        # 3) hook reconstruction
+        # 1) linear transform
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        # 2) hook reconstruction
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         # 4) optional out-normalization (e.g. constant_norm_rescale or layer_norm)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
@@ -82,7 +87,23 @@ class StandardSAE(SAE):
         return self.reshape_fn_out(sae_out_pre, self.d_head)
-class StandardTrainingSAE(TrainingSAE):
+@dataclass
+class StandardTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a StandardTrainingSAE.
+    """
+    l1_coefficient: float = 1.0
+    lp_norm: float = 1.0
+    l1_warm_up_steps: int = 0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "standard"
+class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
     """
     StandardTrainingSAE is a concrete implementation of BaseTrainingSAE using the "standard" SAE architecture.
     It implements:
@@ -96,31 +117,17 @@ class StandardTrainingSAE(TrainingSAE):
     b_enc: nn.Parameter
     def initialize_weights(self) -> None:
-        # Basic init
-        # In Python MRO, this calls StandardSAE.initialize_weights()
-        StandardSAE.initialize_weights(self)  # type: ignore
-        # Complex init logic from original TrainingSAE
-        if self.cfg.decoder_orthogonal_init:
-            self.W_dec.data = nn.init.orthogonal_(self.W_dec.data.T).T
-        elif self.cfg.decoder_heuristic_init:
-            self.W_dec.data = torch.rand(  # Changed from Parameter to data assignment
-                self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-            )
-            self.initialize_decoder_norm_constant_norm()
-        if self.cfg.init_encoder_as_decoder_transpose:
-            self.W_enc.data = self.W_dec.data.T.clone().contiguous()  # type: ignore
+        super().initialize_weights()
+        _init_weights_standard(self)
-        if self.cfg.normalize_sae_decoder:
-            with torch.no_grad():
-                self.set_decoder_norm_to_unit_norm()
-    @torch.no_grad()
-    def initialize_decoder_norm_constant_norm(self, norm: float = 0.1):
-        self.W_dec.data /= torch.norm(self.W_dec.data, dim=1, keepdim=True)  # type: ignore
-        self.W_dec.data *= norm
+    @override
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {
+            "l1": TrainCoefficientConfig(
+                value=self.cfg.l1_coefficient,
+                warm_up_steps=self.cfg.l1_warm_up_steps,
+            ),
+        }
     def encode_with_hidden_pre(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -148,13 +155,11 @@ class StandardTrainingSAE(TrainingSAE):
         sae_out: torch.Tensor,
     ) -> dict[str, torch.Tensor]:
         # The "standard" auxiliary loss is a sparsity penalty on the feature activations
-        weighted_feature_acts = feature_acts
-        if self.cfg.scale_sparsity_penalty_by_decoder_norm:
-            weighted_feature_acts = feature_acts * self.W_dec.norm(dim=1)
+        weighted_feature_acts = feature_acts * self.W_dec.norm(dim=1)
         # Compute the p-norm (set by cfg.lp_norm) over the feature dimension
         sparsity = weighted_feature_acts.norm(p=self.cfg.lp_norm, dim=-1)
-        l1_loss = (step_input.current_l1_coefficient * sparsity).mean()
+        l1_loss = (step_input.coefficients["l1"] * sparsity).mean()
         return {"l1_loss": l1_loss}
@@ -165,3 +170,16 @@ class StandardTrainingSAE(TrainingSAE):
             **super().log_histograms(),
             "weights/b_e": b_e_dist,
         }
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), StandardSAEConfig, ["architecture"]
+        )
+def _init_weights_standard(
+    sae: SAE[StandardSAEConfig] | TrainingSAE[StandardTrainingSAEConfig],
+) -> None:
+    sae.b_enc = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

sae_lens/saes/topk_sae.py CHANGED Viewed

@@ -1,18 +1,22 @@
 """Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
-from typing import Callable
+from dataclasses import dataclass
+from typing import Any, Callable
 import torch
 from jaxtyping import Float
 from torch import nn
+from typing_extensions import override
 from sae_lens.saes.sae import (
     SAE,
     SAEConfig,
+    TrainCoefficientConfig,
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
 )
+from sae_lens.util import filter_valid_dataclass_fields
 class TopK(nn.Module):
@@ -45,14 +49,30 @@ class TopK(nn.Module):
         return result
-class TopKSAE(SAE):
+@dataclass
+class TopKSAEConfig(SAEConfig):
+    """
+    Configuration class for a TopKSAE.
+    """
+    k: int = 100
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "topk"
+class TopKSAE(SAE[TopKSAEConfig]):
     """
     An inference-only sparse autoencoder using a "topk" activation function.
     It uses linear encoder and decoder layers, applying the TopK activation
     to the hidden pre-activation in its encode step.
     """
-    def __init__(self, cfg: SAEConfig, use_error_term: bool = False):
+    b_enc: nn.Parameter
+    def __init__(self, cfg: TopKSAEConfig, use_error_term: bool = False):
         """
         Args:
             cfg: SAEConfig defining model size and behavior.
@@ -60,38 +80,11 @@ class TopKSAE(SAE):
         """
         super().__init__(cfg, use_error_term)
-        if self.cfg.activation_fn != "topk":
-            raise ValueError("TopKSAE must use a TopK activation function.")
+    @override
     def initialize_weights(self) -> None:
-        """
-        Initializes weights and biases for encoder/decoder similarly to the standard SAE,
-        that is:
-          - b_enc, b_dec are zero-initialized
-          - W_enc, W_dec are Kaiming Uniform
-        """
-        # encoder bias
-        self.b_enc = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        # decoder bias
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        # encoder weight
-        w_enc_data = torch.empty(
-            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_enc_data)
-        self.W_enc = nn.Parameter(w_enc_data)
-        # decoder weight
-        w_dec_data = torch.empty(
-            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_dec_data)
-        self.W_dec = nn.Parameter(w_dec_data)
+        # Initialize encoder weights and bias.
+        super().initialize_weights()
+        _init_weights_topk(self)
     def encode(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -114,28 +107,31 @@ class TopKSAE(SAE):
         Applies optional finetuning scaling, hooking to recons, out normalization,
         and optional head reshaping.
         """
-        scaled_features = self.apply_finetuning_scaling_factor(feature_acts)
-        sae_out_pre = scaled_features @ self.W_dec + self.b_dec
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         return self.reshape_fn_out(sae_out_pre, self.d_head)
-    def _get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        if self.cfg.activation_fn == "topk":
-            if "k" not in self.cfg.activation_fn_kwargs:
-                raise ValueError("TopK activation function requires a k value.")
-            k = self.cfg.activation_fn_kwargs.get(
-                "k", 1
-            )  # Default k to 1 if not provided
-            postact_fn = self.cfg.activation_fn_kwargs.get(
-                "postact_fn", nn.ReLU()
-            )  # Default post-activation to ReLU if not provided
-            return TopK(k, postact_fn)
-        # Otherwise, return the "standard" handling from BaseSAE
-        return super()._get_activation_fn()
-class TopKTrainingSAE(TrainingSAE):
+    @override
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return TopK(self.cfg.k)
+@dataclass
+class TopKTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a TopKTrainingSAE.
+    """
+    k: int = 100
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "topk"
+class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     """
     TopK variant with training functionality. Injects noise during training, optionally
     calculates a topk-related auxiliary loss, etc.
@@ -143,32 +139,13 @@ class TopKTrainingSAE(TrainingSAE):
     b_enc: nn.Parameter
-    def __init__(self, cfg: TrainingSAEConfig, use_error_term: bool = False):
+    def __init__(self, cfg: TopKTrainingSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
-        if self.cfg.activation_fn != "topk":
-            raise ValueError("TopKSAE must use a TopK activation function.")
+    @override
     def initialize_weights(self) -> None:
-        """Very similar to TopKSAE, using zero biases + Kaiming Uniform weights."""
-        self.b_enc = nn.Parameter(
-            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
-        )
-        self.b_dec = nn.Parameter(
-            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
-        )
-        w_enc_data = torch.empty(
-            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_enc_data)
-        self.W_enc = nn.Parameter(w_enc_data)
-        w_dec_data = torch.empty(
-            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
-        )
-        nn.init.kaiming_uniform_(w_dec_data)
-        self.W_dec = nn.Parameter(w_dec_data)
+        super().initialize_weights()
+        _init_weights_topk(self)
     def encode_with_hidden_pre(
         self, x: Float[torch.Tensor, "... d_in"]
@@ -207,14 +184,13 @@ class TopKTrainingSAE(TrainingSAE):
         )
         return {"auxiliary_reconstruction_loss": topk_loss}
-    def _get_activation_fn(self):
-        if self.cfg.activation_fn == "topk":
-            if "k" not in self.cfg.activation_fn_kwargs:
-                raise ValueError("TopK activation function requires a k value.")
-            k = self.cfg.activation_fn_kwargs.get("k", 1)
-            postact_fn = self.cfg.activation_fn_kwargs.get("postact_fn", nn.ReLU())
-            return TopK(k, postact_fn)
-        return super()._get_activation_fn()
+    @override
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return TopK(self.cfg.k)
+    @override
+    def get_coefficients(self) -> dict[str, TrainCoefficientConfig | float]:
+        return {}
     def calculate_topk_aux_loss(
         self,
@@ -288,6 +264,11 @@ class TopKTrainingSAE(TrainingSAE):
         return auxk_acts
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        return filter_valid_dataclass_fields(
+            self.cfg.to_dict(), TopKSAEConfig, ["architecture"]
+        )
 def _calculate_topk_aux_acts(
     k_aux: int,
@@ -303,3 +284,11 @@ def _calculate_topk_aux_acts(
     auxk_acts.scatter_(-1, auxk_topk.indices, auxk_topk.values)
     # Set activations to zero for all but top k_aux dead latents
     return auxk_acts
+def _init_weights_topk(
+    sae: SAE[TopKSAEConfig] | TrainingSAE[TopKTrainingSAEConfig],
+) -> None:
+    sae.b_enc = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

sae_lens/training/activation_scaler.py ADDED Viewed

@@ -0,0 +1,53 @@
+import json
+from dataclasses import dataclass
+from statistics import mean
+import torch
+from tqdm import tqdm
+from sae_lens.training.types import DataProvider
+@dataclass
+class ActivationScaler:
+    scaling_factor: float | None = None
+    def scale(self, acts: torch.Tensor) -> torch.Tensor:
+        return acts if self.scaling_factor is None else acts * self.scaling_factor
+    def unscale(self, acts: torch.Tensor) -> torch.Tensor:
+        return acts if self.scaling_factor is None else acts / self.scaling_factor
+    def __call__(self, acts: torch.Tensor) -> torch.Tensor:
+        return self.scale(acts)
+    @torch.no_grad()
+    def _calculate_mean_norm(
+        self, data_provider: DataProvider, n_batches_for_norm_estimate: int = int(1e3)
+    ) -> float:
+        norms_per_batch: list[float] = []
+        for _ in tqdm(
+            range(n_batches_for_norm_estimate), desc="Estimating norm scaling factor"
+        ):
+            acts = next(data_provider)
+            norms_per_batch.append(acts.norm(dim=-1).mean().item())
+        return mean(norms_per_batch)
+    def estimate_scaling_factor(
+        self,
+        d_in: int,
+        data_provider: DataProvider,
+        n_batches_for_norm_estimate: int = int(1e3),
+    ):
+        mean_norm = self._calculate_mean_norm(
+            data_provider, n_batches_for_norm_estimate
+        )
+        self.scaling_factor = (d_in**0.5) / mean_norm
+    def save(self, file_path: str):
+        """save the state dict to a file in json format"""
+        if not file_path.endswith(".json"):
+            raise ValueError("file_path must end with .json")
+        with open(file_path, "w") as f:
+            json.dump({"scaling_factor": self.scaling_factor}, f)

sae-lens 6.0.0rc1__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl

sae-lens 6.0.0rc1py3-none-any.whl → 6.0.0rc3py3-none-any.whl