PyPI - sae-lens - Versions diffs - 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl - Mend

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +6 -3
sae_lens/analysis/neuronpedia_integration.py +3 -3
sae_lens/cache_activations_runner.py +7 -6
sae_lens/config.py +50 -6
sae_lens/constants.py +2 -0
sae_lens/evals.py +39 -28
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +24 -12
sae_lens/saes/gated_sae.py +0 -4
sae_lens/saes/jumprelu_sae.py +4 -10
sae_lens/saes/sae.py +121 -51
sae_lens/saes/standard_sae.py +4 -11
sae_lens/saes/topk_sae.py +18 -12
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +77 -174
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/sae_trainer.py +107 -98
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +1 -1
sae_lens/util.py +19 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc4.dist-info/RECORD +37 -0
sae_lens/sae_training_runner.py +0 -237
sae_lens/training/geometric_median.py +0 -101
sae_lens-6.0.0rc2.dist-info/RECORD +0 -35
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/WHEEL +0 -0

sae_lens/saes/sae.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Base classes for Sparse Autoencoders (SAEs)."""
+import copy
 import json
 import warnings
 from abc import ABC, abstractmethod
@@ -59,24 +60,91 @@ T_SAE = TypeVar("T_SAE", bound="SAE")  # type: ignore
 T_TRAINING_SAE = TypeVar("T_TRAINING_SAE", bound="TrainingSAE")  # type: ignore
-@dataclass
 class SAEMetadata:
     """Core metadata about how this SAE should be used, if known."""
-    model_name: str | None = None
-    hook_name: str | None = None
-    model_class_name: str | None = None
-    hook_layer: int | None = None
-    hook_head_index: int | None = None
-    model_from_pretrained_kwargs: dict[str, Any] | None = None
-    prepend_bos: bool | None = None
-    exclude_special_tokens: bool | list[int] | None = None
-    neuronpedia_id: str | None = None
-    context_size: int | None = None
-    seqpos_slice: tuple[int | None, ...] | None = None
-    dataset_path: str | None = None
-    sae_lens_version: str = field(default_factory=lambda: __version__)
-    sae_lens_training_version: str = field(default_factory=lambda: __version__)
+    def __init__(self, **kwargs: Any):
+        # Set default version fields with their current behavior
+        self.sae_lens_version = kwargs.pop("sae_lens_version", __version__)
+        self.sae_lens_training_version = kwargs.pop(
+            "sae_lens_training_version", __version__
+        )
+        # Set all other attributes dynamically
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+    def __getattr__(self, name: str) -> None:
+        """Return None for any missing attribute (like defaultdict)"""
+        return
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Allow setting any attribute"""
+        super().__setattr__(name, value)
+    def __getitem__(self, key: str) -> Any:
+        """Allow dictionary-style access: metadata['key'] - returns None for missing keys"""
+        return getattr(self, key)
+    def __setitem__(self, key: str, value: Any) -> None:
+        """Allow dictionary-style assignment: metadata['key'] = value"""
+        setattr(self, key, value)
+    def __contains__(self, key: str) -> bool:
+        """Allow 'in' operator: 'key' in metadata"""
+        # Only return True if the attribute was explicitly set (not just defaulting to None)
+        return key in self.__dict__
+    def get(self, key: str, default: Any = None) -> Any:
+        """Dictionary-style get with default"""
+        value = getattr(self, key)
+        # If the attribute wasn't explicitly set and we got None from __getattr__,
+        # use the provided default instead
+        if key not in self.__dict__ and value is None:
+            return default
+        return value
+    def keys(self):
+        """Return all explicitly set attribute names"""
+        return self.__dict__.keys()
+    def values(self):
+        """Return all explicitly set attribute values"""
+        return self.__dict__.values()
+    def items(self):
+        """Return all explicitly set attribute name-value pairs"""
+        return self.__dict__.items()
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        return self.__dict__.copy()
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "SAEMetadata":
+        """Create from dictionary"""
+        return cls(**data)
+    def __repr__(self) -> str:
+        return f"SAEMetadata({self.__dict__})"
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SAEMetadata):
+            return False
+        return self.__dict__ == other.__dict__
+    def __deepcopy__(self, memo: dict[int, Any]) -> "SAEMetadata":
+        """Support for deep copying"""
+        return SAEMetadata(**copy.deepcopy(self.__dict__, memo))
+    def __getstate__(self) -> dict[str, Any]:
+        """Support for pickling"""
+        return self.__dict__
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        """Support for unpickling"""
+        self.__dict__.update(state)
 @dataclass
@@ -100,7 +168,7 @@ class SAEConfig(ABC):
     def to_dict(self) -> dict[str, Any]:
         res = {field.name: getattr(self, field.name) for field in fields(self)}
-        res["metadata"] = asdict(self.metadata)
+        res["metadata"] = self.metadata.to_dict()
         res["architecture"] = self.architecture()
         return res
@@ -125,7 +193,7 @@ class SAEConfig(ABC):
             "layer_norm",
         ]:
             raise ValueError(
-                f"normalize_activations must be none, expected_average_only_in, constant_norm_rescale, or layer_norm. Got {self.normalize_activations}"
+                f"normalize_activations must be none, expected_average_only_in, layer_norm, or constant_norm_rescale. Got {self.normalize_activations}"
             )
@@ -239,9 +307,8 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             self.run_time_activation_norm_fn_in = run_time_activation_norm_fn_in
             self.run_time_activation_norm_fn_out = run_time_activation_norm_fn_out
         elif self.cfg.normalize_activations == "layer_norm":
+            #  we need to scale the norm of the input and store the scaling factor
             def run_time_activation_ln_in(
                 x: torch.Tensor, eps: float = 1e-5
             ) -> torch.Tensor:
@@ -523,7 +590,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         device: str = "cpu",
         force_download: bool = False,
         converter: PretrainedSaeHuggingfaceLoader | None = None,
-    ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
+    ) -> T_SAE:
         """
         Load a pretrained SAE from the Hugging Face model hub.
@@ -531,7 +598,28 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
             id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
             device: The device to load the SAE on.
-            return_sparsity_if_present: If True, will return the log sparsity tensor if it is present in the model directory in the Hugging Face model hub.
+        """
+        return cls.from_pretrained_with_cfg_and_sparsity(
+            release, sae_id, device, force_download, converter=converter
+        )[0]
+    @classmethod
+    def from_pretrained_with_cfg_and_sparsity(
+        cls: Type[T_SAE],
+        release: str,
+        sae_id: str,
+        device: str = "cpu",
+        force_download: bool = False,
+        converter: PretrainedSaeHuggingfaceLoader | None = None,
+    ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
+        """
+        Load a pretrained SAE from the Hugging Face model hub, along with its config dict and sparsity, if present.
+        In SAELens <= 5.x.x, this was called SAE.from_pretrained().
+        Args:
+            release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
+            id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
+            device: The device to load the SAE on.
         """
         # get sae directory
@@ -647,9 +735,6 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
 @dataclass(kw_only=True)
 class TrainingSAEConfig(SAEConfig, ABC):
-    noise_scale: float = 0.0
-    mse_loss_normalization: str | None = None
-    b_dec_init_method: Literal["zeros", "geometric_median", "mean"] = "zeros"
     # https://transformer-circuits.pub/2024/april-update/index.html#training-saes
     # 0.1 corresponds to the "heuristic" initialization, use None to disable
     decoder_init_norm: float | None = 0.1
@@ -666,7 +751,6 @@ class TrainingSAEConfig(SAEConfig, ABC):
         metadata = SAEMetadata(
             model_name=cfg.model_name,
             hook_name=cfg.hook_name,
-            hook_layer=cfg.hook_layer,
             hook_head_index=cfg.hook_head_index,
             context_size=cfg.context_size,
             prepend_bos=cfg.prepend_bos,
@@ -683,9 +767,6 @@ class TrainingSAEConfig(SAEConfig, ABC):
     def from_dict(
         cls: type[T_TRAINING_SAE_CONFIG], config_dict: dict[str, Any]
     ) -> T_TRAINING_SAE_CONFIG:
-        # remove any keys that are not in the dataclass
-        # since we sometimes enhance the config with the whole LM runner config
-        valid_config_dict = filter_valid_dataclass_fields(config_dict, cls)
         cfg_class = cls
         if "architecture" in config_dict:
             cfg_class = get_sae_training_class(config_dict["architecture"])[1]
@@ -693,6 +774,9 @@ class TrainingSAEConfig(SAEConfig, ABC):
             raise ValueError(
                 f"SAE config class {cls} does not match dict config class {type(cfg_class)}"
             )
+        # remove any keys that are not in the dataclass
+        # since we sometimes enhance the config with the whole LM runner config
+        valid_config_dict = filter_valid_dataclass_fields(config_dict, cfg_class)
         if "metadata" in config_dict:
             valid_config_dict["metadata"] = SAEMetadata(**config_dict["metadata"])
         return cfg_class(**valid_config_dict)
@@ -701,6 +785,7 @@ class TrainingSAEConfig(SAEConfig, ABC):
         return {
             **super().to_dict(),
             **asdict(self),
+            "metadata": self.metadata.to_dict(),
             "architecture": self.architecture(),
         }
@@ -711,12 +796,14 @@ class TrainingSAEConfig(SAEConfig, ABC):
         Creates a dictionary containing attributes corresponding to the fields
         defined in the base SAEConfig class.
         """
-        base_config_field_names = {f.name for f in fields(SAEConfig)}
+        base_sae_cfg_class = get_sae_class(self.architecture())[1]
+        base_config_field_names = {f.name for f in fields(base_sae_cfg_class)}
         result_dict = {
             field_name: getattr(self, field_name)
             for field_name in base_config_field_names
         }
         result_dict["architecture"] = self.architecture()
+        result_dict["metadata"] = self.metadata.to_dict()
         return result_dict
@@ -729,7 +816,7 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         # Turn off hook_z reshaping for training mode - the activation store
         # is expected to handle reshaping before passing data to the SAE
         self.turn_off_forward_pass_hook_z_reshaping()
-        self.mse_loss_fn = self._get_mse_loss_fn()
+        self.mse_loss_fn = mse_loss
     @abstractmethod
     def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]: ...
@@ -864,27 +951,6 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         """
         return self.process_state_dict_for_saving(state_dict)
-    def _get_mse_loss_fn(self) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
-        """Get the MSE loss function based on config."""
-        def standard_mse_loss_fn(
-            preds: torch.Tensor, target: torch.Tensor
-        ) -> torch.Tensor:
-            return torch.nn.functional.mse_loss(preds, target, reduction="none")
-        def batch_norm_mse_loss_fn(
-            preds: torch.Tensor, target: torch.Tensor
-        ) -> torch.Tensor:
-            target_centered = target - target.mean(dim=0, keepdim=True)
-            normalization = target_centered.norm(dim=-1, keepdim=True)
-            return torch.nn.functional.mse_loss(preds, target, reduction="none") / (
-                normalization + 1e-6
-            )
-        if self.cfg.mse_loss_normalization == "dense_batch":
-            return batch_norm_mse_loss_fn
-        return standard_mse_loss_fn
     @torch.no_grad()
     def remove_gradient_parallel_to_decoder_directions(self) -> None:
         """Remove gradient components parallel to decoder directions."""
@@ -946,3 +1012,7 @@ def _disable_hooks(sae: SAE[Any]):
     finally:
         for hook_name, hook in sae.hook_dict.items():
             setattr(sae, hook_name, hook)
+def mse_loss(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.mse_loss(preds, target, reduction="none")

sae_lens/saes/standard_sae.py CHANGED Viewed

@@ -67,7 +67,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation values
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # Apply the activation function (e.g., ReLU, tanh-relu, depending on config)
+        # Apply the activation function (e.g., ReLU, depending on config)
         return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
     def decode(
@@ -81,7 +81,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         # 2) hook reconstruction
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
-        # 4) optional out-normalization (e.g. constant_norm_rescale or layer_norm)
+        # 4) optional out-normalization (e.g. constant_norm_rescale)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         # 5) if hook_z is enabled, rearrange back to (..., n_heads, d_head).
         return self.reshape_fn_out(sae_out_pre, self.d_head)
@@ -136,16 +136,9 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation (and allow for a hook if desired)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)  # type: ignore
-        # Add noise during training for robustness (scaled by noise_scale from the configuration)
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre_noised = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
-        else:
-            hidden_pre_noised = hidden_pre
         # Apply the activation function (and any post-activation hook)
-        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre_noised))
-        return feature_acts, hidden_pre_noised
+        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+        return feature_acts, hidden_pre
     def calculate_aux_loss(
         self,

sae_lens/saes/topk_sae.py CHANGED Viewed

@@ -91,8 +91,7 @@ class TopKSAE(SAE[TopKSAEConfig]):
     ) -> Float[torch.Tensor, "... d_sae"]:
         """
         Converts input x into feature activations.
-        Uses topk activation from the config (cfg.activation_fn == "topk")
-        under the hood.
+        Uses topk activation under the hood.
         """
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
@@ -116,6 +115,13 @@ class TopKSAE(SAE[TopKSAEConfig]):
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
         return TopK(self.cfg.k)
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
 @dataclass
 class TopKTrainingSAEConfig(TrainingSAEConfig):
@@ -156,18 +162,11 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # Inject noise if training
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre_noised = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
-        else:
-            hidden_pre_noised = hidden_pre
         # Apply the TopK activation function (already set in self.activation_fn if config is "topk")
-        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre_noised))
-        return feature_acts, hidden_pre_noised
+        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+        return feature_acts, hidden_pre
+    @override
     def calculate_aux_loss(
         self,
         step_input: TrainStepInput,
@@ -184,6 +183,13 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         )
         return {"auxiliary_reconstruction_loss": topk_loss}
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
     @override
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
         return TopK(self.cfg.k)

sae_lens/training/activation_scaler.py ADDED Viewed

@@ -0,0 +1,53 @@
+import json
+from dataclasses import dataclass
+from statistics import mean
+import torch
+from tqdm.auto import tqdm
+from sae_lens.training.types import DataProvider
+@dataclass
+class ActivationScaler:
+    scaling_factor: float | None = None
+    def scale(self, acts: torch.Tensor) -> torch.Tensor:
+        return acts if self.scaling_factor is None else acts * self.scaling_factor
+    def unscale(self, acts: torch.Tensor) -> torch.Tensor:
+        return acts if self.scaling_factor is None else acts / self.scaling_factor
+    def __call__(self, acts: torch.Tensor) -> torch.Tensor:
+        return self.scale(acts)
+    @torch.no_grad()
+    def _calculate_mean_norm(
+        self, data_provider: DataProvider, n_batches_for_norm_estimate: int = int(1e3)
+    ) -> float:
+        norms_per_batch: list[float] = []
+        for _ in tqdm(
+            range(n_batches_for_norm_estimate), desc="Estimating norm scaling factor"
+        ):
+            acts = next(data_provider)
+            norms_per_batch.append(acts.norm(dim=-1).mean().item())
+        return mean(norms_per_batch)
+    def estimate_scaling_factor(
+        self,
+        d_in: int,
+        data_provider: DataProvider,
+        n_batches_for_norm_estimate: int = int(1e3),
+    ):
+        mean_norm = self._calculate_mean_norm(
+            data_provider, n_batches_for_norm_estimate
+        )
+        self.scaling_factor = (d_in**0.5) / mean_norm
+    def save(self, file_path: str):
+        """save the state dict to a file in json format"""
+        if not file_path.endswith(".json"):
+            raise ValueError("file_path must end with .json")
+        with open(file_path, "w") as f:
+            json.dump({"scaling_factor": self.scaling_factor}, f)

sae-lens 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl