PyPI - sae-lens - Versions diffs - 6.0.0rc3__tar.gz → 6.0.0rc4__tar.gz - Mend

sae-lens 6.0.0rc3tar.gz → 6.0.0rc4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: sae-lens
-Version: 6.0.0rc3
+Version: 6.0.0rc4
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.0.0-rc.3"
+version = "6.0.0-rc.4"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"
@@ -60,6 +60,7 @@ tabulate = "^0.9.0"
 ruff = "^0.7.4"
 eai-sparsify = "^1.1.1"
 mike = "^2.0.0"
+trio = "^0.30.0"
 [tool.poetry.extras]
 mamba = ["mamba-lens"]

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.0.0-rc.3"
+__version__ = "6.0.0-rc.4"
 import logging

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/analysis/neuronpedia_integration.py RENAMED Viewed

@@ -59,7 +59,7 @@ def NanAndInfReplacer(value: str):
 def open_neuronpedia_feature_dashboard(sae: SAE[Any], index: int):
-    sae_id = sae.cfg.neuronpedia_id
+    sae_id = sae.cfg.metadata.neuronpedia_id
     if sae_id is None:
         logger.warning(
             "SAE does not have a Neuronpedia ID. Either dashboards for this SAE do not exist (yet) on Neuronpedia, or the SAE was not loaded via the from_pretrained method"
@@ -74,7 +74,7 @@ def get_neuronpedia_quick_list(
     features: list[int],
     name: str = "temporary_list",
 ):
-    sae_id = sae.cfg.neuronpedia_id
+    sae_id = sae.cfg.metadata.neuronpedia_id
     if sae_id is None:
         logger.warning(
             "SAE does not have a Neuronpedia ID. Either dashboards for this SAE do not exist (yet) on Neuronpedia, or the SAE was not loaded via the from_pretrained method"
@@ -86,7 +86,7 @@ def get_neuronpedia_quick_list(
     url = url + "?name=" + name
     list_feature = [
         {
-            "modelId": sae.cfg.model_name,
+            "modelId": sae.cfg.metadata.model_name,
             "layer": sae_id.split("/")[1],
             "index": str(feature),
         }

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/config.py RENAMED Viewed

@@ -201,7 +201,7 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     train_batch_size_tokens: int = 4096
     ## Adam
-    adam_beta1: float = 0.0
+    adam_beta1: float = 0.9
     adam_beta2: float = 0.999
     ## Learning Rate Schedule
@@ -390,7 +390,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
             adam_beta2=self.adam_beta2,
             lr_decay_steps=self.lr_decay_steps,
             n_restart_cycles=self.n_restart_cycles,
-            total_training_steps=self.total_training_steps,
             train_batch_size_samples=self.train_batch_size_tokens,
             dead_feature_window=self.dead_feature_window,
             feature_sampling_window=self.feature_sampling_window,
@@ -613,8 +612,11 @@ class SAETrainerConfig:
     adam_beta2: float
     lr_decay_steps: int
     n_restart_cycles: int
-    total_training_steps: int
     train_batch_size_samples: int
     dead_feature_window: int
     feature_sampling_window: int
     logger: LoggingConfig
+    @property
+    def total_training_steps(self) -> int:
+        return self.total_training_samples // self.train_batch_size_samples

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/evals.py RENAMED Viewed

@@ -4,6 +4,7 @@ import json
 import math
 import re
 import subprocess
+import sys
 from collections import defaultdict
 from collections.abc import Mapping
 from dataclasses import dataclass, field
@@ -15,7 +16,7 @@ from typing import Any
 import einops
 import pandas as pd
 import torch
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from transformer_lens import HookedTransformer
 from transformer_lens.hook_points import HookedRootModule
@@ -814,16 +815,18 @@ def multiple_evals(
             release=sae_release_name,  # see other options in sae_lens/pretrained_saes.yaml
             sae_id=sae_id,  # won't always be a hook point
             device=device,
-        )[0]
+        )
         # move SAE to device if not there already
         sae.to(device)
-        if current_model_str != sae.cfg.model_name:
+        if current_model_str != sae.cfg.metadata.model_name:
             del current_model  # potentially saves GPU memory
-            current_model_str = sae.cfg.model_name
+            current_model_str = sae.cfg.metadata.model_name
             current_model = HookedTransformer.from_pretrained_no_processing(
-                current_model_str, device=device, **sae.cfg.model_from_pretrained_kwargs
+                current_model_str,
+                device=device,
+                **sae.cfg.metadata.model_from_pretrained_kwargs,
             )
         assert current_model is not None
@@ -941,7 +944,7 @@ def process_results(
     }
-if __name__ == "__main__":
+def process_args(args: list[str]) -> argparse.Namespace:
     arg_parser = argparse.ArgumentParser(description="Run evaluations on SAEs")
     arg_parser.add_argument(
         "sae_regex_pattern",
@@ -1031,11 +1034,19 @@ if __name__ == "__main__":
         help="Enable verbose output with tqdm loaders.",
     )
-    args = arg_parser.parse_args()
-    eval_results = run_evaluations(args)
-    output_files = process_results(eval_results, args.output_dir)
+    return arg_parser.parse_args(args)
+def run_evals_cli(args: list[str]) -> None:
+    opts = process_args(args)
+    eval_results = run_evaluations(opts)
+    output_files = process_results(eval_results, opts.output_dir)
     print("Evaluation complete. Output files:")
     print(f"Individual JSONs: {len(output_files['individual_jsons'])}")  # type: ignore
     print(f"Combined JSON: {output_files['combined_json']}")
     print(f"CSV: {output_files['csv']}")
+if __name__ == "__main__":
+    run_evals_cli(sys.argv[1:])

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/llm_sae_training_runner.py RENAMED Viewed

@@ -4,7 +4,7 @@ import sys
 from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Generic, cast
+from typing import Any, Generic
 import torch
 import wandb
@@ -17,12 +17,16 @@ from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
 from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, RUNNER_CFG_FILENAME
 from sae_lens.evals import EvalConfig, run_evals
 from sae_lens.load_model import load_model
+from sae_lens.saes.gated_sae import GatedTrainingSAEConfig
+from sae_lens.saes.jumprelu_sae import JumpReLUTrainingSAEConfig
 from sae_lens.saes.sae import (
     T_TRAINING_SAE,
     T_TRAINING_SAE_CONFIG,
     TrainingSAE,
     TrainingSAEConfig,
 )
+from sae_lens.saes.standard_sae import StandardTrainingSAEConfig
+from sae_lens.saes.topk_sae import TopKTrainingSAEConfig
 from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
 from sae_lens.training.sae_trainer import SAETrainer
@@ -145,17 +149,18 @@ class LanguageModelSAETrainingRunner:
                 )
         else:
             self.sae = override_sae
+        self.sae.to(self.cfg.device)
     def run(self):
         """
         Run the training of the SAE.
         """
+        self._set_sae_metadata()
         if self.cfg.logger.log_to_wandb:
             wandb.init(
                 project=self.cfg.logger.wandb_project,
                 entity=self.cfg.logger.wandb_entity,
-                config=cast(Any, self.cfg),
+                config=self.cfg.to_dict(),
                 name=self.cfg.logger.run_name,
                 id=self.cfg.logger.wandb_id,
             )
@@ -184,6 +189,20 @@ class LanguageModelSAETrainingRunner:
         return sae
+    def _set_sae_metadata(self):
+        self.sae.cfg.metadata.dataset_path = self.cfg.dataset_path
+        self.sae.cfg.metadata.hook_name = self.cfg.hook_name
+        self.sae.cfg.metadata.model_name = self.cfg.model_name
+        self.sae.cfg.metadata.model_class_name = self.cfg.model_class_name
+        self.sae.cfg.metadata.hook_head_index = self.cfg.hook_head_index
+        self.sae.cfg.metadata.context_size = self.cfg.context_size
+        self.sae.cfg.metadata.seqpos_slice = self.cfg.seqpos_slice
+        self.sae.cfg.metadata.model_from_pretrained_kwargs = (
+            self.cfg.model_from_pretrained_kwargs
+        )
+        self.sae.cfg.metadata.prepend_bos = self.cfg.prepend_bos
+        self.sae.cfg.metadata.exclude_special_tokens = self.cfg.exclude_special_tokens
     def _compile_if_needed(self):
         # Compile model and SAE
         #  torch.compile can provide significant speedups (10-20% in testing)
@@ -247,11 +266,100 @@ class LanguageModelSAETrainingRunner:
 def _parse_cfg_args(
     args: Sequence[str],
 ) -> LanguageModelSAERunnerConfig[TrainingSAEConfig]:
+    """
+    Parse command line arguments into a LanguageModelSAERunnerConfig.
+    This function first parses the architecture argument to determine which
+    concrete SAE config class to use, then parses the full configuration
+    with that concrete type.
+    """
     if len(args) == 0:
         args = ["--help"]
+    # First, parse only the architecture to determine which concrete class to use
+    architecture_parser = ArgumentParser(
+        description="Parse architecture to determine SAE config class",
+        exit_on_error=False,
+        add_help=False,  # Don't add help to avoid conflicts
+    )
+    architecture_parser.add_argument(
+        "--architecture",
+        type=str,
+        choices=["standard", "gated", "jumprelu", "topk"],
+        default="standard",
+        help="SAE architecture to use",
+    )
+    # Parse known args to extract architecture, ignore unknown args for now
+    arch_args, remaining_args = architecture_parser.parse_known_args(args)
+    architecture = arch_args.architecture
+    # Remove architecture from remaining args if it exists
+    filtered_args = []
+    skip_next = False
+    for arg in remaining_args:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--architecture":
+            skip_next = True  # Skip the next argument (the architecture value)
+            continue
+        filtered_args.append(arg)
+    # Create a custom wrapper class that simple_parsing can handle
+    def create_config_class(
+        sae_config_type: type[TrainingSAEConfig],
+    ) -> type[LanguageModelSAERunnerConfig[TrainingSAEConfig]]:
+        """Create a concrete config class for the given SAE config type."""
+        # Create the base config without the sae field
+        from dataclasses import field as dataclass_field
+        from dataclasses import fields, make_dataclass
+        # Get all fields from LanguageModelSAERunnerConfig except the generic sae field
+        base_fields = []
+        for field_obj in fields(LanguageModelSAERunnerConfig):
+            if field_obj.name != "sae":
+                base_fields.append((field_obj.name, field_obj.type, field_obj))
+        # Add the concrete sae field
+        base_fields.append(
+            (
+                "sae",
+                sae_config_type,
+                dataclass_field(
+                    default_factory=lambda: sae_config_type(d_in=512, d_sae=1024)
+                ),
+            )
+        )
+        # Create the concrete class
+        return make_dataclass(
+            f"{sae_config_type.__name__}RunnerConfig",
+            base_fields,
+            bases=(LanguageModelSAERunnerConfig,),
+        )
+    # Map architecture to concrete config class
+    sae_config_map = {
+        "standard": StandardTrainingSAEConfig,
+        "gated": GatedTrainingSAEConfig,
+        "jumprelu": JumpReLUTrainingSAEConfig,
+        "topk": TopKTrainingSAEConfig,
+    }
+    sae_config_type = sae_config_map[architecture]
+    concrete_config_class = create_config_class(sae_config_type)
+    # Now parse the full configuration with the concrete type
     parser = ArgumentParser(exit_on_error=False)
-    parser.add_arguments(LanguageModelSAERunnerConfig, dest="cfg")
-    return parser.parse_args(args).cfg
+    parser.add_arguments(concrete_config_class, dest="cfg")
+    # Parse the filtered arguments (without --architecture)
+    parsed_args = parser.parse_args(filtered_args)
+    # Return the parsed configuration
+    return parsed_args.cfg
 # moved into its own function to make it easier to test

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/loading/pretrained_sae_loaders.py RENAMED Viewed

@@ -26,6 +26,22 @@ from sae_lens.loading.pretrained_saes_directory import (
 from sae_lens.registry import get_sae_class
 from sae_lens.util import filter_valid_dataclass_fields
+LLM_METADATA_KEYS = {
+    "model_name",
+    "hook_name",
+    "model_class_name",
+    "hook_head_index",
+    "model_from_pretrained_kwargs",
+    "prepend_bos",
+    "exclude_special_tokens",
+    "neuronpedia_id",
+    "context_size",
+    "seqpos_slice",
+    "dataset_path",
+    "sae_lens_version",
+    "sae_lens_training_version",
+}
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
 class PretrainedSaeHuggingfaceLoader(Protocol):
@@ -207,6 +223,10 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     new_cfg.setdefault("activation_fn", new_cfg.get("activation_fn", "relu"))
     new_cfg.setdefault("architecture", "standard")
     new_cfg.setdefault("neuronpedia_id", None)
+    new_cfg.setdefault(
+        "reshape_activations",
+        "hook_z" if "hook_z" in new_cfg.get("hook_name", "") else "none",
+    )
     if "normalize_activations" in new_cfg and isinstance(
         new_cfg["normalize_activations"], bool
@@ -231,11 +251,9 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     if architecture == "topk":
         sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
-    # import here to avoid circular import
-    from sae_lens.saes.sae import SAEMetadata
-    meta_dict = filter_valid_dataclass_fields(new_cfg, SAEMetadata)
-    sae_cfg_dict["metadata"] = meta_dict
+    sae_cfg_dict["metadata"] = {
+        k: v for k, v in new_cfg.items() if k in LLM_METADATA_KEYS
+    }
     sae_cfg_dict["architecture"] = architecture
     return sae_cfg_dict
@@ -271,6 +289,7 @@ def get_connor_rob_hook_z_config_from_hf(
         "context_size": 128,
         "normalize_activations": "none",
         "dataset_trust_remote_code": True,
+        "reshape_activations": "hook_z",
         **(cfg_overrides or {}),
     }

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/gated_sae.py RENAMED Viewed

@@ -168,10 +168,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
         # Magnitude path
         magnitude_pre_activation = sae_in @ (self.W_enc * self.r_mag.exp()) + self.b_mag
-        if self.training and self.cfg.noise_scale > 0:
-            magnitude_pre_activation += (
-                torch.randn_like(magnitude_pre_activation) * self.cfg.noise_scale
-            )
         magnitude_pre_activation = self.hook_sae_acts_pre(magnitude_pre_activation)
         feature_magnitudes = self.activation_fn(magnitude_pre_activation)

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/jumprelu_sae.py RENAMED Viewed

@@ -105,7 +105,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
     JumpReLUSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a JumpReLU activation. For each unit, if its pre-activation is
     <= threshold, that unit is zeroed out; otherwise, it follows a user-specified
-    activation function (e.g., ReLU, tanh-relu, etc.).
+    activation function (e.g., ReLU etc.).
     It implements:
       - initialize_weights: sets up parameters, including a threshold.
@@ -142,7 +142,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # 1) Apply the base "activation_fn" from config (e.g., ReLU, tanh-relu).
+        # 1) Apply the base "activation_fn" from config (e.g., ReLU).
         base_acts = self.activation_fn(hidden_pre)
         # 2) Zero out any unit whose (hidden_pre <= threshold).
@@ -191,8 +191,8 @@ class JumpReLUTrainingSAEConfig(TrainingSAEConfig):
     Configuration class for training a JumpReLUTrainingSAE.
     """
-    jumprelu_init_threshold: float = 0.001
-    jumprelu_bandwidth: float = 0.001
+    jumprelu_init_threshold: float = 0.01
+    jumprelu_bandwidth: float = 0.05
     l0_coefficient: float = 1.0
     l0_warm_up_steps: int = 0
@@ -257,12 +257,6 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = sae_in @ self.W_enc + self.b_enc
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
         feature_acts = JumpReLU.apply(hidden_pre, self.threshold, self.bandwidth)
         return feature_acts, hidden_pre  # type: ignore

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/sae.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Base classes for Sparse Autoencoders (SAEs)."""
+import copy
 import json
 import warnings
 from abc import ABC, abstractmethod
@@ -59,23 +60,91 @@ T_SAE = TypeVar("T_SAE", bound="SAE")  # type: ignore
 T_TRAINING_SAE = TypeVar("T_TRAINING_SAE", bound="TrainingSAE")  # type: ignore
-@dataclass
 class SAEMetadata:
     """Core metadata about how this SAE should be used, if known."""
-    model_name: str | None = None
-    hook_name: str | None = None
-    model_class_name: str | None = None
-    hook_head_index: int | None = None
-    model_from_pretrained_kwargs: dict[str, Any] | None = None
-    prepend_bos: bool | None = None
-    exclude_special_tokens: bool | list[int] | None = None
-    neuronpedia_id: str | None = None
-    context_size: int | None = None
-    seqpos_slice: tuple[int | None, ...] | None = None
-    dataset_path: str | None = None
-    sae_lens_version: str = field(default_factory=lambda: __version__)
-    sae_lens_training_version: str = field(default_factory=lambda: __version__)
+    def __init__(self, **kwargs: Any):
+        # Set default version fields with their current behavior
+        self.sae_lens_version = kwargs.pop("sae_lens_version", __version__)
+        self.sae_lens_training_version = kwargs.pop(
+            "sae_lens_training_version", __version__
+        )
+        # Set all other attributes dynamically
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+    def __getattr__(self, name: str) -> None:
+        """Return None for any missing attribute (like defaultdict)"""
+        return
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Allow setting any attribute"""
+        super().__setattr__(name, value)
+    def __getitem__(self, key: str) -> Any:
+        """Allow dictionary-style access: metadata['key'] - returns None for missing keys"""
+        return getattr(self, key)
+    def __setitem__(self, key: str, value: Any) -> None:
+        """Allow dictionary-style assignment: metadata['key'] = value"""
+        setattr(self, key, value)
+    def __contains__(self, key: str) -> bool:
+        """Allow 'in' operator: 'key' in metadata"""
+        # Only return True if the attribute was explicitly set (not just defaulting to None)
+        return key in self.__dict__
+    def get(self, key: str, default: Any = None) -> Any:
+        """Dictionary-style get with default"""
+        value = getattr(self, key)
+        # If the attribute wasn't explicitly set and we got None from __getattr__,
+        # use the provided default instead
+        if key not in self.__dict__ and value is None:
+            return default
+        return value
+    def keys(self):
+        """Return all explicitly set attribute names"""
+        return self.__dict__.keys()
+    def values(self):
+        """Return all explicitly set attribute values"""
+        return self.__dict__.values()
+    def items(self):
+        """Return all explicitly set attribute name-value pairs"""
+        return self.__dict__.items()
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization"""
+        return self.__dict__.copy()
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "SAEMetadata":
+        """Create from dictionary"""
+        return cls(**data)
+    def __repr__(self) -> str:
+        return f"SAEMetadata({self.__dict__})"
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SAEMetadata):
+            return False
+        return self.__dict__ == other.__dict__
+    def __deepcopy__(self, memo: dict[int, Any]) -> "SAEMetadata":
+        """Support for deep copying"""
+        return SAEMetadata(**copy.deepcopy(self.__dict__, memo))
+    def __getstate__(self) -> dict[str, Any]:
+        """Support for pickling"""
+        return self.__dict__
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        """Support for unpickling"""
+        self.__dict__.update(state)
 @dataclass
@@ -99,7 +168,7 @@ class SAEConfig(ABC):
     def to_dict(self) -> dict[str, Any]:
         res = {field.name: getattr(self, field.name) for field in fields(self)}
-        res["metadata"] = asdict(self.metadata)
+        res["metadata"] = self.metadata.to_dict()
         res["architecture"] = self.architecture()
         return res
@@ -124,7 +193,7 @@ class SAEConfig(ABC):
             "layer_norm",
         ]:
             raise ValueError(
-                f"normalize_activations must be none, expected_average_only_in, constant_norm_rescale, or layer_norm. Got {self.normalize_activations}"
+                f"normalize_activations must be none, expected_average_only_in, layer_norm, or constant_norm_rescale. Got {self.normalize_activations}"
             )
@@ -238,9 +307,8 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             self.run_time_activation_norm_fn_in = run_time_activation_norm_fn_in
             self.run_time_activation_norm_fn_out = run_time_activation_norm_fn_out
         elif self.cfg.normalize_activations == "layer_norm":
+            #  we need to scale the norm of the input and store the scaling factor
             def run_time_activation_ln_in(
                 x: torch.Tensor, eps: float = 1e-5
             ) -> torch.Tensor:
@@ -522,7 +590,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         device: str = "cpu",
         force_download: bool = False,
         converter: PretrainedSaeHuggingfaceLoader | None = None,
-    ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
+    ) -> T_SAE:
         """
         Load a pretrained SAE from the Hugging Face model hub.
@@ -530,7 +598,28 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
             release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
             id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
             device: The device to load the SAE on.
-            return_sparsity_if_present: If True, will return the log sparsity tensor if it is present in the model directory in the Hugging Face model hub.
+        """
+        return cls.from_pretrained_with_cfg_and_sparsity(
+            release, sae_id, device, force_download, converter=converter
+        )[0]
+    @classmethod
+    def from_pretrained_with_cfg_and_sparsity(
+        cls: Type[T_SAE],
+        release: str,
+        sae_id: str,
+        device: str = "cpu",
+        force_download: bool = False,
+        converter: PretrainedSaeHuggingfaceLoader | None = None,
+    ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
+        """
+        Load a pretrained SAE from the Hugging Face model hub, along with its config dict and sparsity, if present.
+        In SAELens <= 5.x.x, this was called SAE.from_pretrained().
+        Args:
+            release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
+            id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
+            device: The device to load the SAE on.
         """
         # get sae directory
@@ -646,8 +735,6 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
 @dataclass(kw_only=True)
 class TrainingSAEConfig(SAEConfig, ABC):
-    noise_scale: float = 0.0
-    mse_loss_normalization: str | None = None
     # https://transformer-circuits.pub/2024/april-update/index.html#training-saes
     # 0.1 corresponds to the "heuristic" initialization, use None to disable
     decoder_init_norm: float | None = 0.1
@@ -680,9 +767,6 @@ class TrainingSAEConfig(SAEConfig, ABC):
     def from_dict(
         cls: type[T_TRAINING_SAE_CONFIG], config_dict: dict[str, Any]
     ) -> T_TRAINING_SAE_CONFIG:
-        # remove any keys that are not in the dataclass
-        # since we sometimes enhance the config with the whole LM runner config
-        valid_config_dict = filter_valid_dataclass_fields(config_dict, cls)
         cfg_class = cls
         if "architecture" in config_dict:
             cfg_class = get_sae_training_class(config_dict["architecture"])[1]
@@ -690,6 +774,9 @@ class TrainingSAEConfig(SAEConfig, ABC):
             raise ValueError(
                 f"SAE config class {cls} does not match dict config class {type(cfg_class)}"
             )
+        # remove any keys that are not in the dataclass
+        # since we sometimes enhance the config with the whole LM runner config
+        valid_config_dict = filter_valid_dataclass_fields(config_dict, cfg_class)
         if "metadata" in config_dict:
             valid_config_dict["metadata"] = SAEMetadata(**config_dict["metadata"])
         return cfg_class(**valid_config_dict)
@@ -698,6 +785,7 @@ class TrainingSAEConfig(SAEConfig, ABC):
         return {
             **super().to_dict(),
             **asdict(self),
+            "metadata": self.metadata.to_dict(),
             "architecture": self.architecture(),
         }
@@ -708,12 +796,14 @@ class TrainingSAEConfig(SAEConfig, ABC):
         Creates a dictionary containing attributes corresponding to the fields
         defined in the base SAEConfig class.
         """
-        base_config_field_names = {f.name for f in fields(SAEConfig)}
+        base_sae_cfg_class = get_sae_class(self.architecture())[1]
+        base_config_field_names = {f.name for f in fields(base_sae_cfg_class)}
         result_dict = {
             field_name: getattr(self, field_name)
             for field_name in base_config_field_names
         }
         result_dict["architecture"] = self.architecture()
+        result_dict["metadata"] = self.metadata.to_dict()
         return result_dict
@@ -726,7 +816,7 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         # Turn off hook_z reshaping for training mode - the activation store
         # is expected to handle reshaping before passing data to the SAE
         self.turn_off_forward_pass_hook_z_reshaping()
-        self.mse_loss_fn = self._get_mse_loss_fn()
+        self.mse_loss_fn = mse_loss
     @abstractmethod
     def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]: ...
@@ -861,27 +951,6 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         """
         return self.process_state_dict_for_saving(state_dict)
-    def _get_mse_loss_fn(self) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
-        """Get the MSE loss function based on config."""
-        def standard_mse_loss_fn(
-            preds: torch.Tensor, target: torch.Tensor
-        ) -> torch.Tensor:
-            return torch.nn.functional.mse_loss(preds, target, reduction="none")
-        def batch_norm_mse_loss_fn(
-            preds: torch.Tensor, target: torch.Tensor
-        ) -> torch.Tensor:
-            target_centered = target - target.mean(dim=0, keepdim=True)
-            normalization = target_centered.norm(dim=-1, keepdim=True)
-            return torch.nn.functional.mse_loss(preds, target, reduction="none") / (
-                normalization + 1e-6
-            )
-        if self.cfg.mse_loss_normalization == "dense_batch":
-            return batch_norm_mse_loss_fn
-        return standard_mse_loss_fn
     @torch.no_grad()
     def remove_gradient_parallel_to_decoder_directions(self) -> None:
         """Remove gradient components parallel to decoder directions."""
@@ -943,3 +1012,7 @@ def _disable_hooks(sae: SAE[Any]):
     finally:
         for hook_name, hook in sae.hook_dict.items():
             setattr(sae, hook_name, hook)
+def mse_loss(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.mse_loss(preds, target, reduction="none")

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/standard_sae.py RENAMED Viewed

@@ -67,7 +67,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation values
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # Apply the activation function (e.g., ReLU, tanh-relu, depending on config)
+        # Apply the activation function (e.g., ReLU, depending on config)
         return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
     def decode(
@@ -81,7 +81,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         # 2) hook reconstruction
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
-        # 4) optional out-normalization (e.g. constant_norm_rescale or layer_norm)
+        # 4) optional out-normalization (e.g. constant_norm_rescale)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         # 5) if hook_z is enabled, rearrange back to (..., n_heads, d_head).
         return self.reshape_fn_out(sae_out_pre, self.d_head)
@@ -136,16 +136,9 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation (and allow for a hook if desired)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)  # type: ignore
-        # Add noise during training for robustness (scaled by noise_scale from the configuration)
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre_noised = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
-        else:
-            hidden_pre_noised = hidden_pre
         # Apply the activation function (and any post-activation hook)
-        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre_noised))
-        return feature_acts, hidden_pre_noised
+        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+        return feature_acts, hidden_pre
     def calculate_aux_loss(
         self,

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/topk_sae.py RENAMED Viewed

@@ -91,8 +91,7 @@ class TopKSAE(SAE[TopKSAEConfig]):
     ) -> Float[torch.Tensor, "... d_sae"]:
         """
         Converts input x into feature activations.
-        Uses topk activation from the config (cfg.activation_fn == "topk")
-        under the hood.
+        Uses topk activation under the hood.
         """
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
@@ -116,6 +115,13 @@ class TopKSAE(SAE[TopKSAEConfig]):
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
         return TopK(self.cfg.k)
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
 @dataclass
 class TopKTrainingSAEConfig(TrainingSAEConfig):
@@ -156,18 +162,11 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # Inject noise if training
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre_noised = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
-        else:
-            hidden_pre_noised = hidden_pre
         # Apply the TopK activation function (already set in self.activation_fn if config is "topk")
-        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre_noised))
-        return feature_acts, hidden_pre_noised
+        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+        return feature_acts, hidden_pre
+    @override
     def calculate_aux_loss(
         self,
         step_input: TrainStepInput,
@@ -184,6 +183,13 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         )
         return {"auxiliary_reconstruction_loss": topk_loss}
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
     @override
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
         return TopK(self.cfg.k)

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/activation_scaler.py RENAMED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from statistics import mean
 import torch
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from sae_lens.training.types import DataProvider

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/activations_store.py RENAMED Viewed

@@ -161,8 +161,6 @@ class ActivationsStore:
     ) -> ActivationsStore:
         if sae.cfg.metadata.hook_name is None:
             raise ValueError("hook_name is required")
-        if sae.cfg.metadata.hook_head_index is None:
-            raise ValueError("hook_head_index is required")
         if sae.cfg.metadata.context_size is None:
             raise ValueError("context_size is required")
         if sae.cfg.metadata.prepend_bos is None:

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/sae_trainer.py RENAMED Viewed

@@ -7,7 +7,7 @@ import torch
 import wandb
 from safetensors.torch import save_file
 from torch.optim import Adam
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from sae_lens import __version__
 from sae_lens.config import SAETrainerConfig
@@ -161,6 +161,7 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
         return (self.n_forward_passes_since_fired > self.cfg.dead_feature_window).bool()
     def fit(self) -> T_TRAINING_SAE:
+        self.sae.to(self.cfg.device)
         pbar = tqdm(total=self.cfg.total_training_samples, desc="Training SAE")
         if self.sae.cfg.normalize_activations == "expected_average_only_in":
@@ -194,10 +195,11 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
             )
             self.activation_scaler.scaling_factor = None
-        # save final sae group to checkpoints folder
+        # save final inference sae group to checkpoints folder
         self.save_checkpoint(
             checkpoint_name=f"final_{self.n_training_samples}",
             wandb_aliases=["final_model"],
+            save_inference_model=True,
         )
         pbar.close()
@@ -207,11 +209,17 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
         self,
         checkpoint_name: str,
         wandb_aliases: list[str] | None = None,
+        save_inference_model: bool = False,
     ) -> None:
         checkpoint_path = Path(self.cfg.checkpoint_path) / checkpoint_name
         checkpoint_path.mkdir(exist_ok=True, parents=True)
-        weights_path, cfg_path = self.sae.save_model(str(checkpoint_path))
+        save_fn = (
+            self.sae.save_inference_model
+            if save_inference_model
+            else self.sae.save_model
+        )
+        weights_path, cfg_path = save_fn(str(checkpoint_path))
         sparsity_path = checkpoint_path / SPARSITY_FILENAME
         save_file({"sparsity": self.log_feature_sparsity}, sparsity_path)

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/upload_saes_to_huggingface.py RENAMED Viewed

@@ -88,7 +88,7 @@ def _create_default_readme(repo_id: str, sae_ids: Iterable[str]) -> str:
         ```python
         from sae_lens import SAE
-        sae, cfg_dict, sparsity = SAE.from_pretrained("{repo_id}", "<sae_id>")
+        sae = SAE.from_pretrained("{repo_id}", "<sae_id>")
         ```
         """
     )

sae_lens-6.0.0rc3/sae_lens/training/geometric_median.py DELETED Viewed

@@ -1,101 +0,0 @@
-from types import SimpleNamespace
-import torch
-import tqdm
-def weighted_average(points: torch.Tensor, weights: torch.Tensor):
-    weights = weights / weights.sum()
-    return (points * weights.view(-1, 1)).sum(dim=0)
-@torch.no_grad()
-def geometric_median_objective(
-    median: torch.Tensor, points: torch.Tensor, weights: torch.Tensor
-) -> torch.Tensor:
-    norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
-    return (norms * weights).sum()
-def compute_geometric_median(
-    points: torch.Tensor,
-    weights: torch.Tensor | None = None,
-    eps: float = 1e-6,
-    maxiter: int = 100,
-    ftol: float = 1e-20,
-    do_log: bool = False,
-):
-    """
-    :param points: ``torch.Tensor`` of shape ``(n, d)``
-    :param weights: Optional ``torch.Tensor`` of shape :math:``(n,)``.
-    :param eps: Smallest allowed value of denominator, to avoid divide by zero.
-        Equivalently, this is a smoothing parameter. Default 1e-6.
-    :param maxiter: Maximum number of Weiszfeld iterations. Default 100
-    :param ftol: If objective value does not improve by at least this `ftol` fraction, terminate the algorithm. Default 1e-20.
-    :param do_log: If true will return a log of function values encountered through the course of the algorithm
-    :return: SimpleNamespace object with fields
-        - `median`: estimate of the geometric median, which is a ``torch.Tensor`` object of shape :math:``(d,)``
-        - `termination`: string explaining how the algorithm terminated.
-        - `logs`: function values encountered through the course of the algorithm in a list (None if do_log is false).
-    """
-    with torch.no_grad():
-        if weights is None:
-            weights = torch.ones((points.shape[0],), device=points.device)
-        # initialize median estimate at mean
-        new_weights = weights
-        median = weighted_average(points, weights)
-        objective_value = geometric_median_objective(median, points, weights)
-        logs = [objective_value] if do_log else None
-        # Weiszfeld iterations
-        early_termination = False
-        pbar = tqdm.tqdm(range(maxiter))
-        for _ in pbar:
-            prev_obj_value = objective_value
-            norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
-            new_weights = weights / torch.clamp(norms, min=eps)
-            median = weighted_average(points, new_weights)
-            objective_value = geometric_median_objective(median, points, weights)
-            if logs is not None:
-                logs.append(objective_value)
-            if abs(prev_obj_value - objective_value) <= ftol * objective_value:
-                early_termination = True
-                break
-            pbar.set_description(f"Objective value: {objective_value:.4f}")
-    median = weighted_average(points, new_weights)  # allow autodiff to track it
-    return SimpleNamespace(
-        median=median,
-        new_weights=new_weights,
-        termination=(
-            "function value converged within tolerance"
-            if early_termination
-            else "maximum iterations reached"
-        ),
-        logs=logs,
-    )
-if __name__ == "__main__":
-    import time
-    TOLERANCE = 1e-2
-    dim1 = 10000
-    dim2 = 768
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    sample = (
-        torch.randn((dim1, dim2), device=device) * 100
-    )  # seems to be the order of magnitude of the actual use case
-    weights = torch.randn((dim1,), device=device)
-    torch.tensor(weights, device=device)
-    tic = time.perf_counter()
-    new = compute_geometric_median(sample, weights=weights, maxiter=100)
-    print(f"new code takes {time.perf_counter()-tic} seconds!")  # noqa: T201

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/LICENSE RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/README.md RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/analysis/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/analysis/hooked_sae_transformer.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/cache_activations_runner.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/constants.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/load_model.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/loading/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/loading/pretrained_saes_directory.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/pretokenize_runner.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/pretrained_saes.yaml RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/registry.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/saes/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/tokenization_and_batching.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/mixing_buffer.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/optim.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/training/types.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/tutorial/tsea.py RENAMED Viewed

File without changes

{sae_lens-6.0.0rc3 → sae_lens-6.0.0rc4}/sae_lens/util.py RENAMED Viewed

File without changes

sae-lens 6.0.0rc3__tar.gz → 6.0.0rc4__tar.gz

sae-lens 6.0.0rc3tar.gz → 6.0.0rc4tar.gz