PyPI - sae-lens - Versions diffs - 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl - Mend

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +6 -3
sae_lens/analysis/neuronpedia_integration.py +3 -3
sae_lens/cache_activations_runner.py +7 -6
sae_lens/config.py +50 -6
sae_lens/constants.py +2 -0
sae_lens/evals.py +39 -28
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +24 -12
sae_lens/saes/gated_sae.py +0 -4
sae_lens/saes/jumprelu_sae.py +4 -10
sae_lens/saes/sae.py +121 -51
sae_lens/saes/standard_sae.py +4 -11
sae_lens/saes/topk_sae.py +18 -12
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +77 -174
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/sae_trainer.py +107 -98
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +1 -1
sae_lens/util.py +19 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc4.dist-info/RECORD +37 -0
sae_lens/sae_training_runner.py +0 -237
sae_lens/training/geometric_median.py +0 -101
sae_lens-6.0.0rc2.dist-info/RECORD +0 -35
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/WHEEL +0 -0

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.0.0-rc.2"
+__version__ = "6.0.0-rc.4"
 import logging
@@ -33,16 +33,17 @@ from .cache_activations_runner import CacheActivationsRunner
 from .config import (
     CacheActivationsRunnerConfig,
     LanguageModelSAERunnerConfig,
+    LoggingConfig,
     PretokenizeRunnerConfig,
 )
 from .evals import run_evals
+from .llm_sae_training_runner import LanguageModelSAETrainingRunner, SAETrainingRunner
 from .loading.pretrained_sae_loaders import (
     PretrainedSaeDiskLoader,
     PretrainedSaeHuggingfaceLoader,
 )
 from .pretokenize_runner import PretokenizeRunner, pretokenize_runner
 from .registry import register_sae_class, register_sae_training_class
-from .sae_training_runner import SAETrainingRunner
 from .training.activations_store import ActivationsStore
 from .training.upload_saes_to_huggingface import upload_saes_to_huggingface
@@ -54,7 +55,7 @@ __all__ = [
     "HookedSAETransformer",
     "ActivationsStore",
     "LanguageModelSAERunnerConfig",
-    "SAETrainingRunner",
+    "LanguageModelSAETrainingRunner",
     "CacheActivationsRunnerConfig",
     "CacheActivationsRunner",
     "PretokenizeRunnerConfig",
@@ -82,6 +83,8 @@ __all__ = [
     "JumpReLUSAEConfig",
     "JumpReLUTrainingSAE",
     "JumpReLUTrainingSAEConfig",
+    "SAETrainingRunner",
+    "LoggingConfig",
 ]

sae_lens/analysis/neuronpedia_integration.py CHANGED Viewed

@@ -59,7 +59,7 @@ def NanAndInfReplacer(value: str):
 def open_neuronpedia_feature_dashboard(sae: SAE[Any], index: int):
-    sae_id = sae.cfg.neuronpedia_id
+    sae_id = sae.cfg.metadata.neuronpedia_id
     if sae_id is None:
         logger.warning(
             "SAE does not have a Neuronpedia ID. Either dashboards for this SAE do not exist (yet) on Neuronpedia, or the SAE was not loaded via the from_pretrained method"
@@ -74,7 +74,7 @@ def get_neuronpedia_quick_list(
     features: list[int],
     name: str = "temporary_list",
 ):
-    sae_id = sae.cfg.neuronpedia_id
+    sae_id = sae.cfg.metadata.neuronpedia_id
     if sae_id is None:
         logger.warning(
             "SAE does not have a Neuronpedia ID. Either dashboards for this SAE do not exist (yet) on Neuronpedia, or the SAE was not loaded via the from_pretrained method"
@@ -86,7 +86,7 @@ def get_neuronpedia_quick_list(
     url = url + "?name=" + name
     list_feature = [
         {
-            "modelId": sae.cfg.model_name,
+            "modelId": sae.cfg.metadata.model_name,
             "layer": sae_id.split("/")[1],
             "index": str(feature),
         }

sae_lens/cache_activations_runner.py CHANGED Viewed

@@ -34,7 +34,6 @@ def _mk_activations_store(
         dataset=override_dataset or cfg.dataset_path,
         streaming=cfg.streaming,
         hook_name=cfg.hook_name,
-        hook_layer=cfg.hook_layer,
         hook_head_index=None,
         context_size=cfg.context_size,
         d_in=cfg.d_in,
@@ -265,7 +264,7 @@ class CacheActivationsRunner:
         for i in tqdm(range(self.cfg.n_buffers), desc="Caching activations"):
             try:
-                buffer = self.activations_store.get_buffer(
+                buffer = self.activations_store.get_raw_buffer(
                     self.cfg.n_batches_in_buffer, shuffle=False
                 )
                 shard = self._create_shard(buffer)
@@ -319,7 +318,7 @@ class CacheActivationsRunner:
     def _create_shard(
         self,
         buffer: tuple[
-            Float[torch.Tensor, "(bs context_size) num_layers d_in"],
+            Float[torch.Tensor, "(bs context_size) d_in"],
             Int[torch.Tensor, "(bs context_size)"] | None,
         ],
     ) -> Dataset:
@@ -327,13 +326,15 @@ class CacheActivationsRunner:
         acts, token_ids = buffer
         acts = einops.rearrange(
             acts,
-            "(bs context_size) num_layers d_in -> num_layers bs context_size d_in",
+            "(bs context_size) d_in -> bs context_size d_in",
             bs=self.cfg.n_seq_in_buffer,
             context_size=self.context_size,
             d_in=self.cfg.d_in,
-            num_layers=len(hook_names),
         )
-        shard_dict = {hook_name: act for hook_name, act in zip(hook_names, acts)}
+        shard_dict: dict[str, object] = {
+            hook_name: act_batch
+            for hook_name, act_batch in zip(hook_names, [acts], strict=True)
+        }
         if token_ids is not None:
             token_ids = einops.rearrange(

sae_lens/config.py CHANGED Viewed

@@ -23,7 +23,9 @@ from sae_lens.saes.sae import TrainingSAEConfig
 if TYPE_CHECKING:
     pass
-T_TRAINING_SAE_CONFIG = TypeVar("T_TRAINING_SAE_CONFIG", bound=TrainingSAEConfig)
+T_TRAINING_SAE_CONFIG = TypeVar(
+    "T_TRAINING_SAE_CONFIG", bound=TrainingSAEConfig, covariant=True
+)
 HfDataset = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
@@ -102,7 +104,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         model_class_name (str): The name of the class of the model to use. This should be either `HookedTransformer` or `HookedMamba`.
         hook_name (str): The name of the hook to use. This should be a valid TransformerLens hook.
         hook_eval (str): NOT CURRENTLY IN USE. The name of the hook to use for evaluation.
-        hook_layer (int): The index of the layer to hook. Used to stop forward passes early and speed up processing.
         hook_head_index (int, optional): When the hook is for an activation with a head index, we can specify a specific head to use here.
         dataset_path (str): A Hugging Face dataset path.
         dataset_trust_remote_code (bool): Whether to trust remote code when loading datasets from Huggingface.
@@ -159,7 +160,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     model_class_name: str = "HookedTransformer"
     hook_name: str = "blocks.0.hook_mlp_out"
     hook_eval: str = "NOT_IN_USE"
-    hook_layer: int = 0
     hook_head_index: int | None = None
     dataset_path: str = ""
     dataset_trust_remote_code: bool = True
@@ -201,7 +201,7 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     train_batch_size_tokens: int = 4096
     ## Adam
-    adam_beta1: float = 0.0
+    adam_beta1: float = 0.9
     adam_beta2: float = 0.999
     ## Learning Rate Schedule
@@ -375,6 +375,27 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         return cls(**cfg)
+    def to_sae_trainer_config(self) -> "SAETrainerConfig":
+        return SAETrainerConfig(
+            n_checkpoints=self.n_checkpoints,
+            checkpoint_path=self.checkpoint_path,
+            total_training_samples=self.total_training_tokens,
+            device=self.device,
+            autocast=self.autocast,
+            lr=self.lr,
+            lr_end=self.lr_end,
+            lr_scheduler_name=self.lr_scheduler_name,
+            lr_warm_up_steps=self.lr_warm_up_steps,
+            adam_beta1=self.adam_beta1,
+            adam_beta2=self.adam_beta2,
+            lr_decay_steps=self.lr_decay_steps,
+            n_restart_cycles=self.n_restart_cycles,
+            train_batch_size_samples=self.train_batch_size_tokens,
+            dead_feature_window=self.dead_feature_window,
+            feature_sampling_window=self.feature_sampling_window,
+            logger=self.logger,
+        )
 @dataclass
 class CacheActivationsRunnerConfig:
@@ -386,7 +407,6 @@ class CacheActivationsRunnerConfig:
         model_name (str): The name of the model to use.
         model_batch_size (int): How many prompts are in the batch of the language model when generating activations.
         hook_name (str): The name of the hook to use.
-        hook_layer (int): The layer of the final hook. Currently only support a single hook, so this should be the same as hook_name.
         d_in (int): Dimension of the model.
         total_training_tokens (int): Total number of tokens to process.
         context_size (int): Context size to process. Can be left as -1 if the dataset is tokenized.
@@ -416,7 +436,6 @@ class CacheActivationsRunnerConfig:
     model_name: str
     model_batch_size: int
     hook_name: str
-    hook_layer: int
     d_in: int
     training_tokens: int
@@ -576,3 +595,28 @@ class PretokenizeRunnerConfig:
     hf_num_shards: int = 64
     hf_revision: str = "main"
     hf_is_private_repo: bool = False
+@dataclass
+class SAETrainerConfig:
+    n_checkpoints: int
+    checkpoint_path: str
+    total_training_samples: int
+    device: str
+    autocast: bool
+    lr: float
+    lr_end: float | None
+    lr_scheduler_name: str
+    lr_warm_up_steps: int
+    adam_beta1: float
+    adam_beta2: float
+    lr_decay_steps: int
+    n_restart_cycles: int
+    train_batch_size_samples: int
+    dead_feature_window: int
+    feature_sampling_window: int
+    logger: LoggingConfig
+    @property
+    def total_training_steps(self) -> int:
+        return self.total_training_samples // self.train_batch_size_samples

sae_lens/constants.py CHANGED Viewed

@@ -16,3 +16,5 @@ SPARSITY_FILENAME = "sparsity.safetensors"
 SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"
 SAE_CFG_FILENAME = "cfg.json"
 RUNNER_CFG_FILENAME = "runner_cfg.json"
+ACTIVATIONS_STORE_STATE_FILENAME = "activations_store_state.safetensors"
+ACTIVATION_SCALER_CFG_FILENAME = "activation_scaler.json"

sae_lens/evals.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import math
 import re
 import subprocess
+import sys
 from collections import defaultdict
 from collections.abc import Mapping
 from dataclasses import dataclass, field
@@ -15,13 +16,15 @@ from typing import Any
 import einops
 import pandas as pd
 import torch
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from transformer_lens import HookedTransformer
 from transformer_lens.hook_points import HookedRootModule
 from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory
 from sae_lens.saes.sae import SAE, SAEConfig
+from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
 def get_library_version() -> str:
@@ -103,6 +106,7 @@ def run_evals(
     sae: SAE[Any],
     activation_store: ActivationsStore,
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     eval_config: EvalConfig = EvalConfig(),
     model_kwargs: Mapping[str, Any] = {},
     ignore_tokens: set[int | None] = set(),
@@ -140,6 +144,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_kl=eval_config.compute_kl,
             compute_ce_loss=eval_config.compute_ce_loss,
             n_batches=eval_config.n_eval_reconstruction_batches,
@@ -189,6 +194,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_l2_norms=eval_config.compute_l2_norms,
             compute_sparsity_metrics=eval_config.compute_sparsity_metrics,
             compute_variance_metrics=eval_config.compute_variance_metrics,
@@ -301,6 +307,7 @@ def get_downstream_reconstruction_metrics(
     sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     compute_kl: bool,
     compute_ce_loss: bool,
     n_batches: int,
@@ -326,8 +333,8 @@ def get_downstream_reconstruction_metrics(
         for metric_name, metric_value in get_recons_loss(
             sae,
             model,
+            activation_scaler,
             batch_tokens,
-            activation_store,
             compute_kl=compute_kl,
             compute_ce_loss=compute_ce_loss,
             ignore_tokens=ignore_tokens,
@@ -369,6 +376,7 @@ def get_sparsity_and_variance_metrics(
     sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     n_batches: int,
     compute_l2_norms: bool,
     compute_sparsity_metrics: bool,
@@ -436,7 +444,7 @@ def get_sparsity_and_variance_metrics(
             batch_tokens,
             prepend_bos=False,
             names_filter=[hook_name],
-            stop_at_layer=sae.cfg.metadata.hook_layer + 1,
+            stop_at_layer=extract_stop_at_layer_from_tlens_hook_name(hook_name),
             **model_kwargs,
         )
@@ -451,16 +459,14 @@ def get_sparsity_and_variance_metrics(
             original_act = cache[hook_name]
         # normalise if necessary (necessary in training only, otherwise we should fold the scaling in)
-        if activation_store.normalize_activations == "expected_average_only_in":
-            original_act = activation_store.apply_norm_scaling_factor(original_act)
+        original_act = activation_scaler.scale(original_act)
         # send the (maybe normalised) activations into the SAE
         sae_feature_activations = sae.encode(original_act.to(sae.device))
         sae_out = sae.decode(sae_feature_activations).to(original_act.device)
         del cache
-        if activation_store.normalize_activations == "expected_average_only_in":
-            sae_out = activation_store.unscale(sae_out)
+        sae_out = activation_scaler.unscale(sae_out)
         flattened_sae_input = einops.rearrange(original_act, "b ctx d -> (b ctx) d")
         flattened_sae_feature_acts = einops.rearrange(
@@ -582,8 +588,8 @@ def get_sparsity_and_variance_metrics(
 def get_recons_loss(
     sae: SAE[SAEConfig],
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     batch_tokens: torch.Tensor,
-    activation_store: ActivationsStore,
     compute_kl: bool,
     compute_ce_loss: bool,
     ignore_tokens: set[int | None] = set(),
@@ -618,15 +624,13 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations)).to(activations.dtype)
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         new_activations = torch.where(mask[..., None], new_activations, activations)
@@ -637,8 +641,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations.flatten(-2, -1))).to(
@@ -650,8 +653,7 @@ def get_recons_loss(
         )  # reshape to match original shape
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         return new_activations.to(original_device)
@@ -660,8 +662,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         new_activations = sae.decode(sae.encode(activations[:, :, head_index])).to(
             activations.dtype
@@ -669,8 +670,7 @@ def get_recons_loss(
         activations[:, :, head_index] = new_activations
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.unscale(activations)
+        activations = activation_scaler.unscale(activations)
         return activations.to(original_device)
@@ -815,16 +815,18 @@ def multiple_evals(
             release=sae_release_name,  # see other options in sae_lens/pretrained_saes.yaml
             sae_id=sae_id,  # won't always be a hook point
             device=device,
-        )[0]
+        )
         # move SAE to device if not there already
         sae.to(device)
-        if current_model_str != sae.cfg.model_name:
+        if current_model_str != sae.cfg.metadata.model_name:
             del current_model  # potentially saves GPU memory
-            current_model_str = sae.cfg.model_name
+            current_model_str = sae.cfg.metadata.model_name
             current_model = HookedTransformer.from_pretrained_no_processing(
-                current_model_str, device=device, **sae.cfg.model_from_pretrained_kwargs
+                current_model_str,
+                device=device,
+                **sae.cfg.metadata.model_from_pretrained_kwargs,
             )
         assert current_model is not None
@@ -849,6 +851,7 @@ def multiple_evals(
                 scalar_metrics, feature_metrics = run_evals(
                     sae=sae,
                     activation_store=activation_store,
+                    activation_scaler=ActivationScaler(),
                     model=current_model,
                     eval_config=eval_config,
                     ignore_tokens={
@@ -941,7 +944,7 @@ def process_results(
     }
-if __name__ == "__main__":
+def process_args(args: list[str]) -> argparse.Namespace:
     arg_parser = argparse.ArgumentParser(description="Run evaluations on SAEs")
     arg_parser.add_argument(
         "sae_regex_pattern",
@@ -1031,11 +1034,19 @@ if __name__ == "__main__":
         help="Enable verbose output with tqdm loaders.",
     )
-    args = arg_parser.parse_args()
-    eval_results = run_evaluations(args)
-    output_files = process_results(eval_results, args.output_dir)
+    return arg_parser.parse_args(args)
+def run_evals_cli(args: list[str]) -> None:
+    opts = process_args(args)
+    eval_results = run_evaluations(opts)
+    output_files = process_results(eval_results, opts.output_dir)
     print("Evaluation complete. Output files:")
     print(f"Individual JSONs: {len(output_files['individual_jsons'])}")  # type: ignore
     print(f"Combined JSON: {output_files['combined_json']}")
     print(f"CSV: {output_files['csv']}")
+if __name__ == "__main__":
+    run_evals_cli(sys.argv[1:])

sae-lens 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl