PyPI - sae-lens - Versions diffs - 6.0.0rc2__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl - Mend

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

sae_lens/__init__.py +6 -3
sae_lens/cache_activations_runner.py +7 -6
sae_lens/config.py +47 -5
sae_lens/constants.py +2 -0
sae_lens/evals.py +19 -19
sae_lens/{sae_training_runner.py → llm_sae_training_runner.py} +92 -60
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +0 -7
sae_lens/saes/sae.py +0 -3
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +77 -172
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/sae_trainer.py +96 -95
sae_lens/training/types.py +5 -0
sae_lens/util.py +19 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc3.dist-info}/METADATA +1 -1
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc3.dist-info}/RECORD +19 -16
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc3.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc3.dist-info}/WHEEL +0 -0

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.0.0-rc.2"
+__version__ = "6.0.0-rc.3"
 import logging
@@ -33,16 +33,17 @@ from .cache_activations_runner import CacheActivationsRunner
 from .config import (
     CacheActivationsRunnerConfig,
     LanguageModelSAERunnerConfig,
+    LoggingConfig,
     PretokenizeRunnerConfig,
 )
 from .evals import run_evals
+from .llm_sae_training_runner import LanguageModelSAETrainingRunner, SAETrainingRunner
 from .loading.pretrained_sae_loaders import (
     PretrainedSaeDiskLoader,
     PretrainedSaeHuggingfaceLoader,
 )
 from .pretokenize_runner import PretokenizeRunner, pretokenize_runner
 from .registry import register_sae_class, register_sae_training_class
-from .sae_training_runner import SAETrainingRunner
 from .training.activations_store import ActivationsStore
 from .training.upload_saes_to_huggingface import upload_saes_to_huggingface
@@ -54,7 +55,7 @@ __all__ = [
     "HookedSAETransformer",
     "ActivationsStore",
     "LanguageModelSAERunnerConfig",
-    "SAETrainingRunner",
+    "LanguageModelSAETrainingRunner",
     "CacheActivationsRunnerConfig",
     "CacheActivationsRunner",
     "PretokenizeRunnerConfig",
@@ -82,6 +83,8 @@ __all__ = [
     "JumpReLUSAEConfig",
     "JumpReLUTrainingSAE",
     "JumpReLUTrainingSAEConfig",
+    "SAETrainingRunner",
+    "LoggingConfig",
 ]

sae_lens/cache_activations_runner.py CHANGED Viewed

@@ -34,7 +34,6 @@ def _mk_activations_store(
         dataset=override_dataset or cfg.dataset_path,
         streaming=cfg.streaming,
         hook_name=cfg.hook_name,
-        hook_layer=cfg.hook_layer,
         hook_head_index=None,
         context_size=cfg.context_size,
         d_in=cfg.d_in,
@@ -265,7 +264,7 @@ class CacheActivationsRunner:
         for i in tqdm(range(self.cfg.n_buffers), desc="Caching activations"):
             try:
-                buffer = self.activations_store.get_buffer(
+                buffer = self.activations_store.get_raw_buffer(
                     self.cfg.n_batches_in_buffer, shuffle=False
                 )
                 shard = self._create_shard(buffer)
@@ -319,7 +318,7 @@ class CacheActivationsRunner:
     def _create_shard(
         self,
         buffer: tuple[
-            Float[torch.Tensor, "(bs context_size) num_layers d_in"],
+            Float[torch.Tensor, "(bs context_size) d_in"],
             Int[torch.Tensor, "(bs context_size)"] | None,
         ],
     ) -> Dataset:
@@ -327,13 +326,15 @@ class CacheActivationsRunner:
         acts, token_ids = buffer
         acts = einops.rearrange(
             acts,
-            "(bs context_size) num_layers d_in -> num_layers bs context_size d_in",
+            "(bs context_size) d_in -> bs context_size d_in",
             bs=self.cfg.n_seq_in_buffer,
             context_size=self.context_size,
             d_in=self.cfg.d_in,
-            num_layers=len(hook_names),
         )
-        shard_dict = {hook_name: act for hook_name, act in zip(hook_names, acts)}
+        shard_dict: dict[str, object] = {
+            hook_name: act_batch
+            for hook_name, act_batch in zip(hook_names, [acts], strict=True)
+        }
         if token_ids is not None:
             token_ids = einops.rearrange(

sae_lens/config.py CHANGED Viewed

@@ -23,7 +23,9 @@ from sae_lens.saes.sae import TrainingSAEConfig
 if TYPE_CHECKING:
     pass
-T_TRAINING_SAE_CONFIG = TypeVar("T_TRAINING_SAE_CONFIG", bound=TrainingSAEConfig)
+T_TRAINING_SAE_CONFIG = TypeVar(
+    "T_TRAINING_SAE_CONFIG", bound=TrainingSAEConfig, covariant=True
+)
 HfDataset = DatasetDict | Dataset | IterableDatasetDict | IterableDataset
@@ -102,7 +104,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         model_class_name (str): The name of the class of the model to use. This should be either `HookedTransformer` or `HookedMamba`.
         hook_name (str): The name of the hook to use. This should be a valid TransformerLens hook.
         hook_eval (str): NOT CURRENTLY IN USE. The name of the hook to use for evaluation.
-        hook_layer (int): The index of the layer to hook. Used to stop forward passes early and speed up processing.
         hook_head_index (int, optional): When the hook is for an activation with a head index, we can specify a specific head to use here.
         dataset_path (str): A Hugging Face dataset path.
         dataset_trust_remote_code (bool): Whether to trust remote code when loading datasets from Huggingface.
@@ -159,7 +160,6 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     model_class_name: str = "HookedTransformer"
     hook_name: str = "blocks.0.hook_mlp_out"
     hook_eval: str = "NOT_IN_USE"
-    hook_layer: int = 0
     hook_head_index: int | None = None
     dataset_path: str = ""
     dataset_trust_remote_code: bool = True
@@ -375,6 +375,28 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         return cls(**cfg)
+    def to_sae_trainer_config(self) -> "SAETrainerConfig":
+        return SAETrainerConfig(
+            n_checkpoints=self.n_checkpoints,
+            checkpoint_path=self.checkpoint_path,
+            total_training_samples=self.total_training_tokens,
+            device=self.device,
+            autocast=self.autocast,
+            lr=self.lr,
+            lr_end=self.lr_end,
+            lr_scheduler_name=self.lr_scheduler_name,
+            lr_warm_up_steps=self.lr_warm_up_steps,
+            adam_beta1=self.adam_beta1,
+            adam_beta2=self.adam_beta2,
+            lr_decay_steps=self.lr_decay_steps,
+            n_restart_cycles=self.n_restart_cycles,
+            total_training_steps=self.total_training_steps,
+            train_batch_size_samples=self.train_batch_size_tokens,
+            dead_feature_window=self.dead_feature_window,
+            feature_sampling_window=self.feature_sampling_window,
+            logger=self.logger,
+        )
 @dataclass
 class CacheActivationsRunnerConfig:
@@ -386,7 +408,6 @@ class CacheActivationsRunnerConfig:
         model_name (str): The name of the model to use.
         model_batch_size (int): How many prompts are in the batch of the language model when generating activations.
         hook_name (str): The name of the hook to use.
-        hook_layer (int): The layer of the final hook. Currently only support a single hook, so this should be the same as hook_name.
         d_in (int): Dimension of the model.
         total_training_tokens (int): Total number of tokens to process.
         context_size (int): Context size to process. Can be left as -1 if the dataset is tokenized.
@@ -416,7 +437,6 @@ class CacheActivationsRunnerConfig:
     model_name: str
     model_batch_size: int
     hook_name: str
-    hook_layer: int
     d_in: int
     training_tokens: int
@@ -576,3 +596,25 @@ class PretokenizeRunnerConfig:
     hf_num_shards: int = 64
     hf_revision: str = "main"
     hf_is_private_repo: bool = False
+@dataclass
+class SAETrainerConfig:
+    n_checkpoints: int
+    checkpoint_path: str
+    total_training_samples: int
+    device: str
+    autocast: bool
+    lr: float
+    lr_end: float | None
+    lr_scheduler_name: str
+    lr_warm_up_steps: int
+    adam_beta1: float
+    adam_beta2: float
+    lr_decay_steps: int
+    n_restart_cycles: int
+    total_training_steps: int
+    train_batch_size_samples: int
+    dead_feature_window: int
+    feature_sampling_window: int
+    logger: LoggingConfig

sae_lens/constants.py CHANGED Viewed

@@ -16,3 +16,5 @@ SPARSITY_FILENAME = "sparsity.safetensors"
 SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"
 SAE_CFG_FILENAME = "cfg.json"
 RUNNER_CFG_FILENAME = "runner_cfg.json"
+ACTIVATIONS_STORE_STATE_FILENAME = "activations_store_state.safetensors"
+ACTIVATION_SCALER_CFG_FILENAME = "activation_scaler.json"

sae_lens/evals.py CHANGED Viewed

@@ -21,7 +21,9 @@ from transformer_lens.hook_points import HookedRootModule
 from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory
 from sae_lens.saes.sae import SAE, SAEConfig
+from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
 def get_library_version() -> str:
@@ -103,6 +105,7 @@ def run_evals(
     sae: SAE[Any],
     activation_store: ActivationsStore,
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     eval_config: EvalConfig = EvalConfig(),
     model_kwargs: Mapping[str, Any] = {},
     ignore_tokens: set[int | None] = set(),
@@ -140,6 +143,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_kl=eval_config.compute_kl,
             compute_ce_loss=eval_config.compute_ce_loss,
             n_batches=eval_config.n_eval_reconstruction_batches,
@@ -189,6 +193,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_l2_norms=eval_config.compute_l2_norms,
             compute_sparsity_metrics=eval_config.compute_sparsity_metrics,
             compute_variance_metrics=eval_config.compute_variance_metrics,
@@ -301,6 +306,7 @@ def get_downstream_reconstruction_metrics(
     sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     compute_kl: bool,
     compute_ce_loss: bool,
     n_batches: int,
@@ -326,8 +332,8 @@ def get_downstream_reconstruction_metrics(
         for metric_name, metric_value in get_recons_loss(
             sae,
             model,
+            activation_scaler,
             batch_tokens,
-            activation_store,
             compute_kl=compute_kl,
             compute_ce_loss=compute_ce_loss,
             ignore_tokens=ignore_tokens,
@@ -369,6 +375,7 @@ def get_sparsity_and_variance_metrics(
     sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     n_batches: int,
     compute_l2_norms: bool,
     compute_sparsity_metrics: bool,
@@ -436,7 +443,7 @@ def get_sparsity_and_variance_metrics(
             batch_tokens,
             prepend_bos=False,
             names_filter=[hook_name],
-            stop_at_layer=sae.cfg.metadata.hook_layer + 1,
+            stop_at_layer=extract_stop_at_layer_from_tlens_hook_name(hook_name),
             **model_kwargs,
         )
@@ -451,16 +458,14 @@ def get_sparsity_and_variance_metrics(
             original_act = cache[hook_name]
         # normalise if necessary (necessary in training only, otherwise we should fold the scaling in)
-        if activation_store.normalize_activations == "expected_average_only_in":
-            original_act = activation_store.apply_norm_scaling_factor(original_act)
+        original_act = activation_scaler.scale(original_act)
         # send the (maybe normalised) activations into the SAE
         sae_feature_activations = sae.encode(original_act.to(sae.device))
         sae_out = sae.decode(sae_feature_activations).to(original_act.device)
         del cache
-        if activation_store.normalize_activations == "expected_average_only_in":
-            sae_out = activation_store.unscale(sae_out)
+        sae_out = activation_scaler.unscale(sae_out)
         flattened_sae_input = einops.rearrange(original_act, "b ctx d -> (b ctx) d")
         flattened_sae_feature_acts = einops.rearrange(
@@ -582,8 +587,8 @@ def get_sparsity_and_variance_metrics(
 def get_recons_loss(
     sae: SAE[SAEConfig],
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     batch_tokens: torch.Tensor,
-    activation_store: ActivationsStore,
     compute_kl: bool,
     compute_ce_loss: bool,
     ignore_tokens: set[int | None] = set(),
@@ -618,15 +623,13 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations)).to(activations.dtype)
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         new_activations = torch.where(mask[..., None], new_activations, activations)
@@ -637,8 +640,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations.flatten(-2, -1))).to(
@@ -650,8 +652,7 @@ def get_recons_loss(
         )  # reshape to match original shape
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         return new_activations.to(original_device)
@@ -660,8 +661,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         new_activations = sae.decode(sae.encode(activations[:, :, head_index])).to(
             activations.dtype
@@ -669,8 +669,7 @@ def get_recons_loss(
         activations[:, :, head_index] = new_activations
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.unscale(activations)
+        activations = activation_scaler.unscale(activations)
         return activations.to(original_device)
@@ -849,6 +848,7 @@ def multiple_evals(
                 scalar_metrics, feature_metrics = run_evals(
                     sae=sae,
                     activation_store=activation_store,
+                    activation_scaler=ActivationScaler(),
                     model=current_model,
                     eval_config=eval_config,
                     ignore_tokens={

sae_lens/{sae_training_runner.py → llm_sae_training_runner.py} RENAMED Viewed

@@ -2,23 +2,31 @@ import json
 import signal
 import sys
 from collections.abc import Sequence
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, cast
+from typing import Any, Generic, cast
 import torch
 import wandb
-from safetensors.torch import save_file
 from simple_parsing import ArgumentParser
 from transformer_lens.hook_points import HookedRootModule
+from typing_extensions import deprecated
 from sae_lens import logger
 from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
-from sae_lens.constants import RUNNER_CFG_FILENAME, SPARSITY_FILENAME
+from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, RUNNER_CFG_FILENAME
+from sae_lens.evals import EvalConfig, run_evals
 from sae_lens.load_model import load_model
-from sae_lens.saes.sae import T_TRAINING_SAE_CONFIG, TrainingSAE, TrainingSAEConfig
+from sae_lens.saes.sae import (
+    T_TRAINING_SAE,
+    T_TRAINING_SAE_CONFIG,
+    TrainingSAE,
+    TrainingSAEConfig,
+)
+from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
-from sae_lens.training.geometric_median import compute_geometric_median
 from sae_lens.training.sae_trainer import SAETrainer
+from sae_lens.training.types import DataProvider
 class InterruptedException(Exception):
@@ -29,7 +37,58 @@ def interrupt_callback(sig_num: Any, stack_frame: Any):  # noqa: ARG001
     raise InterruptedException()
-class SAETrainingRunner:
+@dataclass
+class LLMSaeEvaluator(Generic[T_TRAINING_SAE]):
+    model: HookedRootModule
+    activations_store: ActivationsStore
+    eval_batch_size_prompts: int | None
+    n_eval_batches: int
+    model_kwargs: dict[str, Any]
+    def __call__(
+        self,
+        sae: T_TRAINING_SAE,
+        data_provider: DataProvider,
+        activation_scaler: ActivationScaler,
+    ) -> dict[str, Any]:
+        ignore_tokens = set()
+        if self.activations_store.exclude_special_tokens is not None:
+            ignore_tokens = set(self.activations_store.exclude_special_tokens.tolist())
+        eval_config = EvalConfig(
+            batch_size_prompts=self.eval_batch_size_prompts,
+            n_eval_reconstruction_batches=self.n_eval_batches,
+            n_eval_sparsity_variance_batches=self.n_eval_batches,
+            compute_ce_loss=True,
+            compute_l2_norms=True,
+            compute_sparsity_metrics=True,
+            compute_variance_metrics=True,
+        )
+        eval_metrics, _ = run_evals(
+            sae=sae,
+            activation_store=self.activations_store,
+            model=self.model,
+            activation_scaler=activation_scaler,
+            eval_config=eval_config,
+            ignore_tokens=ignore_tokens,
+            model_kwargs=self.model_kwargs,
+        )  # not calculating featurwise metrics here.
+        # Remove eval metrics that are already logged during training
+        eval_metrics.pop("metrics/explained_variance", None)
+        eval_metrics.pop("metrics/explained_variance_std", None)
+        eval_metrics.pop("metrics/l0", None)
+        eval_metrics.pop("metrics/l1", None)
+        eval_metrics.pop("metrics/mse", None)
+        # Remove metrics that are not useful for wandb logging
+        eval_metrics.pop("metrics/total_tokens_evaluated", None)
+        return eval_metrics
+class LanguageModelSAETrainingRunner:
     """
     Class to run the training of a Sparse Autoencoder (SAE) on a TransformerLens model.
     """
@@ -84,7 +143,6 @@ class SAETrainingRunner:
                         self.cfg.get_training_sae_cfg_dict(),
                     ).to_dict()
                 )
-                self._init_sae_group_b_decs()
         else:
             self.sae = override_sae
@@ -102,12 +160,20 @@ class SAETrainingRunner:
                 id=self.cfg.logger.wandb_id,
             )
-        trainer = SAETrainer(
+        evaluator = LLMSaeEvaluator(
             model=self.model,
+            activations_store=self.activations_store,
+            eval_batch_size_prompts=self.cfg.eval_batch_size_prompts,
+            n_eval_batches=self.cfg.n_eval_batches,
+            model_kwargs=self.cfg.model_kwargs,
+        )
+        trainer = SAETrainer(
             sae=self.sae,
-            activation_store=self.activations_store,
+            data_provider=self.activations_store,
+            evaluator=evaluator,
             save_checkpoint_fn=self.save_checkpoint,
-            cfg=self.cfg,
+            cfg=self.cfg.to_sae_trainer_config(),
         )
         self._compile_if_needed()
@@ -156,66 +222,27 @@ class SAETrainingRunner:
         except (KeyboardInterrupt, InterruptedException):
             logger.warning("interrupted, saving progress")
-            checkpoint_name = str(trainer.n_training_tokens)
-            self.save_checkpoint(trainer, checkpoint_name=checkpoint_name)
+            checkpoint_path = Path(self.cfg.checkpoint_path) / str(
+                trainer.n_training_samples
+            )
+            self.save_checkpoint(checkpoint_path)
             logger.info("done saving")
             raise
         return sae
-    # TODO: move this into the SAE trainer or Training SAE class
-    def _init_sae_group_b_decs(
-        self,
-    ) -> None:
-        """
-        extract all activations at a certain layer and use for sae b_dec initialization
-        """
-        if self.cfg.sae.b_dec_init_method == "geometric_median":
-            self.activations_store.set_norm_scaling_factor_if_needed()
-            layer_acts = self.activations_store.storage_buffer.detach()[:, 0, :]
-            # get geometric median of the activations if we're using those.
-            median = compute_geometric_median(
-                layer_acts,
-                maxiter=100,
-            ).median
-            self.sae.initialize_b_dec_with_precalculated(median)
-        elif self.cfg.sae.b_dec_init_method == "mean":
-            self.activations_store.set_norm_scaling_factor_if_needed()
-            layer_acts = self.activations_store.storage_buffer.detach().cpu()[:, 0, :]
-            self.sae.initialize_b_dec_with_mean(layer_acts)  # type: ignore
-    @staticmethod
     def save_checkpoint(
-        trainer: SAETrainer[TrainingSAE[Any], Any],
-        checkpoint_name: str,
-        wandb_aliases: list[str] | None = None,
+        self,
+        checkpoint_path: Path,
     ) -> None:
-        base_path = Path(trainer.cfg.checkpoint_path) / checkpoint_name
-        base_path.mkdir(exist_ok=True, parents=True)
-        trainer.activations_store.save(
-            str(base_path / "activations_store_state.safetensors")
+        self.activations_store.save(
+            str(checkpoint_path / ACTIVATIONS_STORE_STATE_FILENAME)
         )
-        weights_path, cfg_path = trainer.sae.save_model(str(base_path))
-        sparsity_path = base_path / SPARSITY_FILENAME
-        save_file({"sparsity": trainer.log_feature_sparsity}, sparsity_path)
-        runner_config = trainer.cfg.to_dict()
-        with open(base_path / RUNNER_CFG_FILENAME, "w") as f:
+        runner_config = self.cfg.to_dict()
+        with open(checkpoint_path / RUNNER_CFG_FILENAME, "w") as f:
             json.dump(runner_config, f)
-        if trainer.cfg.logger.log_to_wandb:
-            trainer.cfg.logger.log(
-                trainer,
-                weights_path,
-                cfg_path,
-                sparsity_path=sparsity_path,
-                wandb_aliases=wandb_aliases,
-            )
 def _parse_cfg_args(
     args: Sequence[str],
@@ -230,8 +257,13 @@ def _parse_cfg_args(
 # moved into its own function to make it easier to test
 def _run_cli(args: Sequence[str]):
     cfg = _parse_cfg_args(args)
-    SAETrainingRunner(cfg=cfg).run()
+    LanguageModelSAETrainingRunner(cfg=cfg).run()
 if __name__ == "__main__":
     _run_cli(args=sys.argv[1:])
+@deprecated("Use LanguageModelSAETrainingRunner instead")
+class SAETrainingRunner(LanguageModelSAETrainingRunner):
+    pass

sae_lens/load_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Literal, cast
+from typing import Any, Callable, Literal, cast
 import torch
 from transformer_lens import HookedTransformer
@@ -77,6 +77,7 @@ class HookedProxyLM(HookedRootModule):
     # copied and modified from base HookedRootModule
     def setup(self):
         self.mod_dict = {}
+        self.named_modules_dict = {}
         self.hook_dict: dict[str, HookPoint] = {}
         for name, module in self.model.named_modules():
             if name == "":
@@ -89,14 +90,21 @@ class HookedProxyLM(HookedRootModule):
             self.hook_dict[name] = hook_point
             self.mod_dict[name] = hook_point
+            self.named_modules_dict[name] = module
+    def run_with_cache(self, *args: Any, **kwargs: Any):  # type: ignore
+        if "names_filter" in kwargs:
+            # hacky way to make sure that the names_filter is passed to our forward method
+            kwargs["_names_filter"] = kwargs["names_filter"]
+        return super().run_with_cache(*args, **kwargs)
     def forward(
         self,
         tokens: torch.Tensor,
         return_type: Literal["both", "logits"] = "logits",
         loss_per_token: bool = False,
-        # TODO: implement real support for stop_at_layer
         stop_at_layer: int | None = None,
+        _names_filter: list[str] | None = None,
         **kwargs: Any,
     ) -> Output | Loss:
         # This is just what's needed for evals, not everything that HookedTransformer has
@@ -107,8 +115,28 @@ class HookedProxyLM(HookedRootModule):
             raise NotImplementedError(
                 "Only return_type supported is 'both' or 'logits' to match what's in evals.py and ActivationsStore"
             )
-        output = self.model(tokens)
-        logits = _extract_logits_from_output(output)
+        stop_hooks = []
+        if stop_at_layer is not None and _names_filter is not None:
+            if return_type != "logits":
+                raise NotImplementedError(
+                    "stop_at_layer is not supported for return_type='both'"
+                )
+            stop_manager = StopManager(_names_filter)
+            for hook_name in _names_filter:
+                module = self.named_modules_dict[hook_name]
+                stop_fn = stop_manager.get_stop_hook_fn(hook_name)
+                stop_hooks.append(module.register_forward_hook(stop_fn))
+        try:
+            output = self.model(tokens)
+            logits = _extract_logits_from_output(output)
+        except StopForward:
+            # If we stop early, we don't care about the return output
+            return None  # type: ignore
+        finally:
+            for stop_hook in stop_hooks:
+                stop_hook.remove()
         if return_type == "logits":
             return logits
@@ -159,7 +187,7 @@ class HookedProxyLM(HookedRootModule):
         # We don't want to prepend bos but the tokenizer does it automatically, so we remove it manually
         if hasattr(self.tokenizer, "add_bos_token") and self.tokenizer.add_bos_token:  # type: ignore
-            tokens = get_tokens_with_bos_removed(self.tokenizer, tokens)
+            tokens = get_tokens_with_bos_removed(self.tokenizer, tokens)  # type: ignore
         return tokens  # type: ignore
@@ -183,3 +211,23 @@ def get_hook_fn(hook_point: HookPoint):
         return output
     return hook_fn
+class StopForward(Exception):
+    pass
+class StopManager:
+    def __init__(self, hook_names: list[str]):
+        self.hook_names = hook_names
+        self.total_hook_names = len(set(hook_names))
+        self.called_hook_names = set()
+    def get_stop_hook_fn(self, hook_name: str) -> Callable[[Any, Any, Any], Any]:
+        def stop_hook_fn(module: Any, input: Any, output: Any) -> Any:  # noqa: ARG001
+            self.called_hook_names.add(hook_name)
+            if len(self.called_hook_names) == self.total_hook_names:
+                raise StopForward()
+            return output
+        return stop_hook_fn

sae-lens 6.0.0rc2__py3-none-any.whl → 6.0.0rc3__py3-none-any.whl

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc3py3-none-any.whl