PyPI - sae-lens - Versions diffs - 5.11.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

sae-lens 5.11.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

sae_lens/__init__.py +60 -7
sae_lens/analysis/hooked_sae_transformer.py +12 -12
sae_lens/analysis/neuronpedia_integration.py +16 -14
sae_lens/cache_activations_runner.py +9 -7
sae_lens/config.py +170 -258
sae_lens/constants.py +21 -0
sae_lens/evals.py +59 -44
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +52 -4
sae_lens/{toolkit → loading}/pretrained_sae_loaders.py +85 -32
sae_lens/registry.py +49 -0
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +254 -0
sae_lens/saes/jumprelu_sae.py +348 -0
sae_lens/saes/sae.py +1076 -0
sae_lens/saes/standard_sae.py +178 -0
sae_lens/saes/topk_sae.py +300 -0
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +103 -184
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +155 -177
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +13 -7
sae_lens/util.py +47 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/METADATA +1 -1
sae_lens-6.0.0.dist-info/RECORD +37 -0
sae_lens/sae.py +0 -747
sae_lens/sae_training_runner.py +0 -251
sae_lens/training/geometric_median.py +0 -101
sae_lens/training/training_sae.py +0 -710
sae_lens-5.11.0.dist-info/RECORD +0 -28
/sae_lens/{toolkit → loading}/__init__.py +0 -0
/sae_lens/{toolkit → loading}/pretrained_saes_directory.py +0 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/LICENSE +0 -0
{sae_lens-5.11.0.dist-info → sae_lens-6.0.0.dist-info}/WHEEL +0 -0

sae_lens/evals.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import math
 import re
 import subprocess
+import sys
 from collections import defaultdict
 from collections.abc import Mapping
 from dataclasses import dataclass, field
@@ -15,13 +16,15 @@ from typing import Any
 import einops
 import pandas as pd
 import torch
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from transformer_lens import HookedTransformer
 from transformer_lens.hook_points import HookedRootModule
-from sae_lens.sae import SAE
-from sae_lens.toolkit.pretrained_saes_directory import get_pretrained_saes_directory
+from sae_lens.loading.pretrained_saes_directory import get_pretrained_saes_directory
+from sae_lens.saes.sae import SAE, SAEConfig
+from sae_lens.training.activation_scaler import ActivationScaler
 from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
 def get_library_version() -> str:
@@ -100,15 +103,16 @@ def get_eval_everything_config(
 @torch.no_grad()
 def run_evals(
-    sae: SAE,
+    sae: SAE[Any],
     activation_store: ActivationsStore,
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     eval_config: EvalConfig = EvalConfig(),
     model_kwargs: Mapping[str, Any] = {},
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
+    hook_name = sae.cfg.metadata.hook_name
     actual_batch_size = (
         eval_config.batch_size_prompts or activation_store.store_batch_size_prompts
     )
@@ -140,6 +144,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_kl=eval_config.compute_kl,
             compute_ce_loss=eval_config.compute_ce_loss,
             n_batches=eval_config.n_eval_reconstruction_batches,
@@ -189,6 +194,7 @@ def run_evals(
             sae,
             model,
             activation_store,
+            activation_scaler,
             compute_l2_norms=eval_config.compute_l2_norms,
             compute_sparsity_metrics=eval_config.compute_sparsity_metrics,
             compute_variance_metrics=eval_config.compute_variance_metrics,
@@ -274,12 +280,11 @@ def run_evals(
     return all_metrics, feature_metrics
-def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
+def get_featurewise_weight_based_metrics(sae: SAE[Any]) -> dict[str, Any]:
     unit_norm_encoders = (sae.W_enc / sae.W_enc.norm(dim=0, keepdim=True)).cpu()
     unit_norm_decoder = (sae.W_dec.T / sae.W_dec.T.norm(dim=0, keepdim=True)).cpu()
     encoder_norms = sae.W_enc.norm(dim=-2).cpu().tolist()
-    encoder_bias = sae.b_enc.cpu().tolist()
     encoder_decoder_cosine_sim = (
         torch.nn.functional.cosine_similarity(
             unit_norm_decoder.T,
@@ -289,17 +294,20 @@ def get_featurewise_weight_based_metrics(sae: SAE) -> dict[str, Any]:
         .tolist()
     )
-    return {
-        "encoder_bias": encoder_bias,
+    metrics = {
         "encoder_norm": encoder_norms,
         "encoder_decoder_cosine_sim": encoder_decoder_cosine_sim,
     }
+    if hasattr(sae, "b_enc") and sae.b_enc is not None:
+        metrics["encoder_bias"] = sae.b_enc.cpu().tolist()  # type: ignore
+    return metrics
 def get_downstream_reconstruction_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     compute_kl: bool,
     compute_ce_loss: bool,
     n_batches: int,
@@ -325,8 +333,8 @@ def get_downstream_reconstruction_metrics(
         for metric_name, metric_value in get_recons_loss(
             sae,
             model,
+            activation_scaler,
             batch_tokens,
-            activation_store,
             compute_kl=compute_kl,
             compute_ce_loss=compute_ce_loss,
             ignore_tokens=ignore_tokens,
@@ -365,9 +373,10 @@ def get_downstream_reconstruction_metrics(
 def get_sparsity_and_variance_metrics(
-    sae: SAE,
+    sae: SAE[Any],
     model: HookedRootModule,
     activation_store: ActivationsStore,
+    activation_scaler: ActivationScaler,
     n_batches: int,
     compute_l2_norms: bool,
     compute_sparsity_metrics: bool,
@@ -378,8 +387,8 @@ def get_sparsity_and_variance_metrics(
     ignore_tokens: set[int | None] = set(),
     verbose: bool = False,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
-    hook_name = sae.cfg.hook_name
-    hook_head_index = sae.cfg.hook_head_index
+    hook_name = sae.cfg.metadata.hook_name
+    hook_head_index = sae.cfg.metadata.hook_head_index
     metric_dict = {}
     feature_metric_dict = {}
@@ -435,7 +444,7 @@ def get_sparsity_and_variance_metrics(
             batch_tokens,
             prepend_bos=False,
             names_filter=[hook_name],
-            stop_at_layer=sae.cfg.hook_layer + 1,
+            stop_at_layer=extract_stop_at_layer_from_tlens_hook_name(hook_name),
             **model_kwargs,
         )
@@ -450,16 +459,14 @@ def get_sparsity_and_variance_metrics(
             original_act = cache[hook_name]
         # normalise if necessary (necessary in training only, otherwise we should fold the scaling in)
-        if activation_store.normalize_activations == "expected_average_only_in":
-            original_act = activation_store.apply_norm_scaling_factor(original_act)
+        original_act = activation_scaler.scale(original_act)
         # send the (maybe normalised) activations into the SAE
         sae_feature_activations = sae.encode(original_act.to(sae.device))
         sae_out = sae.decode(sae_feature_activations).to(original_act.device)
         del cache
-        if activation_store.normalize_activations == "expected_average_only_in":
-            sae_out = activation_store.unscale(sae_out)
+        sae_out = activation_scaler.unscale(sae_out)
         flattened_sae_input = einops.rearrange(original_act, "b ctx d -> (b ctx) d")
         flattened_sae_feature_acts = einops.rearrange(
@@ -579,17 +586,21 @@ def get_sparsity_and_variance_metrics(
 @torch.no_grad()
 def get_recons_loss(
-    sae: SAE,
+    sae: SAE[SAEConfig],
     model: HookedRootModule,
+    activation_scaler: ActivationScaler,
     batch_tokens: torch.Tensor,
-    activation_store: ActivationsStore,
     compute_kl: bool,
     compute_ce_loss: bool,
     ignore_tokens: set[int | None] = set(),
     model_kwargs: Mapping[str, Any] = {},
+    hook_name: str | None = None,
 ) -> dict[str, Any]:
-    hook_name = sae.cfg.hook_name
-    head_index = sae.cfg.hook_head_index
+    hook_name = hook_name or sae.cfg.metadata.hook_name
+    head_index = sae.cfg.metadata.hook_head_index
+    if hook_name is None:
+        raise ValueError("hook_name must be provided")
     original_logits, original_ce_loss = model(
         batch_tokens, return_type="both", loss_per_token=True, **model_kwargs
@@ -613,15 +624,13 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations)).to(activations.dtype)
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         new_activations = torch.where(mask[..., None], new_activations, activations)
@@ -632,8 +641,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         # SAE class agnost forward forward pass.
         new_activations = sae.decode(sae.encode(activations.flatten(-2, -1))).to(
@@ -645,8 +653,7 @@ def get_recons_loss(
         )  # reshape to match original shape
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            new_activations = activation_store.unscale(new_activations)
+        new_activations = activation_scaler.unscale(new_activations)
         return new_activations.to(original_device)
@@ -655,8 +662,7 @@ def get_recons_loss(
         activations = activations.to(sae.device)
         # Handle rescaling if SAE expects it
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.apply_norm_scaling_factor(activations)
+        activations = activation_scaler.scale(activations)
         new_activations = sae.decode(sae.encode(activations[:, :, head_index])).to(
             activations.dtype
@@ -664,8 +670,7 @@ def get_recons_loss(
         activations[:, :, head_index] = new_activations
         # Unscale if activations were scaled prior to going into the SAE
-        if activation_store.normalize_activations == "expected_average_only_in":
-            activations = activation_store.unscale(activations)
+        activations = activation_scaler.unscale(activations)
         return activations.to(original_device)
@@ -794,22 +799,23 @@ def multiple_evals(
     current_model = None
     current_model_str = None
-    print(filtered_saes)
     for sae_release_name, sae_id, _, _ in tqdm(filtered_saes):
         sae = SAE.from_pretrained(
             release=sae_release_name,  # see other options in sae_lens/pretrained_saes.yaml
             sae_id=sae_id,  # won't always be a hook point
             device=device,
-        )[0]
+        )
         # move SAE to device if not there already
         sae.to(device)
-        if current_model_str != sae.cfg.model_name:
+        if current_model_str != sae.cfg.metadata.model_name:
             del current_model  # potentially saves GPU memory
-            current_model_str = sae.cfg.model_name
+            current_model_str = sae.cfg.metadata.model_name
             current_model = HookedTransformer.from_pretrained_no_processing(
-                current_model_str, device=device, **sae.cfg.model_from_pretrained_kwargs
+                current_model_str,
+                device=device,
+                **sae.cfg.metadata.model_from_pretrained_kwargs,
             )
         assert current_model is not None
@@ -834,6 +840,7 @@ def multiple_evals(
                 scalar_metrics, feature_metrics = run_evals(
                     sae=sae,
                     activation_store=activation_store,
+                    activation_scaler=ActivationScaler(),
                     model=current_model,
                     eval_config=eval_config,
                     ignore_tokens={
@@ -926,7 +933,7 @@ def process_results(
     }
-if __name__ == "__main__":
+def process_args(args: list[str]) -> argparse.Namespace:
     arg_parser = argparse.ArgumentParser(description="Run evaluations on SAEs")
     arg_parser.add_argument(
         "sae_regex_pattern",
@@ -1016,11 +1023,19 @@ if __name__ == "__main__":
         help="Enable verbose output with tqdm loaders.",
     )
-    args = arg_parser.parse_args()
-    eval_results = run_evaluations(args)
-    output_files = process_results(eval_results, args.output_dir)
+    return arg_parser.parse_args(args)
+def run_evals_cli(args: list[str]) -> None:
+    opts = process_args(args)
+    eval_results = run_evaluations(opts)
+    output_files = process_results(eval_results, opts.output_dir)
     print("Evaluation complete. Output files:")
     print(f"Individual JSONs: {len(output_files['individual_jsons'])}")  # type: ignore
     print(f"Combined JSON: {output_files['combined_json']}")
     print(f"CSV: {output_files['csv']}")
+if __name__ == "__main__":
+    run_evals_cli(sys.argv[1:])

sae_lens/llm_sae_training_runner.py ADDED Viewed

@@ -0,0 +1,377 @@
+import json
+import signal
+import sys
+from collections.abc import Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Generic
+import torch
+import wandb
+from simple_parsing import ArgumentParser
+from transformer_lens.hook_points import HookedRootModule
+from typing_extensions import deprecated
+from sae_lens import logger
+from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
+from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, RUNNER_CFG_FILENAME
+from sae_lens.evals import EvalConfig, run_evals
+from sae_lens.load_model import load_model
+from sae_lens.saes.gated_sae import GatedTrainingSAEConfig
+from sae_lens.saes.jumprelu_sae import JumpReLUTrainingSAEConfig
+from sae_lens.saes.sae import (
+    T_TRAINING_SAE,
+    T_TRAINING_SAE_CONFIG,
+    TrainingSAE,
+    TrainingSAEConfig,
+)
+from sae_lens.saes.standard_sae import StandardTrainingSAEConfig
+from sae_lens.saes.topk_sae import TopKTrainingSAEConfig
+from sae_lens.training.activation_scaler import ActivationScaler
+from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.training.sae_trainer import SAETrainer
+from sae_lens.training.types import DataProvider
+class InterruptedException(Exception):
+    pass
+def interrupt_callback(sig_num: Any, stack_frame: Any):  # noqa: ARG001
+    raise InterruptedException()
+@dataclass
+class LLMSaeEvaluator(Generic[T_TRAINING_SAE]):
+    model: HookedRootModule
+    activations_store: ActivationsStore
+    eval_batch_size_prompts: int | None
+    n_eval_batches: int
+    model_kwargs: dict[str, Any]
+    def __call__(
+        self,
+        sae: T_TRAINING_SAE,
+        data_provider: DataProvider,
+        activation_scaler: ActivationScaler,
+    ) -> dict[str, Any]:
+        ignore_tokens = set()
+        if self.activations_store.exclude_special_tokens is not None:
+            ignore_tokens = set(self.activations_store.exclude_special_tokens.tolist())
+        eval_config = EvalConfig(
+            batch_size_prompts=self.eval_batch_size_prompts,
+            n_eval_reconstruction_batches=self.n_eval_batches,
+            n_eval_sparsity_variance_batches=self.n_eval_batches,
+            compute_ce_loss=True,
+            compute_l2_norms=True,
+            compute_sparsity_metrics=True,
+            compute_variance_metrics=True,
+        )
+        eval_metrics, _ = run_evals(
+            sae=sae,
+            activation_store=self.activations_store,
+            model=self.model,
+            activation_scaler=activation_scaler,
+            eval_config=eval_config,
+            ignore_tokens=ignore_tokens,
+            model_kwargs=self.model_kwargs,
+        )  # not calculating featurwise metrics here.
+        # Remove eval metrics that are already logged during training
+        eval_metrics.pop("metrics/explained_variance", None)
+        eval_metrics.pop("metrics/explained_variance_std", None)
+        eval_metrics.pop("metrics/l0", None)
+        eval_metrics.pop("metrics/l1", None)
+        eval_metrics.pop("metrics/mse", None)
+        # Remove metrics that are not useful for wandb logging
+        eval_metrics.pop("metrics/total_tokens_evaluated", None)
+        return eval_metrics
+class LanguageModelSAETrainingRunner:
+    """
+    Class to run the training of a Sparse Autoencoder (SAE) on a TransformerLens model.
+    """
+    cfg: LanguageModelSAERunnerConfig[Any]
+    model: HookedRootModule
+    sae: TrainingSAE[Any]
+    activations_store: ActivationsStore
+    def __init__(
+        self,
+        cfg: LanguageModelSAERunnerConfig[T_TRAINING_SAE_CONFIG],
+        override_dataset: HfDataset | None = None,
+        override_model: HookedRootModule | None = None,
+        override_sae: TrainingSAE[Any] | None = None,
+    ):
+        if override_dataset is not None:
+            logger.warning(
+                f"You just passed in a dataset which will override the one specified in your configuration: {cfg.dataset_path}. As a consequence this run will not be reproducible via configuration alone."
+            )
+        if override_model is not None:
+            logger.warning(
+                f"You just passed in a model which will override the one specified in your configuration: {cfg.model_name}. As a consequence this run will not be reproducible via configuration alone."
+            )
+        self.cfg = cfg
+        if override_model is None:
+            self.model = load_model(
+                self.cfg.model_class_name,
+                self.cfg.model_name,
+                device=self.cfg.device,
+                model_from_pretrained_kwargs=self.cfg.model_from_pretrained_kwargs,
+            )
+        else:
+            self.model = override_model
+        self.activations_store = ActivationsStore.from_config(
+            self.model,
+            self.cfg,
+            override_dataset=override_dataset,
+        )
+        if override_sae is None:
+            if self.cfg.from_pretrained_path is not None:
+                self.sae = TrainingSAE.load_from_disk(
+                    self.cfg.from_pretrained_path, self.cfg.device
+                )
+            else:
+                self.sae = TrainingSAE.from_dict(
+                    TrainingSAEConfig.from_dict(
+                        self.cfg.get_training_sae_cfg_dict(),
+                    ).to_dict()
+                )
+        else:
+            self.sae = override_sae
+        self.sae.to(self.cfg.device)
+    def run(self):
+        """
+        Run the training of the SAE.
+        """
+        self._set_sae_metadata()
+        if self.cfg.logger.log_to_wandb:
+            wandb.init(
+                project=self.cfg.logger.wandb_project,
+                entity=self.cfg.logger.wandb_entity,
+                config=self.cfg.to_dict(),
+                name=self.cfg.logger.run_name,
+                id=self.cfg.logger.wandb_id,
+            )
+        evaluator = LLMSaeEvaluator(
+            model=self.model,
+            activations_store=self.activations_store,
+            eval_batch_size_prompts=self.cfg.eval_batch_size_prompts,
+            n_eval_batches=self.cfg.n_eval_batches,
+            model_kwargs=self.cfg.model_kwargs,
+        )
+        trainer = SAETrainer(
+            sae=self.sae,
+            data_provider=self.activations_store,
+            evaluator=evaluator,
+            save_checkpoint_fn=self.save_checkpoint,
+            cfg=self.cfg.to_sae_trainer_config(),
+        )
+        self._compile_if_needed()
+        sae = self.run_trainer_with_interruption_handling(trainer)
+        if self.cfg.logger.log_to_wandb:
+            wandb.finish()
+        return sae
+    def _set_sae_metadata(self):
+        self.sae.cfg.metadata.dataset_path = self.cfg.dataset_path
+        self.sae.cfg.metadata.hook_name = self.cfg.hook_name
+        self.sae.cfg.metadata.model_name = self.cfg.model_name
+        self.sae.cfg.metadata.model_class_name = self.cfg.model_class_name
+        self.sae.cfg.metadata.hook_head_index = self.cfg.hook_head_index
+        self.sae.cfg.metadata.context_size = self.cfg.context_size
+        self.sae.cfg.metadata.seqpos_slice = self.cfg.seqpos_slice
+        self.sae.cfg.metadata.model_from_pretrained_kwargs = (
+            self.cfg.model_from_pretrained_kwargs
+        )
+        self.sae.cfg.metadata.prepend_bos = self.cfg.prepend_bos
+        self.sae.cfg.metadata.exclude_special_tokens = self.cfg.exclude_special_tokens
+    def _compile_if_needed(self):
+        # Compile model and SAE
+        #  torch.compile can provide significant speedups (10-20% in testing)
+        # using max-autotune gives the best speedups but:
+        # (a) increases VRAM usage,
+        # (b) can't be used on both SAE and LM (some issue with cudagraphs), and
+        # (c) takes some time to compile
+        # optimal settings seem to be:
+        # use max-autotune on SAE and max-autotune-no-cudagraphs on LM
+        # (also pylance seems to really hate this)
+        if self.cfg.compile_llm:
+            self.model = torch.compile(
+                self.model,
+                mode=self.cfg.llm_compilation_mode,
+            )  # type: ignore
+        if self.cfg.compile_sae:
+            backend = "aot_eager" if self.cfg.device == "mps" else "inductor"
+            self.sae.training_forward_pass = torch.compile(  # type: ignore
+                self.sae.training_forward_pass,
+                mode=self.cfg.sae_compilation_mode,
+                backend=backend,
+            )  # type: ignore
+    def run_trainer_with_interruption_handling(
+        self, trainer: SAETrainer[TrainingSAE[TrainingSAEConfig], TrainingSAEConfig]
+    ):
+        try:
+            # signal handlers (if preempted)
+            signal.signal(signal.SIGINT, interrupt_callback)
+            signal.signal(signal.SIGTERM, interrupt_callback)
+            # train SAE
+            sae = trainer.fit()
+        except (KeyboardInterrupt, InterruptedException):
+            logger.warning("interrupted, saving progress")
+            checkpoint_path = Path(self.cfg.checkpoint_path) / str(
+                trainer.n_training_samples
+            )
+            self.save_checkpoint(checkpoint_path)
+            logger.info("done saving")
+            raise
+        return sae
+    def save_checkpoint(
+        self,
+        checkpoint_path: Path,
+    ) -> None:
+        self.activations_store.save(
+            str(checkpoint_path / ACTIVATIONS_STORE_STATE_FILENAME)
+        )
+        runner_config = self.cfg.to_dict()
+        with open(checkpoint_path / RUNNER_CFG_FILENAME, "w") as f:
+            json.dump(runner_config, f)
+def _parse_cfg_args(
+    args: Sequence[str],
+) -> LanguageModelSAERunnerConfig[TrainingSAEConfig]:
+    """
+    Parse command line arguments into a LanguageModelSAERunnerConfig.
+    This function first parses the architecture argument to determine which
+    concrete SAE config class to use, then parses the full configuration
+    with that concrete type.
+    """
+    if len(args) == 0:
+        args = ["--help"]
+    # First, parse only the architecture to determine which concrete class to use
+    architecture_parser = ArgumentParser(
+        description="Parse architecture to determine SAE config class",
+        exit_on_error=False,
+        add_help=False,  # Don't add help to avoid conflicts
+    )
+    architecture_parser.add_argument(
+        "--architecture",
+        type=str,
+        choices=["standard", "gated", "jumprelu", "topk"],
+        default="standard",
+        help="SAE architecture to use",
+    )
+    # Parse known args to extract architecture, ignore unknown args for now
+    arch_args, remaining_args = architecture_parser.parse_known_args(args)
+    architecture = arch_args.architecture
+    # Remove architecture from remaining args if it exists
+    filtered_args = []
+    skip_next = False
+    for arg in remaining_args:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--architecture":
+            skip_next = True  # Skip the next argument (the architecture value)
+            continue
+        filtered_args.append(arg)
+    # Create a custom wrapper class that simple_parsing can handle
+    def create_config_class(
+        sae_config_type: type[TrainingSAEConfig],
+    ) -> type[LanguageModelSAERunnerConfig[TrainingSAEConfig]]:
+        """Create a concrete config class for the given SAE config type."""
+        # Create the base config without the sae field
+        from dataclasses import field as dataclass_field
+        from dataclasses import fields, make_dataclass
+        # Get all fields from LanguageModelSAERunnerConfig except the generic sae field
+        base_fields = []
+        for field_obj in fields(LanguageModelSAERunnerConfig):
+            if field_obj.name != "sae":
+                base_fields.append((field_obj.name, field_obj.type, field_obj))
+        # Add the concrete sae field
+        base_fields.append(
+            (
+                "sae",
+                sae_config_type,
+                dataclass_field(
+                    default_factory=lambda: sae_config_type(d_in=512, d_sae=1024)
+                ),
+            )
+        )
+        # Create the concrete class
+        return make_dataclass(
+            f"{sae_config_type.__name__}RunnerConfig",
+            base_fields,
+            bases=(LanguageModelSAERunnerConfig,),
+        )
+    # Map architecture to concrete config class
+    sae_config_map = {
+        "standard": StandardTrainingSAEConfig,
+        "gated": GatedTrainingSAEConfig,
+        "jumprelu": JumpReLUTrainingSAEConfig,
+        "topk": TopKTrainingSAEConfig,
+    }
+    sae_config_type = sae_config_map[architecture]
+    concrete_config_class = create_config_class(sae_config_type)
+    # Now parse the full configuration with the concrete type
+    parser = ArgumentParser(exit_on_error=False)
+    parser.add_arguments(concrete_config_class, dest="cfg")
+    # Parse the filtered arguments (without --architecture)
+    parsed_args = parser.parse_args(filtered_args)
+    # Return the parsed configuration
+    return parsed_args.cfg
+# moved into its own function to make it easier to test
+def _run_cli(args: Sequence[str]):
+    cfg = _parse_cfg_args(args)
+    LanguageModelSAETrainingRunner(cfg=cfg).run()
+if __name__ == "__main__":
+    _run_cli(args=sys.argv[1:])
+@deprecated("Use LanguageModelSAETrainingRunner instead")
+class SAETrainingRunner(LanguageModelSAETrainingRunner):
+    pass

sae-lens 5.11.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

sae-lens 5.11.0py3-none-any.whl → 6.0.0py3-none-any.whl