PyPI - sae-lens - Versions diffs - 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl - Mend

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +6 -3
sae_lens/analysis/neuronpedia_integration.py +3 -3
sae_lens/cache_activations_runner.py +7 -6
sae_lens/config.py +50 -6
sae_lens/constants.py +2 -0
sae_lens/evals.py +39 -28
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +24 -12
sae_lens/saes/gated_sae.py +0 -4
sae_lens/saes/jumprelu_sae.py +4 -10
sae_lens/saes/sae.py +121 -51
sae_lens/saes/standard_sae.py +4 -11
sae_lens/saes/topk_sae.py +18 -12
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +77 -174
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/sae_trainer.py +107 -98
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +1 -1
sae_lens/util.py +19 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc4.dist-info/RECORD +37 -0
sae_lens/sae_training_runner.py +0 -237
sae_lens/training/geometric_median.py +0 -101
sae_lens-6.0.0rc2.dist-info/RECORD +0 -35
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/WHEEL +0 -0

sae_lens/llm_sae_training_runner.py ADDED Viewed

@@ -0,0 +1,377 @@
+import json
+import signal
+import sys
+from collections.abc import Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Generic
+import torch
+import wandb
+from simple_parsing import ArgumentParser
+from transformer_lens.hook_points import HookedRootModule
+from typing_extensions import deprecated
+from sae_lens import logger
+from sae_lens.config import HfDataset, LanguageModelSAERunnerConfig
+from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, RUNNER_CFG_FILENAME
+from sae_lens.evals import EvalConfig, run_evals
+from sae_lens.load_model import load_model
+from sae_lens.saes.gated_sae import GatedTrainingSAEConfig
+from sae_lens.saes.jumprelu_sae import JumpReLUTrainingSAEConfig
+from sae_lens.saes.sae import (
+    T_TRAINING_SAE,
+    T_TRAINING_SAE_CONFIG,
+    TrainingSAE,
+    TrainingSAEConfig,
+)
+from sae_lens.saes.standard_sae import StandardTrainingSAEConfig
+from sae_lens.saes.topk_sae import TopKTrainingSAEConfig
+from sae_lens.training.activation_scaler import ActivationScaler
+from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.training.sae_trainer import SAETrainer
+from sae_lens.training.types import DataProvider
+class InterruptedException(Exception):
+    pass
+def interrupt_callback(sig_num: Any, stack_frame: Any):  # noqa: ARG001
+    raise InterruptedException()
+@dataclass
+class LLMSaeEvaluator(Generic[T_TRAINING_SAE]):
+    model: HookedRootModule
+    activations_store: ActivationsStore
+    eval_batch_size_prompts: int | None
+    n_eval_batches: int
+    model_kwargs: dict[str, Any]
+    def __call__(
+        self,
+        sae: T_TRAINING_SAE,
+        data_provider: DataProvider,
+        activation_scaler: ActivationScaler,
+    ) -> dict[str, Any]:
+        ignore_tokens = set()
+        if self.activations_store.exclude_special_tokens is not None:
+            ignore_tokens = set(self.activations_store.exclude_special_tokens.tolist())
+        eval_config = EvalConfig(
+            batch_size_prompts=self.eval_batch_size_prompts,
+            n_eval_reconstruction_batches=self.n_eval_batches,
+            n_eval_sparsity_variance_batches=self.n_eval_batches,
+            compute_ce_loss=True,
+            compute_l2_norms=True,
+            compute_sparsity_metrics=True,
+            compute_variance_metrics=True,
+        )
+        eval_metrics, _ = run_evals(
+            sae=sae,
+            activation_store=self.activations_store,
+            model=self.model,
+            activation_scaler=activation_scaler,
+            eval_config=eval_config,
+            ignore_tokens=ignore_tokens,
+            model_kwargs=self.model_kwargs,
+        )  # not calculating featurwise metrics here.
+        # Remove eval metrics that are already logged during training
+        eval_metrics.pop("metrics/explained_variance", None)
+        eval_metrics.pop("metrics/explained_variance_std", None)
+        eval_metrics.pop("metrics/l0", None)
+        eval_metrics.pop("metrics/l1", None)
+        eval_metrics.pop("metrics/mse", None)
+        # Remove metrics that are not useful for wandb logging
+        eval_metrics.pop("metrics/total_tokens_evaluated", None)
+        return eval_metrics
+class LanguageModelSAETrainingRunner:
+    """
+    Class to run the training of a Sparse Autoencoder (SAE) on a TransformerLens model.
+    """
+    cfg: LanguageModelSAERunnerConfig[Any]
+    model: HookedRootModule
+    sae: TrainingSAE[Any]
+    activations_store: ActivationsStore
+    def __init__(
+        self,
+        cfg: LanguageModelSAERunnerConfig[T_TRAINING_SAE_CONFIG],
+        override_dataset: HfDataset | None = None,
+        override_model: HookedRootModule | None = None,
+        override_sae: TrainingSAE[Any] | None = None,
+    ):
+        if override_dataset is not None:
+            logger.warning(
+                f"You just passed in a dataset which will override the one specified in your configuration: {cfg.dataset_path}. As a consequence this run will not be reproducible via configuration alone."
+            )
+        if override_model is not None:
+            logger.warning(
+                f"You just passed in a model which will override the one specified in your configuration: {cfg.model_name}. As a consequence this run will not be reproducible via configuration alone."
+            )
+        self.cfg = cfg
+        if override_model is None:
+            self.model = load_model(
+                self.cfg.model_class_name,
+                self.cfg.model_name,
+                device=self.cfg.device,
+                model_from_pretrained_kwargs=self.cfg.model_from_pretrained_kwargs,
+            )
+        else:
+            self.model = override_model
+        self.activations_store = ActivationsStore.from_config(
+            self.model,
+            self.cfg,
+            override_dataset=override_dataset,
+        )
+        if override_sae is None:
+            if self.cfg.from_pretrained_path is not None:
+                self.sae = TrainingSAE.load_from_disk(
+                    self.cfg.from_pretrained_path, self.cfg.device
+                )
+            else:
+                self.sae = TrainingSAE.from_dict(
+                    TrainingSAEConfig.from_dict(
+                        self.cfg.get_training_sae_cfg_dict(),
+                    ).to_dict()
+                )
+        else:
+            self.sae = override_sae
+        self.sae.to(self.cfg.device)
+    def run(self):
+        """
+        Run the training of the SAE.
+        """
+        self._set_sae_metadata()
+        if self.cfg.logger.log_to_wandb:
+            wandb.init(
+                project=self.cfg.logger.wandb_project,
+                entity=self.cfg.logger.wandb_entity,
+                config=self.cfg.to_dict(),
+                name=self.cfg.logger.run_name,
+                id=self.cfg.logger.wandb_id,
+            )
+        evaluator = LLMSaeEvaluator(
+            model=self.model,
+            activations_store=self.activations_store,
+            eval_batch_size_prompts=self.cfg.eval_batch_size_prompts,
+            n_eval_batches=self.cfg.n_eval_batches,
+            model_kwargs=self.cfg.model_kwargs,
+        )
+        trainer = SAETrainer(
+            sae=self.sae,
+            data_provider=self.activations_store,
+            evaluator=evaluator,
+            save_checkpoint_fn=self.save_checkpoint,
+            cfg=self.cfg.to_sae_trainer_config(),
+        )
+        self._compile_if_needed()
+        sae = self.run_trainer_with_interruption_handling(trainer)
+        if self.cfg.logger.log_to_wandb:
+            wandb.finish()
+        return sae
+    def _set_sae_metadata(self):
+        self.sae.cfg.metadata.dataset_path = self.cfg.dataset_path
+        self.sae.cfg.metadata.hook_name = self.cfg.hook_name
+        self.sae.cfg.metadata.model_name = self.cfg.model_name
+        self.sae.cfg.metadata.model_class_name = self.cfg.model_class_name
+        self.sae.cfg.metadata.hook_head_index = self.cfg.hook_head_index
+        self.sae.cfg.metadata.context_size = self.cfg.context_size
+        self.sae.cfg.metadata.seqpos_slice = self.cfg.seqpos_slice
+        self.sae.cfg.metadata.model_from_pretrained_kwargs = (
+            self.cfg.model_from_pretrained_kwargs
+        )
+        self.sae.cfg.metadata.prepend_bos = self.cfg.prepend_bos
+        self.sae.cfg.metadata.exclude_special_tokens = self.cfg.exclude_special_tokens
+    def _compile_if_needed(self):
+        # Compile model and SAE
+        #  torch.compile can provide significant speedups (10-20% in testing)
+        # using max-autotune gives the best speedups but:
+        # (a) increases VRAM usage,
+        # (b) can't be used on both SAE and LM (some issue with cudagraphs), and
+        # (c) takes some time to compile
+        # optimal settings seem to be:
+        # use max-autotune on SAE and max-autotune-no-cudagraphs on LM
+        # (also pylance seems to really hate this)
+        if self.cfg.compile_llm:
+            self.model = torch.compile(
+                self.model,
+                mode=self.cfg.llm_compilation_mode,
+            )  # type: ignore
+        if self.cfg.compile_sae:
+            backend = "aot_eager" if self.cfg.device == "mps" else "inductor"
+            self.sae.training_forward_pass = torch.compile(  # type: ignore
+                self.sae.training_forward_pass,
+                mode=self.cfg.sae_compilation_mode,
+                backend=backend,
+            )  # type: ignore
+    def run_trainer_with_interruption_handling(
+        self, trainer: SAETrainer[TrainingSAE[TrainingSAEConfig], TrainingSAEConfig]
+    ):
+        try:
+            # signal handlers (if preempted)
+            signal.signal(signal.SIGINT, interrupt_callback)
+            signal.signal(signal.SIGTERM, interrupt_callback)
+            # train SAE
+            sae = trainer.fit()
+        except (KeyboardInterrupt, InterruptedException):
+            logger.warning("interrupted, saving progress")
+            checkpoint_path = Path(self.cfg.checkpoint_path) / str(
+                trainer.n_training_samples
+            )
+            self.save_checkpoint(checkpoint_path)
+            logger.info("done saving")
+            raise
+        return sae
+    def save_checkpoint(
+        self,
+        checkpoint_path: Path,
+    ) -> None:
+        self.activations_store.save(
+            str(checkpoint_path / ACTIVATIONS_STORE_STATE_FILENAME)
+        )
+        runner_config = self.cfg.to_dict()
+        with open(checkpoint_path / RUNNER_CFG_FILENAME, "w") as f:
+            json.dump(runner_config, f)
+def _parse_cfg_args(
+    args: Sequence[str],
+) -> LanguageModelSAERunnerConfig[TrainingSAEConfig]:
+    """
+    Parse command line arguments into a LanguageModelSAERunnerConfig.
+    This function first parses the architecture argument to determine which
+    concrete SAE config class to use, then parses the full configuration
+    with that concrete type.
+    """
+    if len(args) == 0:
+        args = ["--help"]
+    # First, parse only the architecture to determine which concrete class to use
+    architecture_parser = ArgumentParser(
+        description="Parse architecture to determine SAE config class",
+        exit_on_error=False,
+        add_help=False,  # Don't add help to avoid conflicts
+    )
+    architecture_parser.add_argument(
+        "--architecture",
+        type=str,
+        choices=["standard", "gated", "jumprelu", "topk"],
+        default="standard",
+        help="SAE architecture to use",
+    )
+    # Parse known args to extract architecture, ignore unknown args for now
+    arch_args, remaining_args = architecture_parser.parse_known_args(args)
+    architecture = arch_args.architecture
+    # Remove architecture from remaining args if it exists
+    filtered_args = []
+    skip_next = False
+    for arg in remaining_args:
+        if skip_next:
+            skip_next = False
+            continue
+        if arg == "--architecture":
+            skip_next = True  # Skip the next argument (the architecture value)
+            continue
+        filtered_args.append(arg)
+    # Create a custom wrapper class that simple_parsing can handle
+    def create_config_class(
+        sae_config_type: type[TrainingSAEConfig],
+    ) -> type[LanguageModelSAERunnerConfig[TrainingSAEConfig]]:
+        """Create a concrete config class for the given SAE config type."""
+        # Create the base config without the sae field
+        from dataclasses import field as dataclass_field
+        from dataclasses import fields, make_dataclass
+        # Get all fields from LanguageModelSAERunnerConfig except the generic sae field
+        base_fields = []
+        for field_obj in fields(LanguageModelSAERunnerConfig):
+            if field_obj.name != "sae":
+                base_fields.append((field_obj.name, field_obj.type, field_obj))
+        # Add the concrete sae field
+        base_fields.append(
+            (
+                "sae",
+                sae_config_type,
+                dataclass_field(
+                    default_factory=lambda: sae_config_type(d_in=512, d_sae=1024)
+                ),
+            )
+        )
+        # Create the concrete class
+        return make_dataclass(
+            f"{sae_config_type.__name__}RunnerConfig",
+            base_fields,
+            bases=(LanguageModelSAERunnerConfig,),
+        )
+    # Map architecture to concrete config class
+    sae_config_map = {
+        "standard": StandardTrainingSAEConfig,
+        "gated": GatedTrainingSAEConfig,
+        "jumprelu": JumpReLUTrainingSAEConfig,
+        "topk": TopKTrainingSAEConfig,
+    }
+    sae_config_type = sae_config_map[architecture]
+    concrete_config_class = create_config_class(sae_config_type)
+    # Now parse the full configuration with the concrete type
+    parser = ArgumentParser(exit_on_error=False)
+    parser.add_arguments(concrete_config_class, dest="cfg")
+    # Parse the filtered arguments (without --architecture)
+    parsed_args = parser.parse_args(filtered_args)
+    # Return the parsed configuration
+    return parsed_args.cfg
+# moved into its own function to make it easier to test
+def _run_cli(args: Sequence[str]):
+    cfg = _parse_cfg_args(args)
+    LanguageModelSAETrainingRunner(cfg=cfg).run()
+if __name__ == "__main__":
+    _run_cli(args=sys.argv[1:])
+@deprecated("Use LanguageModelSAETrainingRunner instead")
+class SAETrainingRunner(LanguageModelSAETrainingRunner):
+    pass

sae_lens/load_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Literal, cast
+from typing import Any, Callable, Literal, cast
 import torch
 from transformer_lens import HookedTransformer
@@ -77,6 +77,7 @@ class HookedProxyLM(HookedRootModule):
     # copied and modified from base HookedRootModule
     def setup(self):
         self.mod_dict = {}
+        self.named_modules_dict = {}
         self.hook_dict: dict[str, HookPoint] = {}
         for name, module in self.model.named_modules():
             if name == "":
@@ -89,14 +90,21 @@ class HookedProxyLM(HookedRootModule):
             self.hook_dict[name] = hook_point
             self.mod_dict[name] = hook_point
+            self.named_modules_dict[name] = module
+    def run_with_cache(self, *args: Any, **kwargs: Any):  # type: ignore
+        if "names_filter" in kwargs:
+            # hacky way to make sure that the names_filter is passed to our forward method
+            kwargs["_names_filter"] = kwargs["names_filter"]
+        return super().run_with_cache(*args, **kwargs)
     def forward(
         self,
         tokens: torch.Tensor,
         return_type: Literal["both", "logits"] = "logits",
         loss_per_token: bool = False,
-        # TODO: implement real support for stop_at_layer
         stop_at_layer: int | None = None,
+        _names_filter: list[str] | None = None,
         **kwargs: Any,
     ) -> Output | Loss:
         # This is just what's needed for evals, not everything that HookedTransformer has
@@ -107,8 +115,28 @@ class HookedProxyLM(HookedRootModule):
             raise NotImplementedError(
                 "Only return_type supported is 'both' or 'logits' to match what's in evals.py and ActivationsStore"
             )
-        output = self.model(tokens)
-        logits = _extract_logits_from_output(output)
+        stop_hooks = []
+        if stop_at_layer is not None and _names_filter is not None:
+            if return_type != "logits":
+                raise NotImplementedError(
+                    "stop_at_layer is not supported for return_type='both'"
+                )
+            stop_manager = StopManager(_names_filter)
+            for hook_name in _names_filter:
+                module = self.named_modules_dict[hook_name]
+                stop_fn = stop_manager.get_stop_hook_fn(hook_name)
+                stop_hooks.append(module.register_forward_hook(stop_fn))
+        try:
+            output = self.model(tokens)
+            logits = _extract_logits_from_output(output)
+        except StopForward:
+            # If we stop early, we don't care about the return output
+            return None  # type: ignore
+        finally:
+            for stop_hook in stop_hooks:
+                stop_hook.remove()
         if return_type == "logits":
             return logits
@@ -159,7 +187,7 @@ class HookedProxyLM(HookedRootModule):
         # We don't want to prepend bos but the tokenizer does it automatically, so we remove it manually
         if hasattr(self.tokenizer, "add_bos_token") and self.tokenizer.add_bos_token:  # type: ignore
-            tokens = get_tokens_with_bos_removed(self.tokenizer, tokens)
+            tokens = get_tokens_with_bos_removed(self.tokenizer, tokens)  # type: ignore
         return tokens  # type: ignore
@@ -183,3 +211,23 @@ def get_hook_fn(hook_point: HookPoint):
         return output
     return hook_fn
+class StopForward(Exception):
+    pass
+class StopManager:
+    def __init__(self, hook_names: list[str]):
+        self.hook_names = hook_names
+        self.total_hook_names = len(set(hook_names))
+        self.called_hook_names = set()
+    def get_stop_hook_fn(self, hook_name: str) -> Callable[[Any, Any, Any], Any]:
+        def stop_hook_fn(module: Any, input: Any, output: Any) -> Any:  # noqa: ARG001
+            self.called_hook_names.add(hook_name)
+            if len(self.called_hook_names) == self.total_hook_names:
+                raise StopForward()
+            return output
+        return stop_hook_fn

sae_lens/loading/pretrained_sae_loaders.py CHANGED Viewed

@@ -26,6 +26,22 @@ from sae_lens.loading.pretrained_saes_directory import (
 from sae_lens.registry import get_sae_class
 from sae_lens.util import filter_valid_dataclass_fields
+LLM_METADATA_KEYS = {
+    "model_name",
+    "hook_name",
+    "model_class_name",
+    "hook_head_index",
+    "model_from_pretrained_kwargs",
+    "prepend_bos",
+    "exclude_special_tokens",
+    "neuronpedia_id",
+    "context_size",
+    "seqpos_slice",
+    "dataset_path",
+    "sae_lens_version",
+    "sae_lens_training_version",
+}
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
 class PretrainedSaeHuggingfaceLoader(Protocol):
@@ -193,7 +209,6 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     rename_keys_map = {
         "hook_point": "hook_name",
-        "hook_point_layer": "hook_layer",
         "hook_point_head_index": "hook_head_index",
         "activation_fn_str": "activation_fn",
     }
@@ -208,6 +223,10 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     new_cfg.setdefault("activation_fn", new_cfg.get("activation_fn", "relu"))
     new_cfg.setdefault("architecture", "standard")
     new_cfg.setdefault("neuronpedia_id", None)
+    new_cfg.setdefault(
+        "reshape_activations",
+        "hook_z" if "hook_z" in new_cfg.get("hook_name", "") else "none",
+    )
     if "normalize_activations" in new_cfg and isinstance(
         new_cfg["normalize_activations"], bool
@@ -232,11 +251,9 @@ def handle_pre_6_0_config(cfg_dict: dict[str, Any]) -> dict[str, Any]:
     if architecture == "topk":
         sae_cfg_dict["k"] = new_cfg["activation_fn_kwargs"]["k"]
-    # import here to avoid circular import
-    from sae_lens.saes.sae import SAEMetadata
-    meta_dict = filter_valid_dataclass_fields(new_cfg, SAEMetadata)
-    sae_cfg_dict["metadata"] = meta_dict
+    sae_cfg_dict["metadata"] = {
+        k: v for k, v in new_cfg.items() if k in LLM_METADATA_KEYS
+    }
     sae_cfg_dict["architecture"] = architecture
     return sae_cfg_dict
@@ -262,7 +279,6 @@ def get_connor_rob_hook_z_config_from_hf(
         "device": device if device is not None else "cpu",
         "model_name": "gpt2-small",
         "hook_name": old_cfg_dict["act_name"],
-        "hook_layer": old_cfg_dict["layer"],
         "hook_head_index": None,
         "activation_fn": "relu",
         "apply_b_dec_to_input": True,
@@ -273,6 +289,7 @@ def get_connor_rob_hook_z_config_from_hf(
         "context_size": 128,
         "normalize_activations": "none",
         "dataset_trust_remote_code": True,
+        "reshape_activations": "hook_z",
         **(cfg_overrides or {}),
     }
@@ -411,7 +428,6 @@ def get_gemma_2_config_from_hf(
         "dtype": "float32",
         "model_name": model_name,
         "hook_name": hook_name,
-        "hook_layer": layer,
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,
@@ -524,7 +540,6 @@ def get_llama_scope_config_from_hf(
         "dtype": "bfloat16",
         "model_name": model_name,
         "hook_name": old_cfg_dict["hook_point_in"],
-        "hook_layer": int(old_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,
@@ -651,7 +666,6 @@ def get_dictionary_learning_config_1_from_hf(
         "device": device,
         "model_name": trainer["lm_name"].split("/")[-1],
         "hook_name": hook_point_name,
-        "hook_layer": trainer["layer"],
         "hook_head_index": None,
         "activation_fn": activation_fn,
         "activation_fn_kwargs": activation_fn_kwargs,
@@ -690,7 +704,6 @@ def get_deepseek_r1_config_from_hf(
         "context_size": 1024,
         "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
         "hook_name": f"blocks.{layer}.hook_resid_post",
-        "hook_layer": layer,
         "hook_head_index": None,
         "prepend_bos": True,
         "dataset_path": "lmsys/lmsys-chat-1m",
@@ -849,7 +862,6 @@ def get_llama_scope_r1_distill_config_from_hf(
         "device": device,
         "model_name": model_name,
         "hook_name": huggingface_cfg_dict["hook_point_in"],
-        "hook_layer": int(huggingface_cfg_dict["hook_point_in"].split(".")[1]),
         "hook_head_index": None,
         "activation_fn": "relu",
         "finetuning_scaling_factor": False,

sae_lens/saes/gated_sae.py CHANGED Viewed

@@ -168,10 +168,6 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
         # Magnitude path
         magnitude_pre_activation = sae_in @ (self.W_enc * self.r_mag.exp()) + self.b_mag
-        if self.training and self.cfg.noise_scale > 0:
-            magnitude_pre_activation += (
-                torch.randn_like(magnitude_pre_activation) * self.cfg.noise_scale
-            )
         magnitude_pre_activation = self.hook_sae_acts_pre(magnitude_pre_activation)
         feature_magnitudes = self.activation_fn(magnitude_pre_activation)

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -105,7 +105,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
     JumpReLUSAE is an inference-only implementation of a Sparse Autoencoder (SAE)
     using a JumpReLU activation. For each unit, if its pre-activation is
     <= threshold, that unit is zeroed out; otherwise, it follows a user-specified
-    activation function (e.g., ReLU, tanh-relu, etc.).
+    activation function (e.g., ReLU etc.).
     It implements:
       - initialize_weights: sets up parameters, including a threshold.
@@ -142,7 +142,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # 1) Apply the base "activation_fn" from config (e.g., ReLU, tanh-relu).
+        # 1) Apply the base "activation_fn" from config (e.g., ReLU).
         base_acts = self.activation_fn(hidden_pre)
         # 2) Zero out any unit whose (hidden_pre <= threshold).
@@ -191,8 +191,8 @@ class JumpReLUTrainingSAEConfig(TrainingSAEConfig):
     Configuration class for training a JumpReLUTrainingSAE.
     """
-    jumprelu_init_threshold: float = 0.001
-    jumprelu_bandwidth: float = 0.001
+    jumprelu_init_threshold: float = 0.01
+    jumprelu_bandwidth: float = 0.05
     l0_coefficient: float = 1.0
     l0_warm_up_steps: int = 0
@@ -257,12 +257,6 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = sae_in @ self.W_enc + self.b_enc
-        if self.training and self.cfg.noise_scale > 0:
-            hidden_pre = (
-                hidden_pre + torch.randn_like(hidden_pre) * self.cfg.noise_scale
-            )
         feature_acts = JumpReLU.apply(hidden_pre, self.threshold, self.bandwidth)
         return feature_acts, hidden_pre  # type: ignore

sae-lens 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl