PyPI - sae-lens - Versions diffs - 5.10.3__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl - Mend

sae-lens 5.10.3py3-none-any.whl → 6.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sae_lens/__init__.py +56 -6
sae_lens/analysis/hooked_sae_transformer.py +12 -12
sae_lens/analysis/neuronpedia_integration.py +13 -11
sae_lens/cache_activations_runner.py +2 -1
sae_lens/config.py +121 -252
sae_lens/constants.py +18 -0
sae_lens/evals.py +32 -17
sae_lens/{toolkit → loading}/pretrained_sae_loaders.py +68 -36
sae_lens/pretrained_saes.yaml +0 -12
sae_lens/registry.py +49 -0
sae_lens/sae_training_runner.py +40 -54
sae_lens/saes/__init__.py +48 -0
sae_lens/saes/gated_sae.py +258 -0
sae_lens/saes/jumprelu_sae.py +354 -0
sae_lens/saes/sae.py +948 -0
sae_lens/saes/standard_sae.py +185 -0
sae_lens/saes/topk_sae.py +294 -0
sae_lens/training/activations_store.py +32 -16
sae_lens/training/optim.py +60 -36
sae_lens/training/sae_trainer.py +55 -86
sae_lens/training/upload_saes_to_huggingface.py +12 -6
sae_lens/util.py +28 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc2.dist-info/RECORD +35 -0
sae_lens/sae.py +0 -747
sae_lens/training/training_sae.py +0 -710
sae_lens-5.10.3.dist-info/RECORD +0 -28
/sae_lens/{toolkit → loading}/__init__.py +0 -0
/sae_lens/{toolkit → loading}/pretrained_saes_directory.py +0 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/LICENSE +0 -0
{sae_lens-5.10.3.dist-info → sae_lens-6.0.0rc2.dist-info}/WHEEL +0 -0

sae_lens/saes/sae.py ADDED Viewed

@@ -0,0 +1,948 @@
+"""Base classes for Sparse Autoencoders (SAEs)."""
+import json
+import warnings
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from dataclasses import asdict, dataclass, field, fields, replace
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Generic,
+    Literal,
+    NamedTuple,
+    Type,
+    TypeVar,
+)
+import einops
+import torch
+from jaxtyping import Float
+from numpy.typing import NDArray
+from safetensors.torch import save_file
+from torch import nn
+from transformer_lens.hook_points import HookedRootModule, HookPoint
+from typing_extensions import deprecated, overload, override
+from sae_lens import __version__, logger
+from sae_lens.constants import (
+    DTYPE_MAP,
+    SAE_CFG_FILENAME,
+    SAE_WEIGHTS_FILENAME,
+)
+from sae_lens.util import filter_valid_dataclass_fields
+if TYPE_CHECKING:
+    from sae_lens.config import LanguageModelSAERunnerConfig
+from sae_lens.loading.pretrained_sae_loaders import (
+    NAMED_PRETRAINED_SAE_LOADERS,
+    PretrainedSaeDiskLoader,
+    PretrainedSaeHuggingfaceLoader,
+    get_conversion_loader_name,
+    handle_config_defaulting,
+    sae_lens_disk_loader,
+)
+from sae_lens.loading.pretrained_saes_directory import (
+    get_config_overrides,
+    get_norm_scaling_factor,
+    get_pretrained_saes_directory,
+    get_repo_id_and_folder_name,
+)
+from sae_lens.registry import get_sae_class, get_sae_training_class
+T_SAE_CONFIG = TypeVar("T_SAE_CONFIG", bound="SAEConfig")
+T_TRAINING_SAE_CONFIG = TypeVar("T_TRAINING_SAE_CONFIG", bound="TrainingSAEConfig")
+T_SAE = TypeVar("T_SAE", bound="SAE")  # type: ignore
+T_TRAINING_SAE = TypeVar("T_TRAINING_SAE", bound="TrainingSAE")  # type: ignore
+@dataclass
+class SAEMetadata:
+    """Core metadata about how this SAE should be used, if known."""
+    model_name: str | None = None
+    hook_name: str | None = None
+    model_class_name: str | None = None
+    hook_layer: int | None = None
+    hook_head_index: int | None = None
+    model_from_pretrained_kwargs: dict[str, Any] | None = None
+    prepend_bos: bool | None = None
+    exclude_special_tokens: bool | list[int] | None = None
+    neuronpedia_id: str | None = None
+    context_size: int | None = None
+    seqpos_slice: tuple[int | None, ...] | None = None
+    dataset_path: str | None = None
+    sae_lens_version: str = field(default_factory=lambda: __version__)
+    sae_lens_training_version: str = field(default_factory=lambda: __version__)
+@dataclass
+class SAEConfig(ABC):
+    """Base configuration for SAE models."""
+    d_in: int
+    d_sae: int
+    dtype: str = "float32"
+    device: str = "cpu"
+    apply_b_dec_to_input: bool = True
+    normalize_activations: Literal[
+        "none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"
+    ] = "none"  # none, expected_average_only_in (Anthropic April Update), constant_norm_rescale (Anthropic Feb Update)
+    reshape_activations: Literal["none", "hook_z"] = "none"
+    metadata: SAEMetadata = field(default_factory=SAEMetadata)
+    @classmethod
+    @abstractmethod
+    def architecture(cls) -> str: ...
+    def to_dict(self) -> dict[str, Any]:
+        res = {field.name: getattr(self, field.name) for field in fields(self)}
+        res["metadata"] = asdict(self.metadata)
+        res["architecture"] = self.architecture()
+        return res
+    @classmethod
+    def from_dict(cls: type[T_SAE_CONFIG], config_dict: dict[str, Any]) -> T_SAE_CONFIG:
+        cfg_class = get_sae_class(config_dict["architecture"])[1]
+        filtered_config_dict = filter_valid_dataclass_fields(config_dict, cfg_class)
+        res = cfg_class(**filtered_config_dict)
+        if "metadata" in config_dict:
+            res.metadata = SAEMetadata(**config_dict["metadata"])
+        if not isinstance(res, cls):
+            raise ValueError(
+                f"SAE config class {cls} does not match dict config class {type(res)}"
+            )
+        return res
+    def __post_init__(self):
+        if self.normalize_activations not in [
+            "none",
+            "expected_average_only_in",
+            "constant_norm_rescale",
+            "layer_norm",
+        ]:
+            raise ValueError(
+                f"normalize_activations must be none, expected_average_only_in, constant_norm_rescale, or layer_norm. Got {self.normalize_activations}"
+            )
+@dataclass
+class TrainStepOutput:
+    """Output from a training step."""
+    sae_in: torch.Tensor
+    sae_out: torch.Tensor
+    feature_acts: torch.Tensor
+    hidden_pre: torch.Tensor
+    loss: torch.Tensor  # we need to call backwards on this
+    losses: dict[str, torch.Tensor]
+@dataclass
+class TrainStepInput:
+    """Input to a training step."""
+    sae_in: torch.Tensor
+    coefficients: dict[str, float]
+    dead_neuron_mask: torch.Tensor | None
+class TrainCoefficientConfig(NamedTuple):
+    value: float
+    warm_up_steps: int
+class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
+    """Abstract base class for all SAE architectures."""
+    cfg: T_SAE_CONFIG
+    dtype: torch.dtype
+    device: torch.device
+    use_error_term: bool
+    # For type checking only - don't provide default values
+    # These will be initialized by subclasses
+    W_enc: nn.Parameter
+    W_dec: nn.Parameter
+    b_dec: nn.Parameter
+    def __init__(self, cfg: T_SAE_CONFIG, use_error_term: bool = False):
+        """Initialize the SAE."""
+        super().__init__()
+        self.cfg = cfg
+        if cfg.metadata and cfg.metadata.model_from_pretrained_kwargs:
+            warnings.warn(
+                "\nThis SAE has non-empty model_from_pretrained_kwargs. "
+                "\nFor optimal performance, load the model like so:\n"
+                "model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)",
+                category=UserWarning,
+                stacklevel=1,
+            )
+        self.dtype = DTYPE_MAP[cfg.dtype]
+        self.device = torch.device(cfg.device)
+        self.use_error_term = use_error_term
+        # Set up activation function
+        self.activation_fn = self.get_activation_fn()
+        # Initialize weights
+        self.initialize_weights()
+        # Set up hooks
+        self.hook_sae_input = HookPoint()
+        self.hook_sae_acts_pre = HookPoint()
+        self.hook_sae_acts_post = HookPoint()
+        self.hook_sae_output = HookPoint()
+        self.hook_sae_recons = HookPoint()
+        self.hook_sae_error = HookPoint()
+        # handle hook_z reshaping if needed.
+        if self.cfg.reshape_activations == "hook_z":
+            self.turn_on_forward_pass_hook_z_reshaping()
+        else:
+            self.turn_off_forward_pass_hook_z_reshaping()
+        # Set up activation normalization
+        self._setup_activation_normalization()
+        self.setup()  # Required for HookedRootModule
+    @torch.no_grad()
+    def fold_activation_norm_scaling_factor(self, scaling_factor: float):
+        self.W_enc.data *= scaling_factor  # type: ignore
+        self.W_dec.data /= scaling_factor  # type: ignore
+        self.b_dec.data /= scaling_factor  # type: ignore
+        self.cfg.normalize_activations = "none"
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        """Get the activation function specified in config."""
+        return nn.ReLU()
+    def _setup_activation_normalization(self):
+        """Set up activation normalization functions based on config."""
+        if self.cfg.normalize_activations == "constant_norm_rescale":
+            def run_time_activation_norm_fn_in(x: torch.Tensor) -> torch.Tensor:
+                self.x_norm_coeff = (self.cfg.d_in**0.5) / x.norm(dim=-1, keepdim=True)
+                return x * self.x_norm_coeff
+            def run_time_activation_norm_fn_out(x: torch.Tensor) -> torch.Tensor:
+                x = x / self.x_norm_coeff  # type: ignore
+                del self.x_norm_coeff
+                return x
+            self.run_time_activation_norm_fn_in = run_time_activation_norm_fn_in
+            self.run_time_activation_norm_fn_out = run_time_activation_norm_fn_out
+        elif self.cfg.normalize_activations == "layer_norm":
+            def run_time_activation_ln_in(
+                x: torch.Tensor, eps: float = 1e-5
+            ) -> torch.Tensor:
+                mu = x.mean(dim=-1, keepdim=True)
+                x = x - mu
+                std = x.std(dim=-1, keepdim=True)
+                x = x / (std + eps)
+                self.ln_mu = mu
+                self.ln_std = std
+                return x
+            def run_time_activation_ln_out(
+                x: torch.Tensor,
+                eps: float = 1e-5,  # noqa: ARG001
+            ) -> torch.Tensor:
+                return x * self.ln_std + self.ln_mu  # type: ignore
+            self.run_time_activation_norm_fn_in = run_time_activation_ln_in
+            self.run_time_activation_norm_fn_out = run_time_activation_ln_out
+        else:
+            self.run_time_activation_norm_fn_in = lambda x: x
+            self.run_time_activation_norm_fn_out = lambda x: x
+    def initialize_weights(self):
+        """Initialize model weights."""
+        self.b_dec = nn.Parameter(
+            torch.zeros(self.cfg.d_in, dtype=self.dtype, device=self.device)
+        )
+        w_dec_data = torch.empty(
+            self.cfg.d_sae, self.cfg.d_in, dtype=self.dtype, device=self.device
+        )
+        nn.init.kaiming_uniform_(w_dec_data)
+        self.W_dec = nn.Parameter(w_dec_data)
+        w_enc_data = self.W_dec.data.T.clone().detach().contiguous()
+        self.W_enc = nn.Parameter(w_enc_data)
+    @abstractmethod
+    def encode(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> Float[torch.Tensor, "... d_sae"]:
+        """Encode input tensor to feature space."""
+        pass
+    @abstractmethod
+    def decode(
+        self, feature_acts: Float[torch.Tensor, "... d_sae"]
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """Decode feature activations back to input space."""
+        pass
+    def turn_on_forward_pass_hook_z_reshaping(self):
+        if (
+            self.cfg.metadata.hook_name is not None
+            and not self.cfg.metadata.hook_name.endswith("_z")
+        ):
+            raise ValueError("This method should only be called for hook_z SAEs.")
+        # print(f"Turning on hook_z reshaping for {self.cfg.hook_name}")
+        def reshape_fn_in(x: torch.Tensor):
+            # print(f"reshape_fn_in input shape: {x.shape}")
+            self.d_head = x.shape[-1]
+            # print(f"Setting d_head to: {self.d_head}")
+            self.reshape_fn_in = lambda x: einops.rearrange(
+                x, "... n_heads d_head -> ... (n_heads d_head)"
+            )
+            return einops.rearrange(x, "... n_heads d_head -> ... (n_heads d_head)")
+        self.reshape_fn_in = reshape_fn_in
+        self.reshape_fn_out = lambda x, d_head: einops.rearrange(
+            x, "... (n_heads d_head) -> ... n_heads d_head", d_head=d_head
+        )
+        self.hook_z_reshaping_mode = True
+        # print(f"hook_z reshaping turned on, self.d_head={getattr(self, 'd_head', None)}")
+    def turn_off_forward_pass_hook_z_reshaping(self):
+        self.reshape_fn_in = lambda x: x
+        self.reshape_fn_out = lambda x, d_head: x  # noqa: ARG005
+        self.d_head = None
+        self.hook_z_reshaping_mode = False
+    @overload
+    def to(
+        self: T_SAE,
+        device: torch.device | str | None = ...,
+        dtype: torch.dtype | None = ...,
+        non_blocking: bool = ...,
+    ) -> T_SAE: ...
+    @overload
+    def to(self: T_SAE, dtype: torch.dtype, non_blocking: bool = ...) -> T_SAE: ...
+    @overload
+    def to(self: T_SAE, tensor: torch.Tensor, non_blocking: bool = ...) -> T_SAE: ...
+    def to(self: T_SAE, *args: Any, **kwargs: Any) -> T_SAE:  # type: ignore
+        device_arg = None
+        dtype_arg = None
+        # Check args
+        for arg in args:
+            if isinstance(arg, (torch.device, str)):
+                device_arg = arg
+            elif isinstance(arg, torch.dtype):
+                dtype_arg = arg
+            elif isinstance(arg, torch.Tensor):
+                device_arg = arg.device
+                dtype_arg = arg.dtype
+        # Check kwargs
+        device_arg = kwargs.get("device", device_arg)
+        dtype_arg = kwargs.get("dtype", dtype_arg)
+        # Update device in config if provided
+        if device_arg is not None:
+            # Convert device to torch.device if it's a string
+            device = (
+                torch.device(device_arg) if isinstance(device_arg, str) else device_arg
+            )
+            # Update the cfg.device
+            self.cfg.device = str(device)
+            # Update the device property
+            self.device = device
+        # Update dtype in config if provided
+        if dtype_arg is not None:
+            # Update the cfg.dtype
+            self.cfg.dtype = str(dtype_arg)
+            # Update the dtype property
+            self.dtype = dtype_arg
+        return super().to(*args, **kwargs)
+    def process_sae_in(
+        self, sae_in: Float[torch.Tensor, "... d_in"]
+    ) -> Float[torch.Tensor, "... d_in"]:
+        # print(f"Input shape to process_sae_in: {sae_in.shape}")
+        # print(f"self.cfg.hook_name: {self.cfg.hook_name}")
+        # print(f"self.b_dec shape: {self.b_dec.shape}")
+        # print(f"Hook z reshaping mode: {getattr(self, 'hook_z_reshaping_mode', False)}")
+        sae_in = sae_in.to(self.dtype)
+        # print(f"Shape before reshape_fn_in: {sae_in.shape}")
+        sae_in = self.reshape_fn_in(sae_in)
+        # print(f"Shape after reshape_fn_in: {sae_in.shape}")
+        sae_in = self.hook_sae_input(sae_in)
+        sae_in = self.run_time_activation_norm_fn_in(sae_in)
+        # Here's where the error happens
+        bias_term = self.b_dec * self.cfg.apply_b_dec_to_input
+        # print(f"Bias term shape: {bias_term.shape}")
+        return sae_in - bias_term
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the SAE."""
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        if self.use_error_term:
+            with torch.no_grad():
+                # Recompute without hooks for true error term
+                with _disable_hooks(self):
+                    feature_acts_clean = self.encode(x)
+                    x_reconstruct_clean = self.decode(feature_acts_clean)
+                sae_error = self.hook_sae_error(x - x_reconstruct_clean)
+            sae_out = sae_out + sae_error
+        return self.hook_sae_output(sae_out)
+    # overwrite this in subclasses to modify the state_dict in-place before saving
+    def process_state_dict_for_saving(self, state_dict: dict[str, Any]) -> None:
+        pass
+    # overwrite this in subclasses to modify the state_dict in-place after loading
+    def process_state_dict_for_loading(self, state_dict: dict[str, Any]) -> None:
+        pass
+    @torch.no_grad()
+    def fold_W_dec_norm(self):
+        """Fold decoder norms into encoder."""
+        W_dec_norms = self.W_dec.norm(dim=-1).unsqueeze(1)
+        self.W_dec.data = self.W_dec.data / W_dec_norms
+        self.W_enc.data = self.W_enc.data * W_dec_norms.T
+        # Only update b_enc if it exists (standard/jumprelu architectures)
+        if hasattr(self, "b_enc") and isinstance(self.b_enc, nn.Parameter):
+            self.b_enc.data = self.b_enc.data * W_dec_norms.squeeze()
+    def get_name(self):
+        """Generate a name for this SAE."""
+        return f"sae_{self.cfg.metadata.model_name}_{self.cfg.metadata.hook_name}_{self.cfg.d_sae}"
+    def save_model(self, path: str | Path) -> tuple[Path, Path]:
+        """Save model weights and config to disk."""
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        # Generate the weights
+        state_dict = self.state_dict()  # Use internal SAE state dict
+        self.process_state_dict_for_saving(state_dict)
+        model_weights_path = path / SAE_WEIGHTS_FILENAME
+        save_file(state_dict, model_weights_path)
+        # Save the config
+        config = self.cfg.to_dict()
+        cfg_path = path / SAE_CFG_FILENAME
+        with open(cfg_path, "w") as f:
+            json.dump(config, f)
+        return model_weights_path, cfg_path
+    ## Initialization Methods
+    @torch.no_grad()
+    def initialize_b_dec_with_precalculated(self, origin: torch.Tensor):
+        out = torch.tensor(origin, dtype=self.dtype, device=self.device)
+        self.b_dec.data = out
+    @torch.no_grad()
+    def initialize_b_dec_with_mean(self, all_activations: torch.Tensor):
+        previous_b_dec = self.b_dec.clone().cpu()
+        out = all_activations.mean(dim=0)
+        previous_distances = torch.norm(all_activations - previous_b_dec, dim=-1)
+        distances = torch.norm(all_activations - out, dim=-1)
+        logger.info("Reinitializing b_dec with mean of activations")
+        logger.debug(
+            f"Previous distances: {previous_distances.median(0).values.mean().item()}"
+        )
+        logger.debug(f"New distances: {distances.median(0).values.mean().item()}")
+        self.b_dec.data = out.to(self.dtype).to(self.device)
+    # Class methods for loading models
+    @classmethod
+    @deprecated("Use load_from_disk instead")
+    def load_from_pretrained(
+        cls: Type[T_SAE],
+        path: str | Path,
+        device: str = "cpu",
+        dtype: str | None = None,
+    ) -> T_SAE:
+        return cls.load_from_disk(path, device=device, dtype=dtype)
+    @classmethod
+    def load_from_disk(
+        cls: Type[T_SAE],
+        path: str | Path,
+        device: str = "cpu",
+        dtype: str | None = None,
+        converter: PretrainedSaeDiskLoader = sae_lens_disk_loader,
+    ) -> T_SAE:
+        overrides = {"dtype": dtype} if dtype is not None else None
+        cfg_dict, state_dict = converter(path, device, cfg_overrides=overrides)
+        cfg_dict = handle_config_defaulting(cfg_dict)
+        sae_config_cls = cls.get_sae_config_class_for_architecture(
+            cfg_dict["architecture"]
+        )
+        sae_cfg = sae_config_cls.from_dict(cfg_dict)
+        sae_cls = cls.get_sae_class_for_architecture(sae_cfg.architecture())
+        sae = sae_cls(sae_cfg)
+        sae.process_state_dict_for_loading(state_dict)
+        sae.load_state_dict(state_dict)
+        return sae
+    @classmethod
+    def from_pretrained(
+        cls: Type[T_SAE],
+        release: str,
+        sae_id: str,
+        device: str = "cpu",
+        force_download: bool = False,
+        converter: PretrainedSaeHuggingfaceLoader | None = None,
+    ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
+        """
+        Load a pretrained SAE from the Hugging Face model hub.
+        Args:
+            release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
+            id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
+            device: The device to load the SAE on.
+            return_sparsity_if_present: If True, will return the log sparsity tensor if it is present in the model directory in the Hugging Face model hub.
+        """
+        # get sae directory
+        sae_directory = get_pretrained_saes_directory()
+        # Validate release and sae_id
+        if release not in sae_directory:
+            if "/" not in release:
+                raise ValueError(
+                    f"Release {release} not found in pretrained SAEs directory, and is not a valid huggingface repo."
+                )
+        elif sae_id not in sae_directory[release].saes_map:
+            # Handle special cases like Gemma Scope
+            if (
+                "gemma-scope" in release
+                and "canonical" not in release
+                and f"{release}-canonical" in sae_directory
+            ):
+                canonical_ids = list(
+                    sae_directory[release + "-canonical"].saes_map.keys()
+                )
+                # Shorten the lengthy string of valid IDs
+                if len(canonical_ids) > 5:
+                    str_canonical_ids = str(canonical_ids[:5])[:-1] + ", ...]"
+                else:
+                    str_canonical_ids = str(canonical_ids)
+                value_suffix = f" If you don't want to specify an L0 value, consider using release {release}-canonical which has valid IDs {str_canonical_ids}"
+            else:
+                value_suffix = ""
+            valid_ids = list(sae_directory[release].saes_map.keys())
+            # Shorten the lengthy string of valid IDs
+            if len(valid_ids) > 5:
+                str_valid_ids = str(valid_ids[:5])[:-1] + ", ...]"
+            else:
+                str_valid_ids = str(valid_ids)
+            raise ValueError(
+                f"ID {sae_id} not found in release {release}. Valid IDs are {str_valid_ids}."
+                + value_suffix
+            )
+        conversion_loader = (
+            converter
+            or NAMED_PRETRAINED_SAE_LOADERS[get_conversion_loader_name(release)]
+        )
+        repo_id, folder_name = get_repo_id_and_folder_name(release, sae_id)
+        config_overrides = get_config_overrides(release, sae_id)
+        config_overrides["device"] = device
+        # Load config and weights
+        cfg_dict, state_dict, log_sparsities = conversion_loader(
+            repo_id=repo_id,
+            folder_name=folder_name,
+            device=device,
+            force_download=force_download,
+            cfg_overrides=config_overrides,
+        )
+        cfg_dict = handle_config_defaulting(cfg_dict)
+        # Create SAE with appropriate architecture
+        sae_config_cls = cls.get_sae_config_class_for_architecture(
+            cfg_dict["architecture"]
+        )
+        sae_cfg = sae_config_cls.from_dict(cfg_dict)
+        sae_cls = cls.get_sae_class_for_architecture(sae_cfg.architecture())
+        sae = sae_cls(sae_cfg)
+        sae.process_state_dict_for_loading(state_dict)
+        sae.load_state_dict(state_dict)
+        # Apply normalization if needed
+        if cfg_dict.get("normalize_activations") == "expected_average_only_in":
+            norm_scaling_factor = get_norm_scaling_factor(release, sae_id)
+            if norm_scaling_factor is not None:
+                sae.fold_activation_norm_scaling_factor(norm_scaling_factor)
+                cfg_dict["normalize_activations"] = "none"
+            else:
+                warnings.warn(
+                    f"norm_scaling_factor not found for {release} and {sae_id}, but normalize_activations is 'expected_average_only_in'. Skipping normalization folding."
+                )
+        return sae, cfg_dict, log_sparsities
+    @classmethod
+    def from_dict(cls: Type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:
+        """Create an SAE from a config dictionary."""
+        sae_cls = cls.get_sae_class_for_architecture(config_dict["architecture"])
+        sae_config_cls = cls.get_sae_config_class_for_architecture(
+            config_dict["architecture"]
+        )
+        return sae_cls(sae_config_cls.from_dict(config_dict))
+    @classmethod
+    def get_sae_class_for_architecture(
+        cls: Type[T_SAE], architecture: str
+    ) -> Type[T_SAE]:
+        """Get the SAE class for a given architecture."""
+        sae_cls, _ = get_sae_class(architecture)
+        if not issubclass(sae_cls, cls):
+            raise ValueError(
+                f"Loaded SAE is not of type {cls.__name__}. Use {sae_cls.__name__} instead"
+            )
+        return sae_cls
+    # in the future, this can be used to load different config classes for different architectures
+    @classmethod
+    def get_sae_config_class_for_architecture(
+        cls,
+        architecture: str,  # noqa: ARG003
+    ) -> type[SAEConfig]:
+        return SAEConfig
+@dataclass(kw_only=True)
+class TrainingSAEConfig(SAEConfig, ABC):
+    noise_scale: float = 0.0
+    mse_loss_normalization: str | None = None
+    b_dec_init_method: Literal["zeros", "geometric_median", "mean"] = "zeros"
+    # https://transformer-circuits.pub/2024/april-update/index.html#training-saes
+    # 0.1 corresponds to the "heuristic" initialization, use None to disable
+    decoder_init_norm: float | None = 0.1
+    @classmethod
+    @abstractmethod
+    def architecture(cls) -> str: ...
+    @classmethod
+    def from_sae_runner_config(
+        cls: type[T_TRAINING_SAE_CONFIG],
+        cfg: "LanguageModelSAERunnerConfig[T_TRAINING_SAE_CONFIG]",
+    ) -> T_TRAINING_SAE_CONFIG:
+        metadata = SAEMetadata(
+            model_name=cfg.model_name,
+            hook_name=cfg.hook_name,
+            hook_layer=cfg.hook_layer,
+            hook_head_index=cfg.hook_head_index,
+            context_size=cfg.context_size,
+            prepend_bos=cfg.prepend_bos,
+            seqpos_slice=cfg.seqpos_slice,
+            model_from_pretrained_kwargs=cfg.model_from_pretrained_kwargs or {},
+        )
+        if not isinstance(cfg.sae, cls):
+            raise ValueError(
+                f"SAE config class {cls} does not match SAE runner config class {type(cfg.sae)}"
+            )
+        return replace(cfg.sae, metadata=metadata)
+    @classmethod
+    def from_dict(
+        cls: type[T_TRAINING_SAE_CONFIG], config_dict: dict[str, Any]
+    ) -> T_TRAINING_SAE_CONFIG:
+        # remove any keys that are not in the dataclass
+        # since we sometimes enhance the config with the whole LM runner config
+        valid_config_dict = filter_valid_dataclass_fields(config_dict, cls)
+        cfg_class = cls
+        if "architecture" in config_dict:
+            cfg_class = get_sae_training_class(config_dict["architecture"])[1]
+        if not issubclass(cfg_class, cls):
+            raise ValueError(
+                f"SAE config class {cls} does not match dict config class {type(cfg_class)}"
+            )
+        if "metadata" in config_dict:
+            valid_config_dict["metadata"] = SAEMetadata(**config_dict["metadata"])
+        return cfg_class(**valid_config_dict)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            **super().to_dict(),
+            **asdict(self),
+            "architecture": self.architecture(),
+        }
+    # this needs to exist so we can initialize the parent sae cfg without the training specific
+    # parameters. Maybe there's a cleaner way to do this
+    def get_base_sae_cfg_dict(self) -> dict[str, Any]:
+        """
+        Creates a dictionary containing attributes corresponding to the fields
+        defined in the base SAEConfig class.
+        """
+        base_config_field_names = {f.name for f in fields(SAEConfig)}
+        result_dict = {
+            field_name: getattr(self, field_name)
+            for field_name in base_config_field_names
+        }
+        result_dict["architecture"] = self.architecture()
+        return result_dict
+class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
+    """Abstract base class for training versions of SAEs."""
+    def __init__(self, cfg: T_TRAINING_SAE_CONFIG, use_error_term: bool = False):
+        super().__init__(cfg, use_error_term)
+        # Turn off hook_z reshaping for training mode - the activation store
+        # is expected to handle reshaping before passing data to the SAE
+        self.turn_off_forward_pass_hook_z_reshaping()
+        self.mse_loss_fn = self._get_mse_loss_fn()
+    @abstractmethod
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]: ...
+    @abstractmethod
+    def encode_with_hidden_pre(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        """Encode with access to pre-activation values for training."""
+        ...
+    def encode(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> Float[torch.Tensor, "... d_sae"]:
+        """
+        For inference, just encode without returning hidden_pre.
+        (training_forward_pass calls encode_with_hidden_pre).
+        """
+        feature_acts, _ = self.encode_with_hidden_pre(x)
+        return feature_acts
+    def decode(
+        self, feature_acts: Float[torch.Tensor, "... d_sae"]
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decodes feature activations back into input space,
+        applying optional finetuning scale, hooking, out normalization, etc.
+        """
+        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+    @override
+    def initialize_weights(self):
+        super().initialize_weights()
+        if self.cfg.decoder_init_norm is not None:
+            with torch.no_grad():
+                self.W_dec.data /= self.W_dec.norm(dim=-1, keepdim=True)
+                self.W_dec.data *= self.cfg.decoder_init_norm
+            self.W_enc.data = self.W_dec.data.T.clone().detach().contiguous()
+    @abstractmethod
+    def calculate_aux_loss(
+        self,
+        step_input: TrainStepInput,
+        feature_acts: torch.Tensor,
+        hidden_pre: torch.Tensor,
+        sae_out: torch.Tensor,
+    ) -> torch.Tensor | dict[str, torch.Tensor]:
+        """Calculate architecture-specific auxiliary loss terms."""
+        ...
+    def training_forward_pass(
+        self,
+        step_input: TrainStepInput,
+    ) -> TrainStepOutput:
+        """Forward pass during training."""
+        feature_acts, hidden_pre = self.encode_with_hidden_pre(step_input.sae_in)
+        sae_out = self.decode(feature_acts)
+        # Calculate MSE loss
+        per_item_mse_loss = self.mse_loss_fn(sae_out, step_input.sae_in)
+        mse_loss = per_item_mse_loss.sum(dim=-1).mean()
+        # Calculate architecture-specific auxiliary losses
+        aux_losses = self.calculate_aux_loss(
+            step_input=step_input,
+            feature_acts=feature_acts,
+            hidden_pre=hidden_pre,
+            sae_out=sae_out,
+        )
+        # Total loss is MSE plus all auxiliary losses
+        total_loss = mse_loss
+        # Create losses dictionary with mse_loss
+        losses = {"mse_loss": mse_loss}
+        # Add architecture-specific losses to the dictionary
+        # Make sure aux_losses is a dictionary with string keys and tensor values
+        if isinstance(aux_losses, dict):
+            losses.update(aux_losses)
+        # Sum all losses for total_loss
+        if isinstance(aux_losses, dict):
+            for loss_value in aux_losses.values():
+                total_loss = total_loss + loss_value
+        else:
+            # Handle case where aux_losses is a tensor
+            total_loss = total_loss + aux_losses
+        return TrainStepOutput(
+            sae_in=step_input.sae_in,
+            sae_out=sae_out,
+            feature_acts=feature_acts,
+            hidden_pre=hidden_pre,
+            loss=total_loss,
+            losses=losses,
+        )
+    def save_inference_model(self, path: str | Path) -> tuple[Path, Path]:
+        """Save inference version of model weights and config to disk."""
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        # Generate the weights
+        state_dict = self.state_dict()  # Use internal SAE state dict
+        self.process_state_dict_for_saving_inference(state_dict)
+        model_weights_path = path / SAE_WEIGHTS_FILENAME
+        save_file(state_dict, model_weights_path)
+        # Save the config
+        config = self.to_inference_config_dict()
+        cfg_path = path / SAE_CFG_FILENAME
+        with open(cfg_path, "w") as f:
+            json.dump(config, f)
+        return model_weights_path, cfg_path
+    @abstractmethod
+    def to_inference_config_dict(self) -> dict[str, Any]:
+        """Convert the config into an inference SAE config dict."""
+        ...
+    def process_state_dict_for_saving_inference(
+        self, state_dict: dict[str, Any]
+    ) -> None:
+        """
+        Process the state dict for saving the inference model.
+        This is a hook that can be overridden to change how the state dict is processed for the inference model.
+        """
+        return self.process_state_dict_for_saving(state_dict)
+    def _get_mse_loss_fn(self) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
+        """Get the MSE loss function based on config."""
+        def standard_mse_loss_fn(
+            preds: torch.Tensor, target: torch.Tensor
+        ) -> torch.Tensor:
+            return torch.nn.functional.mse_loss(preds, target, reduction="none")
+        def batch_norm_mse_loss_fn(
+            preds: torch.Tensor, target: torch.Tensor
+        ) -> torch.Tensor:
+            target_centered = target - target.mean(dim=0, keepdim=True)
+            normalization = target_centered.norm(dim=-1, keepdim=True)
+            return torch.nn.functional.mse_loss(preds, target, reduction="none") / (
+                normalization + 1e-6
+            )
+        if self.cfg.mse_loss_normalization == "dense_batch":
+            return batch_norm_mse_loss_fn
+        return standard_mse_loss_fn
+    @torch.no_grad()
+    def remove_gradient_parallel_to_decoder_directions(self) -> None:
+        """Remove gradient components parallel to decoder directions."""
+        # Implement the original logic since this may not be in the base class
+        assert self.W_dec.grad is not None
+        parallel_component = einops.einsum(
+            self.W_dec.grad,
+            self.W_dec.data,
+            "d_sae d_in, d_sae d_in -> d_sae",
+        )
+        self.W_dec.grad -= einops.einsum(
+            parallel_component,
+            self.W_dec.data,
+            "d_sae, d_sae d_in -> d_sae d_in",
+        )
+    @torch.no_grad()
+    def log_histograms(self) -> dict[str, NDArray[Any]]:
+        """Log histograms of the weights and biases."""
+        W_dec_norm_dist = self.W_dec.detach().float().norm(dim=1).cpu().numpy()
+        return {
+            "weights/W_dec_norms": W_dec_norm_dist,
+        }
+    @classmethod
+    def get_sae_class_for_architecture(
+        cls: Type[T_TRAINING_SAE], architecture: str
+    ) -> Type[T_TRAINING_SAE]:
+        """Get the SAE class for a given architecture."""
+        sae_cls, _ = get_sae_training_class(architecture)
+        if not issubclass(sae_cls, cls):
+            raise ValueError(
+                f"Loaded SAE is not of type {cls.__name__}. Use {sae_cls.__name__} instead"
+            )
+        return sae_cls
+    # in the future, this can be used to load different config classes for different architectures
+    @classmethod
+    def get_sae_config_class_for_architecture(
+        cls,
+        architecture: str,  # noqa: ARG003
+    ) -> type[TrainingSAEConfig]:
+        return get_sae_training_class(architecture)[1]
+_blank_hook = nn.Identity()
+@contextmanager
+def _disable_hooks(sae: SAE[Any]):
+    """
+    Temporarily disable hooks for the SAE. Swaps out all the hooks with a fake modules that does nothing.
+    """
+    try:
+        for hook_name in sae.hook_dict:
+            setattr(sae, hook_name, _blank_hook)
+        yield
+    finally:
+        for hook_name, hook in sae.hook_dict.items():
+            setattr(sae, hook_name, hook)

sae-lens 5.10.3__py3-none-any.whl → 6.0.0rc2__py3-none-any.whl

sae-lens 5.10.3py3-none-any.whl → 6.0.0rc2py3-none-any.whl