PyPI - sae-lens - Versions diffs - 6.25.0__tar.gz → 6.26.0__tar.gz - Mend

sae-lens 6.25.0tar.gz → 6.26.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{sae_lens-6.25.0 → sae_lens-6.26.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.25.0
+Version: 6.26.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.25.0 → sae_lens-6.26.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.25.0"
+version = "6.26.0"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.25.0"
+__version__ = "6.26.0"
 import logging
@@ -21,6 +21,10 @@ from sae_lens.saes import (
     JumpReLUTrainingSAEConfig,
     JumpReLUTranscoder,
     JumpReLUTranscoderConfig,
+    MatchingPursuitSAE,
+    MatchingPursuitSAEConfig,
+    MatchingPursuitTrainingSAE,
+    MatchingPursuitTrainingSAEConfig,
     MatryoshkaBatchTopKTrainingSAE,
     MatryoshkaBatchTopKTrainingSAEConfig,
     SAEConfig,
@@ -113,6 +117,10 @@ __all__ = [
     "MatryoshkaBatchTopKTrainingSAEConfig",
     "TemporalSAE",
     "TemporalSAEConfig",
+    "MatchingPursuitSAE",
+    "MatchingPursuitTrainingSAE",
+    "MatchingPursuitSAEConfig",
+    "MatchingPursuitTrainingSAEConfig",
 ]
@@ -139,3 +147,7 @@ register_sae_class(
     "jumprelu_skip_transcoder", JumpReLUSkipTranscoder, JumpReLUSkipTranscoderConfig
 )
 register_sae_class("temporal", TemporalSAE, TemporalSAEConfig)
+register_sae_class("matching_pursuit", MatchingPursuitSAE, MatchingPursuitSAEConfig)
+register_sae_training_class(
+    "matching_pursuit", MatchingPursuitTrainingSAE, MatchingPursuitTrainingSAEConfig
+)

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/cache_activations_runner.py RENAMED Viewed

@@ -14,9 +14,9 @@ from transformer_lens.HookedTransformer import HookedRootModule
 from sae_lens import logger
 from sae_lens.config import CacheActivationsRunnerConfig
-from sae_lens.constants import DTYPE_MAP
 from sae_lens.load_model import load_model
 from sae_lens.training.activations_store import ActivationsStore
+from sae_lens.util import str_to_dtype
 def _mk_activations_store(
@@ -97,7 +97,7 @@ class CacheActivationsRunner:
         bytes_per_token = (
             self.cfg.d_in * self.cfg.dtype.itemsize
             if isinstance(self.cfg.dtype, torch.dtype)
-            else DTYPE_MAP[self.cfg.dtype].itemsize
+            else str_to_dtype(self.cfg.dtype).itemsize
         )
         total_training_tokens = self.cfg.n_seq_in_dataset * self.context_size
         total_disk_space_gb = total_training_tokens * bytes_per_token / 10**9

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/config.py RENAMED Viewed

@@ -17,9 +17,14 @@ from datasets import (
 )
 from sae_lens import __version__, logger
-from sae_lens.constants import DTYPE_MAP
+# keeping this unused import since some SAELens deps import DTYPE_MAP from config
+from sae_lens.constants import (
+    DTYPE_MAP,  # noqa: F401  # pyright: ignore[reportUnusedImport]
+)
 from sae_lens.registry import get_sae_training_class
 from sae_lens.saes.sae import TrainingSAEConfig
+from sae_lens.util import str_to_dtype
 if TYPE_CHECKING:
     pass
@@ -563,7 +568,7 @@ class CacheActivationsRunnerConfig:
     @property
     def bytes_per_token(self) -> int:
-        return self.d_in * DTYPE_MAP[self.dtype].itemsize
+        return self.d_in * str_to_dtype(self.dtype).itemsize
     @property
     def n_tokens_in_buffer(self) -> int:

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/constants.py RENAMED Viewed

@@ -11,6 +11,14 @@ DTYPE_MAP = {
     "torch.bfloat16": torch.bfloat16,
 }
+# Reverse mapping from torch.dtype to canonical string format
+DTYPE_TO_STR = {
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.float16: "float16",
+    torch.bfloat16: "bfloat16",
+}
 SPARSITY_FILENAME = "sparsity.safetensors"
 SAE_WEIGHTS_FILENAME = "sae_weights.safetensors"

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/loading/pretrained_sae_loaders.py RENAMED Viewed

@@ -12,11 +12,9 @@ from huggingface_hub import hf_hub_download, hf_hub_url
 from huggingface_hub.utils import EntryNotFoundError, build_hf_headers
 from packaging.version import Version
 from safetensors import safe_open
-from safetensors.torch import load_file
 from sae_lens import logger
 from sae_lens.constants import (
-    DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
     SPARSIFY_WEIGHTS_FILENAME,
@@ -28,7 +26,7 @@ from sae_lens.loading.pretrained_saes_directory import (
     get_repo_id_and_folder_name,
 )
 from sae_lens.registry import get_sae_class
-from sae_lens.util import filter_valid_dataclass_fields
+from sae_lens.util import filter_valid_dataclass_fields, str_to_dtype
 LLM_METADATA_KEYS = {
     "model_name",
@@ -51,6 +49,21 @@ LLM_METADATA_KEYS = {
 }
+def load_safetensors_weights(
+    path: str | Path, device: str = "cpu", dtype: torch.dtype | str | None = None
+) -> dict[str, torch.Tensor]:
+    """Load safetensors weights and optionally convert to a different dtype"""
+    loaded_weights = {}
+    dtype = str_to_dtype(dtype) if isinstance(dtype, str) else dtype
+    with safe_open(path, framework="pt", device=device) as f:
+        for k in f.keys():  # noqa: SIM118
+            weight = f.get_tensor(k)
+            if dtype is not None:
+                weight = weight.to(dtype=dtype)
+            loaded_weights[k] = weight
+    return loaded_weights
 # loaders take in a release, sae_id, device, and whether to force download, and returns a tuple of config, state_dict, and log sparsity
 class PretrainedSaeHuggingfaceLoader(Protocol):
     def __call__(
@@ -341,7 +354,7 @@ def read_sae_components_from_disk(
     Given a loaded dictionary and a path to a weight file, load the weights and return the state_dict.
     """
     if dtype is None:
-        dtype = DTYPE_MAP[cfg_dict["dtype"]]
+        dtype = str_to_dtype(cfg_dict["dtype"])
     state_dict = {}
     with safe_open(weight_path, framework="pt", device=device) as f:  # type: ignore
@@ -695,7 +708,9 @@ def gemma_3_sae_huggingface_loader(
         force_download=force_download,
     )
-    raw_state_dict = load_file(sae_path, device=device)
+    raw_state_dict = load_safetensors_weights(
+        sae_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     with torch.no_grad():
         w_dec = raw_state_dict["w_dec"]
@@ -782,11 +797,13 @@ def get_goodfire_huggingface_loader(
     )
     raw_state_dict = torch.load(sae_path, map_location=device)
+    target_dtype = str_to_dtype(cfg_dict.get("dtype", "float32"))
     state_dict = {
-        "W_enc": raw_state_dict["encoder_linear.weight"].T,
-        "W_dec": raw_state_dict["decoder_linear.weight"].T,
-        "b_enc": raw_state_dict["encoder_linear.bias"],
-        "b_dec": raw_state_dict["decoder_linear.bias"],
+        "W_enc": raw_state_dict["encoder_linear.weight"].T.to(dtype=target_dtype),
+        "W_dec": raw_state_dict["decoder_linear.weight"].T.to(dtype=target_dtype),
+        "b_enc": raw_state_dict["encoder_linear.bias"].to(dtype=target_dtype),
+        "b_dec": raw_state_dict["decoder_linear.bias"].to(dtype=target_dtype),
     }
     return cfg_dict, state_dict, None
@@ -889,26 +906,19 @@ def llama_scope_sae_huggingface_loader(
         force_download=force_download,
     )
-    # Load the weights using load_file instead of safe_open
-    state_dict_loaded = load_file(sae_path, device=device)
+    state_dict_loaded = load_safetensors_weights(
+        sae_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     # Convert and organize the weights
     state_dict = {
-        "W_enc": state_dict_loaded["encoder.weight"]
-        .to(dtype=DTYPE_MAP[cfg_dict["dtype"]])
-        .T,
-        "W_dec": state_dict_loaded["decoder.weight"]
-        .to(dtype=DTYPE_MAP[cfg_dict["dtype"]])
-        .T,
-        "b_enc": state_dict_loaded["encoder.bias"].to(
-            dtype=DTYPE_MAP[cfg_dict["dtype"]]
-        ),
-        "b_dec": state_dict_loaded["decoder.bias"].to(
-            dtype=DTYPE_MAP[cfg_dict["dtype"]]
-        ),
+        "W_enc": state_dict_loaded["encoder.weight"].T,
+        "W_dec": state_dict_loaded["decoder.weight"].T,
+        "b_enc": state_dict_loaded["encoder.bias"],
+        "b_dec": state_dict_loaded["decoder.bias"],
         "threshold": torch.ones(
             cfg_dict["d_sae"],
-            dtype=DTYPE_MAP[cfg_dict["dtype"]],
+            dtype=str_to_dtype(cfg_dict["dtype"]),
             device=cfg_dict["device"],
         )
         * cfg_dict["jump_relu_threshold"],
@@ -1219,26 +1229,17 @@ def llama_scope_r1_distill_sae_huggingface_loader(
         force_download=force_download,
     )
-    # Load the weights using load_file instead of safe_open
-    state_dict_loaded = load_file(sae_path, device=device)
+    state_dict_loaded = load_safetensors_weights(
+        sae_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     # Convert and organize the weights
     state_dict = {
-        "W_enc": state_dict_loaded["encoder.weight"]
-        .to(dtype=DTYPE_MAP[cfg_dict["dtype"]])
-        .T,
-        "W_dec": state_dict_loaded["decoder.weight"]
-        .to(dtype=DTYPE_MAP[cfg_dict["dtype"]])
-        .T,
-        "b_enc": state_dict_loaded["encoder.bias"].to(
-            dtype=DTYPE_MAP[cfg_dict["dtype"]]
-        ),
-        "b_dec": state_dict_loaded["decoder.bias"].to(
-            dtype=DTYPE_MAP[cfg_dict["dtype"]]
-        ),
-        "threshold": state_dict_loaded["log_jumprelu_threshold"]
-        .to(dtype=DTYPE_MAP[cfg_dict["dtype"]])
-        .exp(),
+        "W_enc": state_dict_loaded["encoder.weight"].T,
+        "W_dec": state_dict_loaded["decoder.weight"].T,
+        "b_enc": state_dict_loaded["encoder.bias"],
+        "b_dec": state_dict_loaded["decoder.bias"],
+        "threshold": state_dict_loaded["log_jumprelu_threshold"].exp(),
     }
     # No sparsity tensor for Llama Scope SAEs
@@ -1358,34 +1359,34 @@ def sparsify_disk_loader(
     cfg_dict = get_sparsify_config_from_disk(path, device, cfg_overrides)
     weight_path = Path(path) / SPARSIFY_WEIGHTS_FILENAME
-    state_dict_loaded = load_file(weight_path, device=device)
-    dtype = DTYPE_MAP[cfg_dict["dtype"]]
+    state_dict_loaded = load_safetensors_weights(
+        weight_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     W_enc = (
         state_dict_loaded["W_enc"]
         if "W_enc" in state_dict_loaded
         else state_dict_loaded["encoder.weight"].T
-    ).to(dtype)
+    )
     if "W_dec" in state_dict_loaded:
-        W_dec = state_dict_loaded["W_dec"].T.to(dtype)
+        W_dec = state_dict_loaded["W_dec"].T
     else:
-        W_dec = state_dict_loaded["decoder.weight"].T.to(dtype)
+        W_dec = state_dict_loaded["decoder.weight"].T
     if "b_enc" in state_dict_loaded:
-        b_enc = state_dict_loaded["b_enc"].to(dtype)
+        b_enc = state_dict_loaded["b_enc"]
     elif "encoder.bias" in state_dict_loaded:
-        b_enc = state_dict_loaded["encoder.bias"].to(dtype)
+        b_enc = state_dict_loaded["encoder.bias"]
     else:
-        b_enc = torch.zeros(cfg_dict["d_sae"], dtype=dtype, device=device)
+        b_enc = torch.zeros(cfg_dict["d_sae"], dtype=W_dec.dtype, device=device)
     if "b_dec" in state_dict_loaded:
-        b_dec = state_dict_loaded["b_dec"].to(dtype)
+        b_dec = state_dict_loaded["b_dec"]
     elif "decoder.bias" in state_dict_loaded:
-        b_dec = state_dict_loaded["decoder.bias"].to(dtype)
+        b_dec = state_dict_loaded["decoder.bias"]
     else:
-        b_dec = torch.zeros(cfg_dict["d_in"], dtype=dtype, device=device)
+        b_dec = torch.zeros(cfg_dict["d_in"], dtype=W_dec.dtype, device=device)
     state_dict = {"W_enc": W_enc, "b_enc": b_enc, "W_dec": W_dec, "b_dec": b_dec}
     return cfg_dict, state_dict
@@ -1616,7 +1617,9 @@ def mwhanna_transcoder_huggingface_loader(
     )
     # Load weights from safetensors
-    state_dict = load_file(file_path, device=device)
+    state_dict = load_safetensors_weights(
+        file_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     state_dict["W_enc"] = state_dict["W_enc"].T
     return cfg_dict, state_dict, None
@@ -1700,8 +1703,12 @@ def mntss_clt_layer_huggingface_loader(
         force_download=force_download,
     )
-    encoder_state_dict = load_file(encoder_path, device=device)
-    decoder_state_dict = load_file(decoder_path, device=device)
+    encoder_state_dict = load_safetensors_weights(
+        encoder_path, device=device, dtype=cfg_dict.get("dtype")
+    )
+    decoder_state_dict = load_safetensors_weights(
+        decoder_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     with torch.no_grad():
         state_dict = {
@@ -1844,7 +1851,9 @@ def temporal_sae_huggingface_loader(
     )
     # Load checkpoint from safetensors
-    state_dict_raw = load_file(ckpt_path, device=device)
+    state_dict_raw = load_safetensors_weights(
+        ckpt_path, device=device, dtype=cfg_dict.get("dtype")
+    )
     # Convert to SAELens naming convention
     # TemporalSAE uses: D (decoder), E (encoder), b (bias), attn_layers.*

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/__init__.py RENAMED Viewed

@@ -14,6 +14,12 @@ from .jumprelu_sae import (
     JumpReLUTrainingSAE,
     JumpReLUTrainingSAEConfig,
 )
+from .matching_pursuit_sae import (
+    MatchingPursuitSAE,
+    MatchingPursuitSAEConfig,
+    MatchingPursuitTrainingSAE,
+    MatchingPursuitTrainingSAEConfig,
+)
 from .matryoshka_batchtopk_sae import (
     MatryoshkaBatchTopKTrainingSAE,
     MatryoshkaBatchTopKTrainingSAEConfig,
@@ -78,4 +84,8 @@ __all__ = [
     "MatryoshkaBatchTopKTrainingSAEConfig",
     "TemporalSAE",
     "TemporalSAEConfig",
+    "MatchingPursuitSAE",
+    "MatchingPursuitTrainingSAE",
+    "MatchingPursuitSAEConfig",
+    "MatchingPursuitTrainingSAEConfig",
 ]

sae_lens-6.26.0/sae_lens/saes/matching_pursuit_sae.py ADDED Viewed

@@ -0,0 +1,334 @@
+"""Matching Pursuit SAE"""
+import warnings
+from dataclasses import dataclass
+from typing import Any
+import torch
+from typing_extensions import override
+from sae_lens.saes.sae import (
+    SAE,
+    SAEConfig,
+    TrainCoefficientConfig,
+    TrainingSAE,
+    TrainingSAEConfig,
+    TrainStepInput,
+    TrainStepOutput,
+)
+# --- inference ---
+@dataclass
+class MatchingPursuitSAEConfig(SAEConfig):
+    """
+    Configuration class for MatchingPursuitSAE inference.
+    Args:
+        residual_threshold (float): residual error at which to stop selecting latents. Default 1e-2.
+        max_iterations (int | None): Maximum iterations (default: d_in if set to None).
+            Defaults to None.
+        stop_on_duplicate_support (bool): Whether to stop selecting latents if the support set has not changed from the previous iteration. Defaults to True.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+        apply_b_dec_to_input (bool): Whether to apply decoder bias to the input
+            before encoding. Inherited from SAEConfig. Defaults to True.
+        normalize_activations (Literal["none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"]):
+            Normalization strategy for input activations. Inherited from SAEConfig.
+            Defaults to "none".
+        reshape_activations (Literal["none", "hook_z"]): How to reshape activations
+            (useful for attention head outputs). Inherited from SAEConfig.
+            Defaults to "none".
+        metadata (SAEMetadata): Metadata about the SAE (model name, hook name, etc.).
+            Inherited from SAEConfig.
+    """
+    residual_threshold: float = 1e-2
+    max_iterations: int | None = None
+    stop_on_duplicate_support: bool = True
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "matching_pursuit"
+class MatchingPursuitSAE(SAE[MatchingPursuitSAEConfig]):
+    """
+    An inference-only sparse autoencoder using a "matching pursuit" activation function.
+    """
+    # Matching pursuit is a tied SAE, so we use W_enc as the decoder transposed
+    @property
+    def W_enc(self) -> torch.Tensor:  # pyright: ignore[reportIncompatibleVariableOverride]
+        return self.W_dec.T
+    # hacky way to get around the base class having W_enc.
+    # TODO: harmonize with the base class in next major release
+    @override
+    def __setattr__(self, name: str, value: Any):
+        if name == "W_enc":
+            return
+        super().__setattr__(name, value)
+    @override
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Converts input x into feature activations.
+        """
+        sae_in = self.process_sae_in(x)
+        return _encode_matching_pursuit(
+            sae_in,
+            self.W_dec,
+            self.cfg.residual_threshold,
+            max_iterations=self.cfg.max_iterations,
+            stop_on_duplicate_support=self.cfg.stop_on_duplicate_support,
+        )
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for MatchingPursuit SAEs, as this may change the resulting activations"
+        )
+    @override
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
+        """
+        Decode the feature activations back to the input space.
+        Now, if hook_z reshaping is turned on, we reverse the flattening.
+        """
+        sae_out_pre = feature_acts @ self.W_dec
+        # since this is a tied SAE, we need to make sure b_dec is only applied if applied at input
+        if self.cfg.apply_b_dec_to_input:
+            sae_out_pre = sae_out_pre + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+# --- training ---
+@dataclass
+class MatchingPursuitTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a MatchingPursuitTrainingSAE.
+    Args:
+        residual_threshold (float): residual error at which to stop selecting latents. Default 1e-2.
+        max_iterations (int | None): Maximum iterations (default: d_in if set to None).
+            Defaults to None.
+        stop_on_duplicate_support (bool): Whether to stop selecting latents if the support set has not changed from the previous iteration. Defaults to True.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            0.1 corresponds to the "heuristic" initialization from Anthropic's April update.
+            Use None to disable. Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+        apply_b_dec_to_input (bool): Whether to apply decoder bias to the input
+            before encoding. Inherited from SAEConfig. Defaults to True.
+        normalize_activations (Literal["none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"]):
+            Normalization strategy for input activations. Inherited from SAEConfig.
+            Defaults to "none".
+        reshape_activations (Literal["none", "hook_z"]): How to reshape activations
+            (useful for attention head outputs). Inherited from SAEConfig.
+            Defaults to "none".
+        metadata (SAEMetadata): Metadata about the SAE training (model name, hook name, etc.).
+            Inherited from SAEConfig.
+    """
+    residual_threshold: float = 1e-2
+    max_iterations: int | None = None
+    stop_on_duplicate_support: bool = True
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "matching_pursuit"
+    @override
+    def __post_init__(self):
+        super().__post_init__()
+        if self.decoder_init_norm != 1.0:
+            self.decoder_init_norm = 1.0
+            warnings.warn(
+                "decoder_init_norm must be set to 1.0 for MatchingPursuitTrainingSAE, setting to 1.0"
+            )
+class MatchingPursuitTrainingSAE(TrainingSAE[MatchingPursuitTrainingSAEConfig]):
+    # Matching pursuit is a tied SAE, so we use W_enc as the decoder transposed
+    @property
+    def W_enc(self) -> torch.Tensor:  # pyright: ignore[reportIncompatibleVariableOverride]
+        return self.W_dec.T
+    # hacky way to get around the base class having W_enc.
+    # TODO: harmonize with the base class in next major release
+    @override
+    def __setattr__(self, name: str, value: Any):
+        if name == "W_enc":
+            return
+        super().__setattr__(name, value)
+    @override
+    def encode_with_hidden_pre(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        hidden_pre doesn't make sense for matching pursuit, since there is not a single pre-activation.
+        We just return zeros for the hidden_pre.
+        """
+        sae_in = self.process_sae_in(x)
+        acts = _encode_matching_pursuit(
+            sae_in,
+            self.W_dec,
+            self.cfg.residual_threshold,
+            max_iterations=self.cfg.max_iterations,
+            stop_on_duplicate_support=self.cfg.stop_on_duplicate_support,
+        )
+        return acts, torch.zeros_like(acts)
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for MatchingPursuit SAEs, as this may change the resulting activations"
+        )
+    @override
+    def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
+        return {}
+    @override
+    def calculate_aux_loss(
+        self,
+        step_input: TrainStepInput,
+        feature_acts: torch.Tensor,
+        hidden_pre: torch.Tensor,
+        sae_out: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        return {}
+    @override
+    def training_forward_pass(self, step_input: TrainStepInput) -> TrainStepOutput:
+        output = super().training_forward_pass(step_input)
+        l0 = output.feature_acts.bool().float().sum(-1).to_dense()
+        residual_norm = (step_input.sae_in - output.sae_out).norm(dim=-1)
+        output.metrics["max_l0"] = l0.max()
+        output.metrics["min_l0"] = l0.min()
+        output.metrics["residual_norm"] = residual_norm.mean()
+        output.metrics["residual_threshold_converged_portion"] = (
+            (residual_norm < self.cfg.residual_threshold).float().mean()
+        )
+        return output
+    @override
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
+        """
+        Decode the feature activations back to the input space.
+        Now, if hook_z reshaping is turned on, we reverse the flattening.
+        """
+        sae_out_pre = feature_acts @ self.W_dec
+        # since this is a tied SAE, we need to make sure b_dec is only applied if applied at input
+        if self.cfg.apply_b_dec_to_input:
+            sae_out_pre = sae_out_pre + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+# --- shared ---
+def _encode_matching_pursuit(
+    sae_in_centered: torch.Tensor,
+    W_dec: torch.Tensor,
+    residual_threshold: float,
+    max_iterations: int | None,
+    stop_on_duplicate_support: bool,
+) -> torch.Tensor:
+    """
+    Matching pursuit encoding.
+    Args:
+        sae_in_centered: Input activations, centered by b_dec. Shape [..., d_in].
+        W_dec: Decoder weight matrix. Shape [d_sae, d_in].
+        residual_threshold: Stop when residual norm falls below this.
+        max_iterations: Maximum iterations (default: d_in). Prevents infinite loops.
+        stop_on_duplicate_support: Whether to stop selecting latents if the support set has not changed from the previous iteration.
+    """
+    residual = sae_in_centered.clone()
+    stop_on_residual_threshold = residual_threshold > 0
+    # Handle multi-dimensional inputs by flattening all but the last dimension
+    original_shape = residual.shape
+    if residual.ndim > 2:
+        residual = residual.reshape(-1, residual.shape[-1])
+    batch_size = residual.shape[0]
+    d_sae, d_in = W_dec.shape
+    if max_iterations is None:
+        max_iterations = d_in  # Sensible upper bound
+    acts = torch.zeros(batch_size, d_sae, device=W_dec.device, dtype=residual.dtype)
+    prev_support = torch.zeros(batch_size, d_sae, dtype=torch.bool, device=W_dec.device)
+    done = torch.zeros(batch_size, dtype=torch.bool, device=W_dec.device)
+    for _ in range(max_iterations):
+        # Find indices without gradients - the full [batch, d_sae] matmul result
+        # doesn't need to be saved for backward since max indices don't need gradients
+        with torch.no_grad():
+            indices = (residual @ W_dec.T).relu().max(dim=1, keepdim=True).indices
+            indices_flat = indices.squeeze(1)  # [batch_size]
+        # Compute values with gradients using only the selected decoder rows.
+        # This stores [batch, d_in] for backward instead of [batch, d_sae].
+        selected_dec = W_dec[indices_flat]  # [batch_size, d_in]
+        values = (residual * selected_dec).sum(dim=-1, keepdim=True).relu()
+        # Mask values for samples that are already done
+        active_mask = (~done).unsqueeze(1)
+        masked_values = (values * active_mask.to(values.dtype)).to(acts.dtype)
+        acts.scatter_add_(1, indices, masked_values)
+        # Update residual
+        residual = residual - masked_values * selected_dec
+        if stop_on_duplicate_support or stop_on_residual_threshold:
+            with torch.no_grad():
+                support = acts != 0
+                # A sample is considered converged if:
+                # (1) the support set hasn't changed from the previous iteration (stability), or
+                # (2) the residual norm is below a given threshold (good enough reconstruction)
+                if stop_on_duplicate_support:
+                    done = done | (support == prev_support).all(dim=1)
+                    prev_support = support
+                if stop_on_residual_threshold:
+                    done = done | (residual.norm(dim=-1) < residual_threshold)
+                if done.all():
+                    break
+    # Reshape acts back to original shape (replacing last dimension with d_sae)
+    if len(original_shape) > 2:
+        acts = acts.reshape(*original_shape[:-1], acts.shape[-1])
+    return acts

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/sae.py RENAMED Viewed

@@ -27,11 +27,10 @@ from typing_extensions import deprecated, overload, override
 from sae_lens import __version__
 from sae_lens.constants import (
-    DTYPE_MAP,
     SAE_CFG_FILENAME,
     SAE_WEIGHTS_FILENAME,
 )
-from sae_lens.util import filter_valid_dataclass_fields
+from sae_lens.util import dtype_to_str, filter_valid_dataclass_fields, str_to_dtype
 if TYPE_CHECKING:
     from sae_lens.config import LanguageModelSAERunnerConfig
@@ -253,7 +252,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
                 stacklevel=1,
             )
-        self.dtype = DTYPE_MAP[cfg.dtype]
+        self.dtype = str_to_dtype(cfg.dtype)
         self.device = torch.device(cfg.device)
         self.use_error_term = use_error_term
@@ -437,8 +436,8 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         # Update dtype in config if provided
         if dtype_arg is not None:
-            # Update the cfg.dtype
-            self.cfg.dtype = str(dtype_arg)
+            # Update the cfg.dtype (use canonical short form like "float32")
+            self.cfg.dtype = dtype_to_str(dtype_arg)
             # Update the dtype property
             self.dtype = dtype_arg
@@ -534,6 +533,15 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         dtype: str | None = None,
         converter: PretrainedSaeDiskLoader = sae_lens_disk_loader,
     ) -> T_SAE:
+        """
+        Load a SAE from disk.
+        Args:
+            path: The path to the SAE weights and config.
+            device: The device to load the SAE on, defaults to "cpu".
+            dtype: The dtype to load the SAE on, defaults to None. If None, the dtype will be inferred from the SAE config.
+            converter: The converter to use to load the SAE, defaults to sae_lens_disk_loader.
+        """
         overrides = {"dtype": dtype} if dtype is not None else None
         cfg_dict, state_dict = converter(path, device, cfg_overrides=overrides)
         cfg_dict = handle_config_defaulting(cfg_dict)
@@ -542,10 +550,17 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         )
         sae_cfg = sae_config_cls.from_dict(cfg_dict)
         sae_cls = cls.get_sae_class_for_architecture(sae_cfg.architecture())
+        # hack to avoid using double memory when loading the SAE.
+        # first put the SAE on the meta device, then load the weights.
+        device = sae_cfg.device
+        sae_cfg.device = "meta"
         sae = sae_cls(sae_cfg)
+        sae.cfg.device = device
         sae.process_state_dict_for_loading(state_dict)
-        sae.load_state_dict(state_dict)
-        return sae
+        sae.load_state_dict(state_dict, assign=True)
+        # the loaders should already handle the dtype / device conversion
+        # but this is a fallback to guarantee the SAE is on the correct device and dtype
+        return sae.to(dtype=str_to_dtype(sae_cfg.dtype), device=device)
     @classmethod
     def from_pretrained(
@@ -553,6 +568,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         release: str,
         sae_id: str,
         device: str = "cpu",
+        dtype: str = "float32",
         force_download: bool = False,
         converter: PretrainedSaeHuggingfaceLoader | None = None,
     ) -> T_SAE:
@@ -562,10 +578,18 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         Args:
             release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
             id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
-            device: The device to load the SAE on.
+            device: The device to load the SAE on, defaults to "cpu".
+            dtype: The dtype to load the SAE on, defaults to "float32".
+            force_download: Whether to force download the SAE weights and config, defaults to False.
+            converter: The converter to use to load the SAE, defaults to None. If None, the converter will be inferred from the release.
         """
         return cls.from_pretrained_with_cfg_and_sparsity(
-            release, sae_id, device, force_download, converter=converter
+            release,
+            sae_id,
+            device,
+            force_download=force_download,
+            dtype=dtype,
+            converter=converter,
         )[0]
     @classmethod
@@ -574,6 +598,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         release: str,
         sae_id: str,
         device: str = "cpu",
+        dtype: str = "float32",
         force_download: bool = False,
         converter: PretrainedSaeHuggingfaceLoader | None = None,
     ) -> tuple[T_SAE, dict[str, Any], torch.Tensor | None]:
@@ -584,7 +609,10 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         Args:
             release: The release name. This will be mapped to a huggingface repo id based on the pretrained_saes.yaml file.
             id: The id of the SAE to load. This will be mapped to a path in the huggingface repo.
-            device: The device to load the SAE on.
+            device: The device to load the SAE on, defaults to "cpu".
+            dtype: The dtype to load the SAE on, defaults to "float32".
+            force_download: Whether to force download the SAE weights and config, defaults to False.
+            converter: The converter to use to load the SAE, defaults to None. If None, the converter will be inferred from the release.
         """
         # get sae directory
@@ -634,6 +662,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         repo_id, folder_name = get_repo_id_and_folder_name(release, sae_id)
         config_overrides = get_config_overrides(release, sae_id)
         config_overrides["device"] = device
+        config_overrides["dtype"] = dtype
         # Load config and weights
         cfg_dict, state_dict, log_sparsities = conversion_loader(
@@ -651,9 +680,14 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         )
         sae_cfg = sae_config_cls.from_dict(cfg_dict)
         sae_cls = cls.get_sae_class_for_architecture(sae_cfg.architecture())
+        # hack to avoid using double memory when loading the SAE.
+        # first put the SAE on the meta device, then load the weights.
+        device = sae_cfg.device
+        sae_cfg.device = "meta"
         sae = sae_cls(sae_cfg)
+        sae.cfg.device = device
         sae.process_state_dict_for_loading(state_dict)
-        sae.load_state_dict(state_dict)
+        sae.load_state_dict(state_dict, assign=True)
         # Apply normalization if needed
         if cfg_dict.get("normalize_activations") == "expected_average_only_in":
@@ -666,7 +700,13 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
                     f"norm_scaling_factor not found for {release} and {sae_id}, but normalize_activations is 'expected_average_only_in'. Skipping normalization folding."
                 )
-        return sae, cfg_dict, log_sparsities
+        # the loaders should already handle the dtype / device conversion
+        # but this is a fallback to guarantee the SAE is on the correct device and dtype
+        return (
+            sae.to(dtype=str_to_dtype(dtype), device=device),
+            cfg_dict,
+            log_sparsities,
+        )
     @classmethod
     def from_dict(cls: type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/activations_store.py RENAMED Viewed

@@ -24,7 +24,7 @@ from sae_lens.config import (
     HfDataset,
     LanguageModelSAERunnerConfig,
 )
-from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, DTYPE_MAP
+from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME
 from sae_lens.pretokenize_runner import get_special_token_from_cfg
 from sae_lens.saes.sae import SAE, T_SAE_CONFIG, T_TRAINING_SAE_CONFIG
 from sae_lens.tokenization_and_batching import concat_and_batch_sequences
@@ -32,6 +32,7 @@ from sae_lens.training.mixing_buffer import mixing_buffer
 from sae_lens.util import (
     extract_stop_at_layer_from_tlens_hook_name,
     get_special_token_ids,
+    str_to_dtype,
 )
@@ -258,7 +259,7 @@ class ActivationsStore:
         self.prepend_bos = prepend_bos
         self.normalize_activations = normalize_activations
         self.device = torch.device(device)
-        self.dtype = DTYPE_MAP[dtype]
+        self.dtype = str_to_dtype(dtype)
         self.cached_activations_path = cached_activations_path
         self.autocast_lm = autocast_lm
         self.seqpos_slice = seqpos_slice

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/util.py RENAMED Viewed

@@ -5,8 +5,11 @@ from dataclasses import asdict, fields, is_dataclass
 from pathlib import Path
 from typing import Sequence, TypeVar
+import torch
 from transformers import PreTrainedTokenizerBase
+from sae_lens.constants import DTYPE_MAP, DTYPE_TO_STR
 K = TypeVar("K")
 V = TypeVar("V")
@@ -90,3 +93,21 @@ def get_special_token_ids(tokenizer: PreTrainedTokenizerBase) -> list[int]:
                     special_tokens.add(token_id)
     return list(special_tokens)
+def str_to_dtype(dtype: str) -> torch.dtype:
+    """Convert a string to a torch.dtype."""
+    if dtype not in DTYPE_MAP:
+        raise ValueError(
+            f"Invalid dtype: {dtype}. Must be one of {list(DTYPE_MAP.keys())}"
+        )
+    return DTYPE_MAP[dtype]
+def dtype_to_str(dtype: torch.dtype) -> str:
+    """Convert a torch.dtype to a string."""
+    if dtype not in DTYPE_TO_STR:
+        raise ValueError(
+            f"Invalid dtype: {dtype}. Must be one of {list(DTYPE_TO_STR.keys())}"
+        )
+    return DTYPE_TO_STR[dtype]

{sae_lens-6.25.0 → sae_lens-6.26.0}/LICENSE RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/README.md RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/analysis/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/analysis/hooked_sae_transformer.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/analysis/neuronpedia_integration.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/evals.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/llm_sae_training_runner.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/load_model.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/loading/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/loading/pretrained_saes_directory.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/pretokenize_runner.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/pretrained_saes.yaml RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/registry.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/batchtopk_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/gated_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/jumprelu_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/matryoshka_batchtopk_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/standard_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/temporal_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/topk_sae.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/saes/transcoder.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/tokenization_and_batching.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/__init__.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/activation_scaler.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/mixing_buffer.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/optim.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/sae_trainer.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/types.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/training/upload_saes_to_huggingface.py RENAMED Viewed

File without changes

{sae_lens-6.25.0 → sae_lens-6.26.0}/sae_lens/tutorial/tsea.py RENAMED Viewed

File without changes

sae-lens 6.25.0__tar.gz → 6.26.0__tar.gz

sae-lens 6.25.0tar.gz → 6.26.0tar.gz