PyPI - sae-lens - Versions diffs - 6.12.1__py3-none-any.whl → 6.21.0__py3-none-any.whl - Mend

sae-lens 6.12.1py3-none-any.whl → 6.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +15 -1
sae_lens/cache_activations_runner.py +1 -1
sae_lens/config.py +39 -2
sae_lens/constants.py +1 -0
sae_lens/evals.py +20 -14
sae_lens/llm_sae_training_runner.py +17 -18
sae_lens/loading/pretrained_sae_loaders.py +194 -0
sae_lens/loading/pretrained_saes_directory.py +5 -3
sae_lens/pretokenize_runner.py +2 -1
sae_lens/pretrained_saes.yaml +75 -1
sae_lens/saes/__init__.py +9 -0
sae_lens/saes/batchtopk_sae.py +32 -1
sae_lens/saes/matryoshka_batchtopk_sae.py +137 -0
sae_lens/saes/sae.py +22 -24
sae_lens/saes/temporal_sae.py +372 -0
sae_lens/saes/topk_sae.py +287 -17
sae_lens/tokenization_and_batching.py +21 -6
sae_lens/training/activation_scaler.py +7 -0
sae_lens/training/activations_store.py +52 -31
sae_lens/training/optim.py +11 -0
sae_lens/training/sae_trainer.py +57 -16
sae_lens/training/types.py +1 -1
sae_lens/util.py +27 -0
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info}/METADATA +19 -17
sae_lens-6.21.0.dist-info/RECORD +41 -0
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info}/WHEEL +1 -1
sae_lens-6.12.1.dist-info/RECORD +0 -39
{sae_lens-6.12.1.dist-info → sae_lens-6.21.0.dist-info/licenses}/LICENSE +0 -0

sae_lens/saes/topk_sae.py CHANGED Viewed

@@ -1,11 +1,12 @@
 """Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
 from dataclasses import dataclass
-from typing import Callable
+from typing import Any, Callable
 import torch
 from jaxtyping import Float
 from torch import nn
+from transformer_lens.hook_points import HookPoint
 from typing_extensions import override
 from sae_lens.saes.sae import (
@@ -15,44 +16,138 @@ from sae_lens.saes.sae import (
     TrainingSAE,
     TrainingSAEConfig,
     TrainStepInput,
+    _disable_hooks,
 )
+class SparseHookPoint(HookPoint):
+    """
+    A HookPoint that takes in a sparse tensor.
+    Overrides TransformerLens's HookPoint.
+    """
+    def __init__(self, d_sae: int):
+        super().__init__()
+        self.d_sae = d_sae
+    @override
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        using_hooks = (
+            self._forward_hooks is not None and len(self._forward_hooks) > 0
+        ) or (self._backward_hooks is not None and len(self._backward_hooks) > 0)
+        if using_hooks and x.is_sparse:
+            return x.to_dense()
+        return x  # if no hooks are being used, use passthrough
 class TopK(nn.Module):
     """
     A simple TopK activation that zeroes out all but the top K elements along the last dimension,
     and applies ReLU to the top K elements.
     """
-    b_enc: nn.Parameter
+    use_sparse_activations: bool
     def __init__(
         self,
         k: int,
+        use_sparse_activations: bool = False,
     ):
         super().__init__()
         self.k = k
+        self.use_sparse_activations = use_sparse_activations
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
         """
         1) Select top K elements along the last dimension.
         2) Apply ReLU.
         3) Zero out all other entries.
         """
-        topk = torch.topk(x, k=self.k, dim=-1)
-        values = topk.values.relu()
+        topk_values, topk_indices = torch.topk(x, k=self.k, dim=-1, sorted=False)
+        values = topk_values.relu()
+        if self.use_sparse_activations:
+            # Produce a COO sparse tensor (use sparse matrix multiply in decode)
+            original_shape = x.shape
+            # Create indices for all dimensions
+            # For each element in topk_indices, we need to map it back to the original tensor coordinates
+            batch_dims = original_shape[:-1]  # All dimensions except the last one
+            num_batch_elements = torch.prod(torch.tensor(batch_dims)).item()
+            # Create batch indices - each batch element repeated k times
+            batch_indices_flat = torch.arange(
+                num_batch_elements, device=x.device
+            ).repeat_interleave(self.k)
+            # Convert flat batch indices back to multi-dimensional indices
+            if len(batch_dims) == 1:
+                # 2D case: [batch, features]
+                sparse_indices = torch.stack(
+                    [
+                        batch_indices_flat,
+                        topk_indices.flatten(),
+                    ]
+                )
+            else:
+                # 3D+ case: need to unravel the batch indices
+                batch_indices_multi = []
+                remaining = batch_indices_flat
+                for dim_size in reversed(batch_dims):
+                    batch_indices_multi.append(remaining % dim_size)
+                    remaining = remaining // dim_size
+                batch_indices_multi.reverse()
+                sparse_indices = torch.stack(
+                    [
+                        *batch_indices_multi,
+                        topk_indices.flatten(),
+                    ]
+                )
+            return torch.sparse_coo_tensor(
+                sparse_indices, values.flatten(), original_shape
+            )
         result = torch.zeros_like(x)
-        result.scatter_(-1, topk.indices, values)
+        result.scatter_(-1, topk_indices, values)
         return result
 @dataclass
 class TopKSAEConfig(SAEConfig):
     """
-    Configuration class for a TopKSAE.
+    Configuration class for TopKSAE inference.
+    Args:
+        k (int): Number of top features to keep active during inference. Only the top k
+            features with the highest pre-activations will be non-zero. Defaults to 100.
+        rescale_acts_by_decoder_norm (bool): Whether to treat the decoder as if it was
+            already normalized. This affects the topk selection by rescaling pre-activations
+            by decoder norms. Requires that the SAE was trained this way. Defaults to False.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+        apply_b_dec_to_input (bool): Whether to apply decoder bias to the input
+            before encoding. Inherited from SAEConfig. Defaults to True.
+        normalize_activations (Literal["none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"]):
+            Normalization strategy for input activations. Inherited from SAEConfig.
+            Defaults to "none".
+        reshape_activations (Literal["none", "hook_z"]): How to reshape activations
+            (useful for attention head outputs). Inherited from SAEConfig.
+            Defaults to "none".
+        metadata (SAEMetadata): Metadata about the SAE (model name, hook name, etc.).
+            Inherited from SAEConfig.
     """
     k: int = 100
+    rescale_acts_by_decoder_norm: bool = False
     @override
     @classmethod
@@ -60,6 +155,63 @@ class TopKSAEConfig(SAEConfig):
         return "topk"
+def _sparse_matmul_nd(
+    sparse_tensor: torch.Tensor, dense_matrix: torch.Tensor
+) -> torch.Tensor:
+    """
+    Multiply a sparse tensor of shape [..., d_sae] with a dense matrix of shape [d_sae, d_out]
+    to get a result of shape [..., d_out].
+    This function handles sparse tensors with arbitrary batch dimensions by flattening
+    the batch dimensions, performing 2D sparse matrix multiplication, and reshaping back.
+    """
+    original_shape = sparse_tensor.shape
+    batch_dims = original_shape[:-1]
+    d_sae = original_shape[-1]
+    d_out = dense_matrix.shape[-1]
+    if sparse_tensor.ndim == 2:
+        # Simple 2D case - use torch.sparse.mm directly
+        # sparse.mm errors with bfloat16 :(
+        with torch.autocast(device_type=sparse_tensor.device.type, enabled=False):
+            return torch.sparse.mm(sparse_tensor, dense_matrix)
+    # For 3D+ case, reshape to 2D, multiply, then reshape back
+    batch_size = int(torch.prod(torch.tensor(batch_dims)).item())
+    # Ensure tensor is coalesced for efficient access to indices/values
+    if not sparse_tensor.is_coalesced():
+        sparse_tensor = sparse_tensor.coalesce()
+    # Get indices and values
+    indices = sparse_tensor.indices()  # [ndim, nnz]
+    values = sparse_tensor.values()  # [nnz]
+    # Convert multi-dimensional batch indices to flat indices
+    flat_batch_indices = torch.zeros_like(indices[0])
+    multiplier = 1
+    for i in reversed(range(len(batch_dims))):
+        flat_batch_indices += indices[i] * multiplier
+        multiplier *= batch_dims[i]
+    # Create 2D sparse tensor indices [batch_flat, feature]
+    sparse_2d_indices = torch.stack([flat_batch_indices, indices[-1]])
+    # Create 2D sparse tensor
+    sparse_2d = torch.sparse_coo_tensor(
+        sparse_2d_indices, values, (batch_size, d_sae)
+    ).coalesce()
+    # sparse.mm errors with bfloat16 :(
+    with torch.autocast(device_type=sparse_tensor.device.type, enabled=False):
+        # Do the matrix multiplication
+        result_2d = torch.sparse.mm(sparse_2d, dense_matrix)  # [batch_size, d_out]
+    # Reshape back to original batch dimensions
+    result_shape = tuple(batch_dims) + (d_out,)
+    return result_2d.view(result_shape)
 class TopKSAE(SAE[TopKSAEConfig]):
     """
     An inference-only sparse autoencoder using a "topk" activation function.
@@ -92,42 +244,91 @@ class TopKSAE(SAE[TopKSAEConfig]):
         """
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        if self.cfg.rescale_acts_by_decoder_norm:
+            hidden_pre = hidden_pre * self.W_dec.norm(dim=-1)
         # The BaseSAE already sets self.activation_fn to TopK(...) if config requests topk.
         return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
     def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
     ) -> Float[torch.Tensor, "... d_in"]:
         """
         Reconstructs the input from topk feature activations.
         Applies optional finetuning scaling, hooking to recons, out normalization,
         and optional head reshaping.
         """
-        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if self.cfg.rescale_acts_by_decoder_norm:
+            feature_acts = feature_acts / self.W_dec.norm(dim=-1)
+        if feature_acts.is_sparse:
+            sae_out_pre = _sparse_matmul_nd(feature_acts, self.W_dec) + self.b_dec
+        else:
+            sae_out_pre = feature_acts @ self.W_dec + self.b_dec
         sae_out_pre = self.hook_sae_recons(sae_out_pre)
         sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
         return self.reshape_fn_out(sae_out_pre, self.d_head)
     @override
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        return TopK(self.cfg.k)
+        return TopK(self.cfg.k, use_sparse_activations=False)
     @override
     @torch.no_grad()
     def fold_W_dec_norm(self) -> None:
-        raise NotImplementedError(
-            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
-        )
+        if not self.cfg.rescale_acts_by_decoder_norm:
+            raise NotImplementedError(
+                "Folding W_dec_norm is not safe for TopKSAEs when rescale_acts_by_decoder_norm is False, as this may change the topk activations"
+            )
+        _fold_norm_topk(W_dec=self.W_dec, b_enc=self.b_enc, W_enc=self.W_enc)
 @dataclass
 class TopKTrainingSAEConfig(TrainingSAEConfig):
     """
     Configuration class for training a TopKTrainingSAE.
+    Args:
+        k (int): Number of top features to keep active. Only the top k features
+            with the highest pre-activations will be non-zero. Defaults to 100.
+        use_sparse_activations (bool): Whether to use sparse tensor representations
+            for activations during training. This can reduce memory usage and improve
+            performance when k is small relative to d_sae, but is only worthwhile if
+            using float32 and not using autocast. Defaults to False.
+        aux_loss_coefficient (float): Coefficient for the auxiliary loss that encourages
+            dead neurons to learn useful features. This loss helps prevent neuron death
+            in TopK SAEs by having dead neurons reconstruct the residual error from
+            live neurons. Defaults to 1.0.
+        rescale_acts_by_decoder_norm (bool): Treat the decoder as if it was already normalized.
+            This is a good idea since decoder norm can randomly drift during training, and this
+            affects what the topk activations will be. Defaults to True.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            0.1 corresponds to the "heuristic" initialization from Anthropic's April update.
+            Use None to disable. Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+        apply_b_dec_to_input (bool): Whether to apply decoder bias to the input
+            before encoding. Inherited from SAEConfig. Defaults to True.
+        normalize_activations (Literal["none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"]):
+            Normalization strategy for input activations. Inherited from SAEConfig.
+            Defaults to "none".
+        reshape_activations (Literal["none", "hook_z"]): How to reshape activations
+            (useful for attention head outputs). Inherited from SAEConfig.
+            Defaults to "none".
+        metadata (SAEMetadata): Metadata about the SAE training (model name, hook name, etc.).
+            Inherited from SAEConfig.
     """
     k: int = 100
+    use_sparse_activations: bool = False
     aux_loss_coefficient: float = 1.0
+    rescale_acts_by_decoder_norm: bool = True
     @override
     @classmethod
@@ -144,6 +345,8 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     def __init__(self, cfg: TopKTrainingSAEConfig, use_error_term: bool = False):
         super().__init__(cfg, use_error_term)
+        self.hook_sae_acts_post = SparseHookPoint(self.cfg.d_sae)
+        self.setup()
     @override
     def initialize_weights(self) -> None:
@@ -159,10 +362,51 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         sae_in = self.process_sae_in(x)
         hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        if self.cfg.rescale_acts_by_decoder_norm:
+            hidden_pre = hidden_pre * self.W_dec.norm(dim=-1)
         # Apply the TopK activation function (already set in self.activation_fn if config is "topk")
         feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
         return feature_acts, hidden_pre
+    @override
+    def decode(
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decodes feature activations back into input space,
+        applying optional finetuning scale, hooking, out normalization, etc.
+        """
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if self.cfg.rescale_acts_by_decoder_norm:
+            # need to multiply by the inverse of the norm because division is illegal with sparse tensors
+            feature_acts = feature_acts * (1 / self.W_dec.norm(dim=-1))
+        if feature_acts.is_sparse:
+            sae_out_pre = _sparse_matmul_nd(feature_acts, self.W_dec) + self.b_dec
+        else:
+            sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+    @override
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the SAE."""
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        if self.use_error_term:
+            with torch.no_grad():
+                # Recompute without hooks for true error term
+                with _disable_hooks(self):
+                    feature_acts_clean = self.encode(x)
+                    x_reconstruct_clean = self.decode(feature_acts_clean)
+                sae_error = self.hook_sae_error(x - x_reconstruct_clean)
+            sae_out = sae_out + sae_error
+        return self.hook_sae_output(sae_out)
     @override
     def calculate_aux_loss(
         self,
@@ -183,13 +427,15 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     @override
     @torch.no_grad()
     def fold_W_dec_norm(self) -> None:
-        raise NotImplementedError(
-            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
-        )
+        if not self.cfg.rescale_acts_by_decoder_norm:
+            raise NotImplementedError(
+                "Folding W_dec_norm is not safe for TopKSAEs when rescale_acts_by_decoder_norm is False, as this may change the topk activations"
+            )
+        _fold_norm_topk(W_dec=self.W_dec, b_enc=self.b_enc, W_enc=self.W_enc)
     @override
     def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        return TopK(self.cfg.k)
+        return TopK(self.cfg.k, use_sparse_activations=self.cfg.use_sparse_activations)
     @override
     def get_coefficients(self) -> dict[str, TrainCoefficientConfig | float]:
@@ -234,6 +480,18 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         auxk_loss = (recons - residual).pow(2).sum(dim=-1).mean()
         return self.cfg.aux_loss_coefficient * scale * auxk_loss
+    @override
+    def process_state_dict_for_saving_inference(
+        self, state_dict: dict[str, Any]
+    ) -> None:
+        super().process_state_dict_for_saving_inference(state_dict)
+        if self.cfg.rescale_acts_by_decoder_norm:
+            _fold_norm_topk(
+                W_enc=state_dict["W_enc"],
+                b_enc=state_dict["b_enc"],
+                W_dec=state_dict["W_dec"],
+            )
 def _calculate_topk_aux_acts(
     k_aux: int,
@@ -269,3 +527,15 @@ def _init_weights_topk(
     sae.b_enc = nn.Parameter(
         torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
     )
+def _fold_norm_topk(
+    W_enc: torch.Tensor,
+    b_enc: torch.Tensor,
+    W_dec: torch.Tensor,
+) -> None:
+    W_dec_norm = W_dec.norm(dim=-1)
+    b_enc.data = b_enc.data * W_dec_norm
+    W_dec_norms = W_dec_norm.unsqueeze(1)
+    W_dec.data = W_dec.data / W_dec_norms
+    W_enc.data = W_enc.data * W_dec_norms.T

sae_lens/tokenization_and_batching.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Generator, Iterator
+from collections.abc import Generator, Iterator
 import torch
@@ -68,7 +68,7 @@ def concat_and_batch_sequences(
 ) -> Generator[torch.Tensor, None, None]:
     """
     Generator to concat token sequences together from the tokens_interator, yielding
-    batches of size `context_size`.
+    sequences of size `context_size`. Batching across the batch dimension is handled by the caller.
     Args:
         tokens_iterator: An iterator which returns a 1D tensors of tokens
@@ -76,13 +76,28 @@ def concat_and_batch_sequences(
         begin_batch_token_id: If provided, this token will be at position 0 of each batch
         begin_sequence_token_id: If provided, this token will be the first token of each sequence
         sequence_separator_token_id: If provided, this token will be inserted between concatenated sequences
-        disable_concat_sequences: If True, disable concatenating sequences and ignore sequences shorter than context_size
+        disable_concat_sequences: If True, disable concatenating sequences and ignore sequences shorter than context_size (including BOS token if present)
         max_batches: If not provided, the iterator will be run to completion.
     """
     if disable_concat_sequences:
-        for tokens in tokens_iterator:
-            if len(tokens) >= context_size:
-                yield tokens[:context_size]
+        if begin_batch_token_id and not begin_sequence_token_id:
+            begin_sequence_token_id = begin_batch_token_id
+        for sequence in tokens_iterator:
+            if (
+                begin_sequence_token_id is not None
+                and sequence[0] != begin_sequence_token_id
+                and len(sequence) >= context_size - 1
+            ):
+                begin_sequence_token_id_tensor = torch.tensor(
+                    [begin_sequence_token_id],
+                    dtype=torch.long,
+                    device=sequence.device,
+                )
+                sequence = torch.cat(
+                    [begin_sequence_token_id_tensor, sequence[: context_size - 1]]
+                )
+            if len(sequence) >= context_size:
+                yield sequence[:context_size]
         return
     batch: torch.Tensor | None = None

sae_lens/training/activation_scaler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 from dataclasses import dataclass
+from pathlib import Path
 from statistics import mean
 import torch
@@ -51,3 +52,9 @@ class ActivationScaler:
         with open(file_path, "w") as f:
             json.dump({"scaling_factor": self.scaling_factor}, f)
+    def load(self, file_path: str | Path):
+        """load the state dict from a file in json format"""
+        with open(file_path) as f:
+            data = json.load(f)
+            self.scaling_factor = data["scaling_factor"]

sae_lens/training/activations_store.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import os
 import warnings
 from collections.abc import Generator, Iterator, Sequence
+from pathlib import Path
 from typing import Any, Literal, cast
 import datasets
@@ -13,8 +14,8 @@ from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
 from jaxtyping import Float, Int
 from requests import HTTPError
-from safetensors.torch import save_file
-from tqdm import tqdm
+from safetensors.torch import load_file, save_file
+from tqdm.auto import tqdm
 from transformer_lens.hook_points import HookedRootModule
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -24,12 +25,15 @@ from sae_lens.config import (
     HfDataset,
     LanguageModelSAERunnerConfig,
 )
-from sae_lens.constants import DTYPE_MAP
+from sae_lens.constants import ACTIVATIONS_STORE_STATE_FILENAME, DTYPE_MAP
 from sae_lens.pretokenize_runner import get_special_token_from_cfg
 from sae_lens.saes.sae import SAE, T_SAE_CONFIG, T_TRAINING_SAE_CONFIG
 from sae_lens.tokenization_and_batching import concat_and_batch_sequences
 from sae_lens.training.mixing_buffer import mixing_buffer
-from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
+from sae_lens.util import (
+    extract_stop_at_layer_from_tlens_hook_name,
+    get_special_token_ids,
+)
 # TODO: Make an activation store config class to be consistent with the rest of the code.
@@ -113,7 +117,7 @@ class ActivationsStore:
         if exclude_special_tokens is False:
             exclude_special_tokens = None
         if exclude_special_tokens is True:
-            exclude_special_tokens = _get_special_token_ids(model.tokenizer)  # type: ignore
+            exclude_special_tokens = get_special_token_ids(model.tokenizer)  # type: ignore
         if exclude_special_tokens is not None:
             exclude_special_tokens = torch.tensor(
                 exclude_special_tokens, dtype=torch.long, device=device
@@ -315,7 +319,7 @@ class ActivationsStore:
                 )
         else:
             warnings.warn(
-                "Dataset is not tokenized. Pre-tokenizing will improve performance and allows for more control over special tokens. See https://jbloomaus.github.io/SAELens/training_saes/#pretokenizing-datasets for more info."
+                "Dataset is not tokenized. Pre-tokenizing will improve performance and allows for more control over special tokens. See https://decoderesearch.github.io/SAELens/training_saes/#pretokenizing-datasets for more info."
             )
         self.iterable_sequences = self._iterate_tokenized_sequences()
@@ -726,6 +730,48 @@ class ActivationsStore:
         """save the state dict to a file in safetensors format"""
         save_file(self.state_dict(), file_path)
+    def save_to_checkpoint(self, checkpoint_path: str | Path):
+        """Save the state dict to a checkpoint path"""
+        self.save(str(Path(checkpoint_path) / ACTIVATIONS_STORE_STATE_FILENAME))
+    def load_from_checkpoint(self, checkpoint_path: str | Path):
+        """Load the state dict from a checkpoint path"""
+        self.load(str(Path(checkpoint_path) / ACTIVATIONS_STORE_STATE_FILENAME))
+    def load(self, file_path: str):
+        """Load the state dict from a file in safetensors format"""
+        state_dict = load_file(file_path)
+        if "n_dataset_processed" in state_dict:
+            target_n_dataset_processed = state_dict["n_dataset_processed"].item()
+            # Only fast-forward if needed
+            if target_n_dataset_processed > self.n_dataset_processed:
+                logger.info(
+                    "Fast-forwarding through dataset samples to match checkpoint position"
+                )
+                samples_to_skip = target_n_dataset_processed - self.n_dataset_processed
+                pbar = tqdm(
+                    total=samples_to_skip,
+                    desc="Fast-forwarding through dataset",
+                    leave=False,
+                )
+                while target_n_dataset_processed > self.n_dataset_processed:
+                    start = self.n_dataset_processed
+                    try:
+                        # Just consume and ignore the values to fast-forward
+                        next(self.iterable_sequences)
+                    except StopIteration:
+                        logger.warning(
+                            "Dataset exhausted during fast-forward. Resetting dataset."
+                        )
+                        self.iterable_sequences = self._iterate_tokenized_sequences()
+                    pbar.update(self.n_dataset_processed - start)
+                pbar.close()
 def validate_pretokenized_dataset_tokenizer(
     dataset_path: str, model_tokenizer: PreTrainedTokenizerBase
@@ -763,31 +809,6 @@ def _get_model_device(model: HookedRootModule) -> torch.device:
     return next(model.parameters()).device  # type: ignore
-def _get_special_token_ids(tokenizer: PreTrainedTokenizerBase) -> list[int]:
-    """Get all special token IDs from a tokenizer."""
-    special_tokens = set()
-    # Get special tokens from tokenizer attributes
-    for attr in dir(tokenizer):
-        if attr.endswith("_token_id"):
-            token_id = getattr(tokenizer, attr)
-            if token_id is not None:
-                special_tokens.add(token_id)
-    # Get any additional special tokens from the tokenizer's special tokens map
-    if hasattr(tokenizer, "special_tokens_map"):
-        for token in tokenizer.special_tokens_map.values():
-            if isinstance(token, str):
-                token_id = tokenizer.convert_tokens_to_ids(token)  # type: ignore
-                special_tokens.add(token_id)
-            elif isinstance(token, list):
-                for t in token:
-                    token_id = tokenizer.convert_tokens_to_ids(t)  # type: ignore
-                    special_tokens.add(token_id)
-    return list(special_tokens)
 def _filter_buffer_acts(
     buffer: tuple[torch.Tensor, torch.Tensor | None],
     exclude_tokens: torch.Tensor | None,

sae_lens/training/optim.py CHANGED Viewed

@@ -2,6 +2,8 @@
 Took the LR scheduler from my previous work: https://github.com/jbloomAus/DecisionTransformerInterpretability/blob/ee55df35cdb92e81d689c72fb9dd5a7252893363/src/decision_transformer/utils.py#L425
 """
+from typing import Any
 import torch.optim as optim
 import torch.optim.lr_scheduler as lr_scheduler
@@ -150,3 +152,12 @@ class CoefficientScheduler:
     def value(self) -> float:
         """Returns the current scalar value."""
         return self.current_value
+    def state_dict(self) -> dict[str, Any]:
+        return {
+            "current_step": self.current_step,
+        }
+    def load_state_dict(self, state_dict: dict[str, Any]):
+        for k in state_dict:
+            setattr(self, k, state_dict[k])

sae-lens 6.12.1__py3-none-any.whl → 6.21.0__py3-none-any.whl

sae-lens 6.12.1py3-none-any.whl → 6.21.0py3-none-any.whl