PyPI - sae-lens - Versions diffs - 6.12.3__tar.gz → 6.13.1__tar.gz - Mend

sae-lens 6.12.3tar.gz → 6.13.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{sae_lens-6.12.3 → sae_lens-6.13.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.12.3
+Version: 6.13.1
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.12.3 → sae_lens-6.13.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.12.3"
+version = "6.13.1"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.12.3"
+__version__ = "6.13.1"
 import logging

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/evals.py RENAMED Viewed

@@ -466,6 +466,8 @@ def get_sparsity_and_variance_metrics(
         sae_out_scaled = sae.decode(sae_feature_activations).to(
             original_act_scaled.device
         )
+        if sae_feature_activations.is_sparse:
+            sae_feature_activations = sae_feature_activations.to_dense()
         del cache
         sae_out = activation_scaler.unscale(sae_out_scaled)

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/pretokenize_runner.py RENAMED Viewed

@@ -1,9 +1,10 @@
 import io
 import json
 import sys
+from collections.abc import Iterator
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Iterator, Literal, cast
+from typing import Literal, cast
 import torch
 from datasets import Dataset, DatasetDict, load_dataset

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/saes/sae.py RENAMED Viewed

@@ -14,7 +14,6 @@ from typing import (
     Generic,
     Literal,
     NamedTuple,
-    Type,
     TypeVar,
 )
@@ -534,7 +533,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     @deprecated("Use load_from_disk instead")
     def load_from_pretrained(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         path: str | Path,
         device: str = "cpu",
         dtype: str | None = None,
@@ -543,7 +542,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def load_from_disk(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         path: str | Path,
         device: str = "cpu",
         dtype: str | None = None,
@@ -564,7 +563,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def from_pretrained(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         release: str,
         sae_id: str,
         device: str = "cpu",
@@ -585,7 +584,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def from_pretrained_with_cfg_and_sparsity(
-        cls: Type[T_SAE],
+        cls: type[T_SAE],
         release: str,
         sae_id: str,
         device: str = "cpu",
@@ -684,7 +683,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return sae, cfg_dict, log_sparsities
     @classmethod
-    def from_dict(cls: Type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:
+    def from_dict(cls: type[T_SAE], config_dict: dict[str, Any]) -> T_SAE:
         """Create an SAE from a config dictionary."""
         sae_cls = cls.get_sae_class_for_architecture(config_dict["architecture"])
         sae_config_cls = cls.get_sae_config_class_for_architecture(
@@ -694,8 +693,8 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     @classmethod
     def get_sae_class_for_architecture(
-        cls: Type[T_SAE], architecture: str
-    ) -> Type[T_SAE]:
+        cls: type[T_SAE], architecture: str
+    ) -> type[T_SAE]:
         """Get the SAE class for a given architecture."""
         sae_cls, _ = get_sae_class(architecture)
         if not issubclass(sae_cls, cls):
@@ -1000,8 +999,8 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     @classmethod
     def get_sae_class_for_architecture(
-        cls: Type[T_TRAINING_SAE], architecture: str
-    ) -> Type[T_TRAINING_SAE]:
+        cls: type[T_TRAINING_SAE], architecture: str
+    ) -> type[T_TRAINING_SAE]:
         """Get the SAE class for a given architecture."""
         sae_cls, _ = get_sae_training_class(architecture)
         if not issubclass(sae_cls, cls):

sae_lens-6.13.1/sae_lens/saes/topk_sae.py ADDED Viewed

@@ -0,0 +1,473 @@
+"""Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
+from dataclasses import dataclass
+from typing import Callable
+import torch
+from jaxtyping import Float
+from torch import nn
+from transformer_lens.hook_points import HookPoint
+from typing_extensions import override
+from sae_lens.saes.sae import (
+    SAE,
+    SAEConfig,
+    TrainCoefficientConfig,
+    TrainingSAE,
+    TrainingSAEConfig,
+    TrainStepInput,
+    _disable_hooks,
+)
+class SparseHookPoint(HookPoint):
+    """
+    A HookPoint that takes in a sparse tensor.
+    Overrides TransformerLens's HookPoint.
+    """
+    def __init__(self, d_sae: int):
+        super().__init__()
+        self.d_sae = d_sae
+    @override
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        using_hooks = (
+            self._forward_hooks is not None and len(self._forward_hooks) > 0
+        ) or (self._backward_hooks is not None and len(self._backward_hooks) > 0)
+        if using_hooks and x.is_sparse:
+            return x.to_dense()
+        return x  # if no hooks are being used, use passthrough
+class TopK(nn.Module):
+    """
+    A simple TopK activation that zeroes out all but the top K elements along the last dimension,
+    and applies ReLU to the top K elements.
+    """
+    use_sparse_activations: bool
+    def __init__(
+        self,
+        k: int,
+        use_sparse_activations: bool = False,
+    ):
+        super().__init__()
+        self.k = k
+        self.use_sparse_activations = use_sparse_activations
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        1) Select top K elements along the last dimension.
+        2) Apply ReLU.
+        3) Zero out all other entries.
+        """
+        topk_values, topk_indices = torch.topk(x, k=self.k, dim=-1, sorted=False)
+        values = topk_values.relu()
+        if self.use_sparse_activations:
+            # Produce a COO sparse tensor (use sparse matrix multiply in decode)
+            original_shape = x.shape
+            # Create indices for all dimensions
+            # For each element in topk_indices, we need to map it back to the original tensor coordinates
+            batch_dims = original_shape[:-1]  # All dimensions except the last one
+            num_batch_elements = torch.prod(torch.tensor(batch_dims)).item()
+            # Create batch indices - each batch element repeated k times
+            batch_indices_flat = torch.arange(
+                num_batch_elements, device=x.device
+            ).repeat_interleave(self.k)
+            # Convert flat batch indices back to multi-dimensional indices
+            if len(batch_dims) == 1:
+                # 2D case: [batch, features]
+                sparse_indices = torch.stack(
+                    [
+                        batch_indices_flat,
+                        topk_indices.flatten(),
+                    ]
+                )
+            else:
+                # 3D+ case: need to unravel the batch indices
+                batch_indices_multi = []
+                remaining = batch_indices_flat
+                for dim_size in reversed(batch_dims):
+                    batch_indices_multi.append(remaining % dim_size)
+                    remaining = remaining // dim_size
+                batch_indices_multi.reverse()
+                sparse_indices = torch.stack(
+                    [
+                        *batch_indices_multi,
+                        topk_indices.flatten(),
+                    ]
+                )
+            return torch.sparse_coo_tensor(
+                sparse_indices, values.flatten(), original_shape
+            )
+        result = torch.zeros_like(x)
+        result.scatter_(-1, topk_indices, values)
+        return result
+@dataclass
+class TopKSAEConfig(SAEConfig):
+    """
+    Configuration class for a TopKSAE.
+    """
+    k: int = 100
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "topk"
+def _sparse_matmul_nd(
+    sparse_tensor: torch.Tensor, dense_matrix: torch.Tensor
+) -> torch.Tensor:
+    """
+    Multiply a sparse tensor of shape [..., d_sae] with a dense matrix of shape [d_sae, d_out]
+    to get a result of shape [..., d_out].
+    This function handles sparse tensors with arbitrary batch dimensions by flattening
+    the batch dimensions, performing 2D sparse matrix multiplication, and reshaping back.
+    """
+    original_shape = sparse_tensor.shape
+    batch_dims = original_shape[:-1]
+    d_sae = original_shape[-1]
+    d_out = dense_matrix.shape[-1]
+    if sparse_tensor.ndim == 2:
+        # Simple 2D case - use torch.sparse.mm directly
+        # sparse.mm errors with bfloat16 :(
+        with torch.autocast(device_type=sparse_tensor.device.type, enabled=False):
+            return torch.sparse.mm(sparse_tensor, dense_matrix)
+    # For 3D+ case, reshape to 2D, multiply, then reshape back
+    batch_size = int(torch.prod(torch.tensor(batch_dims)).item())
+    # Ensure tensor is coalesced for efficient access to indices/values
+    if not sparse_tensor.is_coalesced():
+        sparse_tensor = sparse_tensor.coalesce()
+    # Get indices and values
+    indices = sparse_tensor.indices()  # [ndim, nnz]
+    values = sparse_tensor.values()  # [nnz]
+    # Convert multi-dimensional batch indices to flat indices
+    flat_batch_indices = torch.zeros_like(indices[0])
+    multiplier = 1
+    for i in reversed(range(len(batch_dims))):
+        flat_batch_indices += indices[i] * multiplier
+        multiplier *= batch_dims[i]
+    # Create 2D sparse tensor indices [batch_flat, feature]
+    sparse_2d_indices = torch.stack([flat_batch_indices, indices[-1]])
+    # Create 2D sparse tensor
+    sparse_2d = torch.sparse_coo_tensor(
+        sparse_2d_indices, values, (batch_size, d_sae)
+    ).coalesce()
+    # sparse.mm errors with bfloat16 :(
+    with torch.autocast(device_type=sparse_tensor.device.type, enabled=False):
+        # Do the matrix multiplication
+        result_2d = torch.sparse.mm(sparse_2d, dense_matrix)  # [batch_size, d_out]
+    # Reshape back to original batch dimensions
+    result_shape = tuple(batch_dims) + (d_out,)
+    return result_2d.view(result_shape)
+class TopKSAE(SAE[TopKSAEConfig]):
+    """
+    An inference-only sparse autoencoder using a "topk" activation function.
+    It uses linear encoder and decoder layers, applying the TopK activation
+    to the hidden pre-activation in its encode step.
+    """
+    b_enc: nn.Parameter
+    def __init__(self, cfg: TopKSAEConfig, use_error_term: bool = False):
+        """
+        Args:
+            cfg: SAEConfig defining model size and behavior.
+            use_error_term: Whether to apply the error-term approach in the forward pass.
+        """
+        super().__init__(cfg, use_error_term)
+    @override
+    def initialize_weights(self) -> None:
+        # Initialize encoder weights and bias.
+        super().initialize_weights()
+        _init_weights_topk(self)
+    def encode(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> Float[torch.Tensor, "... d_sae"]:
+        """
+        Converts input x into feature activations.
+        Uses topk activation under the hood.
+        """
+        sae_in = self.process_sae_in(x)
+        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        # The BaseSAE already sets self.activation_fn to TopK(...) if config requests topk.
+        return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+    def decode(
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Reconstructs the input from topk feature activations.
+        Applies optional finetuning scaling, hooking to recons, out normalization,
+        and optional head reshaping.
+        """
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if feature_acts.is_sparse:
+            sae_out_pre = _sparse_matmul_nd(feature_acts, self.W_dec) + self.b_dec
+        else:
+            sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+    @override
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return TopK(self.cfg.k, use_sparse_activations=False)
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
+@dataclass
+class TopKTrainingSAEConfig(TrainingSAEConfig):
+    """
+    Configuration class for training a TopKTrainingSAE.
+    Args:
+        k (int): Number of top features to keep active. Only the top k features
+            with the highest pre-activations will be non-zero. Defaults to 100.
+        use_sparse_activations (bool): Whether to use sparse tensor representations
+            for activations during training. This can reduce memory usage and improve
+            performance when k is small relative to d_sae, but is only worthwhile if
+            using float32 and not using autocast. Defaults to False.
+        aux_loss_coefficient (float): Coefficient for the auxiliary loss that encourages
+            dead neurons to learn useful features. This loss helps prevent neuron death
+            in TopK SAEs by having dead neurons reconstruct the residual error from
+            live neurons. Defaults to 1.0.
+        decoder_init_norm (float | None): Norm to initialize decoder weights to.
+            0.1 corresponds to the "heuristic" initialization from Anthropic's April update.
+            Use None to disable. Inherited from TrainingSAEConfig. Defaults to 0.1.
+        d_in (int): Input dimension (dimensionality of the activations being encoded).
+            Inherited from SAEConfig.
+        d_sae (int): SAE latent dimension (number of features in the SAE).
+            Inherited from SAEConfig.
+        dtype (str): Data type for the SAE parameters. Inherited from SAEConfig.
+            Defaults to "float32".
+        device (str): Device to place the SAE on. Inherited from SAEConfig.
+            Defaults to "cpu".
+        apply_b_dec_to_input (bool): Whether to apply decoder bias to the input
+            before encoding. Inherited from SAEConfig. Defaults to True.
+        normalize_activations (Literal["none", "expected_average_only_in", "constant_norm_rescale", "layer_norm"]):
+            Normalization strategy for input activations. Inherited from SAEConfig.
+            Defaults to "none".
+        reshape_activations (Literal["none", "hook_z"]): How to reshape activations
+            (useful for attention head outputs). Inherited from SAEConfig.
+            Defaults to "none".
+        metadata (SAEMetadata): Metadata about the SAE training (model name, hook name, etc.).
+            Inherited from SAEConfig.
+    """
+    k: int = 100
+    use_sparse_activations: bool = False
+    aux_loss_coefficient: float = 1.0
+    @override
+    @classmethod
+    def architecture(cls) -> str:
+        return "topk"
+class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
+    """
+    TopK variant with training functionality. Calculates a topk-related auxiliary loss, etc.
+    """
+    b_enc: nn.Parameter
+    def __init__(self, cfg: TopKTrainingSAEConfig, use_error_term: bool = False):
+        super().__init__(cfg, use_error_term)
+        self.hook_sae_acts_post = SparseHookPoint(self.cfg.d_sae)
+        self.setup()
+    @override
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        _init_weights_topk(self)
+    def encode_with_hidden_pre(
+        self, x: Float[torch.Tensor, "... d_in"]
+    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        """
+        Similar to the base training method: calculate pre-activations, then apply TopK.
+        """
+        sae_in = self.process_sae_in(x)
+        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        # Apply the TopK activation function (already set in self.activation_fn if config is "topk")
+        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+        return feature_acts, hidden_pre
+    @override
+    def decode(
+        self,
+        feature_acts: Float[torch.Tensor, "... d_sae"],
+    ) -> Float[torch.Tensor, "... d_in"]:
+        """
+        Decodes feature activations back into input space,
+        applying optional finetuning scale, hooking, out normalization, etc.
+        """
+        # Handle sparse tensors using efficient sparse matrix multiplication
+        if feature_acts.is_sparse:
+            sae_out_pre = _sparse_matmul_nd(feature_acts, self.W_dec) + self.b_dec
+        else:
+            sae_out_pre = feature_acts @ self.W_dec + self.b_dec
+        sae_out_pre = self.hook_sae_recons(sae_out_pre)
+        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
+        return self.reshape_fn_out(sae_out_pre, self.d_head)
+    @override
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the SAE."""
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        if self.use_error_term:
+            with torch.no_grad():
+                # Recompute without hooks for true error term
+                with _disable_hooks(self):
+                    feature_acts_clean = self.encode(x)
+                    x_reconstruct_clean = self.decode(feature_acts_clean)
+                sae_error = self.hook_sae_error(x - x_reconstruct_clean)
+            sae_out = sae_out + sae_error
+        return self.hook_sae_output(sae_out)
+    @override
+    def calculate_aux_loss(
+        self,
+        step_input: TrainStepInput,
+        feature_acts: torch.Tensor,
+        hidden_pre: torch.Tensor,
+        sae_out: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        # Calculate the auxiliary loss for dead neurons
+        topk_loss = self.calculate_topk_aux_loss(
+            sae_in=step_input.sae_in,
+            sae_out=sae_out,
+            hidden_pre=hidden_pre,
+            dead_neuron_mask=step_input.dead_neuron_mask,
+        )
+        return {"auxiliary_reconstruction_loss": topk_loss}
+    @override
+    @torch.no_grad()
+    def fold_W_dec_norm(self) -> None:
+        raise NotImplementedError(
+            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
+        )
+    @override
+    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        return TopK(self.cfg.k, use_sparse_activations=self.cfg.use_sparse_activations)
+    @override
+    def get_coefficients(self) -> dict[str, TrainCoefficientConfig | float]:
+        return {}
+    def calculate_topk_aux_loss(
+        self,
+        sae_in: torch.Tensor,
+        sae_out: torch.Tensor,
+        hidden_pre: torch.Tensor,
+        dead_neuron_mask: torch.Tensor | None,
+    ) -> torch.Tensor:
+        """
+        Calculate TopK auxiliary loss.
+        This auxiliary loss encourages dead neurons to learn useful features by having
+        them reconstruct the residual error from the live neurons. It's a key part of
+        preventing neuron death in TopK SAEs.
+        """
+        # Mostly taken from https://github.com/EleutherAI/sae/blob/main/sae/sae.py, except without variance normalization
+        # NOTE: checking the number of dead neurons will force a GPU sync, so performance can likely be improved here
+        if dead_neuron_mask is None or (num_dead := int(dead_neuron_mask.sum())) == 0:
+            return sae_out.new_tensor(0.0)
+        residual = (sae_in - sae_out).detach()
+        # Heuristic from Appendix B.1 in the paper
+        k_aux = sae_in.shape[-1] // 2
+        # Reduce the scale of the loss if there are a small number of dead latents
+        scale = min(num_dead / k_aux, 1.0)
+        k_aux = min(k_aux, num_dead)
+        auxk_acts = _calculate_topk_aux_acts(
+            k_aux=k_aux,
+            hidden_pre=hidden_pre,
+            dead_neuron_mask=dead_neuron_mask,
+        )
+        # Encourage the top ~50% of dead latents to predict the residual of the
+        # top k living latents
+        recons = self.decode(auxk_acts)
+        auxk_loss = (recons - residual).pow(2).sum(dim=-1).mean()
+        return self.cfg.aux_loss_coefficient * scale * auxk_loss
+def _calculate_topk_aux_acts(
+    k_aux: int,
+    hidden_pre: torch.Tensor,
+    dead_neuron_mask: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Helper method to calculate activations for the auxiliary loss.
+    Args:
+        k_aux: Number of top dead neurons to select
+        hidden_pre: Pre-activation values from encoder
+        dead_neuron_mask: Boolean mask indicating which neurons are dead
+    Returns:
+        Tensor with activations for only the top-k dead neurons, zeros elsewhere
+    """
+    # Don't include living latents in this loss
+    auxk_latents = torch.where(dead_neuron_mask[None], hidden_pre, -torch.inf)
+    # Top-k dead latents
+    auxk_topk = auxk_latents.topk(k_aux, sorted=False)
+    # Set the activations to zero for all but the top k_aux dead latents
+    auxk_acts = torch.zeros_like(hidden_pre)
+    auxk_acts.scatter_(-1, auxk_topk.indices, auxk_topk.values)
+    # Set activations to zero for all but top k_aux dead latents
+    return auxk_acts
+def _init_weights_topk(
+    sae: SAE[TopKSAEConfig] | TrainingSAE[TopKTrainingSAEConfig],
+) -> None:
+    sae.b_enc = nn.Parameter(
+        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
+    )

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/tokenization_and_batching.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Generator, Iterator
+from collections.abc import Generator, Iterator
 import torch
@@ -68,7 +68,7 @@ def concat_and_batch_sequences(
 ) -> Generator[torch.Tensor, None, None]:
     """
     Generator to concat token sequences together from the tokens_interator, yielding
-    batches of size `context_size`.
+    sequences of size `context_size`. Batching across the batch dimension is handled by the caller.
     Args:
         tokens_iterator: An iterator which returns a 1D tensors of tokens
@@ -76,13 +76,28 @@ def concat_and_batch_sequences(
         begin_batch_token_id: If provided, this token will be at position 0 of each batch
         begin_sequence_token_id: If provided, this token will be the first token of each sequence
         sequence_separator_token_id: If provided, this token will be inserted between concatenated sequences
-        disable_concat_sequences: If True, disable concatenating sequences and ignore sequences shorter than context_size
+        disable_concat_sequences: If True, disable concatenating sequences and ignore sequences shorter than context_size (including BOS token if present)
         max_batches: If not provided, the iterator will be run to completion.
     """
     if disable_concat_sequences:
-        for tokens in tokens_iterator:
-            if len(tokens) >= context_size:
-                yield tokens[:context_size]
+        if begin_batch_token_id and not begin_sequence_token_id:
+            begin_sequence_token_id = begin_batch_token_id
+        for sequence in tokens_iterator:
+            if (
+                begin_sequence_token_id is not None
+                and sequence[0] != begin_sequence_token_id
+                and len(sequence) >= context_size - 1
+            ):
+                begin_sequence_token_id_tensor = torch.tensor(
+                    [begin_sequence_token_id],
+                    dtype=torch.long,
+                    device=sequence.device,
+                )
+                sequence = torch.cat(
+                    [begin_sequence_token_id_tensor, sequence[: context_size - 1]]
+                )
+            if len(sequence) >= context_size:
+                yield sequence[:context_size]
         return
     batch: torch.Tensor | None = None

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/training/sae_trainer.py RENAMED Viewed

@@ -253,12 +253,14 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
             )
             with torch.no_grad():
-                did_fire = (train_step_output.feature_acts > 0).float().sum(-2) > 0
+                # calling .bool() should be equivalent to .abs() > 0, and work with coo tensors
+                firing_feats = train_step_output.feature_acts.bool().float()
+                did_fire = firing_feats.sum(-2).bool()
+                if did_fire.is_sparse:
+                    did_fire = did_fire.to_dense()
                 self.n_forward_passes_since_fired += 1
                 self.n_forward_passes_since_fired[did_fire] = 0
-                self.act_freq_scores += (
-                    (train_step_output.feature_acts.abs() > 0).float().sum(0)
-                )
+                self.act_freq_scores += firing_feats.sum(0)
                 self.n_frac_active_samples += self.cfg.train_batch_size_samples
         # Grad scaler will rescale gradients if autocast is enabled
@@ -310,7 +312,7 @@ class SAETrainer(Generic[T_TRAINING_SAE, T_TRAINING_SAE_CONFIG]):
         loss = output.loss.item()
         # metrics for currents acts
-        l0 = (feature_acts > 0).float().sum(-1).mean()
+        l0 = feature_acts.bool().float().sum(-1).to_dense().mean()
         current_learning_rate = self.optimizer.param_groups[0]["lr"]
         per_token_l2_loss = (sae_out - sae_in).pow(2).sum(dim=-1).squeeze()

{sae_lens-6.12.3 → sae_lens-6.13.1}/sae_lens/training/types.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Iterator
+from collections.abc import Iterator
 import torch

sae_lens-6.12.3/sae_lens/saes/topk_sae.py DELETED Viewed

@@ -1,271 +0,0 @@
-"""Inference-only TopKSAE variant, similar in spirit to StandardSAE but using a TopK-based activation."""
-from dataclasses import dataclass
-from typing import Callable
-import torch
-from jaxtyping import Float
-from torch import nn
-from typing_extensions import override
-from sae_lens.saes.sae import (
-    SAE,
-    SAEConfig,
-    TrainCoefficientConfig,
-    TrainingSAE,
-    TrainingSAEConfig,
-    TrainStepInput,
-)
-class TopK(nn.Module):
-    """
-    A simple TopK activation that zeroes out all but the top K elements along the last dimension,
-    and applies ReLU to the top K elements.
-    """
-    b_enc: nn.Parameter
-    def __init__(
-        self,
-        k: int,
-    ):
-        super().__init__()
-        self.k = k
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        1) Select top K elements along the last dimension.
-        2) Apply ReLU.
-        3) Zero out all other entries.
-        """
-        topk = torch.topk(x, k=self.k, dim=-1)
-        values = topk.values.relu()
-        result = torch.zeros_like(x)
-        result.scatter_(-1, topk.indices, values)
-        return result
-@dataclass
-class TopKSAEConfig(SAEConfig):
-    """
-    Configuration class for a TopKSAE.
-    """
-    k: int = 100
-    @override
-    @classmethod
-    def architecture(cls) -> str:
-        return "topk"
-class TopKSAE(SAE[TopKSAEConfig]):
-    """
-    An inference-only sparse autoencoder using a "topk" activation function.
-    It uses linear encoder and decoder layers, applying the TopK activation
-    to the hidden pre-activation in its encode step.
-    """
-    b_enc: nn.Parameter
-    def __init__(self, cfg: TopKSAEConfig, use_error_term: bool = False):
-        """
-        Args:
-            cfg: SAEConfig defining model size and behavior.
-            use_error_term: Whether to apply the error-term approach in the forward pass.
-        """
-        super().__init__(cfg, use_error_term)
-    @override
-    def initialize_weights(self) -> None:
-        # Initialize encoder weights and bias.
-        super().initialize_weights()
-        _init_weights_topk(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
-        """
-        Converts input x into feature activations.
-        Uses topk activation under the hood.
-        """
-        sae_in = self.process_sae_in(x)
-        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # The BaseSAE already sets self.activation_fn to TopK(...) if config requests topk.
-        return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
-        """
-        Reconstructs the input from topk feature activations.
-        Applies optional finetuning scaling, hooking to recons, out normalization,
-        and optional head reshaping.
-        """
-        sae_out_pre = feature_acts @ self.W_dec + self.b_dec
-        sae_out_pre = self.hook_sae_recons(sae_out_pre)
-        sae_out_pre = self.run_time_activation_norm_fn_out(sae_out_pre)
-        return self.reshape_fn_out(sae_out_pre, self.d_head)
-    @override
-    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        return TopK(self.cfg.k)
-    @override
-    @torch.no_grad()
-    def fold_W_dec_norm(self) -> None:
-        raise NotImplementedError(
-            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
-        )
-@dataclass
-class TopKTrainingSAEConfig(TrainingSAEConfig):
-    """
-    Configuration class for training a TopKTrainingSAE.
-    """
-    k: int = 100
-    aux_loss_coefficient: float = 1.0
-    @override
-    @classmethod
-    def architecture(cls) -> str:
-        return "topk"
-class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
-    """
-    TopK variant with training functionality. Calculates a topk-related auxiliary loss, etc.
-    """
-    b_enc: nn.Parameter
-    def __init__(self, cfg: TopKTrainingSAEConfig, use_error_term: bool = False):
-        super().__init__(cfg, use_error_term)
-    @override
-    def initialize_weights(self) -> None:
-        super().initialize_weights()
-        _init_weights_topk(self)
-    def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
-        """
-        Similar to the base training method: calculate pre-activations, then apply TopK.
-        """
-        sae_in = self.process_sae_in(x)
-        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
-        # Apply the TopK activation function (already set in self.activation_fn if config is "topk")
-        feature_acts = self.hook_sae_acts_post(self.activation_fn(hidden_pre))
-        return feature_acts, hidden_pre
-    @override
-    def calculate_aux_loss(
-        self,
-        step_input: TrainStepInput,
-        feature_acts: torch.Tensor,
-        hidden_pre: torch.Tensor,
-        sae_out: torch.Tensor,
-    ) -> dict[str, torch.Tensor]:
-        # Calculate the auxiliary loss for dead neurons
-        topk_loss = self.calculate_topk_aux_loss(
-            sae_in=step_input.sae_in,
-            sae_out=sae_out,
-            hidden_pre=hidden_pre,
-            dead_neuron_mask=step_input.dead_neuron_mask,
-        )
-        return {"auxiliary_reconstruction_loss": topk_loss}
-    @override
-    @torch.no_grad()
-    def fold_W_dec_norm(self) -> None:
-        raise NotImplementedError(
-            "Folding W_dec_norm is not safe for TopKSAEs, as this may change the topk activations"
-        )
-    @override
-    def get_activation_fn(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        return TopK(self.cfg.k)
-    @override
-    def get_coefficients(self) -> dict[str, TrainCoefficientConfig | float]:
-        return {}
-    def calculate_topk_aux_loss(
-        self,
-        sae_in: torch.Tensor,
-        sae_out: torch.Tensor,
-        hidden_pre: torch.Tensor,
-        dead_neuron_mask: torch.Tensor | None,
-    ) -> torch.Tensor:
-        """
-        Calculate TopK auxiliary loss.
-        This auxiliary loss encourages dead neurons to learn useful features by having
-        them reconstruct the residual error from the live neurons. It's a key part of
-        preventing neuron death in TopK SAEs.
-        """
-        # Mostly taken from https://github.com/EleutherAI/sae/blob/main/sae/sae.py, except without variance normalization
-        # NOTE: checking the number of dead neurons will force a GPU sync, so performance can likely be improved here
-        if dead_neuron_mask is None or (num_dead := int(dead_neuron_mask.sum())) == 0:
-            return sae_out.new_tensor(0.0)
-        residual = (sae_in - sae_out).detach()
-        # Heuristic from Appendix B.1 in the paper
-        k_aux = sae_in.shape[-1] // 2
-        # Reduce the scale of the loss if there are a small number of dead latents
-        scale = min(num_dead / k_aux, 1.0)
-        k_aux = min(k_aux, num_dead)
-        auxk_acts = _calculate_topk_aux_acts(
-            k_aux=k_aux,
-            hidden_pre=hidden_pre,
-            dead_neuron_mask=dead_neuron_mask,
-        )
-        # Encourage the top ~50% of dead latents to predict the residual of the
-        # top k living latents
-        recons = self.decode(auxk_acts)
-        auxk_loss = (recons - residual).pow(2).sum(dim=-1).mean()
-        return self.cfg.aux_loss_coefficient * scale * auxk_loss
-def _calculate_topk_aux_acts(
-    k_aux: int,
-    hidden_pre: torch.Tensor,
-    dead_neuron_mask: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Helper method to calculate activations for the auxiliary loss.
-    Args:
-        k_aux: Number of top dead neurons to select
-        hidden_pre: Pre-activation values from encoder
-        dead_neuron_mask: Boolean mask indicating which neurons are dead
-    Returns:
-        Tensor with activations for only the top-k dead neurons, zeros elsewhere
-    """
-    # Don't include living latents in this loss
-    auxk_latents = torch.where(dead_neuron_mask[None], hidden_pre, -torch.inf)
-    # Top-k dead latents
-    auxk_topk = auxk_latents.topk(k_aux, sorted=False)
-    # Set the activations to zero for all but the top k_aux dead latents
-    auxk_acts = torch.zeros_like(hidden_pre)
-    auxk_acts.scatter_(-1, auxk_topk.indices, auxk_topk.values)
-    # Set activations to zero for all but top k_aux dead latents
-    return auxk_acts
-def _init_weights_topk(
-    sae: SAE[TopKSAEConfig] | TrainingSAE[TopKTrainingSAEConfig],
-) -> None:
-    sae.b_enc = nn.Parameter(
-        torch.zeros(sae.cfg.d_sae, dtype=sae.dtype, device=sae.device)
-    )