PyPI - sae-lens - Versions diffs - 6.3.1__py3-none-any.whl → 6.5.0__py3-none-any.whl - Mend

sae-lens 6.3.1py3-none-any.whl → 6.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sae_lens/__init__.py +16 -1
sae_lens/config.py +10 -1
sae_lens/loading/pretrained_sae_loaders.py +141 -0
sae_lens/pretrained_saes.yaml +110 -0
sae_lens/saes/__init__.py +14 -0
sae_lens/saes/sae.py +1 -1
sae_lens/saes/transcoder.py +365 -0
{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/METADATA +1 -1
{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/RECORD +11 -10
{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/LICENSE +0 -0
{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/WHEEL +0 -0

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.3.1"
+__version__ = "6.5.0"
 import logging
@@ -17,7 +17,11 @@ from sae_lens.saes import (
     JumpReLUSAEConfig,
     JumpReLUTrainingSAE,
     JumpReLUTrainingSAEConfig,
+    JumpReLUTranscoder,
+    JumpReLUTranscoderConfig,
     SAEConfig,
+    SkipTranscoder,
+    SkipTranscoderConfig,
     StandardSAE,
     StandardSAEConfig,
     StandardTrainingSAE,
@@ -28,6 +32,8 @@ from sae_lens.saes import (
     TopKTrainingSAEConfig,
     TrainingSAE,
     TrainingSAEConfig,
+    Transcoder,
+    TranscoderConfig,
 )
 from .analysis.hooked_sae_transformer import HookedSAETransformer
@@ -89,6 +95,12 @@ __all__ = [
     "LoggingConfig",
     "BatchTopKTrainingSAE",
     "BatchTopKTrainingSAEConfig",
+    "Transcoder",
+    "TranscoderConfig",
+    "SkipTranscoder",
+    "SkipTranscoderConfig",
+    "JumpReLUTranscoder",
+    "JumpReLUTranscoderConfig",
 ]
@@ -103,3 +115,6 @@ register_sae_training_class("jumprelu", JumpReLUTrainingSAE, JumpReLUTrainingSAE
 register_sae_training_class(
     "batchtopk", BatchTopKTrainingSAE, BatchTopKTrainingSAEConfig
 )
+register_sae_class("transcoder", Transcoder, TranscoderConfig)
+register_sae_class("skip_transcoder", SkipTranscoder, SkipTranscoderConfig)
+register_sae_class("jumprelu_transcoder", JumpReLUTranscoder, JumpReLUTranscoderConfig)

sae_lens/config.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import math
+import warnings
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, cast
@@ -125,7 +126,7 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
         model_name (str): The name of the model to use. This should be the name of the model in the Hugging Face model hub.
         model_class_name (str): The name of the class of the model to use. This should be either `HookedTransformer` or `HookedMamba`.
         hook_name (str): The name of the hook to use. This should be a valid TransformerLens hook.
-        hook_eval (str): NOT CURRENTLY IN USE. The name of the hook to use for evaluation.
+        hook_eval (str): DEPRECATED: Will be removed in v7.0.0. NOT CURRENTLY IN USE. The name of the hook to use for evaluation.
         hook_head_index (int, optional): When the hook is for an activation with a head index, we can specify a specific head to use here.
         dataset_path (str): A Hugging Face dataset path.
         dataset_trust_remote_code (bool): Whether to trust remote code when loading datasets from Huggingface.
@@ -264,6 +265,14 @@ class LanguageModelSAERunnerConfig(Generic[T_TRAINING_SAE_CONFIG]):
     exclude_special_tokens: bool | list[int] = False
     def __post_init__(self):
+        if self.hook_eval != "NOT_IN_USE":
+            warnings.warn(
+                "The 'hook_eval' field is deprecated and will be removed in v7.0.0. "
+                "It is not currently used and can be safely removed from your config.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         if self.use_cached_activations and self.cached_activations_path is None:
             self.cached_activations_path = _default_cached_activations_path(
                 self.dataset_path,

sae_lens/loading/pretrained_sae_loaders.py CHANGED Viewed

@@ -41,6 +41,8 @@ LLM_METADATA_KEYS = {
     "dataset_path",
     "sae_lens_version",
     "sae_lens_training_version",
+    "hook_name_out",
+    "hook_head_index_out",
 }
@@ -1092,6 +1094,143 @@ def sparsify_disk_loader(
     return cfg_dict, state_dict
+def get_gemma_2_transcoder_config_from_hf(
+    repo_id: str,
+    folder_name: str,
+    device: str | None = None,
+    force_download: bool = False,  # noqa: ARG001
+    cfg_overrides: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    """Get config for Gemma-2 transcoders"""
+    width_map = {
+        "width_4k": 4096,
+        "width_16k": 16384,
+        "width_65k": 65536,
+        "width_262k": 262144,
+        "width_524k": 524288,
+        "width_1m": 1048576,
+    }
+    # Extract width from folder name
+    d_sae = None
+    for width_key, width_value in width_map.items():
+        if width_key in folder_name:
+            d_sae = width_value
+            break
+    if d_sae is None:
+        # Try to extract from pattern like "width_16k"
+        match = re.search(r"width_(\d+)k", folder_name)
+        if match:
+            d_sae = int(match.group(1)) * 1024
+        else:
+            raise ValueError(
+                f"Could not extract dictionary size from folder name: {folder_name}"
+            )
+    # Extract layer
+    layer_match = re.search(r"layer_(\d+)", folder_name)
+    if layer_match:
+        layer = int(layer_match.group(1))
+    else:
+        layer_match = re.search(r"layer_(\d+)", repo_id)
+        if layer_match:
+            layer = int(layer_match.group(1))
+        else:
+            raise ValueError("Could not extract layer index")
+    # Determine model and dimensions from repo_id
+    model_configs = {
+        "2b-it": ("gemma-2-2b-it", 2304),
+        "2b": ("gemma-2-2b", 2304),
+        "9b-it": ("gemma-2-9b-it", 3584),
+        "9b": ("gemma-2-9b", 3584),
+        "27b-it": ("gemma-2-27b-it", 4608),
+        "27b": ("gemma-2-27b", 4608),
+    }
+    model_name = None
+    d_model = None
+    for model_key, (name, dim) in model_configs.items():
+        if model_key in repo_id:
+            model_name = name
+            d_model = dim
+            break
+    if model_name is None:
+        raise ValueError(f"Could not determine model from repo_id: {repo_id}")
+    return {
+        "architecture": "jumprelu_transcoder",
+        "d_in": d_model,
+        "d_out": d_model,
+        "d_sae": d_sae,
+        "dtype": "float32",
+        "device": device if device is not None else "cpu",
+        "activation_fn": "relu",
+        "normalize_activations": "none",
+        "model_name": model_name,
+        "hook_name": f"blocks.{layer}.ln2.hook_normalized",
+        "hook_name_out": f"blocks.{layer}.hook_mlp_out",
+        "hook_head_index": None,
+        "hook_head_index_out": None,
+        "prepend_bos": True,
+        "dataset_path": "monology/pile-uncopyrighted",
+        "context_size": 1024,
+        **(cfg_overrides or {}),
+    }
+def gemma_2_transcoder_huggingface_loader(
+    repo_id: str,
+    folder_name: str,
+    device: str = "cpu",
+    force_download: bool = False,
+    cfg_overrides: dict[str, Any] | None = None,
+) -> tuple[dict[str, Any], dict[str, torch.Tensor], torch.Tensor | None]:
+    """Load Gemma-2 transcoders from HuggingFace"""
+    cfg_dict = get_gemma_2_transcoder_config_from_hf(
+        repo_id,
+        folder_name,
+        device,
+        force_download,
+        cfg_overrides,
+    )
+    # Download the npz file
+    revision = cfg_overrides.get("revision", None) if cfg_overrides else None
+    params_filename = f"{folder_name}/params.npz"
+    file_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=params_filename,
+        force_download=force_download,
+        revision=revision,
+    )
+    # Load weights from npz file
+    params = np.load(file_path)
+    # Convert to state dict with proper naming
+    state_dict = {}
+    for key in params.files:
+        tensor = torch.tensor(params[key], dtype=torch.float32, device=device)
+        # Handle various naming conventions
+        key_lower = key.lower()
+        if key_lower in ["w_enc", "wenc", "w_e"]:
+            state_dict["W_enc"] = tensor
+        elif key_lower in ["w_dec", "wdec", "w_d"]:
+            state_dict["W_dec"] = tensor
+        elif key_lower in ["b_enc", "benc", "b_e"]:
+            state_dict["b_enc"] = tensor
+        elif key_lower in ["b_dec", "bdec", "b_d"]:
+            state_dict["b_dec"] = tensor
+        if key_lower in ["threshold"]:
+            state_dict["threshold"] = tensor
+    return cfg_dict, state_dict, None
 NAMED_PRETRAINED_SAE_LOADERS: dict[str, PretrainedSaeHuggingfaceLoader] = {
     "sae_lens": sae_lens_huggingface_loader,
     "connor_rob_hook_z": connor_rob_hook_z_huggingface_loader,
@@ -1101,6 +1240,7 @@ NAMED_PRETRAINED_SAE_LOADERS: dict[str, PretrainedSaeHuggingfaceLoader] = {
     "dictionary_learning_1": dictionary_learning_sae_huggingface_loader_1,
     "deepseek_r1": deepseek_r1_sae_huggingface_loader,
     "sparsify": sparsify_huggingface_loader,
+    "gemma_2_transcoder": gemma_2_transcoder_huggingface_loader,
 }
@@ -1113,4 +1253,5 @@ NAMED_PRETRAINED_SAE_CONFIG_GETTERS: dict[str, PretrainedSaeConfigHuggingfaceLoa
     "dictionary_learning_1": get_dictionary_learning_config_1_from_hf,
     "deepseek_r1": get_deepseek_r1_config_from_hf,
     "sparsify": get_sparsify_config_from_hf,
+    "gemma_2_transcoder": get_gemma_2_transcoder_config_from_hf,
 }

sae_lens/pretrained_saes.yaml CHANGED Viewed

@@ -13974,3 +13974,113 @@ gemma-3-1b-res-matryoshka-dc:
   - id: blocks.24.hook_resid_post
     path: blocks.24.hook_resid_post
     l0: 40.0
+gemma-scope-2b-pt-transcoders:
+  conversion_func: gemma_2_transcoder
+  model: gemma-2-2b
+  repo_id: google/gemma-scope-2b-pt-transcoders
+  saes:
+  - id: layer_0/width_16k/average_l0_76
+    neuronpedia: gemma-2-2b/0-gemmascope-transcoder-16k
+    l0: 76
+    path: layer_0/width_16k/average_l0_76
+  - id: layer_1/width_16k/average_l0_65
+    neuronpedia: gemma-2-2b/1-gemmascope-transcoder-16k
+    l0: 65
+    path: layer_1/width_16k/average_l0_65
+  - id: layer_2/width_16k/average_l0_49
+    neuronpedia: gemma-2-2b/2-gemmascope-transcoder-16k
+    l0: 49
+    path: layer_2/width_16k/average_l0_49
+  - id: layer_3/width_16k/average_l0_54
+    neuronpedia: gemma-2-2b/3-gemmascope-transcoder-16k
+    l0: 54
+    path: layer_3/width_16k/average_l0_54
+  - id: layer_4/width_16k/average_l0_88
+    neuronpedia: gemma-2-2b/4-gemmascope-transcoder-16k
+    l0: 88
+    path: layer_4/width_16k/average_l0_88
+  - id: layer_5/width_16k/average_l0_87
+    neuronpedia: gemma-2-2b/5-gemmascope-transcoder-16k
+    l0: 87
+    path: layer_5/width_16k/average_l0_87
+  - id: layer_6/width_16k/average_l0_95
+    neuronpedia: gemma-2-2b/6-gemmascope-transcoder-16k
+    l0: 95
+    path: layer_6/width_16k/average_l0_95
+  - id: layer_7/width_16k/average_l0_70
+    neuronpedia: gemma-2-2b/7-gemmascope-transcoder-16k
+    l0: 70
+    path: layer_7/width_16k/average_l0_70
+  - id: layer_8/width_16k/average_l0_52
+    neuronpedia: gemma-2-2b/8-gemmascope-transcoder-16k
+    l0: 52
+    path: layer_8/width_16k/average_l0_52
+  - id: layer_9/width_16k/average_l0_72
+    neuronpedia: gemma-2-2b/9-gemmascope-transcoder-16k
+    l0: 72
+    path: layer_9/width_16k/average_l0_72
+  - id: layer_10/width_16k/average_l0_88
+    neuronpedia: gemma-2-2b/10-gemmascope-transcoder-16k
+    l0: 88
+    path: layer_10/width_16k/average_l0_88
+  - id: layer_11/width_16k/average_l0_5
+    neuronpedia: gemma-2-2b/11-gemmascope-transcoder-16k
+    l0: 5
+    path: layer_11/width_16k/average_l0_5
+  - id: layer_12/width_16k/average_l0_6
+    neuronpedia: gemma-2-2b/12-gemmascope-transcoder-16k
+    l0: 6
+    path: layer_12/width_16k/average_l0_6
+  - id: layer_13/width_16k/average_l0_8
+    neuronpedia: gemma-2-2b/13-gemmascope-transcoder-16k
+    l0: 8
+    path: layer_13/width_16k/average_l0_8
+  - id: layer_14/width_16k/average_l0_8
+    neuronpedia: gemma-2-2b/14-gemmascope-transcoder-16k
+    l0: 8
+    path: layer_14/width_16k/average_l0_8
+  - id: layer_15/width_16k/average_l0_8
+    neuronpedia: gemma-2-2b/15-gemmascope-transcoder-16k
+    l0: 8
+    path: layer_15/width_16k/average_l0_8
+  - id: layer_16/width_16k/average_l0_10
+    neuronpedia: gemma-2-2b/16-gemmascope-transcoder-16k
+    l0: 10
+    path: layer_16/width_16k/average_l0_10
+  - id: layer_17/width_16k/average_l0_12
+    neuronpedia: gemma-2-2b/17-gemmascope-transcoder-16k
+    l0: 12
+    path: layer_17/width_16k/average_l0_12
+  - id: layer_18/width_16k/average_l0_13
+    neuronpedia: gemma-2-2b/18-gemmascope-transcoder-16k
+    l0: 13
+    path: layer_18/width_16k/average_l0_13
+  - id: layer_19/width_16k/average_l0_12
+    neuronpedia: gemma-2-2b/19-gemmascope-transcoder-16k
+    l0: 12
+    path: layer_19/width_16k/average_l0_12
+  - id: layer_20/width_16k/average_l0_11
+    neuronpedia: gemma-2-2b/20-gemmascope-transcoder-16k
+    l0: 11
+    path: layer_20/width_16k/average_l0_11
+  - id: layer_21/width_16k/average_l0_13
+    neuronpedia: gemma-2-2b/21-gemmascope-transcoder-16k
+    l0: 13
+    path: layer_21/width_16k/average_l0_13
+  - id: layer_22/width_16k/average_l0_15
+    neuronpedia: gemma-2-2b/22-gemmascope-transcoder-16k
+    l0: 15
+    path: layer_22/width_16k/average_l0_15
+  - id: layer_23/width_16k/average_l0_25
+    neuronpedia: gemma-2-2b/23-gemmascope-transcoder-16k
+    l0: 25
+    path: layer_23/width_16k/average_l0_25
+  - id: layer_24/width_16k/average_l0_37
+    neuronpedia: gemma-2-2b/24-gemmascope-transcoder-16k
+    l0: 37
+    path: layer_24/width_16k/average_l0_37
+  - id: layer_25/width_16k/average_l0_41
+    neuronpedia: gemma-2-2b/25-gemmascope-transcoder-16k
+    l0: 41
+    path: layer_25/width_16k/average_l0_41

sae_lens/saes/__init__.py CHANGED Viewed

@@ -27,6 +27,14 @@ from .topk_sae import (
     TopKTrainingSAE,
     TopKTrainingSAEConfig,
 )
+from .transcoder import (
+    JumpReLUTranscoder,
+    JumpReLUTranscoderConfig,
+    SkipTranscoder,
+    SkipTranscoderConfig,
+    Transcoder,
+    TranscoderConfig,
+)
 __all__ = [
     "SAE",
@@ -51,4 +59,10 @@ __all__ = [
     "TopKTrainingSAEConfig",
     "BatchTopKTrainingSAE",
     "BatchTopKTrainingSAEConfig",
+    "Transcoder",
+    "TranscoderConfig",
+    "SkipTranscoder",
+    "SkipTranscoderConfig",
+    "JumpReLUTranscoder",
+    "JumpReLUTranscoderConfig",
 ]

sae_lens/saes/sae.py CHANGED Viewed

@@ -245,7 +245,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.cfg = cfg
-        if cfg.metadata and cfg.metadata.model_from_pretrained_kwargs:
+        if cfg.metadata and cfg.metadata:
             warnings.warn(
                 "\nThis SAE has non-empty model_from_pretrained_kwargs. "
                 "\nFor optimal performance, load the model like so:\n"

sae_lens/saes/transcoder.py ADDED Viewed

@@ -0,0 +1,365 @@
+from dataclasses import dataclass
+from typing import Any
+import torch
+from torch import nn
+from sae_lens.saes.sae import (
+    SAE,
+    SAEConfig,
+    SAEMetadata,
+)
+from sae_lens.util import filter_valid_dataclass_fields
+# pyright: reportIncompatibleVariableOverride=false
+@dataclass
+class TranscoderConfig(SAEConfig):
+    # Output dimension fields
+    d_out: int = 768
+    # hook_name_out: str = ""
+    # hook_layer_out: int = 0
+    # hook_head_index_out: int | None = None
+    @classmethod
+    def architecture(cls) -> str:
+        """Return the architecture name for this config."""
+        return "transcoder"
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "TranscoderConfig":
+        """Create a TranscoderConfig from a dictionary."""
+        # Filter to only include valid dataclass fields
+        filtered_config_dict = filter_valid_dataclass_fields(config_dict, cls)
+        # Create the config instance
+        res = cls(**filtered_config_dict)
+        # Handle metadata if present
+        if "metadata" in config_dict:
+            res.metadata = SAEMetadata(**config_dict["metadata"])
+        return res
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary, including parent fields."""
+        # Get the base dictionary from parent
+        res = super().to_dict()
+        # Add transcoder-specific fields
+        res.update({"d_out": self.d_out})
+        return res
+class Transcoder(SAE[TranscoderConfig]):
+    """
+    A transcoder maps activations from one hook point to another with
+    potentially different dimensions. It extends the standard SAE but with a
+    decoder that maps to a different output dimension.
+    """
+    cfg: TranscoderConfig
+    W_enc: nn.Parameter
+    b_enc: nn.Parameter
+    W_dec: nn.Parameter
+    b_dec: nn.Parameter
+    def __init__(self, cfg: TranscoderConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+    def initialize_weights(self):
+        """Initialize transcoder weights with proper dimensions."""
+        # Initialize b_dec with output dimension
+        self.b_dec = nn.Parameter(
+            torch.zeros(self.cfg.d_out, dtype=self.dtype, device=self.device)
+        )
+        # Initialize W_dec with shape [d_sae, d_out]
+        w_dec_data = torch.empty(
+            self.cfg.d_sae, self.cfg.d_out, dtype=self.dtype, device=self.device
+        )
+        nn.init.kaiming_uniform_(w_dec_data)
+        self.W_dec = nn.Parameter(w_dec_data)
+        # Initialize W_enc with shape [d_in, d_sae]
+        w_enc_data = torch.empty(
+            self.cfg.d_in, self.cfg.d_sae, dtype=self.dtype, device=self.device
+        )
+        nn.init.kaiming_uniform_(w_enc_data)
+        self.W_enc = nn.Parameter(w_enc_data)
+        # Initialize b_enc
+        self.b_enc = nn.Parameter(
+            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
+        )
+    def process_sae_in(self, sae_in: torch.Tensor) -> torch.Tensor:
+        """
+        Process input without applying decoder bias (which has wrong dimension
+        for transcoder).
+        Overrides the parent method to skip the bias subtraction since b_dec
+        has dimension d_out which doesn't match the input dimension d_in.
+        """
+        # Don't apply b_dec since it has different dimension
+        # Just handle dtype conversion and hooks
+        sae_in = sae_in.to(self.dtype)
+        sae_in = self.hook_sae_input(sae_in)
+        return self.run_time_activation_norm_fn_in(sae_in)
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Encode the input tensor into the feature space.
+        """
+        # Preprocess the SAE input (casting type, applying hooks, normalization)
+        sae_in = self.process_sae_in(x)
+        # Compute the pre-activation values
+        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        # Apply the activation function (e.g., ReLU)
+        return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
+        """Decode to output dimension."""
+        # W_dec has shape [d_sae, d_out], feature_acts has shape
+        # [batch, d_sae]
+        sae_out = feature_acts @ self.W_dec + self.b_dec
+        # Apply hooks
+        # Note: We don't apply run_time_activation_norm_fn_out since the
+        # output dimension is different from the input dimension
+        return self.hook_sae_recons(sae_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for transcoder.
+        Args:
+            x: Input activations from the input hook point [batch, d_in]
+        Returns:
+            sae_out: Reconstructed activations for the output hook point
+            [batch, d_out]
+        """
+        feature_acts = self.encode(x)
+        return self.decode(feature_acts)
+    def forward_with_activations(
+        self,
+        x: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass returning both output and feature activations.
+        Args:
+            x: Input activations from the input hook point [batch, d_in]
+        Returns:
+            sae_out: Reconstructed activations for the output hook point
+            [batch, d_out]
+            feature_acts: Hidden activations [batch, d_sae]
+        """
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        return sae_out, feature_acts
+    @property
+    def d_out(self) -> int:
+        """Output dimension of the transcoder."""
+        return self.cfg.d_out
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "Transcoder":
+        cfg = TranscoderConfig.from_dict(config_dict)
+        return cls(cfg)
+@dataclass
+class SkipTranscoderConfig(TranscoderConfig):
+    @classmethod
+    def architecture(cls) -> str:
+        """Return the architecture name for this config."""
+        return "skip_transcoder"
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "SkipTranscoderConfig":
+        """Create a SkipTranscoderConfig from a dictionary."""
+        # Filter to only include valid dataclass fields
+        filtered_config_dict = filter_valid_dataclass_fields(config_dict, cls)
+        # Create the config instance
+        res = cls(**filtered_config_dict)
+        # Handle metadata if present
+        if "metadata" in config_dict:
+            res.metadata = SAEMetadata(**config_dict["metadata"])
+        return res
+class SkipTranscoder(Transcoder):
+    """
+    A transcoder with a learnable skip connection.
+    Implements: f(x) = W_dec @ relu(W_enc @ x + b_enc) + W_skip @ x + b_dec
+    where W_skip is initialized to zeros.
+    """
+    cfg: SkipTranscoderConfig  # type: ignore[assignment]
+    W_skip: nn.Parameter
+    def __init__(self, cfg: SkipTranscoderConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+        # Initialize skip connection matrix
+        # Shape: [d_out, d_in] to map from input to output dimension
+        self.W_skip = nn.Parameter(torch.zeros(self.cfg.d_out, self.cfg.d_in))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for skip transcoder.
+        Args:
+            x: Input activations from the input hook point [batch, d_in]
+        Returns:
+            sae_out: Reconstructed activations for the output hook point
+            [batch, d_out]
+        """
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        # Add skip connection: W_skip @ x
+        # x has shape [batch, d_in], W_skip has shape [d_out, d_in]
+        skip_out = x @ self.W_skip.T.to(x.device)
+        return sae_out + skip_out
+    def forward_with_activations(
+        self,
+        x: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass returning both output and feature activations.
+        Args:
+            x: Input activations from the input hook point [batch, d_in]
+        Returns:
+            sae_out: Reconstructed activations for the output hook point
+            [batch, d_out]
+            feature_acts: Hidden activations [batch, d_sae]
+        """
+        feature_acts = self.encode(x)
+        sae_out = self.decode(feature_acts)
+        # Add skip connection: W_skip @ x
+        # x has shape [batch, d_in], W_skip has shape [d_out, d_in]
+        skip_out = x @ self.W_skip.T.to(x.device)
+        sae_out = sae_out + skip_out
+        return sae_out, feature_acts
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "SkipTranscoder":
+        cfg = SkipTranscoderConfig.from_dict(config_dict)
+        return cls(cfg)
+# JumpReLU Transcoder Classes
+@dataclass
+class JumpReLUTranscoderConfig(TranscoderConfig):
+    """Configuration for JumpReLU transcoder."""
+    @classmethod
+    def architecture(cls) -> str:
+        """Return the architecture name for this config."""
+        return "jumprelu_transcoder"
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "JumpReLUTranscoderConfig":
+        """Create a JumpReLUTranscoderConfig from a dictionary."""
+        # Filter to only include valid dataclass fields
+        filtered_config_dict = filter_valid_dataclass_fields(config_dict, cls)
+        # Create the config instance
+        res = cls(**filtered_config_dict)
+        # Handle metadata if present
+        if "metadata" in config_dict:
+            res.metadata = SAEMetadata(**config_dict["metadata"])
+        return res
+class JumpReLUTranscoder(Transcoder):
+    """
+    A transcoder with JumpReLU activation function.
+    JumpReLU applies a threshold to activations: if pre-activation <=
+    threshold, the unit is zeroed out; otherwise, it follows the base
+    activation function.
+    """
+    cfg: JumpReLUTranscoderConfig  # type: ignore[assignment]
+    threshold: nn.Parameter
+    def __init__(self, cfg: JumpReLUTranscoderConfig):
+        super().__init__(cfg)
+        self.cfg = cfg
+    def initialize_weights(self):
+        """Initialize transcoder weights including threshold parameter."""
+        super().initialize_weights()
+        # Initialize threshold parameter for JumpReLU
+        self.threshold = nn.Parameter(
+            torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Encode using JumpReLU activation.
+        Applies base activation function (ReLU) then masks based on threshold.
+        """
+        # Preprocess the SAE input
+        sae_in = self.process_sae_in(x)
+        # Compute pre-activation values
+        hidden_pre = self.hook_sae_acts_pre(sae_in @ self.W_enc + self.b_enc)
+        # Apply base activation function (ReLU)
+        feature_acts = self.activation_fn(hidden_pre)
+        # Apply JumpReLU threshold
+        # During training, use detached threshold to prevent gradient flow
+        threshold = self.threshold.detach() if self.training else self.threshold
+        jump_relu_mask = (hidden_pre > threshold).to(self.dtype)
+        # Apply mask and hook
+        return self.hook_sae_acts_post(feature_acts * jump_relu_mask)
+    def fold_W_dec_norm(self) -> None:
+        """
+        Fold the decoder weight norm into the threshold parameter.
+        This is important for JumpReLU as the threshold needs to be scaled
+        along with the decoder weights.
+        """
+        # Get the decoder weight norms before normalizing
+        with torch.no_grad():
+            W_dec_norms = self.W_dec.norm(dim=1)
+        # Fold the decoder norms as in the parent class
+        super().fold_W_dec_norm()
+        # Scale the threshold by the decoder weight norms
+        with torch.no_grad():
+            self.threshold.data = self.threshold.data * W_dec_norms
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any]) -> "JumpReLUTranscoder":
+        cfg = JumpReLUTranscoderConfig.from_dict(config_dict)
+        return cls(cfg)

{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: sae-lens
-Version: 6.3.1
+Version: 6.5.0
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch

{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,26 +1,27 @@
-sae_lens/__init__.py,sha256=8vvwKdk-cv0-h2R1ah18VSmNjcBHt7X9gV3A1LtrroM,3073
+sae_lens/__init__.py,sha256=bVByXIUMDNWvGYcmDJ9cY0Me0iDR5c5TzEdB7yNkv6I,3588
 sae_lens/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sae_lens/analysis/hooked_sae_transformer.py,sha256=Eyg1Y2hVIHNuiiLOCTgzstOuW6iA-7hPHqaGR8y_vMs,13809
 sae_lens/analysis/neuronpedia_integration.py,sha256=MrENqc81Mc2SMbxGjbwHzpkGUCAFKSf0i4EdaUF2Oj4,18707
 sae_lens/cache_activations_runner.py,sha256=L5hhuU2-zPQr2S3L64GMKKLeMQfqXxwDl8NbuOtrybI,12567
-sae_lens/config.py,sha256=6xATsLdg80mXnEsW12x-cvCbAu6SjnONqbRz2eEbqAU,27796
+sae_lens/config.py,sha256=IrjbsKBbaZoFXYrsPJ5xBwIqi9uZJIIFXjV_uoErJaE,28176
 sae_lens/constants.py,sha256=CSjmiZ-bhjQeVLyRvWxAjBokCgkfM8mnvd7-vxLIWTY,639
 sae_lens/evals.py,sha256=kQyrzczKaVD9rHwfFa_DxL_gMXDxsoIVHmsFIPIU2bY,38696
 sae_lens/llm_sae_training_runner.py,sha256=exxNX_OEhdiUrlgmBP9bjX9DOf0HUcNQGO4unKeDjKM,13713
 sae_lens/load_model.py,sha256=dBB_9gO6kWyQ4sXHq7qB8T3YUlXm3PGwYcpR4UVW4QY,8633
 sae_lens/loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sae_lens/loading/pretrained_sae_loaders.py,sha256=5XEU4uFFeGCePwqDwhlE7CrFGRSI0U9Cu-UQVa33Y1E,36432
+sae_lens/loading/pretrained_sae_loaders.py,sha256=RfZhE8l2y5_ZW5-fW-iN-SJGJfHRaZo4d6AiooK1Xuc,40890
 sae_lens/loading/pretrained_saes_directory.py,sha256=4Vn-Jex6SveD7EbxcSOBv8cx1gkPfUMLU1QOP-ww1ZE,3752
 sae_lens/pretokenize_runner.py,sha256=w0f6SfZLAxbp5eAAKnet8RqUB_DKofZ9RGsoJwFnYbA,7058
-sae_lens/pretrained_saes.yaml,sha256=nhHW1auhyi4GHYrjUnHQqbNVhI5cMJv-HThzbzU1xG0,574145
+sae_lens/pretrained_saes.yaml,sha256=BncZpkf-NhVOWV1FhrDPr_TPxAINP4-eiPZBw-HO4N4,578197
 sae_lens/registry.py,sha256=nhy7BPSudSATqW4lo9H_k3Na7sfGHmAf9v-3wpnLL_o,1490
-sae_lens/saes/__init__.py,sha256=RYqE1qkMws-kwQLmBZFhA_VCa69zVtBjGPIy_UAk2pw,1159
+sae_lens/saes/__init__.py,sha256=jVwazK8Q6dW5J6_zFXPoNAuBvSxgziQ8eMOjGM3t-X8,1475
 sae_lens/saes/batchtopk_sae.py,sha256=CyaFG2hMyyDaEaXXrAMJC8wQDW1JoddTKF5mvxxBQKY,3395
 sae_lens/saes/gated_sae.py,sha256=qcmM9JwBA8aZR8z_IRHV1_gQX-q_63tKewWXRnhdXuo,8986
 sae_lens/saes/jumprelu_sae.py,sha256=3xkhBcCol2mEpIBLceymCpudocm2ypOjTeTXbpiXoA4,10794
-sae_lens/saes/sae.py,sha256=McpF4pTh70r6SQUbHFm0YQ9X2c2qPULBUSd_YmnEk4Y,38284
+sae_lens/saes/sae.py,sha256=gdUZuLaOHQrPjbDj-nZI813B6-_mNAnV9i9z4qTnpHk,38255
 sae_lens/saes/standard_sae.py,sha256=9UqYyYtQuThYxXKNaDjYcyowpOx2-7cShG-TeUP6JCQ,5940
 sae_lens/saes/topk_sae.py,sha256=CXMBI6CFvI5829bOhoQ350VXR9d8uFHUDlULTIWHXoU,8686
+sae_lens/saes/transcoder.py,sha256=qbsvIeRy7M1nuHapDut0URNg9fR-UtfBvwW4_hgsAac,11963
 sae_lens/tokenization_and_batching.py,sha256=now7caLbU3p-iGokNwmqZDyIvxYoXgnG1uklhgiLZN4,4656
 sae_lens/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sae_lens/training/activation_scaler.py,sha256=seEE-2Qd2JMHxqgnsNWPt-DGtYGZxWPnOwCGuVNSOtI,1719
@@ -32,7 +33,7 @@ sae_lens/training/types.py,sha256=qSjmGzXf3MLalygG0psnVjmhX_mpLmL47MQtZfe7qxg,81
 sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
 sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
 sae_lens/util.py,sha256=mCwLAilGMVo8Scm7CIsCafU7GsfmBvCcjwmloI4Ly7Y,1718
-sae_lens-6.3.1.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
-sae_lens-6.3.1.dist-info/METADATA,sha256=d-dAwcr-WiSFkybEqtOdFxhnJJBX0xiFec8uvln3ztE,5555
-sae_lens-6.3.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-sae_lens-6.3.1.dist-info/RECORD,,
+sae_lens-6.5.0.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
+sae_lens-6.5.0.dist-info/METADATA,sha256=kzfWl_gUihYKkxaGbDBk3dLPNGAbjJ4NgnlcJRhUf0M,5555
+sae_lens-6.5.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+sae_lens-6.5.0.dist-info/RECORD,,

{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{sae_lens-6.3.1.dist-info → sae_lens-6.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

sae-lens 6.3.1__py3-none-any.whl → 6.5.0__py3-none-any.whl

sae-lens 6.3.1py3-none-any.whl → 6.5.0py3-none-any.whl