PyPI - sae-lens - Versions diffs - 6.20.1__tar.gz → 6.22.2__tar.gz - Mend

sae-lens 6.20.1tar.gz → 6.22.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{sae_lens-6.20.1 → sae_lens-6.22.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.20.1
+Version: 6.22.2
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.20.1 → sae_lens-6.22.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sae-lens"
-version = "6.20.1"
+version = "6.22.2"
 description = "Training and Analyzing Sparse Autoencoders (SAEs)"
 authors = ["Joseph Bloom"]
 readme = "README.md"

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.20.1"
+__version__ = "6.22.2"
 import logging

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/analysis/hooked_sae_transformer.py RENAMED Viewed

@@ -3,7 +3,6 @@ from contextlib import contextmanager
 from typing import Any, Callable
 import torch
-from jaxtyping import Float
 from transformer_lens.ActivationCache import ActivationCache
 from transformer_lens.components.mlps.can_be_used_as_mlp import CanBeUsedAsMLP
 from transformer_lens.hook_points import HookPoint  # Hooking utilities
@@ -11,8 +10,8 @@ from transformer_lens.HookedTransformer import HookedTransformer
 from sae_lens.saes.sae import SAE
-SingleLoss = Float[torch.Tensor, ""]  # Type alias for a single element tensor
-LossPerToken = Float[torch.Tensor, "batch pos-1"]
+SingleLoss = torch.Tensor  # Type alias for a single element tensor
+LossPerToken = torch.Tensor
 Loss = SingleLoss | LossPerToken
@@ -171,12 +170,7 @@ class HookedSAETransformer(HookedTransformer):
         reset_saes_end: bool = True,
         use_error_term: bool | None = None,
         **model_kwargs: Any,
-    ) -> (
-        None
-        | Float[torch.Tensor, "batch pos d_vocab"]
-        | Loss
-        | tuple[Float[torch.Tensor, "batch pos d_vocab"], Loss]
-    ):
+    ) -> None | torch.Tensor | Loss | tuple[torch.Tensor, Loss]:
         """Wrapper around HookedTransformer forward pass.
         Runs the model with the given SAEs attached for one forward pass, then removes them. By default, will reset all SAEs to original state after.
@@ -203,10 +197,7 @@ class HookedSAETransformer(HookedTransformer):
         remove_batch_dim: bool = False,
         **kwargs: Any,
     ) -> tuple[
-        None
-        | Float[torch.Tensor, "batch pos d_vocab"]
-        | Loss
-        | tuple[Float[torch.Tensor, "batch pos d_vocab"], Loss],
+        None | torch.Tensor | Loss | tuple[torch.Tensor, Loss],
         ActivationCache | dict[str, torch.Tensor],
     ]:
         """Wrapper around 'run_with_cache' in HookedTransformer.

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/cache_activations_runner.py RENAMED Viewed

@@ -9,7 +9,6 @@ import torch
 from datasets import Array2D, Dataset, Features, Sequence, Value
 from datasets.fingerprint import generate_fingerprint
 from huggingface_hub import HfApi
-from jaxtyping import Float, Int
 from tqdm.auto import tqdm
 from transformer_lens.HookedTransformer import HookedRootModule
@@ -318,8 +317,8 @@ class CacheActivationsRunner:
     def _create_shard(
         self,
         buffer: tuple[
-            Float[torch.Tensor, "(bs context_size) d_in"],
-            Int[torch.Tensor, "(bs context_size)"] | None,
+            torch.Tensor,  # shape: (bs context_size) d_in
+            torch.Tensor | None,  # shape: (bs context_size) or None
         ],
     ) -> Dataset:
         hook_names = [self.cfg.hook_name]

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/pretrained_saes.yaml RENAMED Viewed

@@ -14916,6 +14916,30 @@ qwen2.5-7b-instruct-andyrdt:
     path: resid_post_layer_27/trainer_1
     neuronpedia: qwen2.5-7b-it/27-resid-post-aa
+gpt-oss-20b-andyrdt:
+  conversion_func: dictionary_learning_1
+  model: openai/gpt-oss-20b
+  repo_id: andyrdt/saes-gpt-oss-20b
+  saes:
+  - id: resid_post_layer_3_trainer_0
+    path: resid_post_layer_3/trainer_0
+    neuronpedia: gpt-oss-20b/3-resid-post-aa
+  - id: resid_post_layer_7_trainer_0
+    path: resid_post_layer_7/trainer_0
+    neuronpedia: gpt-oss-20b/7-resid-post-aa
+  - id: resid_post_layer_11_trainer_0
+    path: resid_post_layer_11/trainer_0
+    neuronpedia: gpt-oss-20b/11-resid-post-aa
+  - id: resid_post_layer_15_trainer_0
+    path: resid_post_layer_15/trainer_0
+    neuronpedia: gpt-oss-20b/15-resid-post-aa
+  - id: resid_post_layer_19_trainer_0
+    path: resid_post_layer_19/trainer_0
+    neuronpedia: gpt-oss-20b/19-resid-post-aa
+  - id: resid_post_layer_23_trainer_0
+    path: resid_post_layer_23/trainer_0
+    neuronpedia: gpt-oss-20b/23-resid-post-aa
 goodfire-llama-3.3-70b-instruct:
   conversion_func: goodfire
   model: meta-llama/Llama-3.3-70B-Instruct
@@ -14924,6 +14948,7 @@ goodfire-llama-3.3-70b-instruct:
   - id: layer_50
     path: Llama-3.3-70B-Instruct-SAE-l50.pt
     l0: 121
+    neuronpedia: llama3.3-70b-it/50-resid-post-gf
 goodfire-llama-3.1-8b-instruct:
   conversion_func: goodfire
@@ -14933,3 +14958,4 @@ goodfire-llama-3.1-8b-instruct:
   - id: layer_19
     path: Llama-3.1-8B-Instruct-SAE-l19.pth
     l0: 91
+    neuronpedia: llama3.1-8b-it/19-resid-post-gf

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/gated_sae.py RENAMED Viewed

@@ -2,7 +2,6 @@ from dataclasses import dataclass
 from typing import Any
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
 from typing_extensions import override
@@ -49,9 +48,7 @@ class GatedSAE(SAE[GatedSAEConfig]):
         super().initialize_weights()
         _init_weights_gated(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space using a gated encoder.
         This must match the original encode_gated implementation from SAE class.
@@ -72,9 +69,7 @@ class GatedSAE(SAE[GatedSAEConfig]):
         # Combine gating and magnitudes
         return self.hook_sae_acts_post(active_features * feature_magnitudes)
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back into the input space:
           1) Apply optional finetuning scaling.
@@ -147,8 +142,8 @@ class GatedTrainingSAE(TrainingSAE[GatedTrainingSAEConfig]):
         _init_weights_gated(self)
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gated forward pass with pre-activation (for training).
         """

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/jumprelu_sae.py RENAMED Viewed

@@ -3,7 +3,6 @@ from typing import Any, Literal
 import numpy as np
 import torch
-from jaxtyping import Float
 from torch import nn
 from typing_extensions import override
@@ -130,9 +129,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
             torch.zeros(self.cfg.d_sae, dtype=self.dtype, device=self.device)
         )
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space using JumpReLU.
         The threshold parameter determines which units remain active.
@@ -150,9 +147,7 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
         # 3) Multiply the normally activated units by that mask.
         return self.hook_sae_acts_post(base_acts * jump_relu_mask)
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back to the input space.
         Follows the same steps as StandardSAE: apply scaling, transform, hook, and optionally reshape.
@@ -265,8 +260,8 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         return torch.exp(self.log_threshold)
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         sae_in = self.process_sae_in(x)
         hidden_pre = sae_in @ self.W_enc + self.b_enc

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/matryoshka_batchtopk_sae.py RENAMED Viewed

@@ -2,7 +2,6 @@ import warnings
 from dataclasses import dataclass, field
 import torch
-from jaxtyping import Float
 from typing_extensions import override
 from sae_lens.saes.batchtopk_sae import (
@@ -95,10 +94,10 @@ class MatryoshkaBatchTopKTrainingSAE(BatchTopKTrainingSAE):
     def _decode_matryoshka_level(
         self,
-        feature_acts: Float[torch.Tensor, "... d_sae"],
+        feature_acts: torch.Tensor,
         width: int,
         inv_W_dec_norm: torch.Tensor,
-    ) -> Float[torch.Tensor, "... d_in"]:
+    ) -> torch.Tensor:
         """
         Decodes feature activations back into input space for a matryoshka level
         """

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/sae.py RENAMED Viewed

@@ -19,7 +19,6 @@ from typing import (
 import einops
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
 from safetensors.torch import load_file, save_file
 from torch import nn
@@ -351,16 +350,12 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.W_enc = nn.Parameter(w_enc_data)
     @abstractmethod
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """Encode input tensor to feature space."""
         pass
     @abstractmethod
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """Decode feature activations back to input space."""
         pass
@@ -450,9 +445,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         return super().to(*args, **kwargs)
-    def process_sae_in(
-        self, sae_in: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def process_sae_in(self, sae_in: torch.Tensor) -> torch.Tensor:
         sae_in = sae_in.to(self.dtype)
         sae_in = self.reshape_fn_in(sae_in)
@@ -859,14 +852,12 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
     @abstractmethod
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Encode with access to pre-activation values for training."""
         ...
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         For inference, just encode without returning hidden_pre.
         (training_forward_pass calls encode_with_hidden_pre).
@@ -874,9 +865,7 @@ class TrainingSAE(SAE[T_TRAINING_SAE_CONFIG], ABC):
         feature_acts, _ = self.encode_with_hidden_pre(x)
         return feature_acts
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decodes feature activations back into input space,
         applying optional finetuning scale, hooking, out normalization, etc.

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/standard_sae.py RENAMED Viewed

@@ -2,7 +2,6 @@ from dataclasses import dataclass
 import numpy as np
 import torch
-from jaxtyping import Float
 from numpy.typing import NDArray
 from torch import nn
 from typing_extensions import override
@@ -54,9 +53,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         super().initialize_weights()
         _init_weights_standard(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Encode the input tensor into the feature space.
         """
@@ -67,9 +64,7 @@ class StandardSAE(SAE[StandardSAEConfig]):
         # Apply the activation function (e.g., ReLU, depending on config)
         return self.hook_sae_acts_post(self.activation_fn(hidden_pre))
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """
         Decode the feature activations back to the input space.
         Now, if hook_z reshaping is turned on, we reverse the flattening.
@@ -127,8 +122,8 @@ class StandardTrainingSAE(TrainingSAE[StandardTrainingSAEConfig]):
         }
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # Process the input (including dtype conversion, hook call, and any activation normalization)
         sae_in = self.process_sae_in(x)
         # Compute the pre-activation (and allow for a hook if desired)

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/temporal_sae.py RENAMED Viewed

@@ -13,7 +13,6 @@ from typing import Literal
 import torch
 import torch.nn.functional as F
-from jaxtyping import Float
 from torch import nn
 from typing_extensions import override
@@ -250,8 +249,8 @@ class TemporalSAE(SAE[TemporalSAEConfig]):
             )
     def encode_with_predictions(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Encode input to novel codes only.
         Returns only the sparse novel codes (not predicted codes).
@@ -312,14 +311,10 @@ class TemporalSAE(SAE[TemporalSAEConfig]):
         # Return only novel codes (these are the interpretable features)
         return z_novel, z_pred
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         return self.encode_with_predictions(x)[0]
-    def decode(
-        self, feature_acts: Float[torch.Tensor, "... d_sae"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def decode(self, feature_acts: torch.Tensor) -> torch.Tensor:
         """Decode novel codes to reconstruction.
         Note: This only decodes the novel codes. For full reconstruction,
@@ -342,9 +337,7 @@ class TemporalSAE(SAE[TemporalSAEConfig]):
         return sae_out
     @override
-    def forward(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_in"]:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Full forward pass through TemporalSAE.
         Returns complete reconstruction (predicted + novel).

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/saes/topk_sae.py RENAMED Viewed

@@ -4,7 +4,6 @@ from dataclasses import dataclass
 from typing import Any, Callable
 import torch
-from jaxtyping import Float
 from torch import nn
 from transformer_lens.hook_points import HookPoint
 from typing_extensions import override
@@ -235,9 +234,7 @@ class TopKSAE(SAE[TopKSAEConfig]):
         super().initialize_weights()
         _init_weights_topk(self)
-    def encode(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> Float[torch.Tensor, "... d_sae"]:
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
         """
         Converts input x into feature activations.
         Uses topk activation under the hood.
@@ -251,8 +248,8 @@ class TopKSAE(SAE[TopKSAEConfig]):
     def decode(
         self,
-        feature_acts: Float[torch.Tensor, "... d_sae"],
-    ) -> Float[torch.Tensor, "... d_in"]:
+        feature_acts: torch.Tensor,
+    ) -> torch.Tensor:
         """
         Reconstructs the input from topk feature activations.
         Applies optional finetuning scaling, hooking to recons, out normalization,
@@ -354,8 +351,8 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
         _init_weights_topk(self)
     def encode_with_hidden_pre(
-        self, x: Float[torch.Tensor, "... d_in"]
-    ) -> tuple[Float[torch.Tensor, "... d_sae"], Float[torch.Tensor, "... d_sae"]]:
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Similar to the base training method: calculate pre-activations, then apply TopK.
         """
@@ -372,8 +369,8 @@ class TopKTrainingSAE(TrainingSAE[TopKTrainingSAEConfig]):
     @override
     def decode(
         self,
-        feature_acts: Float[torch.Tensor, "... d_sae"],
-    ) -> Float[torch.Tensor, "... d_in"]:
+        feature_acts: torch.Tensor,
+    ) -> torch.Tensor:
         """
         Decodes feature activations back into input space,
         applying optional finetuning scale, hooking, out normalization, etc.

{sae_lens-6.20.1 → sae_lens-6.22.2}/sae_lens/training/activations_store.py RENAMED Viewed

@@ -12,7 +12,6 @@ import torch
 from datasets import Dataset, DatasetDict, IterableDataset, load_dataset
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
-from jaxtyping import Float, Int
 from requests import HTTPError
 from safetensors.torch import load_file, save_file
 from tqdm.auto import tqdm
@@ -167,9 +166,11 @@ class ActivationsStore:
         disable_concat_sequences: bool = False,
         sequence_separator_token: int | Literal["bos", "eos", "sep"] | None = "bos",
     ) -> ActivationsStore:
+        if context_size is None:
+            context_size = sae.cfg.metadata.context_size
         if sae.cfg.metadata.hook_name is None:
             raise ValueError("hook_name is required")
-        if sae.cfg.metadata.context_size is None:
+        if context_size is None:
             raise ValueError("context_size is required")
         if sae.cfg.metadata.prepend_bos is None:
             raise ValueError("prepend_bos is required")
@@ -179,9 +180,7 @@ class ActivationsStore:
             d_in=sae.cfg.d_in,
             hook_name=sae.cfg.metadata.hook_name,
             hook_head_index=sae.cfg.metadata.hook_head_index,
-            context_size=sae.cfg.metadata.context_size
-            if context_size is None
-            else context_size,
+            context_size=context_size,
             prepend_bos=sae.cfg.metadata.prepend_bos,
             streaming=streaming,
             store_batch_size_prompts=store_batch_size_prompts,
@@ -542,8 +541,8 @@ class ActivationsStore:
         d_in: int,
         raise_on_epoch_end: bool,
     ) -> tuple[
-        Float[torch.Tensor, "(total_size context_size) num_layers d_in"],
-        Int[torch.Tensor, "(total_size context_size)"] | None,
+        torch.Tensor,
+        torch.Tensor | None,
     ]:
         """
         Loads `total_size` activations from `cached_activation_dataset`