PyPI - sae-lens - Versions diffs - 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl - Mend

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sae_lens/__init__.py +6 -3
sae_lens/analysis/neuronpedia_integration.py +3 -3
sae_lens/cache_activations_runner.py +7 -6
sae_lens/config.py +50 -6
sae_lens/constants.py +2 -0
sae_lens/evals.py +39 -28
sae_lens/llm_sae_training_runner.py +377 -0
sae_lens/load_model.py +53 -5
sae_lens/loading/pretrained_sae_loaders.py +24 -12
sae_lens/saes/gated_sae.py +0 -4
sae_lens/saes/jumprelu_sae.py +4 -10
sae_lens/saes/sae.py +121 -51
sae_lens/saes/standard_sae.py +4 -11
sae_lens/saes/topk_sae.py +18 -12
sae_lens/training/activation_scaler.py +53 -0
sae_lens/training/activations_store.py +77 -174
sae_lens/training/mixing_buffer.py +56 -0
sae_lens/training/sae_trainer.py +107 -98
sae_lens/training/types.py +5 -0
sae_lens/training/upload_saes_to_huggingface.py +1 -1
sae_lens/util.py +19 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/METADATA +1 -1
sae_lens-6.0.0rc4.dist-info/RECORD +37 -0
sae_lens/sae_training_runner.py +0 -237
sae_lens/training/geometric_median.py +0 -101
sae_lens-6.0.0rc2.dist-info/RECORD +0 -35
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/LICENSE +0 -0
{sae_lens-6.0.0rc2.dist-info → sae_lens-6.0.0rc4.dist-info}/WHEEL +0 -0

sae_lens/training/activations_store.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import contextlib
 import json
 import os
 import warnings
@@ -16,7 +15,6 @@ from huggingface_hub.utils import HfHubHTTPError
 from jaxtyping import Float, Int
 from requests import HTTPError
 from safetensors.torch import save_file
-from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformer_lens.hook_points import HookedRootModule
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -30,6 +28,8 @@ from sae_lens.config import (
 from sae_lens.constants import DTYPE_MAP
 from sae_lens.saes.sae import SAE, T_SAE_CONFIG, T_TRAINING_SAE_CONFIG
 from sae_lens.tokenization_and_batching import concat_and_batch_sequences
+from sae_lens.training.mixing_buffer import mixing_buffer
+from sae_lens.util import extract_stop_at_layer_from_tlens_hook_name
 # TODO: Make an activation store config class to be consistent with the rest of the code.
@@ -45,10 +45,8 @@ class ActivationsStore:
     cached_activation_dataset: Dataset | None = None
     tokens_column: Literal["tokens", "input_ids", "text", "problem"]
     hook_name: str
-    hook_layer: int
     hook_head_index: int | None
     _dataloader: Iterator[Any] | None = None
-    _storage_buffer: torch.Tensor | None = None
     exclude_special_tokens: torch.Tensor | None = None
     device: torch.device
@@ -65,7 +63,6 @@ class ActivationsStore:
             cached_activations_path=cfg.new_cached_activations_path,
             dtype=cfg.dtype,
             hook_name=cfg.hook_name,
-            hook_layer=cfg.hook_layer,
             context_size=cfg.context_size,
             d_in=cfg.d_in,
             n_batches_in_buffer=cfg.n_batches_in_buffer,
@@ -126,7 +123,6 @@ class ActivationsStore:
             dataset=override_dataset or cfg.dataset_path,
             streaming=cfg.streaming,
             hook_name=cfg.hook_name,
-            hook_layer=cfg.hook_layer,
             hook_head_index=cfg.hook_head_index,
             context_size=cfg.context_size,
             d_in=cfg.d_in
@@ -165,10 +161,6 @@ class ActivationsStore:
     ) -> ActivationsStore:
         if sae.cfg.metadata.hook_name is None:
             raise ValueError("hook_name is required")
-        if sae.cfg.metadata.hook_layer is None:
-            raise ValueError("hook_layer is required")
-        if sae.cfg.metadata.hook_head_index is None:
-            raise ValueError("hook_head_index is required")
         if sae.cfg.metadata.context_size is None:
             raise ValueError("context_size is required")
         if sae.cfg.metadata.prepend_bos is None:
@@ -178,7 +170,6 @@ class ActivationsStore:
             dataset=dataset,
             d_in=sae.cfg.d_in,
             hook_name=sae.cfg.metadata.hook_name,
-            hook_layer=sae.cfg.metadata.hook_layer,
             hook_head_index=sae.cfg.metadata.hook_head_index,
             context_size=sae.cfg.metadata.context_size
             if context_size is None
@@ -202,7 +193,6 @@ class ActivationsStore:
         dataset: HfDataset | str,
         streaming: bool,
         hook_name: str,
-        hook_layer: int,
         hook_head_index: int | None,
         context_size: int,
         d_in: int,
@@ -246,7 +236,6 @@ class ActivationsStore:
                 )
         self.hook_name = hook_name
-        self.hook_layer = hook_layer
         self.hook_head_index = hook_head_index
         self.context_size = context_size
         self.d_in = d_in
@@ -262,12 +251,11 @@ class ActivationsStore:
         self.cached_activations_path = cached_activations_path
         self.autocast_lm = autocast_lm
         self.seqpos_slice = seqpos_slice
+        self.training_context_size = len(range(context_size)[slice(*seqpos_slice)])
         self.exclude_special_tokens = exclude_special_tokens
         self.n_dataset_processed = 0
-        self.estimated_norm_scaling_factor = None
         # Check if dataset is tokenized
         dataset_sample = next(iter(self.dataset))
@@ -432,30 +420,6 @@ class ActivationsStore:
         return activations_dataset
-    def set_norm_scaling_factor_if_needed(self):
-        if (
-            self.normalize_activations == "expected_average_only_in"
-            and self.estimated_norm_scaling_factor is None
-        ):
-            self.estimated_norm_scaling_factor = self.estimate_norm_scaling_factor()
-    def apply_norm_scaling_factor(self, activations: torch.Tensor) -> torch.Tensor:
-        if self.estimated_norm_scaling_factor is None:
-            raise ValueError(
-                "estimated_norm_scaling_factor is not set, call set_norm_scaling_factor_if_needed() first"
-            )
-        return activations * self.estimated_norm_scaling_factor
-    def unscale(self, activations: torch.Tensor) -> torch.Tensor:
-        if self.estimated_norm_scaling_factor is None:
-            raise ValueError(
-                "estimated_norm_scaling_factor is not set, call set_norm_scaling_factor_if_needed() first"
-            )
-        return activations / self.estimated_norm_scaling_factor
-    def get_norm_scaling_factor(self, activations: torch.Tensor) -> torch.Tensor:
-        return (self.d_in**0.5) / activations.norm(dim=-1).mean()
     @torch.no_grad()
     def estimate_norm_scaling_factor(self, n_batches_for_norm_estimate: int = int(1e3)):
         norms_per_batch = []
@@ -490,21 +454,6 @@ class ActivationsStore:
         """
         self.iterable_dataset = iter(self.dataset)
-    @property
-    def storage_buffer(self) -> torch.Tensor:
-        if self._storage_buffer is None:
-            self._storage_buffer = _filter_buffer_acts(
-                self.get_buffer(self.half_buffer_size), self.exclude_special_tokens
-            )
-        return self._storage_buffer
-    @property
-    def dataloader(self) -> Iterator[Any]:
-        if self._dataloader is None:
-            self._dataloader = self.get_data_loader()
-        return self._dataloader
     def get_batch_tokens(
         self, batch_size: int | None = None, raise_at_epoch_end: bool = False
     ):
@@ -537,22 +486,17 @@ class ActivationsStore:
         d_in may result from a concatenated head dimension.
         """
-        # Setup autocast if using
-        if self.autocast_lm:
-            autocast_if_enabled = torch.autocast(
-                device_type="cuda",
-                dtype=torch.bfloat16,
-                enabled=self.autocast_lm,
-            )
-        else:
-            autocast_if_enabled = contextlib.nullcontext()
-        with autocast_if_enabled:
+        with torch.autocast(
+            device_type="cuda",
+            dtype=torch.bfloat16,
+            enabled=self.autocast_lm,
+        ):
             layerwise_activations_cache = self.model.run_with_cache(
                 batch_tokens,
                 names_filter=[self.hook_name],
-                stop_at_layer=self.hook_layer + 1,
+                stop_at_layer=extract_stop_at_layer_from_tlens_hook_name(
+                    self.hook_name
+                ),
                 prepend_bos=False,
                 **self.model_kwargs,
             )[1]
@@ -563,25 +507,25 @@ class ActivationsStore:
         n_batches, n_context = layerwise_activations.shape[:2]
-        stacked_activations = torch.zeros((n_batches, n_context, 1, self.d_in))
+        stacked_activations = torch.zeros((n_batches, n_context, self.d_in))
         if self.hook_head_index is not None:
-            stacked_activations[:, :, 0] = layerwise_activations[
+            stacked_activations[:, :] = layerwise_activations[
                 :, :, self.hook_head_index
             ]
         elif layerwise_activations.ndim > 3:  # if we have a head dimension
             try:
-                stacked_activations[:, :, 0] = layerwise_activations.view(
+                stacked_activations[:, :] = layerwise_activations.view(
                     n_batches, n_context, -1
                 )
             except RuntimeError as e:
                 logger.error(f"Error during view operation: {e}")
                 logger.info("Attempting to use reshape instead...")
-                stacked_activations[:, :, 0] = layerwise_activations.reshape(
+                stacked_activations[:, :] = layerwise_activations.reshape(
                     n_batches, n_context, -1
                 )
         else:
-            stacked_activations[:, :, 0] = layerwise_activations
+            stacked_activations[:, :] = layerwise_activations
         return stacked_activations
@@ -589,7 +533,6 @@ class ActivationsStore:
         self,
         total_size: int,
         context_size: int,
-        num_layers: int,
         d_in: int,
         raise_on_epoch_end: bool,
     ) -> tuple[
@@ -606,10 +549,9 @@ class ActivationsStore:
         """
         assert self.cached_activation_dataset is not None
         # In future, could be a list of multiple hook names
-        hook_names = [self.hook_name]
-        if not set(hook_names).issubset(self.cached_activation_dataset.column_names):
+        if self.hook_name not in self.cached_activation_dataset.column_names:
             raise ValueError(
-                f"Missing columns in dataset. Expected {hook_names}, "
+                f"Missing columns in dataset. Expected {self.hook_name}, "
                 f"got {self.cached_activation_dataset.column_names}."
             )
@@ -622,28 +564,17 @@ class ActivationsStore:
         ds_slice = self.cached_activation_dataset[
             self.current_row_idx : self.current_row_idx + total_size
         ]
-        for hook_name in hook_names:
-            # Load activations for each hook.
-            # Usually faster to first slice dataset then pick column
-            _hook_buffer = ds_slice[hook_name]
-            if _hook_buffer.shape != (total_size, context_size, d_in):
-                raise ValueError(
-                    f"_hook_buffer has shape {_hook_buffer.shape}, "
-                    f"but expected ({total_size}, {context_size}, {d_in})."
-                )
-            new_buffer.append(_hook_buffer)
-        # Stack across num_layers dimension
-        # list of num_layers; shape: (total_size, context_size, d_in) -> (total_size, context_size, num_layers, d_in)
-        new_buffer = torch.stack(new_buffer, dim=2)
-        if new_buffer.shape != (total_size, context_size, num_layers, d_in):
+        # Load activations for each hook.
+        # Usually faster to first slice dataset then pick column
+        new_buffer = ds_slice[self.hook_name]
+        if new_buffer.shape != (total_size, context_size, d_in):
             raise ValueError(
                 f"new_buffer has shape {new_buffer.shape}, "
-                f"but expected ({total_size}, {context_size}, {num_layers}, {d_in})."
+                f"but expected ({total_size}, {context_size}, {d_in})."
             )
         self.current_row_idx += total_size
-        acts_buffer = new_buffer.reshape(total_size * context_size, num_layers, d_in)
+        acts_buffer = new_buffer.reshape(total_size * context_size, d_in)
         if "token_ids" not in self.cached_activation_dataset.column_names:
             return acts_buffer, None
@@ -658,7 +589,7 @@ class ActivationsStore:
         return acts_buffer, token_ids_buffer
     @torch.no_grad()
-    def get_buffer(
+    def get_raw_buffer(
         self,
         n_batches_in_buffer: int,
         raise_on_epoch_end: bool = False,
@@ -672,26 +603,24 @@ class ActivationsStore:
         If raise_on_epoch_end is True, when the dataset it exhausted it will automatically refill the dataset and then raise a StopIteration so that the caller has a chance to react.
         """
         context_size = self.context_size
-        training_context_size = len(range(context_size)[slice(*self.seqpos_slice)])
         batch_size = self.store_batch_size_prompts
         d_in = self.d_in
         total_size = batch_size * n_batches_in_buffer
-        num_layers = 1
         if self.cached_activation_dataset is not None:
             return self._load_buffer_from_cached(
-                total_size, context_size, num_layers, d_in, raise_on_epoch_end
+                total_size, context_size, d_in, raise_on_epoch_end
             )
         refill_iterator = range(0, total_size, batch_size)
         # Initialize empty tensor buffer of the maximum required size with an additional dimension for layers
         new_buffer_activations = torch.zeros(
-            (total_size, training_context_size, num_layers, d_in),
+            (total_size, self.training_context_size, d_in),
             dtype=self.dtype,  # type: ignore
             device=self.device,
         )
         new_buffer_token_ids = torch.zeros(
-            (total_size, training_context_size),
+            (total_size, self.training_context_size),
             dtype=torch.long,
             device=self.device,
         )
@@ -716,106 +645,80 @@ class ActivationsStore:
                 refill_batch_idx_start : refill_batch_idx_start + batch_size, ...
             ] = refill_batch_tokens
-        new_buffer_activations = new_buffer_activations.reshape(-1, num_layers, d_in)
+        new_buffer_activations = new_buffer_activations.reshape(-1, d_in)
         new_buffer_token_ids = new_buffer_token_ids.reshape(-1)
         if shuffle:
             new_buffer_activations, new_buffer_token_ids = permute_together(
                 [new_buffer_activations, new_buffer_token_ids]
             )
-        # every buffer should be normalized:
-        if self.normalize_activations == "expected_average_only_in":
-            new_buffer_activations = self.apply_norm_scaling_factor(
-                new_buffer_activations
-            )
         return (
             new_buffer_activations,
             new_buffer_token_ids,
         )
-    def get_data_loader(
+    def get_filtered_buffer(
         self,
-    ) -> Iterator[Any]:
-        """
-        Return a torch.utils.dataloader which you can get batches from.
-        Should automatically refill the buffer when it gets to n % full.
-        (better mixing if you refill and shuffle regularly).
+        n_batches_in_buffer: int,
+        raise_on_epoch_end: bool = False,
+        shuffle: bool = True,
+    ) -> torch.Tensor:
+        return _filter_buffer_acts(
+            self.get_raw_buffer(
+                n_batches_in_buffer=n_batches_in_buffer,
+                raise_on_epoch_end=raise_on_epoch_end,
+                shuffle=shuffle,
+            ),
+            self.exclude_special_tokens,
+        )
+    def _iterate_filtered_activations(self) -> Generator[torch.Tensor, None, None]:
         """
-        batch_size = self.train_batch_size_tokens
-        try:
-            new_samples = _filter_buffer_acts(
-                self.get_buffer(self.half_buffer_size, raise_on_epoch_end=True),
-                self.exclude_special_tokens,
-            )
-        except StopIteration:
-            warnings.warn(
-                "All samples in the training dataset have been exhausted, we are now beginning a new epoch with the same samples."
-            )
-            self._storage_buffer = (
-                None  # dump the current buffer so samples do not leak between epochs
-            )
+        Iterate over the filtered tokens in the buffer.
+        """
+        while True:
             try:
-                new_samples = _filter_buffer_acts(
-                    self.get_buffer(self.half_buffer_size),
-                    self.exclude_special_tokens,
+                yield self.get_filtered_buffer(
+                    self.half_buffer_size, raise_on_epoch_end=True
                 )
             except StopIteration:
-                raise ValueError(
-                    "We were unable to fill up the buffer directly after starting a new epoch. This could indicate that there are less samples in the dataset than are required to fill up the buffer. Consider reducing batch_size or n_batches_in_buffer. "
+                warnings.warn(
+                    "All samples in the training dataset have been exhausted, beginning new epoch."
                 )
+                try:
+                    yield self.get_filtered_buffer(self.half_buffer_size)
+                except StopIteration:
+                    raise ValueError(
+                        "Unable to fill buffer after starting new epoch. Dataset may be too small."
+                    )
-        # 1. # create new buffer by mixing stored and new buffer
-        mixing_buffer = torch.cat(
-            [new_samples, self.storage_buffer],
-            dim=0,
-        )
-        mixing_buffer = mixing_buffer[torch.randperm(mixing_buffer.shape[0])]
-        # 2.  put 50 % in storage
-        self._storage_buffer = mixing_buffer[: mixing_buffer.shape[0] // 2]
-        # 3. put other 50 % in a dataloader
-        return iter(
-            DataLoader(
-                # TODO: seems like a typing bug?
-                cast(Any, mixing_buffer[mixing_buffer.shape[0] // 2 :]),
-                batch_size=batch_size,
-                shuffle=True,
-            )
+    def get_data_loader(
+        self,
+    ) -> Iterator[Any]:
+        """
+        Return an auto-refilling stream of filtered and mixed activations.
+        """
+        return mixing_buffer(
+            buffer_size=self.n_batches_in_buffer * self.training_context_size,
+            batch_size=self.train_batch_size_tokens,
+            activations_loader=self._iterate_filtered_activations(),
         )
     def next_batch(self) -> torch.Tensor:
-        """
-        Get the next batch from the current DataLoader.
-        If the DataLoader is exhausted, refill the buffer and create a new DataLoader.
-        """
-        try:
-            # Try to get the next batch
-            return next(self.dataloader)
-        except StopIteration:
-            # If the DataLoader is exhausted, create a new one
+        """Get next batch, updating buffer if needed."""
+        return self.__next__()
+    # ActivationsStore should be an iterator
+    def __next__(self) -> torch.Tensor:
+        if self._dataloader is None:
             self._dataloader = self.get_data_loader()
-            return next(self.dataloader)
+        return next(self._dataloader)
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        return self
     def state_dict(self) -> dict[str, torch.Tensor]:
-        result = {
-            "n_dataset_processed": torch.tensor(self.n_dataset_processed),
-        }
-        if self._storage_buffer is not None:  # first time might be None
-            result["storage_buffer_activations"] = self._storage_buffer[0]
-            if self._storage_buffer[1] is not None:
-                result["storage_buffer_tokens"] = self._storage_buffer[1]
-        if self.estimated_norm_scaling_factor is not None:
-            result["estimated_norm_scaling_factor"] = torch.tensor(
-                self.estimated_norm_scaling_factor
-            )
-        return result
+        return {"n_dataset_processed": torch.tensor(self.n_dataset_processed)}
     def save(self, file_path: str):
         """save the state dict to a file in safetensors format"""

sae_lens/training/mixing_buffer.py ADDED Viewed

@@ -0,0 +1,56 @@
+from collections.abc import Iterator
+import torch
+@torch.no_grad()
+def mixing_buffer(
+    buffer_size: int,
+    batch_size: int,
+    activations_loader: Iterator[torch.Tensor],
+) -> Iterator[torch.Tensor]:
+    """
+    A generator that maintains a mix of old and new activations for better training.
+    It stores half of the activations and mixes them with new ones to create batches.
+    Args:
+        buffer_size: Total size of the buffer (will store buffer_size/2 activations)
+        batch_size: Size of batches to return
+        activations_loader: Iterator providing new activations
+    Yields:
+        Batches of activations of shape (batch_size, *activation_dims)
+    """
+    if buffer_size < batch_size:
+        raise ValueError("Buffer size must be greater than or equal to batch size")
+    storage_buffer: torch.Tensor | None = None
+    for new_activations in activations_loader:
+        storage_buffer = (
+            new_activations
+            if storage_buffer is None
+            else torch.cat([storage_buffer, new_activations], dim=0)
+        )
+        if storage_buffer.shape[0] >= buffer_size:
+            # Shuffle
+            storage_buffer = storage_buffer[torch.randperm(storage_buffer.shape[0])]
+            num_serving_batches = max(1, storage_buffer.shape[0] // (2 * batch_size))
+            serving_cutoff = num_serving_batches * batch_size
+            serving_buffer = storage_buffer[:serving_cutoff]
+            storage_buffer = storage_buffer[serving_cutoff:]
+            # Yield batches from the serving_buffer
+            for batch_idx in range(num_serving_batches):
+                yield serving_buffer[
+                    batch_idx * batch_size : (batch_idx + 1) * batch_size
+                ]
+    # If there are any remaining activations, yield them
+    if storage_buffer is not None:
+        remaining_batches = storage_buffer.shape[0] // batch_size
+        for i in range(remaining_batches):
+            yield storage_buffer[i * batch_size : (i + 1) * batch_size]

sae-lens 6.0.0rc2__py3-none-any.whl → 6.0.0rc4__py3-none-any.whl

sae-lens 6.0.0rc2py3-none-any.whl → 6.0.0rc4py3-none-any.whl