PyPI - sae-lens - Versions diffs - 6.26.0__py3-none-any.whl → 6.28.1__py3-none-any.whl - Mend

sae-lens 6.26.0py3-none-any.whl → 6.28.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sae_lens/__init__.py +3 -1
sae_lens/cache_activations_runner.py +12 -5
sae_lens/config.py +2 -0
sae_lens/loading/pretrained_sae_loaders.py +2 -1
sae_lens/loading/pretrained_saes_directory.py +18 -0
sae_lens/pretrained_saes.yaml +144 -144
sae_lens/saes/gated_sae.py +1 -0
sae_lens/saes/jumprelu_sae.py +3 -0
sae_lens/saes/sae.py +13 -0
sae_lens/saes/standard_sae.py +2 -0
sae_lens/saes/temporal_sae.py +1 -0
sae_lens/synthetic/__init__.py +89 -0
sae_lens/synthetic/activation_generator.py +215 -0
sae_lens/synthetic/correlation.py +170 -0
sae_lens/synthetic/evals.py +141 -0
sae_lens/synthetic/feature_dictionary.py +138 -0
sae_lens/synthetic/firing_probabilities.py +104 -0
sae_lens/synthetic/hierarchy.py +335 -0
sae_lens/synthetic/initialization.py +40 -0
sae_lens/synthetic/plotting.py +230 -0
sae_lens/synthetic/training.py +145 -0
sae_lens/tokenization_and_batching.py +1 -1
sae_lens/training/activations_store.py +51 -91
sae_lens/training/mixing_buffer.py +14 -5
sae_lens/training/sae_trainer.py +1 -1
sae_lens/util.py +26 -1
{sae_lens-6.26.0.dist-info → sae_lens-6.28.1.dist-info}/METADATA +3 -1
sae_lens-6.28.1.dist-info/RECORD +52 -0
sae_lens-6.26.0.dist-info/RECORD +0 -42
{sae_lens-6.26.0.dist-info → sae_lens-6.28.1.dist-info}/WHEEL +0 -0
{sae_lens-6.26.0.dist-info → sae_lens-6.28.1.dist-info}/licenses/LICENSE +0 -0

sae_lens/synthetic/plotting.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+Plotting utilities for visualizing SAE training on synthetic data.
+This module provides functions for:
+- Plotting cosine similarities between SAE features and true features
+- Automatically reordering features for better visualization
+- Creating comparison plots between encoder and decoder
+"""
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any
+import plotly.graph_objects as go
+import torch
+from plotly.subplots import make_subplots
+from sae_lens.saes import SAE
+from sae_lens.synthetic.feature_dictionary import FeatureDictionary
+from sae_lens.util import cosine_similarities
+def find_best_feature_ordering(
+    sae_features: torch.Tensor,
+    true_features: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Find the best ordering of SAE features to match true features.
+    Reorders SAE features so that each SAE latent aligns with its best-matching
+    true feature in order. This makes cosine similarity plots more interpretable.
+    Args:
+        sae_features: SAE decoder weights of shape [d_sae, hidden_dim]
+        true_features: True feature vectors of shape [num_features, hidden_dim]
+    Returns:
+        Tensor of indices that reorders sae_features for best alignment
+    """
+    cos_sims = cosine_similarities(sae_features, true_features)
+    best_matches = torch.argmax(torch.abs(cos_sims), dim=1)
+    return torch.argsort(best_matches)
+def find_best_feature_ordering_from_sae(
+    sae: torch.nn.Module,
+    feature_dict: FeatureDictionary,
+) -> torch.Tensor:
+    """
+    Find the best feature ordering for an SAE given a feature dictionary.
+    Args:
+        sae: SAE with W_dec attribute of shape [d_sae, hidden_dim]
+        feature_dict: The feature dictionary containing true features
+    Returns:
+        Tensor of indices that reorders SAE latents for best alignment
+    """
+    sae_features = sae.W_dec.detach()  # type: ignore[attr-defined]
+    true_features = feature_dict.feature_vectors.detach()
+    return find_best_feature_ordering(sae_features, true_features)
+def find_best_feature_ordering_across_saes(
+    saes: Iterable[torch.nn.Module],
+    feature_dict: FeatureDictionary,
+) -> torch.Tensor:
+    """
+    Find the best feature ordering that works across multiple SAEs.
+    Useful for creating consistent orderings across training snapshots.
+    Args:
+        saes: Iterable of SAEs to consider
+        feature_dict: The feature dictionary containing true features
+    Returns:
+        The best ordering tensor found across all SAEs
+    """
+    best_score = float("-inf")
+    best_ordering: torch.Tensor | None = None
+    true_features = feature_dict.feature_vectors.detach()
+    for sae in saes:
+        sae_features = sae.W_dec.detach()  # type: ignore[attr-defined]
+        cos_sims = cosine_similarities(sae_features, true_features)
+        cos_sims = torch.round(cos_sims * 100) / 100  # Reduce numerical noise
+        ordering = find_best_feature_ordering(sae_features, true_features)
+        score = cos_sims[ordering, torch.arange(cos_sims.shape[1])].mean().item()
+        if score > best_score:
+            best_score = score
+            best_ordering = ordering
+    if best_ordering is None:
+        raise ValueError("No SAEs provided")
+    return best_ordering
+def plot_sae_feature_similarity(
+    sae: SAE[Any],
+    feature_dict: FeatureDictionary,
+    title: str | None = None,
+    reorder_features: bool | torch.Tensor = False,
+    decoder_only: bool = False,
+    show_values: bool = False,
+    height: int = 400,
+    width: int = 800,
+    save_path: str | Path | None = None,
+    show_plot: bool = True,
+    dtick: int | None = 1,
+    scale: float = 1.0,
+):
+    """
+    Plot cosine similarities between SAE features and true features.
+    Creates a heatmap showing how well each SAE latent aligns with each
+    true feature. Useful for understanding what the SAE has learned.
+    Args:
+        sae: The SAE to visualize. Must have W_enc and W_dec attributes.
+        feature_dict: The feature dictionary containing true features
+        title: Plot title. If None, a default title is used.
+        reorder_features: If True, automatically reorders features for best alignment.
+            If a tensor, uses that as the ordering.
+        decoder_only: If True, only plots the decoder (not encoder and decoder side-by-side)
+        show_values: If True, shows numeric values on the heatmap
+        height: Height of the figure in pixels
+        width: Width of the figure in pixels
+        save_path: If provided, saves the figure to this path
+        show_plot: If True, displays the plot
+        dtick: Tick spacing for axes
+        scale: Scale factor for image resolution when saving
+    """
+    # Get cosine similarities
+    true_features = feature_dict.feature_vectors.detach()
+    dec_cos_sims = cosine_similarities(sae.W_dec.detach(), true_features)  # type: ignore[attr-defined]
+    enc_cos_sims = cosine_similarities(sae.W_enc.T.detach(), true_features)  # type: ignore[attr-defined]
+    # Round to reduce numerical noise
+    dec_cos_sims = torch.round(dec_cos_sims * 100) / 100
+    enc_cos_sims = torch.round(enc_cos_sims * 100) / 100
+    # Apply feature reordering if requested
+    if reorder_features is not False:
+        if isinstance(reorder_features, bool):
+            sorted_indices = find_best_feature_ordering(
+                sae.W_dec.detach(),
+                true_features,  # type: ignore[attr-defined]
+            )
+        else:
+            sorted_indices = reorder_features
+        dec_cos_sims = dec_cos_sims[sorted_indices]
+        enc_cos_sims = enc_cos_sims[sorted_indices]
+    hovertemplate = "True feature: %{x}<br>SAE Latent: %{y}<br>Cosine Similarity: %{z:.3f}<extra></extra>"
+    if decoder_only:
+        fig = make_subplots(rows=1, cols=1)
+        decoder_args: dict[str, Any] = {
+            "z": dec_cos_sims.cpu().numpy(),
+            "zmin": -1,
+            "zmax": 1,
+            "colorscale": "RdBu",
+            "colorbar": dict(title="cos sim", x=1.0, dtick=1, tickvals=[-1, 0, 1]),
+            "hovertemplate": hovertemplate,
+        }
+        if show_values:
+            decoder_args["texttemplate"] = "%{z:.2f}"
+            decoder_args["textfont"] = {"size": 10}
+        fig.add_trace(go.Heatmap(**decoder_args), row=1, col=1)
+        fig.update_xaxes(title_text="True feature", row=1, col=1, dtick=dtick)
+        fig.update_yaxes(title_text="SAE Latent", row=1, col=1, dtick=dtick)
+    else:
+        fig = make_subplots(
+            rows=1, cols=2, subplot_titles=("SAE encoder", "SAE decoder")
+        )
+        # Encoder heatmap
+        encoder_args: dict[str, Any] = {
+            "z": enc_cos_sims.cpu().numpy(),
+            "zmin": -1,
+            "zmax": 1,
+            "colorscale": "RdBu",
+            "showscale": False,
+            "hovertemplate": hovertemplate,
+        }
+        if show_values:
+            encoder_args["texttemplate"] = "%{z:.2f}"
+            encoder_args["textfont"] = {"size": 10}
+        fig.add_trace(go.Heatmap(**encoder_args), row=1, col=1)
+        # Decoder heatmap
+        decoder_args = {
+            "z": dec_cos_sims.cpu().numpy(),
+            "zmin": -1,
+            "zmax": 1,
+            "colorscale": "RdBu",
+            "colorbar": dict(title="cos sim", x=1.0, dtick=1, tickvals=[-1, 0, 1]),
+            "hovertemplate": hovertemplate,
+        }
+        if show_values:
+            decoder_args["texttemplate"] = "%{z:.2f}"
+            decoder_args["textfont"] = {"size": 10}
+        fig.add_trace(go.Heatmap(**decoder_args), row=1, col=2)
+        fig.update_xaxes(title_text="True feature", row=1, col=1, dtick=dtick)
+        fig.update_xaxes(title_text="True feature", row=1, col=2, dtick=dtick)
+        fig.update_yaxes(title_text="SAE Latent", row=1, col=1, dtick=dtick)
+        fig.update_yaxes(title_text="SAE Latent", row=1, col=2, dtick=dtick)
+    # Set main title
+    if title is None:
+        title = "Cosine similarity with true features"
+    fig.update_layout(height=height, width=width, title_text=title)
+    if save_path:
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+        fig.write_image(save_path, scale=scale)
+    if show_plot:
+        fig.show()

sae_lens/synthetic/training.py ADDED Viewed

@@ -0,0 +1,145 @@
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any, Callable
+import torch
+from sae_lens.config import LoggingConfig, SAETrainerConfig
+from sae_lens.saes.sae import TrainingSAE
+from sae_lens.synthetic.activation_generator import ActivationGenerator
+from sae_lens.synthetic.feature_dictionary import FeatureDictionary
+from sae_lens.training.sae_trainer import SAETrainer, SaveCheckpointFn
+def train_toy_sae(
+    sae: TrainingSAE[Any],
+    feature_dict: FeatureDictionary,
+    activations_generator: ActivationGenerator,
+    training_samples: int = 10_000_000,
+    batch_size: int = 1024,
+    lr: float = 3e-4,
+    lr_warm_up_steps: int = 0,
+    lr_decay_steps: int = 0,
+    device: str | torch.device = "cpu",
+    n_snapshots: int = 0,
+    snapshot_fn: Callable[[SAETrainer[Any, Any]], None] | None = None,
+) -> None:
+    """
+    Train an SAE on synthetic activations from a feature dictionary.
+    This is a convenience function that sets up the training loop with
+    sensible defaults for small-scale synthetic data experiments.
+    Args:
+        sae: The TrainingSAE to train
+        feature_dict: The feature dictionary that maps feature activations to
+            hidden activations
+        activations_generator: Generator that produces feature activations
+        training_samples: Total number of training samples
+        batch_size: Batch size for training
+        lr: Learning rate
+        lr_warm_up_steps: Number of warmup steps for learning rate
+        lr_decay_steps: Number of steps over which to decay learning rate
+        device: Device to train on
+        n_snapshots: Number of snapshots to take during training. Snapshots are
+            evenly spaced throughout training.
+        snapshot_fn: Callback function called at each snapshot point. Receives
+            the SAETrainer instance, allowing access to the SAE, training step,
+            and other training state. Required if n_snapshots > 0.
+    """
+    device_str = str(device) if isinstance(device, torch.device) else device
+    # Create data iterator
+    data_iterator = SyntheticActivationIterator(
+        feature_dict=feature_dict,
+        activations_generator=activations_generator,
+        batch_size=batch_size,
+    )
+    # Create trainer config
+    trainer_cfg = SAETrainerConfig(
+        n_checkpoints=n_snapshots,
+        checkpoint_path=None,
+        save_final_checkpoint=False,
+        total_training_samples=training_samples,
+        device=device_str,
+        autocast=False,
+        lr=lr,
+        lr_end=lr,
+        lr_scheduler_name="constant",
+        lr_warm_up_steps=lr_warm_up_steps,
+        adam_beta1=0.9,
+        adam_beta2=0.999,
+        lr_decay_steps=lr_decay_steps,
+        n_restart_cycles=1,
+        train_batch_size_samples=batch_size,
+        dead_feature_window=1000,
+        feature_sampling_window=2000,
+        logger=LoggingConfig(
+            log_to_wandb=False,
+            # hacky way to disable evals, but works for now
+            eval_every_n_wandb_logs=2**31 - 1,
+        ),
+    )
+    def snapshot_wrapper(
+        snapshot_fn: Callable[[SAETrainer[Any, Any]], None] | None,
+    ) -> SaveCheckpointFn:
+        def save_checkpoint(checkpoint_path: Path | None) -> None:  # noqa: ARG001
+            if snapshot_fn is None:
+                raise ValueError("snapshot_fn must be provided to take snapshots")
+            snapshot_fn(trainer)
+        return save_checkpoint
+    # Create trainer and train
+    feature_dict.eval()
+    trainer = SAETrainer(
+        cfg=trainer_cfg,
+        sae=sae,
+        data_provider=data_iterator,
+        save_checkpoint_fn=snapshot_wrapper(snapshot_fn),
+    )
+    trainer.fit()
+class SyntheticActivationIterator(Iterator[torch.Tensor]):
+    """
+    An iterator that generates synthetic activations for SAE training.
+    This iterator wraps a FeatureDictionary and a function that generates
+    feature activations, producing hidden activations that can be used
+    to train an SAE.
+    """
+    def __init__(
+        self,
+        feature_dict: FeatureDictionary,
+        activations_generator: ActivationGenerator,
+        batch_size: int,
+    ):
+        """
+        Create a new SyntheticActivationIterator.
+        Args:
+            feature_dict: The feature dictionary to use for generating hidden activations
+            activations_generator: Generator that produces feature activations
+            batch_size: Number of samples per batch
+        """
+        self.feature_dict = feature_dict
+        self.activations_generator = activations_generator
+        self.batch_size = batch_size
+    @torch.no_grad()
+    def next_batch(self) -> torch.Tensor:
+        """Generate the next batch of hidden activations."""
+        features = self.activations_generator(self.batch_size)
+        return self.feature_dict(features)
+    def __iter__(self) -> "SyntheticActivationIterator":
+        return self
+    def __next__(self) -> torch.Tensor:
+        return self.next_batch()

sae_lens/tokenization_and_batching.py CHANGED Viewed

@@ -85,8 +85,8 @@ def concat_and_batch_sequences(
         for sequence in tokens_iterator:
             if (
                 begin_sequence_token_id is not None
-                and sequence[0] != begin_sequence_token_id
                 and len(sequence) >= context_size - 1
+                and sequence[0] != begin_sequence_token_id
             ):
                 begin_sequence_token_id_tensor = torch.tensor(
                     [begin_sequence_token_id],

sae_lens/training/activations_store.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import json
 import os
 import warnings
-from collections.abc import Generator, Iterator, Sequence
+from collections.abc import Generator, Iterator
 from pathlib import Path
 from typing import Any, Literal, cast
@@ -148,6 +148,7 @@ class ActivationsStore:
             exclude_special_tokens=exclude_special_tokens,
             disable_concat_sequences=cfg.disable_concat_sequences,
             sequence_separator_token=cfg.sequence_separator_token,
+            activations_mixing_fraction=cfg.activations_mixing_fraction,
         )
     @classmethod
@@ -222,6 +223,7 @@ class ActivationsStore:
         exclude_special_tokens: torch.Tensor | None = None,
         disable_concat_sequences: bool = False,
         sequence_separator_token: int | Literal["bos", "eos", "sep"] | None = "bos",
+        activations_mixing_fraction: float = 0.5,
     ):
         self.model = model
         if model_kwargs is None:
@@ -252,7 +254,6 @@ class ActivationsStore:
         self.context_size = context_size
         self.d_in = d_in
         self.n_batches_in_buffer = n_batches_in_buffer
-        self.half_buffer_size = n_batches_in_buffer // 2
         self.total_training_tokens = total_training_tokens
         self.store_batch_size_prompts = store_batch_size_prompts
         self.train_batch_size_tokens = train_batch_size_tokens
@@ -269,6 +270,7 @@ class ActivationsStore:
         self.sequence_separator_token: int | Literal["bos", "eos", "sep"] | None = (
             sequence_separator_token
         )
+        self.activations_mixing_fraction = activations_mixing_fraction
         self.n_dataset_processed = 0
@@ -535,18 +537,15 @@ class ActivationsStore:
         return stacked_activations
-    def _load_buffer_from_cached(
+    def _load_raw_llm_batch_from_cached(
         self,
-        total_size: int,
-        context_size: int,
-        d_in: int,
         raise_on_epoch_end: bool,
     ) -> tuple[
         torch.Tensor,
         torch.Tensor | None,
     ]:
         """
-        Loads `total_size` activations from `cached_activation_dataset`
+        Loads a batch of activations from `cached_activation_dataset`
         The dataset has columns for each hook_name,
         each containing activations of shape (context_size, d_in).
@@ -554,6 +553,10 @@ class ActivationsStore:
         raises StopIteration
         """
         assert self.cached_activation_dataset is not None
+        context_size = self.context_size
+        batch_size = self.store_batch_size_prompts
+        d_in = self.d_in
         # In future, could be a list of multiple hook names
         if self.hook_name not in self.cached_activation_dataset.column_names:
             raise ValueError(
@@ -561,138 +564,100 @@ class ActivationsStore:
                 f"got {self.cached_activation_dataset.column_names}."
             )
-        if self.current_row_idx > len(self.cached_activation_dataset) - total_size:
+        if self.current_row_idx > len(self.cached_activation_dataset) - batch_size:
             self.current_row_idx = 0
             if raise_on_epoch_end:
                 raise StopIteration
-        new_buffer = []
         ds_slice = self.cached_activation_dataset[
-            self.current_row_idx : self.current_row_idx + total_size
+            self.current_row_idx : self.current_row_idx + batch_size
         ]
         # Load activations for each hook.
         # Usually faster to first slice dataset then pick column
-        new_buffer = ds_slice[self.hook_name]
-        if new_buffer.shape != (total_size, context_size, d_in):
+        acts_buffer = ds_slice[self.hook_name]
+        if acts_buffer.shape != (batch_size, context_size, d_in):
             raise ValueError(
-                f"new_buffer has shape {new_buffer.shape}, "
-                f"but expected ({total_size}, {context_size}, {d_in})."
+                f"acts_buffer has shape {acts_buffer.shape}, "
+                f"but expected ({batch_size}, {context_size}, {d_in})."
             )
-        self.current_row_idx += total_size
-        acts_buffer = new_buffer.reshape(total_size * context_size, d_in)
+        self.current_row_idx += batch_size
+        acts_buffer = acts_buffer.reshape(batch_size * context_size, d_in)
         if "token_ids" not in self.cached_activation_dataset.column_names:
             return acts_buffer, None
         token_ids_buffer = ds_slice["token_ids"]
-        if token_ids_buffer.shape != (total_size, context_size):
+        if token_ids_buffer.shape != (batch_size, context_size):
             raise ValueError(
                 f"token_ids_buffer has shape {token_ids_buffer.shape}, "
-                f"but expected ({total_size}, {context_size})."
+                f"but expected ({batch_size}, {context_size})."
             )
-        token_ids_buffer = token_ids_buffer.reshape(total_size * context_size)
+        token_ids_buffer = token_ids_buffer.reshape(batch_size * context_size)
         return acts_buffer, token_ids_buffer
     @torch.no_grad()
-    def get_raw_buffer(
+    def get_raw_llm_batch(
         self,
-        n_batches_in_buffer: int,
         raise_on_epoch_end: bool = False,
-        shuffle: bool = True,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
-        Loads the next n_batches_in_buffer batches of activations into a tensor and returns it.
+        Loads the next batch of activations from the LLM and returns it.
-        The primary purpose here is maintaining a shuffling buffer.
+        If raise_on_epoch_end is True, when the dataset is exhausted it will
+        automatically refill the dataset and then raise a StopIteration so that
+        the caller has a chance to react.
-        If raise_on_epoch_end is True, when the dataset it exhausted it will automatically refill the dataset and then raise a StopIteration so that the caller has a chance to react.
+        Returns:
+            Tuple of (activations, token_ids) where activations has shape
+            (batch_size * context_size, d_in) and token_ids has shape
+            (batch_size * context_size,).
         """
-        context_size = self.context_size
-        batch_size = self.store_batch_size_prompts
         d_in = self.d_in
-        total_size = batch_size * n_batches_in_buffer
         if self.cached_activation_dataset is not None:
-            return self._load_buffer_from_cached(
-                total_size, context_size, d_in, raise_on_epoch_end
-            )
+            return self._load_raw_llm_batch_from_cached(raise_on_epoch_end)
-        refill_iterator = range(0, total_size, batch_size)
-        # Initialize empty tensor buffer of the maximum required size with an additional dimension for layers
-        new_buffer_activations = torch.zeros(
-            (total_size, self.training_context_size, d_in),
-            dtype=self.dtype,  # type: ignore
-            device=self.device,
-        )
-        new_buffer_token_ids = torch.zeros(
-            (total_size, self.training_context_size),
-            dtype=torch.long,
-            device=self.device,
+        # move batch toks to gpu for model
+        batch_tokens = self.get_batch_tokens(raise_at_epoch_end=raise_on_epoch_end).to(
+            _get_model_device(self.model)
         )
+        activations = self.get_activations(batch_tokens).to(self.device)
-        for refill_batch_idx_start in tqdm(
-            refill_iterator, leave=False, desc="Refilling buffer"
-        ):
-            # move batch toks to gpu for model
-            refill_batch_tokens = self.get_batch_tokens(
-                raise_at_epoch_end=raise_on_epoch_end
-            ).to(_get_model_device(self.model))
-            refill_activations = self.get_activations(refill_batch_tokens)
-            # move acts back to cpu
-            refill_activations.to(self.device)
-            new_buffer_activations[
-                refill_batch_idx_start : refill_batch_idx_start + batch_size, ...
-            ] = refill_activations
-            # handle seqpos_slice, this is done for activations in get_activations
-            refill_batch_tokens = refill_batch_tokens[:, slice(*self.seqpos_slice)]
-            new_buffer_token_ids[
-                refill_batch_idx_start : refill_batch_idx_start + batch_size, ...
-            ] = refill_batch_tokens
-        new_buffer_activations = new_buffer_activations.reshape(-1, d_in)
-        new_buffer_token_ids = new_buffer_token_ids.reshape(-1)
-        if shuffle:
-            new_buffer_activations, new_buffer_token_ids = permute_together(
-                [new_buffer_activations, new_buffer_token_ids]
-            )
+        # handle seqpos_slice, this is done for activations in get_activations
+        batch_tokens = batch_tokens[:, slice(*self.seqpos_slice)]
-        return (
-            new_buffer_activations,
-            new_buffer_token_ids,
-        )
+        # reshape from (batch, context, d_in) to (batch * context, d_in)
+        activations = activations.reshape(-1, d_in)
+        token_ids = batch_tokens.reshape(-1)
-    def get_filtered_buffer(
+        return activations, token_ids
+    def get_filtered_llm_batch(
         self,
-        n_batches_in_buffer: int,
         raise_on_epoch_end: bool = False,
-        shuffle: bool = True,
     ) -> torch.Tensor:
+        """
+        Get a batch of LLM activations with special tokens filtered out.
+        """
         return _filter_buffer_acts(
-            self.get_raw_buffer(
-                n_batches_in_buffer=n_batches_in_buffer,
-                raise_on_epoch_end=raise_on_epoch_end,
-                shuffle=shuffle,
-            ),
+            self.get_raw_llm_batch(raise_on_epoch_end=raise_on_epoch_end),
             self.exclude_special_tokens,
         )
     def _iterate_filtered_activations(self) -> Generator[torch.Tensor, None, None]:
         """
-        Iterate over the filtered tokens in the buffer.
+        Iterate over filtered LLM activation batches.
         """
         while True:
             try:
-                yield self.get_filtered_buffer(
-                    self.half_buffer_size, raise_on_epoch_end=True
-                )
+                yield self.get_filtered_llm_batch(raise_on_epoch_end=True)
             except StopIteration:
                 warnings.warn(
                     "All samples in the training dataset have been exhausted, beginning new epoch."
                 )
                 try:
-                    yield self.get_filtered_buffer(self.half_buffer_size)
+                    yield self.get_filtered_llm_batch()
                 except StopIteration:
                     raise ValueError(
                         "Unable to fill buffer after starting new epoch. Dataset may be too small."
@@ -708,6 +673,7 @@ class ActivationsStore:
             buffer_size=self.n_batches_in_buffer * self.training_context_size,
             batch_size=self.train_batch_size_tokens,
             activations_loader=self._iterate_filtered_activations(),
+            mix_fraction=self.activations_mixing_fraction,
         )
     def next_batch(self) -> torch.Tensor:
@@ -823,9 +789,3 @@ def _filter_buffer_acts(
     mask = torch.isin(tokens, exclude_tokens)
     return activations[~mask]
-def permute_together(tensors: Sequence[torch.Tensor]) -> tuple[torch.Tensor, ...]:
-    """Permute tensors together."""
-    permutation = torch.randperm(tensors[0].shape[0])
-    return tuple(t[permutation] for t in tensors)

sae-lens 6.26.0__py3-none-any.whl → 6.28.1__py3-none-any.whl

sae-lens 6.26.0py3-none-any.whl → 6.28.1py3-none-any.whl