PyPI - sae-lens - Versions diffs - 6.33.0__py3-none-any.whl → 6.34.1__py3-none-any.whl - Mend

sae-lens 6.33.0py3-none-any.whl → 6.34.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.33.0"
+__version__ = "6.34.1"
 import logging

sae_lens/analysis/hooked_sae_transformer.py CHANGED Viewed

@@ -1,13 +1,14 @@
-import logging
 from contextlib import contextmanager
 from typing import Any, Callable
 import torch
+from torch import nn
 from transformer_lens.ActivationCache import ActivationCache
 from transformer_lens.components.mlps.can_be_used_as_mlp import CanBeUsedAsMLP
 from transformer_lens.hook_points import HookPoint  # Hooking utilities
 from transformer_lens.HookedTransformer import HookedTransformer
+from sae_lens import logger
 from sae_lens.saes.sae import SAE
 SingleLoss = torch.Tensor  # Type alias for a single element tensor
@@ -15,6 +16,72 @@ LossPerToken = torch.Tensor
 Loss = SingleLoss | LossPerToken
+class _SAEWrapper(nn.Module):
+    """Wrapper for SAE/Transcoder that handles error term and hook coordination.
+    For SAEs (input_hook == output_hook), _captured_input stays None and we use
+    the forward argument directly. For transcoders, _captured_input is set at
+    the input hook via capture_input().
+    Implementation Note:
+        The SAE is stored in __dict__ directly rather than as a registered submodule.
+        This is intentional: PyTorch's module registration would add a ".sae." prefix
+        to all hook names in the cache (e.g., "blocks.0.hook_mlp_out.sae.hook_sae_input"
+        instead of "blocks.0.hook_mlp_out.hook_sae_input"). By storing in __dict__ and
+        copying hooks directly to the wrapper, we preserve the expected cache paths
+        for backwards compatibility.
+    """
+    def __init__(self, sae: SAE[Any], use_error_term: bool = False):
+        super().__init__()
+        # Store SAE in __dict__ to avoid registering as submodule. This keeps cache
+        # paths clean by avoiding a ".sae." prefix on hook names. See class docstring.
+        self.__dict__["_sae"] = sae
+        # Copy SAE's hooks directly to wrapper so they appear at the right path
+        for name, hook in sae.hook_dict.items():
+            setattr(self, name, hook)
+        self.use_error_term = use_error_term
+        self._captured_input: torch.Tensor | None = None
+    @property
+    def sae(self) -> SAE[Any]:
+        return self.__dict__["_sae"]
+    def capture_input(self, x: torch.Tensor) -> None:
+        """Capture input at input hook (for transcoders).
+        Note: We don't clone the tensor here - the input should not be modified
+        in-place between capture and use, and avoiding clone preserves memory.
+        """
+        self._captured_input = x
+    def forward(self, original_output: torch.Tensor) -> torch.Tensor:
+        """Run SAE/transcoder at output hook location."""
+        # For SAE: use original_output as input (same hook for input/output)
+        # For transcoder: use captured input from earlier hook
+        sae_input = (
+            self._captured_input
+            if self._captured_input is not None
+            else original_output
+        )
+        # Temporarily disable SAE's internal use_error_term - we handle it here
+        # Use _use_error_term directly to avoid triggering deprecation warning
+        sae_use_error_term = self.sae._use_error_term
+        self.sae._use_error_term = False
+        try:
+            sae_out = self.sae(sae_input)
+            if self.use_error_term:
+                error = original_output - sae_out.detach()
+                sae_out = sae_out + error
+            return sae_out
+        finally:
+            self.sae._use_error_term = sae_use_error_term
+            self._captured_input = None
 def get_deep_attr(obj: Any, path: str):
     """Helper function to get a nested attribute from a object.
     In practice used to access HookedTransformer HookPoints (eg model.blocks[0].attn.hook_z)
@@ -78,88 +145,130 @@ class HookedSAETransformer(HookedTransformer):
             add_hook_in_to_mlp(block.mlp)  # type: ignore
         self.setup()
-        self.acts_to_saes: dict[str, SAE] = {}  # type: ignore
+        self._acts_to_saes: dict[str, _SAEWrapper] = {}
+        # Track output hooks used by transcoders for cleanup
+        self._transcoder_output_hooks: dict[str, str] = {}
+    @property
+    def acts_to_saes(self) -> dict[str, SAE[Any]]:
+        """Returns a dict mapping hook names to attached SAEs."""
+        return {name: wrapper.sae for name, wrapper in self._acts_to_saes.items()}
     def add_sae(self, sae: SAE[Any], use_error_term: bool | None = None):
-        """Attaches an SAE to the model
+        """Attaches an SAE or Transcoder to the model.
-        WARNING: This sae will be permanantly attached until you remove it with reset_saes. This function will also overwrite any existing SAE attached to the same hook point.
+        WARNING: This SAE will be permanently attached until you remove it with
+        reset_saes. This function will also overwrite any existing SAE attached
+        to the same hook point.
         Args:
-            sae: SparseAutoencoderBase. The SAE to attach to the model
-            use_error_term: (bool | None) If provided, will set the use_error_term attribute of the SAE to this value. Determines whether the SAE returns input or reconstruction. Defaults to None.
+            sae: The SAE or Transcoder to attach to the model.
+            use_error_term: If True, computes error term so output matches what the
+                model would have produced without the SAE. This works for both SAEs
+                (where input==output hook) and transcoders (where they differ).
+                Defaults to None (uses SAE's existing setting).
         """
-        act_name = sae.cfg.metadata.hook_name
-        if (act_name not in self.acts_to_saes) and (act_name not in self.hook_dict):
-            logging.warning(
-                f"No hook found for {act_name}. Skipping. Check model.hook_dict for available hooks."
+        input_hook = sae.cfg.metadata.hook_name
+        output_hook = sae.cfg.metadata.hook_name_out or input_hook
+        if (input_hook not in self._acts_to_saes) and (
+            input_hook not in self.hook_dict
+        ):
+            logger.warning(
+                f"No hook found for {input_hook}. Skipping. Check model.hook_dict for available hooks."
             )
             return
-        if use_error_term is not None:
-            if not hasattr(sae, "_original_use_error_term"):
-                sae._original_use_error_term = sae.use_error_term  # type: ignore
-            sae.use_error_term = use_error_term
-        self.acts_to_saes[act_name] = sae
-        set_deep_attr(self, act_name, sae)
+        # Check if output hook exists (either as hook_dict entry or already has SAE attached)
+        output_hook_exists = (
+            output_hook in self.hook_dict
+            or output_hook in self._acts_to_saes
+            or any(v == output_hook for v in self._transcoder_output_hooks.values())
+        )
+        if not output_hook_exists:
+            logger.warning(f"No hook found for output {output_hook}. Skipping.")
+            return
+        # Always use wrapper - it handles both SAEs and transcoders uniformly
+        # If use_error_term not specified, respect SAE's existing setting
+        effective_use_error_term = (
+            use_error_term if use_error_term is not None else sae.use_error_term
+        )
+        wrapper = _SAEWrapper(sae, use_error_term=effective_use_error_term)
+        # For transcoders (input != output), capture input at input hook
+        if input_hook != output_hook:
+            input_hook_point = get_deep_attr(self, input_hook)
+            if isinstance(input_hook_point, HookPoint):
+                input_hook_point.add_hook(
+                    lambda tensor, hook: (wrapper.capture_input(tensor), tensor)[1],  # noqa: ARG005
+                    dir="fwd",
+                    is_permanent=True,
+                )
+            self._transcoder_output_hooks[input_hook] = output_hook
+        # Store wrapper in _acts_to_saes and at output hook
+        self._acts_to_saes[input_hook] = wrapper
+        set_deep_attr(self, output_hook, wrapper)
         self.setup()
-    def _reset_sae(self, act_name: str, prev_sae: SAE[Any] | None = None):
-        """Resets an SAE that was attached to the model
+    def _reset_sae(
+        self, act_name: str, prev_wrapper: _SAEWrapper | None = None
+    ) -> None:
+        """Resets an SAE that was attached to the model.
         By default will remove the SAE from that hook_point.
-        If prev_sae is provided, will replace the current SAE with the provided one.
-        This is mainly used to restore previously attached SAEs after temporarily running with different SAEs (eg with run_with_saes)
+        If prev_wrapper is provided, will restore that wrapper's SAE with its settings.
         Args:
-            act_name: str. The hook_name of the SAE to reset
-            prev_sae: SAE | None. The SAE to replace the current one with. If None, will just remove the SAE from this hook point. Defaults to None
+            act_name: The hook_name of the SAE to reset.
+            prev_wrapper: The previous wrapper to restore. If None, will just
+                remove the SAE from this hook point. Defaults to None.
         """
-        if act_name not in self.acts_to_saes:
-            logging.warning(
+        if act_name not in self._acts_to_saes:
+            logger.warning(
                 f"No SAE is attached to {act_name}. There's nothing to reset."
             )
             return
-        current_sae = self.acts_to_saes[act_name]
-        if hasattr(current_sae, "_original_use_error_term"):
-            current_sae.use_error_term = current_sae._original_use_error_term  # type: ignore
-            delattr(current_sae, "_original_use_error_term")
+        # Determine output hook location (different from input for transcoders)
+        output_hook = self._transcoder_output_hooks.pop(act_name, act_name)
+        # For transcoders, clear permanent hooks from input hook point
+        if output_hook != act_name:
+            input_hook_point = get_deep_attr(self, act_name)
+            if isinstance(input_hook_point, HookPoint):
+                input_hook_point.remove_hooks(dir="fwd", including_permanent=True)
+        # Reset output hook location
+        set_deep_attr(self, output_hook, HookPoint())
+        del self._acts_to_saes[act_name]
-        if prev_sae is not None:
-            set_deep_attr(self, act_name, prev_sae)
-            self.acts_to_saes[act_name] = prev_sae
-        else:
-            set_deep_attr(self, act_name, HookPoint())
-            del self.acts_to_saes[act_name]
+        if prev_wrapper is not None:
+            # Rebuild hook_dict before adding new SAE
+            self.setup()
+            self.add_sae(prev_wrapper.sae, use_error_term=prev_wrapper.use_error_term)
     def reset_saes(
         self,
         act_names: str | list[str] | None = None,
-        prev_saes: list[SAE[Any] | None] | None = None,
-    ):
-        """Reset the SAEs attached to the model
+    ) -> None:
+        """Reset the SAEs attached to the model.
-        If act_names are provided will just reset SAEs attached to those hooks. Otherwise will reset all SAEs attached to the model.
-        Optionally can provide a list of prev_saes to reset to. This is mainly used to restore previously attached SAEs after temporarily running with different SAEs (eg with run_with_saes).
+        If act_names are provided will just reset SAEs attached to those hooks.
+        Otherwise will reset all SAEs attached to the model.
         Args:
-            act_names (str | list[str] | None): The act_names of the SAEs to reset. If None, will reset all SAEs attached to the model. Defaults to None.
-            prev_saes (list[SAE | None] | None): List of SAEs to replace the current ones with. If None, will just remove the SAEs. Defaults to None.
+            act_names: The act_names of the SAEs to reset. If None, will reset
+                all SAEs attached to the model. Defaults to None.
         """
         if isinstance(act_names, str):
             act_names = [act_names]
         elif act_names is None:
-            act_names = list(self.acts_to_saes.keys())
-        if prev_saes:
-            if len(act_names) != len(prev_saes):
-                raise ValueError("act_names and prev_saes must have the same length")
-        else:
-            prev_saes = [None] * len(act_names)  # type: ignore
+            act_names = list(self._acts_to_saes.keys())
-        for act_name, prev_sae in zip(act_names, prev_saes):  # type: ignore
-            self._reset_sae(act_name, prev_sae)
+        for act_name in act_names:
+            self._reset_sae(act_name)
         self.setup()
@@ -269,41 +378,32 @@ class HookedSAETransformer(HookedTransformer):
         reset_saes_end: bool = True,
         use_error_term: bool | None = None,
     ):
-        """
-        A context manager for adding temporary SAEs to the model.
-        See HookedTransformer.hooks for a similar context manager for hooks.
-        By default will keep track of previously attached SAEs, and restore them when the context manager exits.
-        Example:
-        .. code-block:: python
-            from transformer_lens import HookedSAETransformer
-            from sae_lens.saes.sae import SAE
-            model = HookedSAETransformer.from_pretrained('gpt2-small')
-            sae_cfg = SAEConfig(...)
-            sae = SAE(sae_cfg)
-            with model.saes(saes=[sae]):
-                spliced_logits = model(text)
+        """A context manager for adding temporary SAEs to the model.
+        See HookedTransformer.hooks for a similar context manager for hooks.
+        By default will keep track of previously attached SAEs, and restore
+        them when the context manager exits.
         Args:
-            saes (SAE | list[SAE]): SAEs to be attached.
-            reset_saes_end (bool): If True, removes all SAEs added by this context manager when the context manager exits, returning previously attached SAEs to their original state.
-            use_error_term (bool | None): If provided, will set the use_error_term attribute of all SAEs attached during this run to this value. Defaults to None.
+            saes: SAEs to be attached.
+            reset_saes_end: If True, removes all SAEs added by this context
+                manager when the context manager exits, returning previously
+                attached SAEs to their original state.
+            use_error_term: If provided, will set the use_error_term attribute
+                of all SAEs attached during this run to this value.
         """
-        act_names_to_reset = []
-        prev_saes = []
+        saes_to_restore: list[tuple[str, _SAEWrapper | None]] = []
         if isinstance(saes, SAE):
             saes = [saes]
         try:
             for sae in saes:
-                act_names_to_reset.append(sae.cfg.metadata.hook_name)
-                prev_sae = self.acts_to_saes.get(sae.cfg.metadata.hook_name, None)
-                prev_saes.append(prev_sae)
+                act_name = sae.cfg.metadata.hook_name
+                prev_wrapper = self._acts_to_saes.get(act_name, None)
+                saes_to_restore.append((act_name, prev_wrapper))
                 self.add_sae(sae, use_error_term=use_error_term)
             yield self
         finally:
             if reset_saes_end:
-                self.reset_saes(act_names_to_reset, prev_saes)
+                for act_name, prev_wrapper in saes_to_restore:
+                    self._reset_sae(act_name, prev_wrapper)
+                self.setup()

sae_lens/analysis/sae_transformer_bridge.py CHANGED Viewed

@@ -8,7 +8,11 @@ from transformer_lens.hook_points import HookPoint
 from transformer_lens.model_bridge import TransformerBridge
 from sae_lens import logger
-from sae_lens.analysis.hooked_sae_transformer import set_deep_attr
+from sae_lens.analysis.hooked_sae_transformer import (
+    _SAEWrapper,
+    get_deep_attr,
+    set_deep_attr,
+)
 from sae_lens.saes.sae import SAE
 SingleLoss = torch.Tensor  # Type alias for a single element tensor
@@ -30,11 +34,18 @@ class SAETransformerBridge(TransformerBridge):  # type: ignore[misc,no-untyped-c
     useful for models not natively supported by HookedTransformer, such as Gemma 3.
     """
-    acts_to_saes: dict[str, SAE[Any]]
+    _acts_to_saes: dict[str, _SAEWrapper]
     def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(*args, **kwargs)
-        self.acts_to_saes = {}
+        self._acts_to_saes = {}
+        # Track output hooks used by transcoders for cleanup
+        self._transcoder_output_hooks: dict[str, str] = {}
+    @property
+    def acts_to_saes(self) -> dict[str, SAE[Any]]:
+        """Returns a dict mapping hook names to attached SAEs."""
+        return {name: wrapper.sae for name, wrapper in self._acts_to_saes.items()}
     @classmethod
     def boot_transformers(  # type: ignore[override]
@@ -56,7 +67,8 @@ class SAETransformerBridge(TransformerBridge):  # type: ignore[misc,no-untyped-c
         # Convert to our class
         # NOTE: this is super hacky and scary, but I don't know how else to achieve this given TLens' internal code
         bridge.__class__ = cls
-        bridge.acts_to_saes = {}  # type: ignore[attr-defined]
+        bridge._acts_to_saes = {}  # type: ignore[attr-defined]
+        bridge._transcoder_output_hooks = {}  # type: ignore[attr-defined]
         return bridge  # type: ignore[return-value]
     def _resolve_hook_name(self, hook_name: str) -> str:
@@ -75,110 +87,129 @@ class SAETransformerBridge(TransformerBridge):  # type: ignore[misc,no-untyped-c
         return resolved if isinstance(resolved, str) else hook_name
     def add_sae(self, sae: SAE[Any], use_error_term: bool | None = None) -> None:
-        """Attaches an SAE to the model.
+        """Attaches an SAE or Transcoder to the model.
         WARNING: This SAE will be permanently attached until you remove it with
         reset_saes. This function will also overwrite any existing SAE attached
         to the same hook point.
         Args:
-            sae: The SAE to attach to the model
-            use_error_term: If provided, will set the use_error_term attribute of
-                the SAE to this value. Determines whether the SAE returns input
-                or reconstruction. Defaults to None.
+            sae: The SAE or Transcoder to attach to the model.
+            use_error_term: If True, computes error term so output matches what the
+                model would have produced without the SAE. This works for both SAEs
+                (where input==output hook) and transcoders (where they differ).
+                Defaults to None (uses SAE's existing setting).
         """
-        alias_name = sae.cfg.metadata.hook_name
-        actual_name = self._resolve_hook_name(alias_name)
-        # Check if hook exists (either as alias or actual name)
-        if (alias_name not in self.acts_to_saes) and (
-            actual_name not in self._hook_registry
+        input_hook_alias = sae.cfg.metadata.hook_name
+        output_hook_alias = sae.cfg.metadata.hook_name_out or input_hook_alias
+        input_hook_actual = self._resolve_hook_name(input_hook_alias)
+        output_hook_actual = self._resolve_hook_name(output_hook_alias)
+        # Check if hooks exist
+        if (input_hook_alias not in self._acts_to_saes) and (
+            input_hook_actual not in self._hook_registry
         ):
             logger.warning(
-                f"No hook found for {alias_name}. Skipping. "
+                f"No hook found for {input_hook_alias}. Skipping. "
                 f"Check model._hook_registry for available hooks."
             )
             return
-        if use_error_term is not None:
-            if not hasattr(sae, "_original_use_error_term"):
-                sae._original_use_error_term = sae.use_error_term  # type: ignore[attr-defined]
-            sae.use_error_term = use_error_term
-        # Replace hook and update registry
-        set_deep_attr(self, actual_name, sae)
-        self._hook_registry[actual_name] = sae  # type: ignore[assignment]
-        self.acts_to_saes[alias_name] = sae
+        # Check if output hook exists (either as registry entry or already has SAE attached)
+        output_hook_exists = (
+            output_hook_actual in self._hook_registry
+            or input_hook_alias in self._acts_to_saes
+            or any(
+                v == output_hook_actual for v in self._transcoder_output_hooks.values()
+            )
+        )
+        if not output_hook_exists:
+            logger.warning(f"No hook found for output {output_hook_alias}. Skipping.")
+            return
-    def _reset_sae(self, act_name: str, prev_sae: SAE[Any] | None = None) -> None:
+        # Always use wrapper - it handles both SAEs and transcoders uniformly
+        # If use_error_term not specified, respect SAE's existing setting
+        effective_use_error_term = (
+            use_error_term if use_error_term is not None else sae.use_error_term
+        )
+        wrapper = _SAEWrapper(sae, use_error_term=effective_use_error_term)
+        # For transcoders (input != output), capture input at input hook
+        if input_hook_alias != output_hook_alias:
+            input_hook_point = get_deep_attr(self, input_hook_actual)
+            if isinstance(input_hook_point, HookPoint):
+                input_hook_point.add_hook(
+                    lambda tensor, hook: (wrapper.capture_input(tensor), tensor)[1],  # noqa: ARG005
+                    dir="fwd",
+                    is_permanent=True,
+                )
+            self._transcoder_output_hooks[input_hook_alias] = output_hook_actual
+        # Store wrapper in _acts_to_saes and at output hook
+        set_deep_attr(self, output_hook_actual, wrapper)
+        self._hook_registry[output_hook_actual] = wrapper  # type: ignore[assignment]
+        self._acts_to_saes[input_hook_alias] = wrapper
+    def _reset_sae(
+        self, act_name: str, prev_wrapper: _SAEWrapper | None = None
+    ) -> None:
         """Resets an SAE that was attached to the model.
         By default will remove the SAE from that hook_point.
-        If prev_sae is provided, will replace the current SAE with the provided one.
-        This is mainly used to restore previously attached SAEs after temporarily
-        running with different SAEs (e.g., with run_with_saes).
+        If prev_wrapper is provided, will restore that wrapper's SAE with its settings.
         Args:
             act_name: The hook_name of the SAE to reset
-            prev_sae: The SAE to replace the current one with. If None, will just
+            prev_wrapper: The previous wrapper to restore. If None, will just
                 remove the SAE from this hook point. Defaults to None.
         """
-        if act_name not in self.acts_to_saes:
+        if act_name not in self._acts_to_saes:
             logger.warning(
                 f"No SAE is attached to {act_name}. There's nothing to reset."
             )
             return
         actual_name = self._resolve_hook_name(act_name)
-        current_sae = self.acts_to_saes[act_name]
-        if hasattr(current_sae, "_original_use_error_term"):
-            current_sae.use_error_term = current_sae._original_use_error_term  # type: ignore[attr-defined]
-            delattr(current_sae, "_original_use_error_term")
-        if prev_sae is not None:
-            set_deep_attr(self, actual_name, prev_sae)
-            self._hook_registry[actual_name] = prev_sae  # type: ignore[assignment]
-            self.acts_to_saes[act_name] = prev_sae
-        else:
-            new_hook = HookPoint()
-            new_hook.name = actual_name
-            set_deep_attr(self, actual_name, new_hook)
-            self._hook_registry[actual_name] = new_hook
-            del self.acts_to_saes[act_name]
+        # Determine output hook location (different from input for transcoders)
+        output_hook = self._transcoder_output_hooks.pop(act_name, actual_name)
+        # For transcoders, clear permanent hooks from input hook point
+        if output_hook != actual_name:
+            input_hook_point = get_deep_attr(self, actual_name)
+            if isinstance(input_hook_point, HookPoint):
+                input_hook_point.remove_hooks(dir="fwd", including_permanent=True)
+        # Reset output hook location
+        new_hook = HookPoint()
+        new_hook.name = output_hook
+        set_deep_attr(self, output_hook, new_hook)
+        self._hook_registry[output_hook] = new_hook
+        del self._acts_to_saes[act_name]
+        if prev_wrapper is not None:
+            self.add_sae(prev_wrapper.sae, use_error_term=prev_wrapper.use_error_term)
     def reset_saes(
         self,
         act_names: str | list[str] | None = None,
-        prev_saes: list[SAE[Any] | None] | None = None,
     ) -> None:
         """Reset the SAEs attached to the model.
         If act_names are provided will just reset SAEs attached to those hooks.
         Otherwise will reset all SAEs attached to the model.
-        Optionally can provide a list of prev_saes to reset to. This is mainly
-        used to restore previously attached SAEs after temporarily running with
-        different SAEs (e.g., with run_with_saes).
         Args:
             act_names: The act_names of the SAEs to reset. If None, will reset all
                 SAEs attached to the model. Defaults to None.
-            prev_saes: List of SAEs to replace the current ones with. If None, will
-                just remove the SAEs. Defaults to None.
         """
         if isinstance(act_names, str):
             act_names = [act_names]
         elif act_names is None:
-            act_names = list(self.acts_to_saes.keys())
-        if prev_saes:
-            if len(act_names) != len(prev_saes):
-                raise ValueError("act_names and prev_saes must have the same length")
-        else:
-            prev_saes = [None] * len(act_names)  # type: ignore[assignment]
+            act_names = list(self._acts_to_saes.keys())
-        for act_name, prev_sae in zip(act_names, prev_saes):  # type: ignore[arg-type]
-            self._reset_sae(act_name, prev_sae)
+        for act_name in act_names:
+            self._reset_sae(act_name)
     def run_with_saes(
         self,
@@ -310,20 +341,20 @@ class SAETransformerBridge(TransformerBridge):  # type: ignore[misc,no-untyped-c
             use_error_term: If provided, will set the use_error_term attribute of
                 all SAEs attached during this run to this value. Defaults to None.
         """
-        act_names_to_reset: list[str] = []
-        prev_saes: list[SAE[Any] | None] = []
+        saes_to_restore: list[tuple[str, _SAEWrapper | None]] = []
         if isinstance(saes, SAE):
             saes = [saes]
         try:
             for sae in saes:
-                act_names_to_reset.append(sae.cfg.metadata.hook_name)
-                prev_sae = self.acts_to_saes.get(sae.cfg.metadata.hook_name, None)
-                prev_saes.append(prev_sae)
+                act_name = sae.cfg.metadata.hook_name
+                prev_wrapper = self._acts_to_saes.get(act_name, None)
+                saes_to_restore.append((act_name, prev_wrapper))
                 self.add_sae(sae, use_error_term=use_error_term)
             yield self
         finally:
             if reset_saes_end:
-                self.reset_saes(act_names_to_reset, prev_saes)
+                for act_name, prev_wrapper in saes_to_restore:
+                    self._reset_sae(act_name, prev_wrapper)
     @property
     def hook_dict(self) -> dict[str, HookPoint]:
@@ -337,9 +368,9 @@ class SAETransformerBridge(TransformerBridge):  # type: ignore[misc,no-untyped-c
         hooks: dict[str, HookPoint] = {}
         for name, hook_or_sae in self._hook_registry.items():
-            if isinstance(hook_or_sae, SAE):
+            if isinstance(hook_or_sae, _SAEWrapper):
                 # Include SAE's internal hooks with full path names
-                for sae_hook_name, sae_hook in hook_or_sae.hook_dict.items():
+                for sae_hook_name, sae_hook in hook_or_sae.sae.hook_dict.items():
                     full_name = f"{name}.{sae_hook_name}"
                     hooks[full_name] = sae_hook
             else:

sae_lens/saes/sae.py CHANGED Viewed

@@ -229,7 +229,7 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
     cfg: T_SAE_CONFIG
     dtype: torch.dtype
     device: torch.device
-    use_error_term: bool
+    _use_error_term: bool
     # For type checking only - don't provide default values
     # These will be initialized by subclasses
@@ -254,7 +254,9 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.dtype = str_to_dtype(cfg.dtype)
         self.device = torch.device(cfg.device)
-        self.use_error_term = use_error_term
+        self._use_error_term = False  # Set directly to avoid warning during init
+        if use_error_term:
+            self.use_error_term = True  # Use property setter to trigger warning
         # Set up activation function
         self.activation_fn = self.get_activation_fn()
@@ -281,6 +283,22 @@ class SAE(HookedRootModule, Generic[T_SAE_CONFIG], ABC):
         self.setup()  # Required for HookedRootModule
+    @property
+    def use_error_term(self) -> bool:
+        return self._use_error_term
+    @use_error_term.setter
+    def use_error_term(self, value: bool) -> None:
+        if value and not self._use_error_term:
+            warnings.warn(
+                "Setting use_error_term directly on SAE is deprecated. "
+                "Use HookedSAETransformer.add_sae(sae, use_error_term=True) instead. "
+                "This will be removed in a future version.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        self._use_error_term = value
     @torch.no_grad()
     def fold_activation_norm_scaling_factor(self, scaling_factor: float):
         self.W_enc.data *= scaling_factor  # type: ignore

{sae_lens-6.33.0.dist-info → sae_lens-6.34.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sae-lens
-Version: 6.33.0
+Version: 6.34.1
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 License-File: LICENSE

{sae_lens-6.33.0.dist-info → sae_lens-6.34.1.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-sae_lens/__init__.py,sha256=gHaxlySzLskrAUg2oUZ3aOpnI3U_AVIHce-agGJL9rI,5168
+sae_lens/__init__.py,sha256=U5I6dWw2_x6ZXwL8BF72vB0HGPEUt-_BzHq-btLq2pw,5168
 sae_lens/analysis/__init__.py,sha256=FZExlMviNwWR7OGUSGRbd0l-yUDGSp80gglI_ivILrY,412
 sae_lens/analysis/compat.py,sha256=cgE3nhFcJTcuhppxbL71VanJS7YqVEOefuneB5eOaPw,538
-sae_lens/analysis/hooked_sae_transformer.py,sha256=LpnjxSAcItqqXA4SJyZuxY4Ki0UOuWV683wg9laYAsY,14050
+sae_lens/analysis/hooked_sae_transformer.py,sha256=-LY9CKYEziSTt-H7MeLdTx6ErfvMqgNEF709wV7tEs4,17826
 sae_lens/analysis/neuronpedia_integration.py,sha256=Gx1W7hUBEuMoasNcnOnZ1wmqbXDd1pSZ1nqKEya1HQc,4962
-sae_lens/analysis/sae_transformer_bridge.py,sha256=xpJRRcB0g47EOQcmNCwMyrJJsbqMsGxVViDrV6C3upU,14916
+sae_lens/analysis/sae_transformer_bridge.py,sha256=y_ZdvaxuUM_-7ywSCeFl6f6cq1_FiqRstMAo8mxZtYE,16191
 sae_lens/cache_activations_runner.py,sha256=TjqNWIc46Nw09jHWFjzQzgzG5wdu_87Ahe-iFjI5_0Q,13117
 sae_lens/config.py,sha256=V0BXV8rvpbm5YuVukow9FURPpdyE4HSflbdymAo0Ycg,31205
 sae_lens/constants.py,sha256=CM-h9AjZNAl2aP7hVpKk7YsFHpu-_Lfhhmq2d5qPEVc,887
@@ -22,7 +22,7 @@ sae_lens/saes/gated_sae.py,sha256=V_2ZNlV4gRD-rX5JSx1xqY7idT8ChfdQ5yxWDdu_6hg,88
 sae_lens/saes/jumprelu_sae.py,sha256=miiF-xI_yXdV9EkKjwAbU9zSMsx9KtKCz5YdXEzkN8g,13313
 sae_lens/saes/matching_pursuit_sae.py,sha256=08_G9p1YMLnE5qZVCPp6gll-iG6nHRbMMASf4_bkFt8,13207
 sae_lens/saes/matryoshka_batchtopk_sae.py,sha256=Qr6htt1HHOuO9FXI9hyaPSnGFIiJG-v7y1t1CEmkFzM,5995
-sae_lens/saes/sae.py,sha256=wkwqzNragj-1189cV52S3_XeRtEgBd2ZNwvL2EsKkWw,39429
+sae_lens/saes/sae.py,sha256=2FHhLoTZYOJUGjkWH7eF2EAc1fPxQsF4KYr5TZ8IUIU,40155
 sae_lens/saes/standard_sae.py,sha256=_hldNZkFPAf9VGrxouR1-tN8T2OEk8IkWBcXoatrC1o,5749
 sae_lens/saes/temporal_sae.py,sha256=S44sPddVj2xujA02CC8gT1tG0in7c_CSAhspu9FHbaA,13273
 sae_lens/saes/topk_sae.py,sha256=vrMRPrCQR1o8G_kXqY_EAoGZARupkQNFB2dNZVLsusE,21073
@@ -49,7 +49,7 @@ sae_lens/training/types.py,sha256=1FpLx_Doda9vZpmfm-x1e8wGBYpyhe9Kpb_JuM5nIFM,90
 sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
 sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
 sae_lens/util.py,sha256=oIMoeyEP2IzcPFmRbKUzOAycgEyMcOasGeO_BGVZbc4,4846
-sae_lens-6.33.0.dist-info/METADATA,sha256=X6XqngWTNEsfdaPPWXxtF8Kvdp8fAk8i68sfRtDb2xo,6566
-sae_lens-6.33.0.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
-sae_lens-6.33.0.dist-info/licenses/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
-sae_lens-6.33.0.dist-info/RECORD,,
+sae_lens-6.34.1.dist-info/METADATA,sha256=vAlbWT90NggKoxjII6dnEj7wlC-PmuBn8zzqL6r8dRg,6566
+sae_lens-6.34.1.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
+sae_lens-6.34.1.dist-info/licenses/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
+sae_lens-6.34.1.dist-info/RECORD,,

{sae_lens-6.33.0.dist-info → sae_lens-6.34.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 2.3.0
+Generator: poetry-core 2.3.1
 Root-Is-Purelib: true
 Tag: py3-none-any

{sae_lens-6.33.0.dist-info → sae_lens-6.34.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sae-lens 6.33.0__py3-none-any.whl → 6.34.1__py3-none-any.whl

sae-lens 6.33.0py3-none-any.whl → 6.34.1py3-none-any.whl