PyPI - sae-lens - Versions diffs - 6.10.0__py3-none-any.whl → 6.11.1__py3-none-any.whl - Mend

sae-lens 6.10.0py3-none-any.whl → 6.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sae_lens/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ruff: noqa: E402
-__version__ = "6.10.0"
+__version__ = "6.11.1"
 import logging

sae_lens/saes/batchtopk_sae.py CHANGED Viewed

@@ -15,7 +15,7 @@ class BatchTopK(nn.Module):
     def __init__(
         self,
-        k: int,
+        k: float,
     ):
         super().__init__()
         self.k = k
@@ -23,7 +23,7 @@ class BatchTopK(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         acts = x.relu()
         flat_acts = acts.flatten()
-        acts_topk_flat = torch.topk(flat_acts, self.k * acts.shape[0], dim=-1)
+        acts_topk_flat = torch.topk(flat_acts, int(self.k * acts.shape[0]), dim=-1)
         return (
             torch.zeros_like(flat_acts)
             .scatter(-1, acts_topk_flat.indices, acts_topk_flat.values)
@@ -37,6 +37,7 @@ class BatchTopKTrainingSAEConfig(TopKTrainingSAEConfig):
     Configuration class for training a BatchTopKTrainingSAE.
     """
+    k: float = 100  # type: ignore[assignment]
     topk_threshold_lr: float = 0.01
     @override

sae_lens/saes/jumprelu_sae.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Literal
 import numpy as np
 import torch
@@ -187,13 +187,29 @@ class JumpReLUSAE(SAE[JumpReLUSAEConfig]):
 class JumpReLUTrainingSAEConfig(TrainingSAEConfig):
     """
     Configuration class for training a JumpReLUTrainingSAE.
+    - jumprelu_init_threshold: initial threshold for the JumpReLU activation
+    - jumprelu_bandwidth: bandwidth for the JumpReLU activation
+    - jumprelu_sparsity_loss_mode: mode for the sparsity loss, either "step" or "tanh". "step" is Google Deepmind's L0 loss, "tanh" is Anthropic's sparsity loss.
+    - l0_coefficient: coefficient for the l0 sparsity loss
+    - l0_warm_up_steps: number of warm-up steps for the l0 sparsity loss
+    - pre_act_loss_coefficient: coefficient for the pre-activation loss. Set to None to disable. Set to 3e-6 to match Anthropic's setup. Default is None.
+    - jumprelu_tanh_scale: scale for the tanh sparsity loss. Only relevant for "tanh" sparsity loss mode. Default is 4.0.
     """
     jumprelu_init_threshold: float = 0.01
     jumprelu_bandwidth: float = 0.05
+    # step is Google Deepmind, tanh is Anthropic
+    jumprelu_sparsity_loss_mode: Literal["step", "tanh"] = "step"
     l0_coefficient: float = 1.0
     l0_warm_up_steps: int = 0
+    # anthropic's auxiliary loss to avoid dead features
+    pre_act_loss_coefficient: float | None = None
+    # only relevant for tanh sparsity loss mode
+    jumprelu_tanh_scale: float = 4.0
     @override
     @classmethod
     def architecture(cls) -> str:
@@ -267,9 +283,35 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
         sae_out: torch.Tensor,
     ) -> dict[str, torch.Tensor]:
         """Calculate architecture-specific auxiliary loss terms."""
-        l0 = torch.sum(Step.apply(hidden_pre, self.threshold, self.bandwidth), dim=-1)  # type: ignore
-        l0_loss = (step_input.coefficients["l0"] * l0).mean()
-        return {"l0_loss": l0_loss}
+        threshold = self.threshold
+        W_dec_norm = self.W_dec.norm(dim=1)
+        if self.cfg.jumprelu_sparsity_loss_mode == "step":
+            l0 = torch.sum(
+                Step.apply(hidden_pre, threshold, self.bandwidth),  # type: ignore
+                dim=-1,
+            )
+            l0_loss = (step_input.coefficients["l0"] * l0).mean()
+        elif self.cfg.jumprelu_sparsity_loss_mode == "tanh":
+            per_item_l0_loss = torch.tanh(
+                self.cfg.jumprelu_tanh_scale * feature_acts * W_dec_norm
+            ).sum(dim=-1)
+            l0_loss = (step_input.coefficients["l0"] * per_item_l0_loss).mean()
+        else:
+            raise ValueError(
+                f"Invalid sparsity loss mode: {self.cfg.jumprelu_sparsity_loss_mode}"
+            )
+        losses = {"l0_loss": l0_loss}
+        if self.cfg.pre_act_loss_coefficient is not None:
+            losses["pre_act_loss"] = calculate_pre_act_loss(
+                self.cfg.pre_act_loss_coefficient,
+                threshold,
+                hidden_pre,
+                step_input.dead_neuron_mask,
+                W_dec_norm,
+            )
+        return losses
     @override
     def get_coefficients(self) -> dict[str, float | TrainCoefficientConfig]:
@@ -310,3 +352,21 @@ class JumpReLUTrainingSAE(TrainingSAE[JumpReLUTrainingSAEConfig]):
             threshold = state_dict["threshold"]
             del state_dict["threshold"]
             state_dict["log_threshold"] = torch.log(threshold).detach().contiguous()
+def calculate_pre_act_loss(
+    pre_act_loss_coefficient: float,
+    threshold: torch.Tensor,
+    hidden_pre: torch.Tensor,
+    dead_neuron_mask: torch.Tensor | None,
+    W_dec_norm: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Calculate Anthropic's pre-activation loss, except we only calculate this for latents that are actually dead.
+    """
+    if dead_neuron_mask is None or not dead_neuron_mask.any():
+        return hidden_pre.new_tensor(0.0)
+    per_item_loss = (
+        (threshold - hidden_pre).relu() * dead_neuron_mask * W_dec_norm
+    ).sum(dim=-1)
+    return pre_act_loss_coefficient * per_item_loss.mean()

{sae_lens-6.10.0.dist-info → sae_lens-6.11.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: sae-lens
-Version: 6.10.0
+Version: 6.11.1
 Summary: Training and Analyzing Sparse Autoencoders (SAEs)
 License: MIT
 Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch

{sae_lens-6.10.0.dist-info → sae_lens-6.11.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sae_lens/__init__.py,sha256=k8M2SyKNE3KpipPxODICdLG8KJNVvf1Zab4KNJuGWMQ,3589
+sae_lens/__init__.py,sha256=DLmCuiml_kjSeA2AlEbJwnCIwOorh5MLGRXt4uL7mqs,3589
 sae_lens/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sae_lens/analysis/hooked_sae_transformer.py,sha256=vRu6JseH1lZaEeILD5bEkQEQ1wYHHDcxD-f2olKmE9Y,14275
 sae_lens/analysis/neuronpedia_integration.py,sha256=Gx1W7hUBEuMoasNcnOnZ1wmqbXDd1pSZ1nqKEya1HQc,4962
@@ -15,9 +15,9 @@ sae_lens/pretokenize_runner.py,sha256=w0f6SfZLAxbp5eAAKnet8RqUB_DKofZ9RGsoJwFnYb
 sae_lens/pretrained_saes.yaml,sha256=d6FYfWTdVAPlOCM55C1ICS6lF9nWPPVNwjlXCa9p7NU,600468
 sae_lens/registry.py,sha256=nhy7BPSudSATqW4lo9H_k3Na7sfGHmAf9v-3wpnLL_o,1490
 sae_lens/saes/__init__.py,sha256=jVwazK8Q6dW5J6_zFXPoNAuBvSxgziQ8eMOjGM3t-X8,1475
-sae_lens/saes/batchtopk_sae.py,sha256=CyaFG2hMyyDaEaXXrAMJC8wQDW1JoddTKF5mvxxBQKY,3395
+sae_lens/saes/batchtopk_sae.py,sha256=GX_J0vH4vzeLqYxl0mkfsZQpFEoCEHMR4dIG8fz8N8w,3449
 sae_lens/saes/gated_sae.py,sha256=qcmM9JwBA8aZR8z_IRHV1_gQX-q_63tKewWXRnhdXuo,8986
-sae_lens/saes/jumprelu_sae.py,sha256=3xkhBcCol2mEpIBLceymCpudocm2ypOjTeTXbpiXoA4,10794
+sae_lens/saes/jumprelu_sae.py,sha256=HHBF1sJ95lZvxwP5vwLSQFKdnJN2KKYK0WAEaLTrta0,13399
 sae_lens/saes/sae.py,sha256=gdUZuLaOHQrPjbDj-nZI813B6-_mNAnV9i9z4qTnpHk,38255
 sae_lens/saes/standard_sae.py,sha256=9UqYyYtQuThYxXKNaDjYcyowpOx2-7cShG-TeUP6JCQ,5940
 sae_lens/saes/topk_sae.py,sha256=CXMBI6CFvI5829bOhoQ350VXR9d8uFHUDlULTIWHXoU,8686
@@ -33,7 +33,7 @@ sae_lens/training/types.py,sha256=qSjmGzXf3MLalygG0psnVjmhX_mpLmL47MQtZfe7qxg,81
 sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
 sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
 sae_lens/util.py,sha256=lW7fBn_b8quvRYlen9PUmB7km60YhKyjmuelB1f6KzQ,2253
-sae_lens-6.10.0.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
-sae_lens-6.10.0.dist-info/METADATA,sha256=7Yq4_hrZVc2CBB4nMvgy_BGFjT5FrF3SfOo8LnJ18Rg,5245
-sae_lens-6.10.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-sae_lens-6.10.0.dist-info/RECORD,,
+sae_lens-6.11.1.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
+sae_lens-6.11.1.dist-info/METADATA,sha256=qRU9qqA2fLgiyLct7lTpOOLjkkXAIzUEdpDrV1NwKX0,5245
+sae_lens-6.11.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+sae_lens-6.11.1.dist-info/RECORD,,

{sae_lens-6.10.0.dist-info → sae_lens-6.11.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{sae_lens-6.10.0.dist-info → sae_lens-6.11.1.dist-info}/WHEEL RENAMED Viewed

File without changes

sae-lens 6.10.0__py3-none-any.whl → 6.11.1__py3-none-any.whl

sae-lens 6.10.0py3-none-any.whl → 6.11.1py3-none-any.whl