sae-lens 6.6.3__py3-none-any.whl → 6.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sae_lens/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # ruff: noqa: E402
2
- __version__ = "6.6.3"
2
+ __version__ = "6.6.5"
3
3
 
4
4
  import logging
5
5
 
@@ -8,27 +8,6 @@ from typing import Any, TypeVar
8
8
 
9
9
  import requests
10
10
  from dotenv import load_dotenv
11
- from neuron_explainer.activations.activation_records import calculate_max_activation
12
- from neuron_explainer.activations.activations import ActivationRecord
13
- from neuron_explainer.explanations.calibrated_simulator import (
14
- UncalibratedNeuronSimulator,
15
- )
16
- from neuron_explainer.explanations.explainer import (
17
- HARMONY_V4_MODELS,
18
- ContextSize,
19
- TokenActivationPairExplainer,
20
- )
21
- from neuron_explainer.explanations.explanations import ScoredSimulation
22
- from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
23
- from neuron_explainer.explanations.prompt_builder import PromptFormat
24
- from neuron_explainer.explanations.scoring import (
25
- _simulate_and_score_sequence,
26
- aggregate_scored_sequence_simulations,
27
- )
28
- from neuron_explainer.explanations.simulator import (
29
- LogprobFreeExplanationTokenSimulator,
30
- NeuronSimulator,
31
- )
32
11
  from tenacity import retry, stop_after_attempt, wait_random_exponential
33
12
 
34
13
  from sae_lens import SAE, logger
@@ -158,10 +137,22 @@ def sleep_identity(x: T) -> T:
158
137
 
159
138
  @retry(wait=wait_random_exponential(min=1, max=500), stop=stop_after_attempt(10))
160
139
  async def simulate_and_score( # type: ignore
161
- simulator: NeuronSimulator,
162
- activation_records: list[ActivationRecord], # type: ignore
163
- ) -> ScoredSimulation: # type: ignore
140
+ simulator: Any,
141
+ activation_records: list[Any],
142
+ ) -> Any:
164
143
  """Score an explanation of a neuron by how well it predicts activations on the given text sequences."""
144
+ try:
145
+ from neuron_explainer.explanations.scoring import (
146
+ _simulate_and_score_sequence,
147
+ aggregate_scored_sequence_simulations,
148
+ )
149
+ except ImportError as e:
150
+ raise ImportError(
151
+ "The neuron_explainer package is required to use this function. "
152
+ "Please install SAELens with the neuronpedia optional dependencies: "
153
+ "pip install sae-lens[neuronpedia]"
154
+ ) from e
155
+
165
156
  scored_sequence_simulations = await asyncio.gather(
166
157
  *[
167
158
  sleep_identity(
@@ -253,6 +244,31 @@ async def autointerp_neuronpedia_features( # noqa: C901
253
244
  Returns:
254
245
  None
255
246
  """
247
+ try:
248
+ from neuron_explainer.activations.activation_records import (
249
+ calculate_max_activation,
250
+ )
251
+ from neuron_explainer.activations.activations import ActivationRecord
252
+ from neuron_explainer.explanations.calibrated_simulator import (
253
+ UncalibratedNeuronSimulator,
254
+ )
255
+ from neuron_explainer.explanations.explainer import (
256
+ HARMONY_V4_MODELS,
257
+ ContextSize,
258
+ TokenActivationPairExplainer,
259
+ )
260
+ from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
261
+ from neuron_explainer.explanations.prompt_builder import PromptFormat
262
+ from neuron_explainer.explanations.simulator import (
263
+ LogprobFreeExplanationTokenSimulator,
264
+ )
265
+ except ImportError as e:
266
+ raise ImportError(
267
+ "The automated-interpretability package is required to use autointerp functionality. "
268
+ "Please install SAELens with the neuronpedia optional dependencies: "
269
+ "pip install sae-lens[neuronpedia]"
270
+ ) from e
271
+
256
272
  logger.info("\n\n")
257
273
 
258
274
  if os.getenv("OPENAI_API_KEY") is None:
sae_lens/evals.py CHANGED
@@ -718,17 +718,9 @@ def get_recons_loss(
718
718
  **model_kwargs,
719
719
  )
720
720
 
721
- def kl(original_logits: torch.Tensor, new_logits: torch.Tensor):
722
- original_probs = torch.nn.functional.softmax(original_logits, dim=-1)
723
- log_original_probs = torch.log(original_probs)
724
- new_probs = torch.nn.functional.softmax(new_logits, dim=-1)
725
- log_new_probs = torch.log(new_probs)
726
- kl_div = original_probs * (log_original_probs - log_new_probs)
727
- return kl_div.sum(dim=-1)
728
-
729
721
  if compute_kl:
730
- recons_kl_div = kl(original_logits, recons_logits)
731
- zero_abl_kl_div = kl(original_logits, zero_abl_logits)
722
+ recons_kl_div = _kl(original_logits, recons_logits)
723
+ zero_abl_kl_div = _kl(original_logits, zero_abl_logits)
732
724
  metrics["kl_div_with_sae"] = recons_kl_div
733
725
  metrics["kl_div_with_ablation"] = zero_abl_kl_div
734
726
 
@@ -740,6 +732,18 @@ def get_recons_loss(
740
732
  return metrics
741
733
 
742
734
 
735
+ def _kl(original_logits: torch.Tensor, new_logits: torch.Tensor):
736
+ # Computes the log-probabilities of the new logits (approximation).
737
+ log_probs_new = torch.nn.functional.log_softmax(new_logits, dim=-1)
738
+ # Computes the probabilities of the original logits (true distribution).
739
+ probs_orig = torch.nn.functional.softmax(original_logits, dim=-1)
740
+ # Compute the KL divergence. torch.nn.functional.kl_div expects the first argument to be the log
741
+ # probabilities of the approximation (new), and the second argument to be the true distribution
742
+ # (original) as probabilities. This computes KL(original || new).
743
+ kl = torch.nn.functional.kl_div(log_probs_new, probs_orig, reduction="none")
744
+ return kl.sum(dim=-1)
745
+
746
+
743
747
  def all_loadable_saes() -> list[tuple[str, str, float, float]]:
744
748
  all_loadable_saes = []
745
749
  saes_directory = get_pretrained_saes_directory()
@@ -1001,10 +1001,14 @@ def get_sparsify_config_from_disk(
1001
1001
  layer = int(match.group(1))
1002
1002
  hook_name = f"blocks.{layer}.hook_resid_post"
1003
1003
 
1004
+ d_sae = old_cfg_dict.get("num_latents")
1005
+ if d_sae is None:
1006
+ d_sae = old_cfg_dict["d_in"] * old_cfg_dict["expansion_factor"]
1007
+
1004
1008
  cfg_dict: dict[str, Any] = {
1005
1009
  "architecture": "standard",
1006
1010
  "d_in": old_cfg_dict["d_in"],
1007
- "d_sae": old_cfg_dict["d_in"] * old_cfg_dict["expansion_factor"],
1011
+ "d_sae": d_sae,
1008
1012
  "dtype": "bfloat16",
1009
1013
  "device": device or "cpu",
1010
1014
  "model_name": config_dict.get("model", path.parts[-2]),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: sae-lens
3
- Version: 6.6.3
3
+ Version: 6.6.5
4
4
  Summary: Training and Analyzing Sparse Autoencoders (SAEs)
5
5
  License: MIT
6
6
  Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch
@@ -14,7 +14,8 @@ Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Provides-Extra: mamba
17
- Requires-Dist: automated-interpretability (>=0.0.5,<1.0.0)
17
+ Provides-Extra: neuronpedia
18
+ Requires-Dist: automated-interpretability (>=0.0.5,<1.0.0) ; extra == "neuronpedia"
18
19
  Requires-Dist: babe (>=0.0.7,<0.0.8)
19
20
  Requires-Dist: datasets (>=3.1.0)
20
21
  Requires-Dist: mamba-lens (>=0.0.4,<0.0.5) ; extra == "mamba"
@@ -1,15 +1,15 @@
1
- sae_lens/__init__.py,sha256=PhaMB_ijs_Y7kwPlEyHpENZ8mPdEYouVbG9pFc297DI,3588
1
+ sae_lens/__init__.py,sha256=gvg9photJRtatuXa9YF-uDv1tYiHwHTMh29X1GNQd6Y,3588
2
2
  sae_lens/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  sae_lens/analysis/hooked_sae_transformer.py,sha256=vRu6JseH1lZaEeILD5bEkQEQ1wYHHDcxD-f2olKmE9Y,14275
4
- sae_lens/analysis/neuronpedia_integration.py,sha256=MrENqc81Mc2SMbxGjbwHzpkGUCAFKSf0i4EdaUF2Oj4,18707
4
+ sae_lens/analysis/neuronpedia_integration.py,sha256=Fj4gVyaXMGBUxoK0vPeTwGVFr4n40fmfPrRENo4WzPs,19324
5
5
  sae_lens/cache_activations_runner.py,sha256=cNeAtp2JQ_vKbeddZVM-tcPLYyyfTWL8NDna5KQpkLI,12583
6
6
  sae_lens/config.py,sha256=IrjbsKBbaZoFXYrsPJ5xBwIqi9uZJIIFXjV_uoErJaE,28176
7
7
  sae_lens/constants.py,sha256=CSjmiZ-bhjQeVLyRvWxAjBokCgkfM8mnvd7-vxLIWTY,639
8
- sae_lens/evals.py,sha256=2YHR_IBhXdjktpmoVtvvNrqUZIx5ok7yERuiFY40HHY,39186
8
+ sae_lens/evals.py,sha256=4hanbyG8qZLItWqft94F4ZjUoytPVB7fw5s0P4Oi0VE,39504
9
9
  sae_lens/llm_sae_training_runner.py,sha256=exxNX_OEhdiUrlgmBP9bjX9DOf0HUcNQGO4unKeDjKM,13713
10
10
  sae_lens/load_model.py,sha256=C8AMykctj6H7tz_xRwB06-EXj6TfW64PtSJZR5Jxn1Y,8649
11
11
  sae_lens/loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sae_lens/loading/pretrained_sae_loaders.py,sha256=iIcHM24qfb45JOGEmUn7jr5E9vl8L2FYSlArsobCwlI,44388
12
+ sae_lens/loading/pretrained_sae_loaders.py,sha256=tLeHArWFpu8CI6vXH1ZxFkhmsrhO2UsZyi7DzVzqAUs,44477
13
13
  sae_lens/loading/pretrained_saes_directory.py,sha256=4Vn-Jex6SveD7EbxcSOBv8cx1gkPfUMLU1QOP-ww1ZE,3752
14
14
  sae_lens/pretokenize_runner.py,sha256=w0f6SfZLAxbp5eAAKnet8RqUB_DKofZ9RGsoJwFnYbA,7058
15
15
  sae_lens/pretrained_saes.yaml,sha256=O_FwoOe7fU9_WLEOnMk1IWXRxD4nwzf1tCfbof1r0D0,598578
@@ -33,7 +33,7 @@ sae_lens/training/types.py,sha256=qSjmGzXf3MLalygG0psnVjmhX_mpLmL47MQtZfe7qxg,81
33
33
  sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
34
34
  sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
35
35
  sae_lens/util.py,sha256=mCwLAilGMVo8Scm7CIsCafU7GsfmBvCcjwmloI4Ly7Y,1718
36
- sae_lens-6.6.3.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
37
- sae_lens-6.6.3.dist-info/METADATA,sha256=c0naO-z2PfIEeJY8f1YbC58Dscj2nl8KfyJVhA0sk0A,5303
38
- sae_lens-6.6.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
39
- sae_lens-6.6.3.dist-info/RECORD,,
36
+ sae_lens-6.6.5.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
37
+ sae_lens-6.6.5.dist-info/METADATA,sha256=U5oP3RYgIE2EnHA2mwRImUcoyVBhYYwiRU199LM_R7c,5356
38
+ sae_lens-6.6.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
39
+ sae_lens-6.6.5.dist-info/RECORD,,