sae-lens 6.6.4__py3-none-any.whl → 6.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sae_lens/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # ruff: noqa: E402
2
- __version__ = "6.6.4"
2
+ __version__ = "6.6.5"
3
3
 
4
4
  import logging
5
5
 
sae_lens/evals.py CHANGED
@@ -718,17 +718,9 @@ def get_recons_loss(
718
718
  **model_kwargs,
719
719
  )
720
720
 
721
- def kl(original_logits: torch.Tensor, new_logits: torch.Tensor):
722
- original_probs = torch.nn.functional.softmax(original_logits, dim=-1)
723
- log_original_probs = torch.log(original_probs)
724
- new_probs = torch.nn.functional.softmax(new_logits, dim=-1)
725
- log_new_probs = torch.log(new_probs)
726
- kl_div = original_probs * (log_original_probs - log_new_probs)
727
- return kl_div.sum(dim=-1)
728
-
729
721
  if compute_kl:
730
- recons_kl_div = kl(original_logits, recons_logits)
731
- zero_abl_kl_div = kl(original_logits, zero_abl_logits)
722
+ recons_kl_div = _kl(original_logits, recons_logits)
723
+ zero_abl_kl_div = _kl(original_logits, zero_abl_logits)
732
724
  metrics["kl_div_with_sae"] = recons_kl_div
733
725
  metrics["kl_div_with_ablation"] = zero_abl_kl_div
734
726
 
@@ -740,6 +732,18 @@ def get_recons_loss(
740
732
  return metrics
741
733
 
742
734
 
735
+ def _kl(original_logits: torch.Tensor, new_logits: torch.Tensor):
736
+ # Computes the log-probabilities of the new logits (approximation).
737
+ log_probs_new = torch.nn.functional.log_softmax(new_logits, dim=-1)
738
+ # Computes the probabilities of the original logits (true distribution).
739
+ probs_orig = torch.nn.functional.softmax(original_logits, dim=-1)
740
+ # Compute the KL divergence. torch.nn.functional.kl_div expects the first argument to be the log
741
+ # probabilities of the approximation (new), and the second argument to be the true distribution
742
+ # (original) as probabilities. This computes KL(original || new).
743
+ kl = torch.nn.functional.kl_div(log_probs_new, probs_orig, reduction="none")
744
+ return kl.sum(dim=-1)
745
+
746
+
743
747
  def all_loadable_saes() -> list[tuple[str, str, float, float]]:
744
748
  all_loadable_saes = []
745
749
  saes_directory = get_pretrained_saes_directory()
@@ -1001,10 +1001,14 @@ def get_sparsify_config_from_disk(
1001
1001
  layer = int(match.group(1))
1002
1002
  hook_name = f"blocks.{layer}.hook_resid_post"
1003
1003
 
1004
+ d_sae = old_cfg_dict.get("num_latents")
1005
+ if d_sae is None:
1006
+ d_sae = old_cfg_dict["d_in"] * old_cfg_dict["expansion_factor"]
1007
+
1004
1008
  cfg_dict: dict[str, Any] = {
1005
1009
  "architecture": "standard",
1006
1010
  "d_in": old_cfg_dict["d_in"],
1007
- "d_sae": old_cfg_dict["d_in"] * old_cfg_dict["expansion_factor"],
1011
+ "d_sae": d_sae,
1008
1012
  "dtype": "bfloat16",
1009
1013
  "device": device or "cpu",
1010
1014
  "model_name": config_dict.get("model", path.parts[-2]),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: sae-lens
3
- Version: 6.6.4
3
+ Version: 6.6.5
4
4
  Summary: Training and Analyzing Sparse Autoencoders (SAEs)
5
5
  License: MIT
6
6
  Keywords: deep-learning,sparse-autoencoders,mechanistic-interpretability,PyTorch
@@ -1,15 +1,15 @@
1
- sae_lens/__init__.py,sha256=yJm-dL16dtJ9Eo9p4cOcV3Y4Zrhim6FMuzysx0MnYas,3588
1
+ sae_lens/__init__.py,sha256=gvg9photJRtatuXa9YF-uDv1tYiHwHTMh29X1GNQd6Y,3588
2
2
  sae_lens/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  sae_lens/analysis/hooked_sae_transformer.py,sha256=vRu6JseH1lZaEeILD5bEkQEQ1wYHHDcxD-f2olKmE9Y,14275
4
4
  sae_lens/analysis/neuronpedia_integration.py,sha256=Fj4gVyaXMGBUxoK0vPeTwGVFr4n40fmfPrRENo4WzPs,19324
5
5
  sae_lens/cache_activations_runner.py,sha256=cNeAtp2JQ_vKbeddZVM-tcPLYyyfTWL8NDna5KQpkLI,12583
6
6
  sae_lens/config.py,sha256=IrjbsKBbaZoFXYrsPJ5xBwIqi9uZJIIFXjV_uoErJaE,28176
7
7
  sae_lens/constants.py,sha256=CSjmiZ-bhjQeVLyRvWxAjBokCgkfM8mnvd7-vxLIWTY,639
8
- sae_lens/evals.py,sha256=2YHR_IBhXdjktpmoVtvvNrqUZIx5ok7yERuiFY40HHY,39186
8
+ sae_lens/evals.py,sha256=4hanbyG8qZLItWqft94F4ZjUoytPVB7fw5s0P4Oi0VE,39504
9
9
  sae_lens/llm_sae_training_runner.py,sha256=exxNX_OEhdiUrlgmBP9bjX9DOf0HUcNQGO4unKeDjKM,13713
10
10
  sae_lens/load_model.py,sha256=C8AMykctj6H7tz_xRwB06-EXj6TfW64PtSJZR5Jxn1Y,8649
11
11
  sae_lens/loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sae_lens/loading/pretrained_sae_loaders.py,sha256=iIcHM24qfb45JOGEmUn7jr5E9vl8L2FYSlArsobCwlI,44388
12
+ sae_lens/loading/pretrained_sae_loaders.py,sha256=tLeHArWFpu8CI6vXH1ZxFkhmsrhO2UsZyi7DzVzqAUs,44477
13
13
  sae_lens/loading/pretrained_saes_directory.py,sha256=4Vn-Jex6SveD7EbxcSOBv8cx1gkPfUMLU1QOP-ww1ZE,3752
14
14
  sae_lens/pretokenize_runner.py,sha256=w0f6SfZLAxbp5eAAKnet8RqUB_DKofZ9RGsoJwFnYbA,7058
15
15
  sae_lens/pretrained_saes.yaml,sha256=O_FwoOe7fU9_WLEOnMk1IWXRxD4nwzf1tCfbof1r0D0,598578
@@ -33,7 +33,7 @@ sae_lens/training/types.py,sha256=qSjmGzXf3MLalygG0psnVjmhX_mpLmL47MQtZfe7qxg,81
33
33
  sae_lens/training/upload_saes_to_huggingface.py,sha256=r_WzI1zLtGZ5TzAxuG3xa_8T09j3zXJrWd_vzPsPGkQ,4469
34
34
  sae_lens/tutorial/tsea.py,sha256=fd1am_XXsf2KMbByDapJo-2qlxduKaa62Z2qcQZ3QKU,18145
35
35
  sae_lens/util.py,sha256=mCwLAilGMVo8Scm7CIsCafU7GsfmBvCcjwmloI4Ly7Y,1718
36
- sae_lens-6.6.4.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
37
- sae_lens-6.6.4.dist-info/METADATA,sha256=2fVRS6CU6AgaiZZBDgwLcFPClSs_KAMUXWP_uJB27TE,5356
38
- sae_lens-6.6.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
39
- sae_lens-6.6.4.dist-info/RECORD,,
36
+ sae_lens-6.6.5.dist-info/LICENSE,sha256=DW6e-hDosiu4CfW0-imI57sV1I5f9UEslpviNQcOAKs,1069
37
+ sae_lens-6.6.5.dist-info/METADATA,sha256=U5oP3RYgIE2EnHA2mwRImUcoyVBhYYwiRU199LM_R7c,5356
38
+ sae_lens-6.6.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
39
+ sae_lens-6.6.5.dist-info/RECORD,,