archscope 0.2.6__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {archscope-0.2.6/src/archscope.egg-info → archscope-0.2.7}/PKG-INFO +1 -1
- {archscope-0.2.6 → archscope-0.2.7}/pyproject.toml +1 -1
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/__init__.py +1 -1
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/probes.py +34 -0
- {archscope-0.2.6 → archscope-0.2.7/src/archscope.egg-info}/PKG-INFO +1 -1
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_unit.py +32 -1
- {archscope-0.2.6 → archscope-0.2.7}/LICENSE +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/README.md +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/setup.cfg +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/_utils.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/attribute.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/backends.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/bench.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/circuits.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/cli.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/diff.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/kazdov_backend.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/lens.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/loader.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/neurons.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/py.typed +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/sae.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope/transfer.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope.egg-info/SOURCES.txt +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope.egg-info/dependency_links.txt +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope.egg-info/entry_points.txt +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope.egg-info/requires.txt +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/src/archscope.egg-info/top_level.txt +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_circuits_3arch.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_diff.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_kazdov_integration.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_lens.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_mamba_integration.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_mamba_ssm_state.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_probe_transfer.py +0 -0
- {archscope-0.2.6 → archscope-0.2.7}/tests/test_pythia_end_to_end.py +0 -0
|
@@ -108,6 +108,40 @@ class ProbeFit:
|
|
|
108
108
|
with torch.no_grad():
|
|
109
109
|
return torch.sigmoid(self.probe(activations.to(self.device)))
|
|
110
110
|
|
|
111
|
+
@property
|
|
112
|
+
def direction(self) -> torch.Tensor:
|
|
113
|
+
"""1D direction vector in activation space (linear probes only).
|
|
114
|
+
|
|
115
|
+
Shape: ``(hidden_dim,)``. This is the projection axis the probe found —
|
|
116
|
+
useful for: applying a probe to externally-transformed activations
|
|
117
|
+
(e.g., after ``archscope.transfer.learn_alignment``), inspecting feature
|
|
118
|
+
geometry, or projecting interventions along the learned direction.
|
|
119
|
+
|
|
120
|
+
Raises ``ValueError`` for MLP probes (no single linear direction).
|
|
121
|
+
"""
|
|
122
|
+
if self.config.probe_type != "linear":
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f".direction is only defined for linear probes (got "
|
|
125
|
+
f"probe_type={self.config.probe_type!r}). MLP probes don't have a "
|
|
126
|
+
"single direction in activation space."
|
|
127
|
+
)
|
|
128
|
+
return self.probe.net.weight.detach().squeeze(0).clone()
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def bias(self) -> torch.Tensor:
|
|
132
|
+
"""Scalar bias term (linear probes only). Shape: ``()``.
|
|
133
|
+
|
|
134
|
+
Together with ``.direction``, lets you score arbitrary activations as
|
|
135
|
+
``logits = acts @ direction + bias`` without going through the probe
|
|
136
|
+
module — handy for cross-arch transfer experiments.
|
|
137
|
+
"""
|
|
138
|
+
if self.config.probe_type != "linear":
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f".bias is only defined for linear probes (got "
|
|
141
|
+
f"probe_type={self.config.probe_type!r})."
|
|
142
|
+
)
|
|
143
|
+
return self.probe.net.bias.detach().squeeze().clone()
|
|
144
|
+
|
|
111
145
|
|
|
112
146
|
def _auroc(logits: torch.Tensor, labels: torch.Tensor) -> float:
|
|
113
147
|
"""AUROC from logits + binary labels.
|
|
@@ -22,7 +22,7 @@ def test_imports():
|
|
|
22
22
|
import archscope
|
|
23
23
|
from archscope import (probes, sae, neurons, attribute, backends, # noqa: F401
|
|
24
24
|
circuits, transfer, bench, lens, diff) # noqa: F401
|
|
25
|
-
assert archscope.__version__ == "0.2.
|
|
25
|
+
assert archscope.__version__ == "0.2.7"
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def test_loader_exports():
|
|
@@ -259,6 +259,37 @@ def test_induction_head_score_small_vocab_clear_error():
|
|
|
259
259
|
assert "vocab window" in str(ei.value).lower() or "n_pairs" in str(ei.value)
|
|
260
260
|
|
|
261
261
|
|
|
262
|
+
def test_probefit_direction_and_bias_accessors():
|
|
263
|
+
"""ProbeFit exposes .direction and .bias for linear probes."""
|
|
264
|
+
from archscope.probes import ProbeFit, ProbeConfig
|
|
265
|
+
torch.manual_seed(0)
|
|
266
|
+
pos = torch.randn(40, 8) + 1.5
|
|
267
|
+
neg = torch.randn(40, 8) - 1.5
|
|
268
|
+
cfg = ProbeConfig(layer_name="x", probe_type="linear")
|
|
269
|
+
pf = ProbeFit(cfg, input_dim=8)
|
|
270
|
+
pf.train(torch.cat([pos, neg]), torch.cat([torch.ones(40), torch.zeros(40)]),
|
|
271
|
+
epochs=30, batch_size=16)
|
|
272
|
+
d, b = pf.direction, pf.bias
|
|
273
|
+
assert d.shape == (8,), f"direction shape: {d.shape}"
|
|
274
|
+
assert b.dim() == 0, f"bias should be scalar: {b.shape}"
|
|
275
|
+
# Manual application matches what probe.score does (up to sigmoid).
|
|
276
|
+
test_act = torch.randn(3, 8)
|
|
277
|
+
manual_logits = test_act @ d + b
|
|
278
|
+
via_probe = pf.probe(test_act)
|
|
279
|
+
assert torch.allclose(manual_logits, via_probe, atol=1e-5), \
|
|
280
|
+
"direction @ acts + bias should equal probe(acts)"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def test_probefit_direction_rejects_mlp():
|
|
284
|
+
""".direction raises on MLP probes."""
|
|
285
|
+
from archscope.probes import ProbeFit, ProbeConfig
|
|
286
|
+
cfg = ProbeConfig(layer_name="x", probe_type="mlp")
|
|
287
|
+
pf = ProbeFit(cfg, input_dim=8)
|
|
288
|
+
with pytest.raises(ValueError) as ei:
|
|
289
|
+
_ = pf.direction
|
|
290
|
+
assert "linear" in str(ei.value).lower()
|
|
291
|
+
|
|
292
|
+
|
|
262
293
|
def test_dim_decompose_rejects_mamba_style_model():
|
|
263
294
|
"""dim_decompose raises on models with no attention/MLP submodules."""
|
|
264
295
|
from archscope.attribute import dim_decompose
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|