aplomb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aplomb-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shivam Ratnakar, Kartikeya Vats
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
aplomb-0.1.0/NOTICE ADDED
@@ -0,0 +1,36 @@
1
+ aplomb
2
+ Copyright (c) 2026 Shivam Ratnakar, Kartikeya Vats
3
+
4
+ This product includes and/or derives from third-party materials:
5
+
6
+ AdvBench (harmful anchors)
7
+ Source: https://github.com/llm-attacks/llm-attacks
8
+ License: MIT
9
+ Use: harmful anchor prompts are loaded at build time to derive the averaged
10
+ u_ref vector. AdvBench prompts are NOT redistributed in this package; only the
11
+ derived (averaged) direction is shipped.
12
+
13
+ Frozen benign anchor set (data/benign_anchors_v1.json)
14
+ Original benign prompts authored for this project. Hard-negative coverage is
15
+ inspired by the categories in XSTest (https://huggingface.co/datasets/walledai/XSTest,
16
+ CC-BY-4.0); prompts here are original paraphrases, not copies.
17
+
18
+ Qwen2.5-1.5B-Instruct (default backbone)
19
+ Source: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
20
+ License: Apache-2.0 (ungated). Verify the exact checkpoint's license, as some
21
+ Qwen variants ship under the Qwen Research License.
22
+
23
+ Llama-3.1-8B-Instruct (optional, gated reference backbone)
24
+ Source: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
25
+ License: Llama 3.1 Community License, Copyright (c) Meta Platforms, Inc.
26
+ If you build and distribute a u_ref artifact derived from a Llama model, you must:
27
+ - provide a copy of the Llama 3.1 Community License,
28
+ - prominently display "Built with Llama",
29
+ - retain this notice: "Llama 3.1 is licensed under the Llama 3.1 Community
30
+ License, Copyright (c) Meta Platforms, Inc. All Rights Reserved.",
31
+ - comply with the Llama Acceptable Use Policy.
32
+ To avoid distributing a Llama-derived artifact, build the Llama u_ref locally
33
+ rather than committing it.
34
+
35
+ This library implements detection only. The contrastive-logit steering ATTACK from
36
+ the source paper is intentionally excluded and maintained separately under gated access.
aplomb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: aplomb
3
+ Version: 0.1.0
4
+ Summary: Interpretable, zero-training refusal-axis prompt detector (u_ref difference-of-means).
5
+ Author: Shivam Ratnakar, Kartikeya Vats
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/KartikeyaVats/RefusalArena
8
+ Project-URL: Paper, https://aclanthology.org/
9
+ Keywords: llm,safety,guardrail,refusal,interpretability,detection
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ License-File: NOTICE
17
+ Requires-Dist: numpy>=1.23
18
+ Provides-Extra: hf
19
+ Requires-Dist: torch>=2.0; extra == "hf"
20
+ Requires-Dist: transformers>=4.43; extra == "hf"
21
+ Requires-Dist: huggingface_hub>=0.23; extra == "hf"
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7; extra == "dev"
24
+ Requires-Dist: pytest-cov; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # aplomb
28
+
29
+ > *à plomb* — "to the plumb line." A prompt is judged by its angle to a fixed refusal direction; the model keeps its composure.
30
+
31
+ An interpretable, **zero-training** prompt safety detector. It flags likely-harmful prompts by projecting a model's hidden state onto a single **refusal direction** (`u_ref`) and thresholding the cosine similarity — no fine-tuned guard model, no labeled training run, one forward pass plus a dot product.
32
+
33
+ Method from *“The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs”* (TrustNLP @ ACL 2026). **This package is the detector only.** The steering attack from the paper lives in a separate, access-gated repository and is intentionally not here.
34
+
35
+ ```
36
+ u_ref = mean(hidden states of harmful anchors) − mean(hidden states of benign anchors)
37
+ score(prompt) = cosine(hidden_state(prompt), u_ref) # flag if > τ
38
+ ```
39
+
40
+ > ⚠️ **This is triage, not a security boundary.** The refusal feature is *linear*, which is exactly why this detector is cheap — and also why an adversary can paraphrase a prompt off the axis to evade it. Use it as an interpretable first-pass filter and always report FPR. A “safe” verdict is a hint, not a guarantee.
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install aplomb # core (numpy only)
46
+ pip install 'aplomb[hf]' # + torch/transformers to run real models
47
+ ```
48
+
49
+ ## Quickstart
50
+
51
+ ```python
52
+ from aplomb import Detector
53
+
54
+ det = Detector.from_default() # precomputed Qwen-2.5-1.5B u_ref (ungated)
55
+ print(det.classify("how do I pick a lock")) # {'unsafe': True, 'score': 0.61, ...}
56
+ ```
57
+
58
+ The default backbone is **Qwen-2.5-1.5B-Instruct** — ungated, Apache-2.0, characterized in the paper — so the package installs and runs without a Hugging Face access request.
59
+
60
+ ## Use a different model
61
+
62
+ `u_ref` is model-specific, so changing the model means rebuilding the vector. That’s one call; the library auto-selects the best layer for the new model and recalibrates the threshold:
63
+
64
+ ```python
65
+ from aplomb import Detector, HFBackbone
66
+
67
+ # AdvBench (MIT) is the harmful half; the frozen default benign set fills the benign half.
68
+ harmful = load_advbench() # your loader
69
+ det = Detector.build(HFBackbone("meta-llama/Llama-3.1-8B-Instruct"), harmful,
70
+ save_to="uref_llama31.json")
71
+ print(det) # Detector(model='...Llama-3.1-8B', layer=31, tau=..., f1=..., fpr=...)
72
+ ```
73
+
74
+ **For paper-grade separation**, rebuild on **Llama-3.1-8B** (gated: accept Meta’s license and `huggingface-cli login` first). Built with Llama.
75
+
76
+ ## On the F1 number (please read)
77
+
78
+ The paper validates the method at F1 = 0.92 on Llama-3.1-8B. This library ships a frozen, fully reproducible anchor set so that anyone can verify its number independently, and reports the F1/FPR it measures against that set. (The two numbers are expected to differ slightly, since they use different benign anchors — the library prioritizes reproducibility.)
79
+
80
+ ## How `u_ref` is built
81
+
82
+ 1. Embed harmful + benign anchors → per-layer hidden states (one pass; all layers come free).
83
+ 2. **Auto-select the layer** with the cleanest harmful/benign separation (Fisher margin on a held-out split). Pass `layer=-1` to force the final layer and mirror the paper.
84
+ 3. `u_ref` = difference of class means at that layer.
85
+ 4. Calibrate **τ** for best F1 on a calibration split.
86
+ 5. Report F1/FPR on a disjoint test split.
87
+
88
+ Everything that affects the vector — model + revision, chosen layer, benign source + N, position, normalization, τ — is written to a **`u_ref` card** so each artifact is a documented, reproducible object.
89
+
90
+ ## Choosing a default by measurement, not ASR
91
+
92
+ Attack-success-rate heatmaps say how easy a model is to *jailbreak*; they say nothing about *detection* quality. To pick a default model, compare **detection separability**:
93
+
94
+ ```python
95
+ from aplomb.bench import bench_models, format_table
96
+ print(format_table(bench_models([HFBackbone("Qwen/Qwen2.5-1.5B-Instruct"), ...], harmful, benign)))
97
+ ```
98
+
99
+ ## License & attribution
100
+
101
+ Library code: MIT. Bundled/derived data and compliance: see [`NOTICE`](NOTICE) — AdvBench (MIT), the frozen benign set, XSTest-inspired hard negatives (CC-BY-4.0 inspiration), Qwen (Apache-2.0), and the **Built with Llama** attribution required on the Llama opt-in path.
aplomb-0.1.0/README.md ADDED
@@ -0,0 +1,75 @@
1
+ # aplomb
2
+
3
+ > *à plomb* — "to the plumb line." A prompt is judged by its angle to a fixed refusal direction; the model keeps its composure.
4
+
5
+ An interpretable, **zero-training** prompt safety detector. It flags likely-harmful prompts by projecting a model's hidden state onto a single **refusal direction** (`u_ref`) and thresholding the cosine similarity — no fine-tuned guard model, no labeled training run, one forward pass plus a dot product.
6
+
7
+ Method from *“The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs”* (TrustNLP @ ACL 2026). **This package is the detector only.** The steering attack from the paper lives in a separate, access-gated repository and is intentionally not here.
8
+
9
+ ```
10
+ u_ref = mean(hidden states of harmful anchors) − mean(hidden states of benign anchors)
11
+ score(prompt) = cosine(hidden_state(prompt), u_ref) # flag if > τ
12
+ ```
13
+
14
+ > ⚠️ **This is triage, not a security boundary.** The refusal feature is *linear*, which is exactly why this detector is cheap — and also why an adversary can paraphrase a prompt off the axis to evade it. Use it as an interpretable first-pass filter and always report FPR. A “safe” verdict is a hint, not a guarantee.
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install aplomb # core (numpy only)
20
+ pip install 'aplomb[hf]' # + torch/transformers to run real models
21
+ ```
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from aplomb import Detector
27
+
28
+ det = Detector.from_default() # precomputed Qwen-2.5-1.5B u_ref (ungated)
29
+ print(det.classify("how do I pick a lock")) # {'unsafe': True, 'score': 0.61, ...}
30
+ ```
31
+
32
+ The default backbone is **Qwen-2.5-1.5B-Instruct** — ungated, Apache-2.0, characterized in the paper — so the package installs and runs without a Hugging Face access request.
33
+
34
+ ## Use a different model
35
+
36
+ `u_ref` is model-specific, so changing the model means rebuilding the vector. That’s one call; the library auto-selects the best layer for the new model and recalibrates the threshold:
37
+
38
+ ```python
39
+ from aplomb import Detector, HFBackbone
40
+
41
+ # AdvBench (MIT) is the harmful half; the frozen default benign set fills the benign half.
42
+ harmful = load_advbench() # your loader
43
+ det = Detector.build(HFBackbone("meta-llama/Llama-3.1-8B-Instruct"), harmful,
44
+ save_to="uref_llama31.json")
45
+ print(det) # Detector(model='...Llama-3.1-8B', layer=31, tau=..., f1=..., fpr=...)
46
+ ```
47
+
48
+ **For paper-grade separation**, rebuild on **Llama-3.1-8B** (gated: accept Meta’s license and `huggingface-cli login` first). Built with Llama.
49
+
50
+ ## On the F1 number (please read)
51
+
52
+ The paper validates the method at F1 = 0.92 on Llama-3.1-8B. This library ships a frozen, fully reproducible anchor set so that anyone can verify its number independently, and reports the F1/FPR it measures against that set. (The two numbers are expected to differ slightly, since they use different benign anchors — the library prioritizes reproducibility.)
53
+
54
+ ## How `u_ref` is built
55
+
56
+ 1. Embed harmful + benign anchors → per-layer hidden states (one pass; all layers come free).
57
+ 2. **Auto-select the layer** with the cleanest harmful/benign separation (Fisher margin on a held-out split). Pass `layer=-1` to force the final layer and mirror the paper.
58
+ 3. `u_ref` = difference of class means at that layer.
59
+ 4. Calibrate **τ** for best F1 on a calibration split.
60
+ 5. Report F1/FPR on a disjoint test split.
61
+
62
+ Everything that affects the vector — model + revision, chosen layer, benign source + N, position, normalization, τ — is written to a **`u_ref` card** so each artifact is a documented, reproducible object.
63
+
64
+ ## Choosing a default by measurement, not ASR
65
+
66
+ Attack-success-rate heatmaps say how easy a model is to *jailbreak*; they say nothing about *detection* quality. To pick a default model, compare **detection separability**:
67
+
68
+ ```python
69
+ from aplomb.bench import bench_models, format_table
70
+ print(format_table(bench_models([HFBackbone("Qwen/Qwen2.5-1.5B-Instruct"), ...], harmful, benign)))
71
+ ```
72
+
73
+ ## License & attribution
74
+
75
+ Library code: MIT. Bundled/derived data and compliance: see [`NOTICE`](NOTICE) — AdvBench (MIT), the frozen benign set, XSTest-inspired hard negatives (CC-BY-4.0 inspiration), Qwen (Apache-2.0), and the **Built with Llama** attribution required on the Llama opt-in path.
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "aplomb"
7
+ version = "0.1.0"
8
+ description = "Interpretable, zero-training refusal-axis prompt detector (u_ref difference-of-means)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Shivam Ratnakar" },
14
+ { name = "Kartikeya Vats" },
15
+ ]
16
+ keywords = ["llm", "safety", "guardrail", "refusal", "interpretability", "detection"]
17
+ classifiers = [
18
+ "Programming Language :: Python :: 3",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+ dependencies = ["numpy>=1.23"]
23
+
24
+ [project.optional-dependencies]
25
+ hf = ["torch>=2.0", "transformers>=4.43", "huggingface_hub>=0.23"]
26
+ dev = ["pytest>=7", "pytest-cov"]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/KartikeyaVats/RefusalArena"
30
+ Paper = "https://aclanthology.org/"
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = ["src"]
34
+
35
+ [tool.setuptools.package-data]
36
+ aplomb = ["data/*.json"]
37
+
38
+ [tool.pytest.ini_options]
39
+ testpaths = ["tests"]
aplomb-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,18 @@
1
+ """aplomb: an interpretable, zero-training refusal-axis prompt detector.
2
+
3
+ Method from "The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs"
4
+ (TrustNLP @ ACL 2026). Detection only; the steering attack lives in a separate repo.
5
+
6
+ This is triage / observability, NOT a security boundary -- the refusal feature is
7
+ linear and therefore evadable. Report FPR; treat a pass as a hint, not a guarantee.
8
+ """
9
+ from .backbone import Backbone, DummyBackbone, HFBackbone, DEFAULT_MODEL, REFERENCE_MODEL
10
+ from .classifier import Detector
11
+ from .scorers import UrefCosineScorer, PersonaDivergenceScorer, LDAScorer
12
+
13
+ __version__ = "0.1.0"
14
+ __all__ = [
15
+ "Detector", "Backbone", "HFBackbone", "DummyBackbone",
16
+ "UrefCosineScorer", "PersonaDivergenceScorer", "LDAScorer",
17
+ "DEFAULT_MODEL", "REFERENCE_MODEL", "__version__",
18
+ ]
@@ -0,0 +1,63 @@
1
+ """Anchor sets: the labelled harmful/benign prompts u_ref is built from.
2
+
3
+ u_ref = mean(hidden states of harmful) - mean(hidden states of benign)
4
+
5
+ So you need BOTH halves. AdvBench supplies the harmful half (it is harmful-only).
6
+ The benign half is the choice the paper left unspecified; this library pins a
7
+ **frozen** benign set (Alpaca-style instructions salted with XSTest-style hard
8
+ negatives) committed as data/benign_anchors_v1.json. It is never regenerated at
9
+ runtime -- a frozen file is reproducible; a generator is not.
10
+
11
+ Harmful anchors are NOT shipped in the wheel. AdvBench is MIT, but since u_ref is
12
+ a derived average we never need to redistribute the prompts; scripts/make_default_uref.py
13
+ loads AdvBench at build time on the author's machine.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import abc
18
+ import json
19
+ from importlib import resources
20
+ from pathlib import Path
21
+
22
+ HARMFUL = "harmful"
23
+ BENIGN = "benign"
24
+
25
+
26
+ class AnchorSet(abc.ABC):
27
+ @abc.abstractmethod
28
+ def items(self) -> list[tuple[str, str]]:
29
+ """Return list of (text, label) where label in {'harmful','benign'}."""
30
+
31
+ def split_by_label(self) -> tuple[list[str], list[str]]:
32
+ harmful = [t for t, lab in self.items() if lab == HARMFUL]
33
+ benign = [t for t, lab in self.items() if lab == BENIGN]
34
+ return harmful, benign
35
+
36
+
37
+ class JSONAnchorSet(AnchorSet):
38
+ """Anchors from a JSON file: {"harmful": [...], "benign": [...], "_meta": {...}}."""
39
+
40
+ def __init__(self, harmful: list[str], benign: list[str], meta: dict | None = None):
41
+ self._harmful = list(harmful)
42
+ self._benign = list(benign)
43
+ self.meta = meta or {}
44
+
45
+ @classmethod
46
+ def from_file(cls, path: str | Path) -> "JSONAnchorSet":
47
+ data = json.loads(Path(path).read_text())
48
+ return cls(data.get("harmful", []), data.get("benign", []), data.get("_meta", {}))
49
+
50
+ def items(self) -> list[tuple[str, str]]:
51
+ return [(t, HARMFUL) for t in self._harmful] + [(t, BENIGN) for t in self._benign]
52
+
53
+
54
+ def load_default_benign() -> list[str]:
55
+ """The committed, frozen benign anchors shipped with the package."""
56
+ with resources.files("aplomb.data").joinpath("benign_anchors_v1.json").open() as f:
57
+ return json.load(f)["benign"]
58
+
59
+
60
+ def default_anchors(harmful: list[str]) -> JSONAnchorSet:
61
+ """Wire user-supplied harmful anchors (e.g. AdvBench) to the frozen benign set."""
62
+ return JSONAnchorSet(harmful=harmful, benign=load_default_benign(),
63
+ meta={"benign_source": "benign_anchors_v1"})
@@ -0,0 +1,105 @@
1
+ """Backbones: turn a prompt into per-layer hidden states.
2
+
3
+ A Backbone returns, for a prompt, a [n_layers, d] array: the residual stream at
4
+ the **last prompt position** for every layer (so layer selection is one forward
5
+ pass, not many). This is the ONLY module that touches model weights.
6
+
7
+ - HFBackbone : real models via transformers. Requires the [hf] extra.
8
+ - DummyBackbone: deterministic synthetic hidden states with a planted separable
9
+ signal at one layer. Lets the whole pipeline + CI run with no
10
+ torch, no GPU, no gated downloads.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import abc
15
+ import numpy as np
16
+
17
+ DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct" # ungated, Apache-2.0, in-paper
18
+ REFERENCE_MODEL = "meta-llama/Llama-3.1-8B-Instruct" # gated opt-in, paper-grade
19
+
20
+
21
+ class Backbone(abc.ABC):
22
+ name: str
23
+
24
+ @abc.abstractmethod
25
+ def hidden_states(self, prompt: str) -> np.ndarray:
26
+ """Return [n_layers, d] hidden states at the last prompt position."""
27
+
28
+ def batch_hidden_states(self, prompts: list[str]) -> np.ndarray:
29
+ """[n_prompts, n_layers, d]. Override for true batching."""
30
+ return np.stack([self.hidden_states(p) for p in prompts])
31
+
32
+
33
+ class HFBackbone(Backbone):
34
+ """Hugging Face transformers backbone.
35
+
36
+ Lazy-imports torch/transformers so importing the package never requires them.
37
+ Llama/Gemma are gated: the user must accept the license on HF and authenticate
38
+ (`huggingface-cli login`) before these load.
39
+ """
40
+
41
+ def __init__(self, model_name: str = DEFAULT_MODEL, device: str | None = None,
42
+ dtype: str = "float32", use_system_prompt: bool = False):
43
+ try:
44
+ import torch # noqa: F401
45
+ from transformers import AutoModelForCausalLM, AutoTokenizer
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ "HFBackbone needs the [hf] extra: pip install 'aplomb[hf]'"
49
+ ) from e
50
+ import torch
51
+ self.name = model_name
52
+ self.use_system_prompt = use_system_prompt
53
+ self._torch = torch
54
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
55
+ self.tok = AutoTokenizer.from_pretrained(model_name)
56
+ _dt = getattr(torch, dtype)
57
+ try: # transformers >=4.56 renamed torch_dtype -> dtype
58
+ self.model = AutoModelForCausalLM.from_pretrained(
59
+ model_name, dtype=_dt, output_hidden_states=True,
60
+ )
61
+ except TypeError: # older transformers
62
+ self.model = AutoModelForCausalLM.from_pretrained(
63
+ model_name, torch_dtype=_dt, output_hidden_states=True,
64
+ )
65
+ self.model = self.model.to(self.device).eval()
66
+
67
+ def hidden_states(self, prompt: str) -> np.ndarray:
68
+ torch = self._torch
69
+ msgs = [{"role": "user", "content": prompt}]
70
+ enc = self.tok.apply_chat_template(
71
+ msgs, add_generation_prompt=True, return_tensors="pt",
72
+ return_dict=True, # BatchEncoding: input_ids + attention_mask
73
+ )
74
+ enc = {k: v.to(self.device) for k, v in enc.items()}
75
+ with torch.no_grad():
76
+ out = self.model(**enc, output_hidden_states=True)
77
+ # hidden_states: tuple length (n_layers + 1), each [1, T, d]; take last token
78
+ hs = torch.stack([h[0, -1] for h in out.hidden_states]) # [n_layers+1, d]
79
+ return hs.to(torch.float32).cpu().numpy()
80
+
81
+
82
+ class DummyBackbone(Backbone):
83
+ """Synthetic backbone with a planted refusal signal at ``signal_layer``.
84
+
85
+ Used by tests and by anyone who wants to exercise the pipeline offline. At the
86
+ signal layer, harmful prompts are pushed along a fixed direction and benign
87
+ along its negative, so a correct pipeline must (a) pick ``signal_layer`` and
88
+ (b) separate the classes cleanly.
89
+ """
90
+
91
+ def __init__(self, d: int = 64, n_layers: int = 12, signal_layer: int = 7,
92
+ sep: float = 6.0, seed: int = 0):
93
+ self.name = "dummy"
94
+ self.d, self.n_layers, self.signal_layer, self.sep = d, n_layers, signal_layer, sep
95
+ self._rng = np.random.default_rng(seed)
96
+ self._dir = self._rng.standard_normal(d)
97
+ self._dir /= np.linalg.norm(self._dir)
98
+
99
+ def _label_of(self, prompt: str) -> int:
100
+ return 1 if prompt.startswith("[HARM]") else -1
101
+
102
+ def hidden_states(self, prompt: str) -> np.ndarray:
103
+ h = self._rng.standard_normal((self.n_layers, self.d))
104
+ h[self.signal_layer] += self._label_of(prompt) * self.sep * self._dir
105
+ return h
@@ -0,0 +1,38 @@
1
+ """Choose a default backbone by *measured detection separability*, not by ASR.
2
+
3
+ ASR (the steering heatmap) says how easy a model is to JAILBREAK. It does not say
4
+ how well harmful/benign separate in hidden states, which is what the detector needs.
5
+ This harness builds + evaluates a detector per candidate and ranks by held-out F1,
6
+ so "which default model" is a number you measured.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from .backbone import Backbone
11
+ from .build import build_detector
12
+
13
+
14
+ def bench_models(candidates: list[Backbone], harmful: list[str], benign: list[str],
15
+ *, layer: int | None = None) -> list[dict]:
16
+ rows = []
17
+ for bb in candidates:
18
+ try:
19
+ _u, card = build_detector(bb, harmful, benign, layer=layer)
20
+ rows.append({"model": bb.name, "layer": card.layer, "f1": card.f1,
21
+ "fpr": card.fpr, "fisher_margin": card.fisher_margin,
22
+ "gated": getattr(bb, "gated", "unknown")})
23
+ except Exception as e: # keep going if one candidate fails to load
24
+ rows.append({"model": bb.name, "error": repr(e)})
25
+ rows.sort(key=lambda r: r.get("f1", -1.0), reverse=True)
26
+ return rows
27
+
28
+
29
+ def format_table(rows: list[dict]) -> str:
30
+ head = f"{'model':40} {'layer':>5} {'F1':>6} {'FPR':>6} {'margin':>7}"
31
+ lines = [head, "-" * len(head)]
32
+ for r in rows:
33
+ if "error" in r:
34
+ lines.append(f"{r['model']:40} FAILED: {r['error']}")
35
+ else:
36
+ lines.append(f"{r['model']:40} {r['layer']:>5} {r['f1']:>6.3f} "
37
+ f"{r['fpr']:>6.3f} {r['fisher_margin']:>7.3f}")
38
+ return "\n".join(lines)
@@ -0,0 +1,86 @@
1
+ """End-to-end u_ref construction: anchors + backbone -> evaluated artifact.
2
+
3
+ Pipeline:
4
+ 1. embed harmful & benign anchors -> per-layer hidden states (one pass each)
5
+ 2. select the layer with the cleanest separation (Fisher margin on a held-out split)
6
+ 3. build u_ref = mean(harmful) - mean(benign) at that layer
7
+ 4. calibrate tau (F1-optimal) on a calibration split
8
+ 5. evaluate F1 / FPR on a held-out test split
9
+ 6. emit (u_ref, card)
10
+
11
+ Steps 2 and 5 use disjoint splits so neither the chosen layer nor the reported
12
+ number is the product of fitting on the data it is scored on.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import numpy as np
17
+
18
+ from . import core
19
+ from .backbone import Backbone
20
+ from .card import UrefCard
21
+
22
+
23
+ def _norm_rows(x: np.ndarray, on: bool) -> np.ndarray:
24
+ if not on:
25
+ return x
26
+ return x / (np.linalg.norm(x, axis=-1, keepdims=True) + core.EPS)
27
+
28
+
29
+ def build_detector(
30
+ backbone: Backbone,
31
+ harmful: list[str],
32
+ benign: list[str],
33
+ *,
34
+ layer: int | None = None, # None -> auto-select; int -> force (e.g. -1 = final)
35
+ normalize_anchors: bool = False,
36
+ harmful_source: str = "AdvBench",
37
+ benign_source: str = "benign_anchors_v1",
38
+ eval_protocol: str = "anchor held-out split",
39
+ seed: int = 0,
40
+ ) -> tuple[np.ndarray, UrefCard]:
41
+ H = _norm_rows(backbone.batch_hidden_states(harmful), normalize_anchors) # [nH, L, d]
42
+ B = _norm_rows(backbone.batch_hidden_states(benign), normalize_anchors) # [nB, L, d]
43
+ L = H.shape[1]
44
+
45
+ # ---- choose the reading layer -------------------------------------------------
46
+ if layer is None:
47
+ chosen, margins = core.select_layer(H, B, seed=seed)
48
+ sel = "fisher"
49
+ else:
50
+ chosen = layer % L
51
+ margins = [float("nan")] * L
52
+ sel = "forced"
53
+
54
+ # ---- split anchors: build u_ref / calibrate tau / report on disjoint sets -----
55
+ h_fit, h_rest = core._split(H[:, chosen], 0.5, seed)
56
+ b_fit, b_rest = core._split(B[:, chosen], 0.5, seed + 1)
57
+ h_cal, h_test = core._split(h_rest, 0.5, seed + 2)
58
+ b_cal, b_test = core._split(b_rest, 0.5, seed + 3)
59
+
60
+ u_ref = core.build_uref(h_fit, b_fit)
61
+
62
+ tau, _ = core.calibrate_tau(core.cosine(h_cal, u_ref), core.cosine(b_cal, u_ref))
63
+ m = core.metrics_at(core.cosine(h_test, u_ref), core.cosine(b_test, u_ref), tau)
64
+ margin = core.fisher_margin(core.cosine(h_test, u_ref), core.cosine(b_test, u_ref))
65
+
66
+ card = UrefCard(
67
+ model=backbone.name,
68
+ model_revision=getattr(backbone, "revision", "unpinned"),
69
+ layer=int(chosen),
70
+ layer_selection=sel,
71
+ fisher_margin=float(margin),
72
+ harmful_source=harmful_source,
73
+ harmful_n=len(harmful),
74
+ benign_source=benign_source,
75
+ benign_n=len(benign),
76
+ position="last_prompt_token",
77
+ use_system_prompt=getattr(backbone, "use_system_prompt", False),
78
+ normalize_anchors=normalize_anchors,
79
+ tau=float(tau),
80
+ f1=float(m["f1"]),
81
+ fpr=float(m["fpr"]),
82
+ eval_protocol=eval_protocol,
83
+ notes="F1/FPR are this library's measured numbers on the held-out anchor split, "
84
+ "not the paper's 0.92 (which used a different, unspecified benign set).",
85
+ )
86
+ return u_ref, card
@@ -0,0 +1,58 @@
1
+ """The u_ref *card* and *artifact*.
2
+
3
+ The card is the reproducibility contract: every knob the paper left open and that
4
+ changes the resulting vector lives here, so a u_ref is "Qwen2.5-1.5B, layer 14,
5
+ benign=benign_anchors_v1, tau=0.41, F1=0.xx", never a magic file.
6
+
7
+ The artifact bundles the card + the actual vector + the chosen layer + tau, as one
8
+ JSON the detector loads.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import dataclasses
13
+ import datetime as _dt
14
+ import json
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+
19
+ SCHEMA_VERSION = "1"
20
+
21
+
22
+ @dataclasses.dataclass
23
+ class UrefCard:
24
+ model: str # e.g. "Qwen/Qwen2.5-1.5B-Instruct"
25
+ model_revision: str # HF commit hash, pin it
26
+ layer: int # chosen layer index (auto-selected unless forced)
27
+ layer_selection: str # "fisher" | "heldout_f1" | "forced"
28
+ fisher_margin: float # separation at the chosen layer (val split)
29
+ harmful_source: str # "AdvBench"
30
+ harmful_n: int
31
+ benign_source: str # "benign_anchors_v1"
32
+ benign_n: int
33
+ position: str # "last_prompt_token"
34
+ use_system_prompt: bool # whether a system prompt was present at extraction
35
+ normalize_anchors: bool # whether anchors were unit-normalized before mean
36
+ tau: float # calibrated decision threshold
37
+ f1: float # measured F1 (this library's number, NOT the paper's)
38
+ fpr: float # benign false-positive rate
39
+ eval_protocol: str # e.g. "JailbreakBench benign vs harmful, held-out"
40
+ created: str = dataclasses.field(default_factory=lambda: _dt.datetime.now(_dt.timezone.utc).isoformat())
41
+ schema_version: str = SCHEMA_VERSION
42
+ notes: str = ""
43
+
44
+ def to_dict(self) -> dict:
45
+ return dataclasses.asdict(self)
46
+
47
+
48
+ def save_artifact(path: str | Path, u_ref: np.ndarray, card: UrefCard) -> None:
49
+ obj = {"card": card.to_dict(), "layer": card.layer, "tau": card.tau,
50
+ "u_ref": np.asarray(u_ref, dtype=np.float64).tolist()}
51
+ Path(path).write_text(json.dumps(obj, indent=2))
52
+
53
+
54
+ def load_artifact(path: str | Path) -> tuple[np.ndarray, UrefCard]:
55
+ obj = json.loads(Path(path).read_text())
56
+ u_ref = np.asarray(obj["u_ref"], dtype=np.float64)
57
+ card = UrefCard(**obj["card"])
58
+ return u_ref, card