aplomb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aplomb-0.1.0/LICENSE +21 -0
- aplomb-0.1.0/NOTICE +36 -0
- aplomb-0.1.0/PKG-INFO +101 -0
- aplomb-0.1.0/README.md +75 -0
- aplomb-0.1.0/pyproject.toml +39 -0
- aplomb-0.1.0/setup.cfg +4 -0
- aplomb-0.1.0/src/aplomb/__init__.py +18 -0
- aplomb-0.1.0/src/aplomb/anchors.py +63 -0
- aplomb-0.1.0/src/aplomb/backbone.py +105 -0
- aplomb-0.1.0/src/aplomb/bench.py +38 -0
- aplomb-0.1.0/src/aplomb/build.py +86 -0
- aplomb-0.1.0/src/aplomb/card.py +58 -0
- aplomb-0.1.0/src/aplomb/classifier.py +98 -0
- aplomb-0.1.0/src/aplomb/core.py +156 -0
- aplomb-0.1.0/src/aplomb/data/__init__.py +1 -0
- aplomb-0.1.0/src/aplomb/data/benign_anchors_v1.json +36 -0
- aplomb-0.1.0/src/aplomb/data/uref_dummy_demo.json +91 -0
- aplomb-0.1.0/src/aplomb/data/uref_qwen2.5-1.5b.json +26 -0
- aplomb-0.1.0/src/aplomb/evaluate.py +29 -0
- aplomb-0.1.0/src/aplomb/scorers.py +94 -0
- aplomb-0.1.0/src/aplomb.egg-info/PKG-INFO +101 -0
- aplomb-0.1.0/src/aplomb.egg-info/SOURCES.txt +24 -0
- aplomb-0.1.0/src/aplomb.egg-info/dependency_links.txt +1 -0
- aplomb-0.1.0/src/aplomb.egg-info/requires.txt +10 -0
- aplomb-0.1.0/src/aplomb.egg-info/top_level.txt +1 -0
- aplomb-0.1.0/tests/test_core.py +59 -0
aplomb-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shivam Ratnakar, Kartikeya Vats
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
aplomb-0.1.0/NOTICE
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
aplomb
|
|
2
|
+
Copyright (c) 2026 Shivam Ratnakar, Kartikeya Vats
|
|
3
|
+
|
|
4
|
+
This product includes and/or derives from third-party materials:
|
|
5
|
+
|
|
6
|
+
AdvBench (harmful anchors)
|
|
7
|
+
Source: https://github.com/llm-attacks/llm-attacks
|
|
8
|
+
License: MIT
|
|
9
|
+
Use: harmful anchor prompts are loaded at build time to derive the averaged
|
|
10
|
+
u_ref vector. AdvBench prompts are NOT redistributed in this package; only the
|
|
11
|
+
derived (averaged) direction is shipped.
|
|
12
|
+
|
|
13
|
+
Frozen benign anchor set (data/benign_anchors_v1.json)
|
|
14
|
+
Original benign prompts authored for this project. Hard-negative coverage is
|
|
15
|
+
inspired by the categories in XSTest (https://huggingface.co/datasets/walledai/XSTest,
|
|
16
|
+
CC-BY-4.0); prompts here are original paraphrases, not copies.
|
|
17
|
+
|
|
18
|
+
Qwen2.5-1.5B-Instruct (default backbone)
|
|
19
|
+
Source: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
|
|
20
|
+
License: Apache-2.0 (ungated). Verify the exact checkpoint's license, as some
|
|
21
|
+
Qwen variants ship under the Qwen Research License.
|
|
22
|
+
|
|
23
|
+
Llama-3.1-8B-Instruct (optional, gated reference backbone)
|
|
24
|
+
Source: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
|
25
|
+
License: Llama 3.1 Community License, Copyright (c) Meta Platforms, Inc.
|
|
26
|
+
If you build and distribute a u_ref artifact derived from a Llama model, you must:
|
|
27
|
+
- provide a copy of the Llama 3.1 Community License,
|
|
28
|
+
- prominently display "Built with Llama",
|
|
29
|
+
- retain this notice: "Llama 3.1 is licensed under the Llama 3.1 Community
|
|
30
|
+
License, Copyright (c) Meta Platforms, Inc. All Rights Reserved.",
|
|
31
|
+
- comply with the Llama Acceptable Use Policy.
|
|
32
|
+
To avoid distributing a Llama-derived artifact, build the Llama u_ref locally
|
|
33
|
+
rather than committing it.
|
|
34
|
+
|
|
35
|
+
This library implements detection only. The contrastive-logit steering ATTACK from
|
|
36
|
+
the source paper is intentionally excluded and maintained separately under gated access.
|
aplomb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aplomb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Interpretable, zero-training refusal-axis prompt detector (u_ref difference-of-means).
|
|
5
|
+
Author: Shivam Ratnakar, Kartikeya Vats
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/KartikeyaVats/RefusalArena
|
|
8
|
+
Project-URL: Paper, https://aclanthology.org/
|
|
9
|
+
Keywords: llm,safety,guardrail,refusal,interpretability,detection
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
License-File: NOTICE
|
|
17
|
+
Requires-Dist: numpy>=1.23
|
|
18
|
+
Provides-Extra: hf
|
|
19
|
+
Requires-Dist: torch>=2.0; extra == "hf"
|
|
20
|
+
Requires-Dist: transformers>=4.43; extra == "hf"
|
|
21
|
+
Requires-Dist: huggingface_hub>=0.23; extra == "hf"
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# aplomb
|
|
28
|
+
|
|
29
|
+
> *à plomb* — "to the plumb line." A prompt is judged by its angle to a fixed refusal direction; the model keeps its composure.
|
|
30
|
+
|
|
31
|
+
An interpretable, **zero-training** prompt safety detector. It flags likely-harmful prompts by projecting a model's hidden state onto a single **refusal direction** (`u_ref`) and thresholding the cosine similarity — no fine-tuned guard model, no labeled training run, one forward pass plus a dot product.
|
|
32
|
+
|
|
33
|
+
Method from *“The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs”* (TrustNLP @ ACL 2026). **This package is the detector only.** The steering attack from the paper lives in a separate, access-gated repository and is intentionally not here.
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
u_ref = mean(hidden states of harmful anchors) − mean(hidden states of benign anchors)
|
|
37
|
+
score(prompt) = cosine(hidden_state(prompt), u_ref) # flag if > τ
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
> ⚠️ **This is triage, not a security boundary.** The refusal feature is *linear*, which is exactly why this detector is cheap — and also why an adversary can paraphrase a prompt off the axis to evade it. Use it as an interpretable first-pass filter and always report FPR. A “safe” verdict is a hint, not a guarantee.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install aplomb # core (numpy only)
|
|
46
|
+
pip install 'aplomb[hf]' # + torch/transformers to run real models
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quickstart
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from aplomb import Detector
|
|
53
|
+
|
|
54
|
+
det = Detector.from_default() # precomputed Qwen-2.5-1.5B u_ref (ungated)
|
|
55
|
+
print(det.classify("how do I pick a lock")) # {'unsafe': True, 'score': 0.61, ...}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The default backbone is **Qwen-2.5-1.5B-Instruct** — ungated, Apache-2.0, characterized in the paper — so the package installs and runs without a Hugging Face access request.
|
|
59
|
+
|
|
60
|
+
## Use a different model
|
|
61
|
+
|
|
62
|
+
`u_ref` is model-specific, so changing the model means rebuilding the vector. That’s one call; the library auto-selects the best layer for the new model and recalibrates the threshold:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from aplomb import Detector, HFBackbone
|
|
66
|
+
|
|
67
|
+
# AdvBench (MIT) is the harmful half; the frozen default benign set fills the benign half.
|
|
68
|
+
harmful = load_advbench() # your loader
|
|
69
|
+
det = Detector.build(HFBackbone("meta-llama/Llama-3.1-8B-Instruct"), harmful,
|
|
70
|
+
save_to="uref_llama31.json")
|
|
71
|
+
print(det) # Detector(model='...Llama-3.1-8B', layer=31, tau=..., f1=..., fpr=...)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**For paper-grade separation**, rebuild on **Llama-3.1-8B** (gated: accept Meta’s license and `huggingface-cli login` first). Built with Llama.
|
|
75
|
+
|
|
76
|
+
## On the F1 number (please read)
|
|
77
|
+
|
|
78
|
+
The paper validates the method at F1 = 0.92 on Llama-3.1-8B. This library ships a frozen, fully reproducible anchor set so that anyone can verify its number independently, and reports the F1/FPR it measures against that set. (The two numbers are expected to differ slightly, since they use different benign anchors — the library prioritizes reproducibility.)
|
|
79
|
+
|
|
80
|
+
## How `u_ref` is built
|
|
81
|
+
|
|
82
|
+
1. Embed harmful + benign anchors → per-layer hidden states (one pass; all layers come free).
|
|
83
|
+
2. **Auto-select the layer** with the cleanest harmful/benign separation (Fisher margin on a held-out split). Pass `layer=-1` to force the final layer and mirror the paper.
|
|
84
|
+
3. `u_ref` = difference of class means at that layer.
|
|
85
|
+
4. Calibrate **τ** for best F1 on a calibration split.
|
|
86
|
+
5. Report F1/FPR on a disjoint test split.
|
|
87
|
+
|
|
88
|
+
Everything that affects the vector — model + revision, chosen layer, benign source + N, position, normalization, τ — is written to a **`u_ref` card** so each artifact is a documented, reproducible object.
|
|
89
|
+
|
|
90
|
+
## Choosing a default by measurement, not ASR
|
|
91
|
+
|
|
92
|
+
Attack-success-rate heatmaps say how easy a model is to *jailbreak*; they say nothing about *detection* quality. To pick a default model, compare **detection separability**:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from aplomb.bench import bench_models, format_table
|
|
96
|
+
print(format_table(bench_models([HFBackbone("Qwen/Qwen2.5-1.5B-Instruct"), ...], harmful, benign)))
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## License & attribution
|
|
100
|
+
|
|
101
|
+
Library code: MIT. Bundled/derived data and compliance: see [`NOTICE`](NOTICE) — AdvBench (MIT), the frozen benign set, XSTest-inspired hard negatives (CC-BY-4.0 inspiration), Qwen (Apache-2.0), and the **Built with Llama** attribution required on the Llama opt-in path.
|
aplomb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# aplomb
|
|
2
|
+
|
|
3
|
+
> *à plomb* — "to the plumb line." A prompt is judged by its angle to a fixed refusal direction; the model keeps its composure.
|
|
4
|
+
|
|
5
|
+
An interpretable, **zero-training** prompt safety detector. It flags likely-harmful prompts by projecting a model's hidden state onto a single **refusal direction** (`u_ref`) and thresholding the cosine similarity — no fine-tuned guard model, no labeled training run, one forward pass plus a dot product.
|
|
6
|
+
|
|
7
|
+
Method from *“The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs”* (TrustNLP @ ACL 2026). **This package is the detector only.** The steering attack from the paper lives in a separate, access-gated repository and is intentionally not here.
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
u_ref = mean(hidden states of harmful anchors) − mean(hidden states of benign anchors)
|
|
11
|
+
score(prompt) = cosine(hidden_state(prompt), u_ref) # flag if > τ
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
> ⚠️ **This is triage, not a security boundary.** The refusal feature is *linear*, which is exactly why this detector is cheap — and also why an adversary can paraphrase a prompt off the axis to evade it. Use it as an interpretable first-pass filter and always report FPR. A “safe” verdict is a hint, not a guarantee.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install aplomb # core (numpy only)
|
|
20
|
+
pip install 'aplomb[hf]' # + torch/transformers to run real models
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from aplomb import Detector
|
|
27
|
+
|
|
28
|
+
det = Detector.from_default() # precomputed Qwen-2.5-1.5B u_ref (ungated)
|
|
29
|
+
print(det.classify("how do I pick a lock")) # {'unsafe': True, 'score': 0.61, ...}
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The default backbone is **Qwen-2.5-1.5B-Instruct** — ungated, Apache-2.0, characterized in the paper — so the package installs and runs without a Hugging Face access request.
|
|
33
|
+
|
|
34
|
+
## Use a different model
|
|
35
|
+
|
|
36
|
+
`u_ref` is model-specific, so changing the model means rebuilding the vector. That’s one call; the library auto-selects the best layer for the new model and recalibrates the threshold:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from aplomb import Detector, HFBackbone
|
|
40
|
+
|
|
41
|
+
# AdvBench (MIT) is the harmful half; the frozen default benign set fills the benign half.
|
|
42
|
+
harmful = load_advbench() # your loader
|
|
43
|
+
det = Detector.build(HFBackbone("meta-llama/Llama-3.1-8B-Instruct"), harmful,
|
|
44
|
+
save_to="uref_llama31.json")
|
|
45
|
+
print(det) # Detector(model='...Llama-3.1-8B', layer=31, tau=..., f1=..., fpr=...)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**For paper-grade separation**, rebuild on **Llama-3.1-8B** (gated: accept Meta’s license and `huggingface-cli login` first). Built with Llama.
|
|
49
|
+
|
|
50
|
+
## On the F1 number (please read)
|
|
51
|
+
|
|
52
|
+
The paper validates the method at F1 = 0.92 on Llama-3.1-8B. This library ships a frozen, fully reproducible anchor set so that anyone can verify its number independently, and reports the F1/FPR it measures against that set. (The two numbers are expected to differ slightly, since they use different benign anchors — the library prioritizes reproducibility.)
|
|
53
|
+
|
|
54
|
+
## How `u_ref` is built
|
|
55
|
+
|
|
56
|
+
1. Embed harmful + benign anchors → per-layer hidden states (one pass; all layers come free).
|
|
57
|
+
2. **Auto-select the layer** with the cleanest harmful/benign separation (Fisher margin on a held-out split). Pass `layer=-1` to force the final layer and mirror the paper.
|
|
58
|
+
3. `u_ref` = difference of class means at that layer.
|
|
59
|
+
4. Calibrate **τ** for best F1 on a calibration split.
|
|
60
|
+
5. Report F1/FPR on a disjoint test split.
|
|
61
|
+
|
|
62
|
+
Everything that affects the vector — model + revision, chosen layer, benign source + N, position, normalization, τ — is written to a **`u_ref` card** so each artifact is a documented, reproducible object.
|
|
63
|
+
|
|
64
|
+
## Choosing a default by measurement, not ASR
|
|
65
|
+
|
|
66
|
+
Attack-success-rate heatmaps say how easy a model is to *jailbreak*; they say nothing about *detection* quality. To pick a default model, compare **detection separability**:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from aplomb.bench import bench_models, format_table
|
|
70
|
+
print(format_table(bench_models([HFBackbone("Qwen/Qwen2.5-1.5B-Instruct"), ...], harmful, benign)))
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## License & attribution
|
|
74
|
+
|
|
75
|
+
Library code: MIT. Bundled/derived data and compliance: see [`NOTICE`](NOTICE) — AdvBench (MIT), the frozen benign set, XSTest-inspired hard negatives (CC-BY-4.0 inspiration), Qwen (Apache-2.0), and the **Built with Llama** attribution required on the Llama opt-in path.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "aplomb"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Interpretable, zero-training refusal-axis prompt detector (u_ref difference-of-means)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Shivam Ratnakar" },
|
|
14
|
+
{ name = "Kartikeya Vats" },
|
|
15
|
+
]
|
|
16
|
+
keywords = ["llm", "safety", "guardrail", "refusal", "interpretability", "detection"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = ["numpy>=1.23"]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
hf = ["torch>=2.0", "transformers>=4.43", "huggingface_hub>=0.23"]
|
|
26
|
+
dev = ["pytest>=7", "pytest-cov"]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/KartikeyaVats/RefusalArena"
|
|
30
|
+
Paper = "https://aclanthology.org/"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["src"]
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.package-data]
|
|
36
|
+
aplomb = ["data/*.json"]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
testpaths = ["tests"]
|
aplomb-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""aplomb: an interpretable, zero-training refusal-axis prompt detector.
|
|
2
|
+
|
|
3
|
+
Method from "The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs"
|
|
4
|
+
(TrustNLP @ ACL 2026). Detection only; the steering attack lives in a separate repo.
|
|
5
|
+
|
|
6
|
+
This is triage / observability, NOT a security boundary -- the refusal feature is
|
|
7
|
+
linear and therefore evadable. Report FPR; treat a pass as a hint, not a guarantee.
|
|
8
|
+
"""
|
|
9
|
+
from .backbone import Backbone, DummyBackbone, HFBackbone, DEFAULT_MODEL, REFERENCE_MODEL
|
|
10
|
+
from .classifier import Detector
|
|
11
|
+
from .scorers import UrefCosineScorer, PersonaDivergenceScorer, LDAScorer
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Detector", "Backbone", "HFBackbone", "DummyBackbone",
|
|
16
|
+
"UrefCosineScorer", "PersonaDivergenceScorer", "LDAScorer",
|
|
17
|
+
"DEFAULT_MODEL", "REFERENCE_MODEL", "__version__",
|
|
18
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Anchor sets: the labelled harmful/benign prompts u_ref is built from.
|
|
2
|
+
|
|
3
|
+
u_ref = mean(hidden states of harmful) - mean(hidden states of benign)
|
|
4
|
+
|
|
5
|
+
So you need BOTH halves. AdvBench supplies the harmful half (it is harmful-only).
|
|
6
|
+
The benign half is the choice the paper left unspecified; this library pins a
|
|
7
|
+
**frozen** benign set (Alpaca-style instructions salted with XSTest-style hard
|
|
8
|
+
negatives) committed as data/benign_anchors_v1.json. It is never regenerated at
|
|
9
|
+
runtime -- a frozen file is reproducible; a generator is not.
|
|
10
|
+
|
|
11
|
+
Harmful anchors are NOT shipped in the wheel. AdvBench is MIT, but since u_ref is
|
|
12
|
+
a derived average we never need to redistribute the prompts; scripts/make_default_uref.py
|
|
13
|
+
loads AdvBench at build time on the author's machine.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import abc
|
|
18
|
+
import json
|
|
19
|
+
from importlib import resources
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
HARMFUL = "harmful"
|
|
23
|
+
BENIGN = "benign"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AnchorSet(abc.ABC):
|
|
27
|
+
@abc.abstractmethod
|
|
28
|
+
def items(self) -> list[tuple[str, str]]:
|
|
29
|
+
"""Return list of (text, label) where label in {'harmful','benign'}."""
|
|
30
|
+
|
|
31
|
+
def split_by_label(self) -> tuple[list[str], list[str]]:
|
|
32
|
+
harmful = [t for t, lab in self.items() if lab == HARMFUL]
|
|
33
|
+
benign = [t for t, lab in self.items() if lab == BENIGN]
|
|
34
|
+
return harmful, benign
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JSONAnchorSet(AnchorSet):
|
|
38
|
+
"""Anchors from a JSON file: {"harmful": [...], "benign": [...], "_meta": {...}}."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, harmful: list[str], benign: list[str], meta: dict | None = None):
|
|
41
|
+
self._harmful = list(harmful)
|
|
42
|
+
self._benign = list(benign)
|
|
43
|
+
self.meta = meta or {}
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_file(cls, path: str | Path) -> "JSONAnchorSet":
|
|
47
|
+
data = json.loads(Path(path).read_text())
|
|
48
|
+
return cls(data.get("harmful", []), data.get("benign", []), data.get("_meta", {}))
|
|
49
|
+
|
|
50
|
+
def items(self) -> list[tuple[str, str]]:
|
|
51
|
+
return [(t, HARMFUL) for t in self._harmful] + [(t, BENIGN) for t in self._benign]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_default_benign() -> list[str]:
|
|
55
|
+
"""The committed, frozen benign anchors shipped with the package."""
|
|
56
|
+
with resources.files("aplomb.data").joinpath("benign_anchors_v1.json").open() as f:
|
|
57
|
+
return json.load(f)["benign"]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def default_anchors(harmful: list[str]) -> JSONAnchorSet:
|
|
61
|
+
"""Wire user-supplied harmful anchors (e.g. AdvBench) to the frozen benign set."""
|
|
62
|
+
return JSONAnchorSet(harmful=harmful, benign=load_default_benign(),
|
|
63
|
+
meta={"benign_source": "benign_anchors_v1"})
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Backbones: turn a prompt into per-layer hidden states.
|
|
2
|
+
|
|
3
|
+
A Backbone returns, for a prompt, a [n_layers, d] array: the residual stream at
|
|
4
|
+
the **last prompt position** for every layer (so layer selection is one forward
|
|
5
|
+
pass, not many). This is the ONLY module that touches model weights.
|
|
6
|
+
|
|
7
|
+
- HFBackbone : real models via transformers. Requires the [hf] extra.
|
|
8
|
+
- DummyBackbone: deterministic synthetic hidden states with a planted separable
|
|
9
|
+
signal at one layer. Lets the whole pipeline + CI run with no
|
|
10
|
+
torch, no GPU, no gated downloads.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import abc
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct" # ungated, Apache-2.0, in-paper
|
|
18
|
+
REFERENCE_MODEL = "meta-llama/Llama-3.1-8B-Instruct" # gated opt-in, paper-grade
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Backbone(abc.ABC):
|
|
22
|
+
name: str
|
|
23
|
+
|
|
24
|
+
@abc.abstractmethod
|
|
25
|
+
def hidden_states(self, prompt: str) -> np.ndarray:
|
|
26
|
+
"""Return [n_layers, d] hidden states at the last prompt position."""
|
|
27
|
+
|
|
28
|
+
def batch_hidden_states(self, prompts: list[str]) -> np.ndarray:
|
|
29
|
+
"""[n_prompts, n_layers, d]. Override for true batching."""
|
|
30
|
+
return np.stack([self.hidden_states(p) for p in prompts])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class HFBackbone(Backbone):
|
|
34
|
+
"""Hugging Face transformers backbone.
|
|
35
|
+
|
|
36
|
+
Lazy-imports torch/transformers so importing the package never requires them.
|
|
37
|
+
Llama/Gemma are gated: the user must accept the license on HF and authenticate
|
|
38
|
+
(`huggingface-cli login`) before these load.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, model_name: str = DEFAULT_MODEL, device: str | None = None,
|
|
42
|
+
dtype: str = "float32", use_system_prompt: bool = False):
|
|
43
|
+
try:
|
|
44
|
+
import torch # noqa: F401
|
|
45
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
46
|
+
except ImportError as e: # pragma: no cover
|
|
47
|
+
raise ImportError(
|
|
48
|
+
"HFBackbone needs the [hf] extra: pip install 'aplomb[hf]'"
|
|
49
|
+
) from e
|
|
50
|
+
import torch
|
|
51
|
+
self.name = model_name
|
|
52
|
+
self.use_system_prompt = use_system_prompt
|
|
53
|
+
self._torch = torch
|
|
54
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
55
|
+
self.tok = AutoTokenizer.from_pretrained(model_name)
|
|
56
|
+
_dt = getattr(torch, dtype)
|
|
57
|
+
try: # transformers >=4.56 renamed torch_dtype -> dtype
|
|
58
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
59
|
+
model_name, dtype=_dt, output_hidden_states=True,
|
|
60
|
+
)
|
|
61
|
+
except TypeError: # older transformers
|
|
62
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
63
|
+
model_name, torch_dtype=_dt, output_hidden_states=True,
|
|
64
|
+
)
|
|
65
|
+
self.model = self.model.to(self.device).eval()
|
|
66
|
+
|
|
67
|
+
def hidden_states(self, prompt: str) -> np.ndarray:
|
|
68
|
+
torch = self._torch
|
|
69
|
+
msgs = [{"role": "user", "content": prompt}]
|
|
70
|
+
enc = self.tok.apply_chat_template(
|
|
71
|
+
msgs, add_generation_prompt=True, return_tensors="pt",
|
|
72
|
+
return_dict=True, # BatchEncoding: input_ids + attention_mask
|
|
73
|
+
)
|
|
74
|
+
enc = {k: v.to(self.device) for k, v in enc.items()}
|
|
75
|
+
with torch.no_grad():
|
|
76
|
+
out = self.model(**enc, output_hidden_states=True)
|
|
77
|
+
# hidden_states: tuple length (n_layers + 1), each [1, T, d]; take last token
|
|
78
|
+
hs = torch.stack([h[0, -1] for h in out.hidden_states]) # [n_layers+1, d]
|
|
79
|
+
return hs.to(torch.float32).cpu().numpy()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DummyBackbone(Backbone):
|
|
83
|
+
"""Synthetic backbone with a planted refusal signal at ``signal_layer``.
|
|
84
|
+
|
|
85
|
+
Used by tests and by anyone who wants to exercise the pipeline offline. At the
|
|
86
|
+
signal layer, harmful prompts are pushed along a fixed direction and benign
|
|
87
|
+
along its negative, so a correct pipeline must (a) pick ``signal_layer`` and
|
|
88
|
+
(b) separate the classes cleanly.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self, d: int = 64, n_layers: int = 12, signal_layer: int = 7,
|
|
92
|
+
sep: float = 6.0, seed: int = 0):
|
|
93
|
+
self.name = "dummy"
|
|
94
|
+
self.d, self.n_layers, self.signal_layer, self.sep = d, n_layers, signal_layer, sep
|
|
95
|
+
self._rng = np.random.default_rng(seed)
|
|
96
|
+
self._dir = self._rng.standard_normal(d)
|
|
97
|
+
self._dir /= np.linalg.norm(self._dir)
|
|
98
|
+
|
|
99
|
+
def _label_of(self, prompt: str) -> int:
|
|
100
|
+
return 1 if prompt.startswith("[HARM]") else -1
|
|
101
|
+
|
|
102
|
+
def hidden_states(self, prompt: str) -> np.ndarray:
|
|
103
|
+
h = self._rng.standard_normal((self.n_layers, self.d))
|
|
104
|
+
h[self.signal_layer] += self._label_of(prompt) * self.sep * self._dir
|
|
105
|
+
return h
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Choose a default backbone by *measured detection separability*, not by ASR.
|
|
2
|
+
|
|
3
|
+
ASR (the steering heatmap) says how easy a model is to JAILBREAK. It does not say
|
|
4
|
+
how well harmful/benign separate in hidden states, which is what the detector needs.
|
|
5
|
+
This harness builds + evaluates a detector per candidate and ranks by held-out F1,
|
|
6
|
+
so "which default model" is a number you measured.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .backbone import Backbone
|
|
11
|
+
from .build import build_detector
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def bench_models(candidates: list[Backbone], harmful: list[str], benign: list[str],
|
|
15
|
+
*, layer: int | None = None) -> list[dict]:
|
|
16
|
+
rows = []
|
|
17
|
+
for bb in candidates:
|
|
18
|
+
try:
|
|
19
|
+
_u, card = build_detector(bb, harmful, benign, layer=layer)
|
|
20
|
+
rows.append({"model": bb.name, "layer": card.layer, "f1": card.f1,
|
|
21
|
+
"fpr": card.fpr, "fisher_margin": card.fisher_margin,
|
|
22
|
+
"gated": getattr(bb, "gated", "unknown")})
|
|
23
|
+
except Exception as e: # keep going if one candidate fails to load
|
|
24
|
+
rows.append({"model": bb.name, "error": repr(e)})
|
|
25
|
+
rows.sort(key=lambda r: r.get("f1", -1.0), reverse=True)
|
|
26
|
+
return rows
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def format_table(rows: list[dict]) -> str:
|
|
30
|
+
head = f"{'model':40} {'layer':>5} {'F1':>6} {'FPR':>6} {'margin':>7}"
|
|
31
|
+
lines = [head, "-" * len(head)]
|
|
32
|
+
for r in rows:
|
|
33
|
+
if "error" in r:
|
|
34
|
+
lines.append(f"{r['model']:40} FAILED: {r['error']}")
|
|
35
|
+
else:
|
|
36
|
+
lines.append(f"{r['model']:40} {r['layer']:>5} {r['f1']:>6.3f} "
|
|
37
|
+
f"{r['fpr']:>6.3f} {r['fisher_margin']:>7.3f}")
|
|
38
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""End-to-end u_ref construction: anchors + backbone -> evaluated artifact.
|
|
2
|
+
|
|
3
|
+
Pipeline:
|
|
4
|
+
1. embed harmful & benign anchors -> per-layer hidden states (one pass each)
|
|
5
|
+
2. select the layer with the cleanest separation (Fisher margin on a held-out split)
|
|
6
|
+
3. build u_ref = mean(harmful) - mean(benign) at that layer
|
|
7
|
+
4. calibrate tau (F1-optimal) on a calibration split
|
|
8
|
+
5. evaluate F1 / FPR on a held-out test split
|
|
9
|
+
6. emit (u_ref, card)
|
|
10
|
+
|
|
11
|
+
Steps 2 and 5 use disjoint splits so neither the chosen layer nor the reported
|
|
12
|
+
number is the product of fitting on the data it is scored on.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from . import core
|
|
19
|
+
from .backbone import Backbone
|
|
20
|
+
from .card import UrefCard
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _norm_rows(x: np.ndarray, on: bool) -> np.ndarray:
|
|
24
|
+
if not on:
|
|
25
|
+
return x
|
|
26
|
+
return x / (np.linalg.norm(x, axis=-1, keepdims=True) + core.EPS)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_detector(
|
|
30
|
+
backbone: Backbone,
|
|
31
|
+
harmful: list[str],
|
|
32
|
+
benign: list[str],
|
|
33
|
+
*,
|
|
34
|
+
layer: int | None = None, # None -> auto-select; int -> force (e.g. -1 = final)
|
|
35
|
+
normalize_anchors: bool = False,
|
|
36
|
+
harmful_source: str = "AdvBench",
|
|
37
|
+
benign_source: str = "benign_anchors_v1",
|
|
38
|
+
eval_protocol: str = "anchor held-out split",
|
|
39
|
+
seed: int = 0,
|
|
40
|
+
) -> tuple[np.ndarray, UrefCard]:
|
|
41
|
+
H = _norm_rows(backbone.batch_hidden_states(harmful), normalize_anchors) # [nH, L, d]
|
|
42
|
+
B = _norm_rows(backbone.batch_hidden_states(benign), normalize_anchors) # [nB, L, d]
|
|
43
|
+
L = H.shape[1]
|
|
44
|
+
|
|
45
|
+
# ---- choose the reading layer -------------------------------------------------
|
|
46
|
+
if layer is None:
|
|
47
|
+
chosen, margins = core.select_layer(H, B, seed=seed)
|
|
48
|
+
sel = "fisher"
|
|
49
|
+
else:
|
|
50
|
+
chosen = layer % L
|
|
51
|
+
margins = [float("nan")] * L
|
|
52
|
+
sel = "forced"
|
|
53
|
+
|
|
54
|
+
# ---- split anchors: build u_ref / calibrate tau / report on disjoint sets -----
|
|
55
|
+
h_fit, h_rest = core._split(H[:, chosen], 0.5, seed)
|
|
56
|
+
b_fit, b_rest = core._split(B[:, chosen], 0.5, seed + 1)
|
|
57
|
+
h_cal, h_test = core._split(h_rest, 0.5, seed + 2)
|
|
58
|
+
b_cal, b_test = core._split(b_rest, 0.5, seed + 3)
|
|
59
|
+
|
|
60
|
+
u_ref = core.build_uref(h_fit, b_fit)
|
|
61
|
+
|
|
62
|
+
tau, _ = core.calibrate_tau(core.cosine(h_cal, u_ref), core.cosine(b_cal, u_ref))
|
|
63
|
+
m = core.metrics_at(core.cosine(h_test, u_ref), core.cosine(b_test, u_ref), tau)
|
|
64
|
+
margin = core.fisher_margin(core.cosine(h_test, u_ref), core.cosine(b_test, u_ref))
|
|
65
|
+
|
|
66
|
+
card = UrefCard(
|
|
67
|
+
model=backbone.name,
|
|
68
|
+
model_revision=getattr(backbone, "revision", "unpinned"),
|
|
69
|
+
layer=int(chosen),
|
|
70
|
+
layer_selection=sel,
|
|
71
|
+
fisher_margin=float(margin),
|
|
72
|
+
harmful_source=harmful_source,
|
|
73
|
+
harmful_n=len(harmful),
|
|
74
|
+
benign_source=benign_source,
|
|
75
|
+
benign_n=len(benign),
|
|
76
|
+
position="last_prompt_token",
|
|
77
|
+
use_system_prompt=getattr(backbone, "use_system_prompt", False),
|
|
78
|
+
normalize_anchors=normalize_anchors,
|
|
79
|
+
tau=float(tau),
|
|
80
|
+
f1=float(m["f1"]),
|
|
81
|
+
fpr=float(m["fpr"]),
|
|
82
|
+
eval_protocol=eval_protocol,
|
|
83
|
+
notes="F1/FPR are this library's measured numbers on the held-out anchor split, "
|
|
84
|
+
"not the paper's 0.92 (which used a different, unspecified benign set).",
|
|
85
|
+
)
|
|
86
|
+
return u_ref, card
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""The u_ref *card* and *artifact*.
|
|
2
|
+
|
|
3
|
+
The card is the reproducibility contract: every knob the paper left open and that
|
|
4
|
+
changes the resulting vector lives here, so a u_ref is "Qwen2.5-1.5B, layer 14,
|
|
5
|
+
benign=benign_anchors_v1, tau=0.41, F1=0.xx", never a magic file.
|
|
6
|
+
|
|
7
|
+
The artifact bundles the card + the actual vector + the chosen layer + tau, as one
|
|
8
|
+
JSON the detector loads.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import dataclasses
|
|
13
|
+
import datetime as _dt
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
SCHEMA_VERSION = "1"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclasses.dataclass
|
|
23
|
+
class UrefCard:
|
|
24
|
+
model: str # e.g. "Qwen/Qwen2.5-1.5B-Instruct"
|
|
25
|
+
model_revision: str # HF commit hash, pin it
|
|
26
|
+
layer: int # chosen layer index (auto-selected unless forced)
|
|
27
|
+
layer_selection: str # "fisher" | "heldout_f1" | "forced"
|
|
28
|
+
fisher_margin: float # separation at the chosen layer (val split)
|
|
29
|
+
harmful_source: str # "AdvBench"
|
|
30
|
+
harmful_n: int
|
|
31
|
+
benign_source: str # "benign_anchors_v1"
|
|
32
|
+
benign_n: int
|
|
33
|
+
position: str # "last_prompt_token"
|
|
34
|
+
use_system_prompt: bool # whether a system prompt was present at extraction
|
|
35
|
+
normalize_anchors: bool # whether anchors were unit-normalized before mean
|
|
36
|
+
tau: float # calibrated decision threshold
|
|
37
|
+
f1: float # measured F1 (this library's number, NOT the paper's)
|
|
38
|
+
fpr: float # benign false-positive rate
|
|
39
|
+
eval_protocol: str # e.g. "JailbreakBench benign vs harmful, held-out"
|
|
40
|
+
created: str = dataclasses.field(default_factory=lambda: _dt.datetime.now(_dt.timezone.utc).isoformat())
|
|
41
|
+
schema_version: str = SCHEMA_VERSION
|
|
42
|
+
notes: str = ""
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> dict:
|
|
45
|
+
return dataclasses.asdict(self)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def save_artifact(path: str | Path, u_ref: np.ndarray, card: UrefCard) -> None:
|
|
49
|
+
obj = {"card": card.to_dict(), "layer": card.layer, "tau": card.tau,
|
|
50
|
+
"u_ref": np.asarray(u_ref, dtype=np.float64).tolist()}
|
|
51
|
+
Path(path).write_text(json.dumps(obj, indent=2))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_artifact(path: str | Path) -> tuple[np.ndarray, UrefCard]:
|
|
55
|
+
obj = json.loads(Path(path).read_text())
|
|
56
|
+
u_ref = np.asarray(obj["u_ref"], dtype=np.float64)
|
|
57
|
+
card = UrefCard(**obj["card"])
|
|
58
|
+
return u_ref, card
|