aperture-gate 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aperture_gate-0.1.5/LICENSE +21 -0
- aperture_gate-0.1.5/MANIFEST.in +4 -0
- aperture_gate-0.1.5/PKG-INFO +146 -0
- aperture_gate-0.1.5/README.md +118 -0
- aperture_gate-0.1.5/aperture_gate/__init__.py +24 -0
- aperture_gate-0.1.5/aperture_gate/calibrate.py +317 -0
- aperture_gate-0.1.5/aperture_gate/cli.py +93 -0
- aperture_gate-0.1.5/aperture_gate/gate.py +555 -0
- aperture_gate-0.1.5/aperture_gate/hedge.py +193 -0
- aperture_gate-0.1.5/aperture_gate/mcp.py +209 -0
- aperture_gate-0.1.5/aperture_gate/verify.py +106 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/PKG-INFO +146 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/SOURCES.txt +24 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/dependency_links.txt +1 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/entry_points.txt +2 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/requires.txt +3 -0
- aperture_gate-0.1.5/aperture_gate.egg-info/top_level.txt +1 -0
- aperture_gate-0.1.5/pyproject.toml +44 -0
- aperture_gate-0.1.5/setup.cfg +4 -0
- aperture_gate-0.1.5/tests/._test_gate.py +0 -0
- aperture_gate-0.1.5/tests/fixtures/._calibrate_demo.json +0 -0
- aperture_gate-0.1.5/tests/fixtures/._cert_371e95d23ba7.json +0 -0
- aperture_gate-0.1.5/tests/fixtures/calibrate_demo.json +1 -0
- aperture_gate-0.1.5/tests/fixtures/cert_371e95d23ba7.json +1 -0
- aperture_gate-0.1.5/tests/fixtures/cert_signed.json +1 -0
- aperture_gate-0.1.5/tests/test_gate.py +447 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 the Aperture project · honesty.tools
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aperture-gate
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: Deploy honesty.tools calibration certificates: gate model answers as ON_MAP / UNCERTAIN / OFF_MAP via a words-first refusal reader and a per-model logprob fingerprint probe.
|
|
5
|
+
Author-email: the Aperture project <hello@honesty.tools>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://honesty.tools
|
|
8
|
+
Project-URL: Documentation, https://honesty.tools/docs
|
|
9
|
+
Project-URL: Calibrate, https://honesty.tools/calibrate
|
|
10
|
+
Keywords: llm,hallucination,confabulation,calibration,logprobs,honesty,mcp
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Provides-Extra: verify
|
|
26
|
+
Requires-Dist: cryptography>=41; extra == "verify"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# aperture-gate
|
|
30
|
+
|
|
31
|
+
Deploy a [honesty.tools](https://honesty.tools) calibration certificate.
|
|
32
|
+
|
|
33
|
+
You calibrated a model at [honesty.tools/calibrate](https://honesty.tools/calibrate)
|
|
34
|
+
(or with the public `aperture_calibrate.py`) and got a certificate JSON —
|
|
35
|
+
`aperture.cert.v1` — containing a logistic probe fitted to *your model's*
|
|
36
|
+
logprob confidence fingerprint over a Wikipedia-validated battery of real and
|
|
37
|
+
fabricated entities. This package is the deployment half: it consumes that
|
|
38
|
+
certificate and gates live answers.
|
|
39
|
+
|
|
40
|
+
**The method, honestly stated.** Two instruments, words first:
|
|
41
|
+
|
|
42
|
+
1. **words** — if the answer itself refuses or hedges ("no such company",
|
|
43
|
+
"I couldn't find any record…"), the verdict is `OFF_MAP`. The model said
|
|
44
|
+
so; believe it.
|
|
45
|
+
2. **fingerprint** — otherwise, the gate scores
|
|
46
|
+
`[mean logprob, min logprob, mean top-5 entropy, max top-5 entropy]`
|
|
47
|
+
through the certificate's probe. Higher score = more likely fabricated.
|
|
48
|
+
`score >= off_map_thr` → `OFF_MAP`; `>= uncertain_thr` → `UNCERTAIN`;
|
|
49
|
+
else `ON_MAP`.
|
|
50
|
+
3. With no logprobs and no refusal, the read degrades honestly to
|
|
51
|
+
`ON_MAP` (instrument `words-only`).
|
|
52
|
+
|
|
53
|
+
The probe is calibrated **per model** — deploy the certificate for the model
|
|
54
|
+
you actually serve. It separates grounded answers from confabulated answers
|
|
55
|
+
about entities; it is not a general lie detector.
|
|
56
|
+
|
|
57
|
+
Zero runtime dependencies (pure stdlib), Python ≥ 3.9, MIT.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install https://honesty.tools/sdk/aperture_gate-0.1.5-py3-none-any.whl
|
|
63
|
+
|
|
64
|
+
# optional — verify certificate signatures against the registry key (adds `cryptography`):
|
|
65
|
+
pip install "aperture-gate[verify] @ https://honesty.tools/sdk/aperture_gate-0.1.5-py3-none-any.whl"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Certificate trust.** Registry certificates are ed25519-signed. With the `[verify]` extra
|
|
69
|
+
installed, `Gate.from_cert(...)` verifies the signature against the registry's pinned public
|
|
70
|
+
key and **rejects a tampered or wrongly-signed certificate** (raises `ValueError`); every
|
|
71
|
+
verdict carries `signature_verified` (`true`/`false`/`null`). Without `[verify]` the gate still
|
|
72
|
+
works — it just notes that the signature was not verified. Expired certificates always warn.
|
|
73
|
+
|
|
74
|
+
## Quickstart
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from aperture_gate import Gate
|
|
78
|
+
|
|
79
|
+
# a registry id, a local cert path, or the cert dict itself
|
|
80
|
+
gate = Gate.from_cert("openai/gpt-4o-mini") # openai/gpt-4o-mini reference cert
|
|
81
|
+
|
|
82
|
+
# 1) words-only read of any text
|
|
83
|
+
gate.read_text("I couldn't find any record of that company.")
|
|
84
|
+
# {'verdict': 'OFF_MAP', 'band': 'off-map', 'instrument': 'words', ...}
|
|
85
|
+
|
|
86
|
+
# 2) read a raw chat.completions response (ask for logprobs yourself)
|
|
87
|
+
verdict = gate.read_response(resp_dict_or_openai_object)
|
|
88
|
+
|
|
89
|
+
# 3) let the gate call the endpoint (sends logprobs=true, top_logprobs=5, temperature=0)
|
|
90
|
+
out = gate.ask("Tell me about the novel Glass over Brackwald.",
|
|
91
|
+
base_url="https://openrouter.ai/api/v1", api_key="sk-or-...")
|
|
92
|
+
print(out["verdict"], out["score"], out["answer"][:80])
|
|
93
|
+
|
|
94
|
+
# 4) or wrap an openai-python client (openai>=1.x) transparently
|
|
95
|
+
client = gate.wrap(OpenAI())
|
|
96
|
+
r = client.chat.completions.create(model="gpt-4o-mini",
|
|
97
|
+
messages=[{"role": "user", "content": "..."}])
|
|
98
|
+
print(r.aperture) # the verdict dict rides along on the response
|
|
99
|
+
|
|
100
|
+
# 5) gate a STREAM in real time — the verdict lands before the answer finishes.
|
|
101
|
+
# Chunks pass through to your loop unchanged; the gate scores the calibrated
|
|
102
|
+
# window prefix as it arrives and (optionally) ABORTS off-map answers mid-stream.
|
|
103
|
+
client = gate.wrap(OpenAI(), on_verdict=lambda v: v["verdict"] == "OFF_MAP")
|
|
104
|
+
stream = client.chat.completions.create(model="gpt-4o-mini", stream=True,
|
|
105
|
+
messages=[{"role": "user", "content": "..."}])
|
|
106
|
+
for chunk in stream:
|
|
107
|
+
print(chunk.choices[0].delta.content or "", end="") # stops early if it reads off-map
|
|
108
|
+
print(stream.aperture) # the verdict, scored on the first `window` tokens
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## CLI
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
aperture-gate read --cert openai/gpt-4o-mini --text "No such film exists."
|
|
115
|
+
aperture-gate read --cert ./aperture-cert-my-model.json \
|
|
116
|
+
--query "Who founded Brindlewick & Thorne?" \
|
|
117
|
+
--base-url http://localhost:8000/v1 --key sk-...
|
|
118
|
+
aperture-gate calibrate --base-url http://localhost:8000/v1 --model my-llama-70b
|
|
119
|
+
aperture-gate verify # conformance self-test against frozen vectors
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## MCP server
|
|
123
|
+
|
|
124
|
+
A pure-stdlib stdio MCP server, for Claude Desktop / Claude Code / any MCP client:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
python -m aperture_gate.mcp
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Tools: `aperture_read {query, model?}` (ask the upstream model and gate the
|
|
131
|
+
answer) and `aperture_check_text {text}` (words-only read, no model call).
|
|
132
|
+
Configure with `APERTURE_BASE_URL` (default `https://openrouter.ai/api/v1`),
|
|
133
|
+
`APERTURE_UPSTREAM_KEY` or `OPENROUTER_API_KEY`, and `APERTURE_CERT`
|
|
134
|
+
(path or registry id; default `openai/gpt-4o-mini`).
|
|
135
|
+
|
|
136
|
+
## Conformance
|
|
137
|
+
|
|
138
|
+
`aperture-gate verify` (or `python -m aperture_gate.verify`) runs six frozen
|
|
139
|
+
vectors computed from the real gpt-4o-mini registry certificate and the
|
|
140
|
+
public demo battery — three fingerprint scores to six decimal places and
|
|
141
|
+
three refusal-reader reads pinning the normalization and guard logic. If any
|
|
142
|
+
vector fails, the install does not implement the certified method.
|
|
143
|
+
|
|
144
|
+
## Docs
|
|
145
|
+
|
|
146
|
+
Full method, evidence, and the certificate registry: [honesty.tools/docs](https://honesty.tools/docs)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# aperture-gate
|
|
2
|
+
|
|
3
|
+
Deploy a [honesty.tools](https://honesty.tools) calibration certificate.
|
|
4
|
+
|
|
5
|
+
You calibrated a model at [honesty.tools/calibrate](https://honesty.tools/calibrate)
|
|
6
|
+
(or with the public `aperture_calibrate.py`) and got a certificate JSON —
|
|
7
|
+
`aperture.cert.v1` — containing a logistic probe fitted to *your model's*
|
|
8
|
+
logprob confidence fingerprint over a Wikipedia-validated battery of real and
|
|
9
|
+
fabricated entities. This package is the deployment half: it consumes that
|
|
10
|
+
certificate and gates live answers.
|
|
11
|
+
|
|
12
|
+
**The method, honestly stated.** Two instruments, words first:
|
|
13
|
+
|
|
14
|
+
1. **words** — if the answer itself refuses or hedges ("no such company",
|
|
15
|
+
"I couldn't find any record…"), the verdict is `OFF_MAP`. The model said
|
|
16
|
+
so; believe it.
|
|
17
|
+
2. **fingerprint** — otherwise, the gate scores
|
|
18
|
+
`[mean logprob, min logprob, mean top-5 entropy, max top-5 entropy]`
|
|
19
|
+
through the certificate's probe. Higher score = more likely fabricated.
|
|
20
|
+
`score >= off_map_thr` → `OFF_MAP`; `>= uncertain_thr` → `UNCERTAIN`;
|
|
21
|
+
else `ON_MAP`.
|
|
22
|
+
3. With no logprobs and no refusal, the read degrades honestly to
|
|
23
|
+
`ON_MAP` (instrument `words-only`).
|
|
24
|
+
|
|
25
|
+
The probe is calibrated **per model** — deploy the certificate for the model
|
|
26
|
+
you actually serve. It separates grounded answers from confabulated answers
|
|
27
|
+
about entities; it is not a general lie detector.
|
|
28
|
+
|
|
29
|
+
Zero runtime dependencies (pure stdlib), Python ≥ 3.9, MIT.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install https://honesty.tools/sdk/aperture_gate-0.1.5-py3-none-any.whl
|
|
35
|
+
|
|
36
|
+
# optional — verify certificate signatures against the registry key (adds `cryptography`):
|
|
37
|
+
pip install "aperture-gate[verify] @ https://honesty.tools/sdk/aperture_gate-0.1.5-py3-none-any.whl"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Certificate trust.** Registry certificates are ed25519-signed. With the `[verify]` extra
|
|
41
|
+
installed, `Gate.from_cert(...)` verifies the signature against the registry's pinned public
|
|
42
|
+
key and **rejects a tampered or wrongly-signed certificate** (raises `ValueError`); every
|
|
43
|
+
verdict carries `signature_verified` (`true`/`false`/`null`). Without `[verify]` the gate still
|
|
44
|
+
works — it just notes that the signature was not verified. Expired certificates always warn.
|
|
45
|
+
|
|
46
|
+
## Quickstart
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from aperture_gate import Gate
|
|
50
|
+
|
|
51
|
+
# a registry id, a local cert path, or the cert dict itself
|
|
52
|
+
gate = Gate.from_cert("openai/gpt-4o-mini") # openai/gpt-4o-mini reference cert
|
|
53
|
+
|
|
54
|
+
# 1) words-only read of any text
|
|
55
|
+
gate.read_text("I couldn't find any record of that company.")
|
|
56
|
+
# {'verdict': 'OFF_MAP', 'band': 'off-map', 'instrument': 'words', ...}
|
|
57
|
+
|
|
58
|
+
# 2) read a raw chat.completions response (ask for logprobs yourself)
|
|
59
|
+
verdict = gate.read_response(resp_dict_or_openai_object)
|
|
60
|
+
|
|
61
|
+
# 3) let the gate call the endpoint (sends logprobs=true, top_logprobs=5, temperature=0)
|
|
62
|
+
out = gate.ask("Tell me about the novel Glass over Brackwald.",
|
|
63
|
+
base_url="https://openrouter.ai/api/v1", api_key="sk-or-...")
|
|
64
|
+
print(out["verdict"], out["score"], out["answer"][:80])
|
|
65
|
+
|
|
66
|
+
# 4) or wrap an openai-python client (openai>=1.x) transparently
|
|
67
|
+
client = gate.wrap(OpenAI())
|
|
68
|
+
r = client.chat.completions.create(model="gpt-4o-mini",
|
|
69
|
+
messages=[{"role": "user", "content": "..."}])
|
|
70
|
+
print(r.aperture) # the verdict dict rides along on the response
|
|
71
|
+
|
|
72
|
+
# 5) gate a STREAM in real time — the verdict lands before the answer finishes.
|
|
73
|
+
# Chunks pass through to your loop unchanged; the gate scores the calibrated
|
|
74
|
+
# window prefix as it arrives and (optionally) ABORTS off-map answers mid-stream.
|
|
75
|
+
client = gate.wrap(OpenAI(), on_verdict=lambda v: v["verdict"] == "OFF_MAP")
|
|
76
|
+
stream = client.chat.completions.create(model="gpt-4o-mini", stream=True,
|
|
77
|
+
messages=[{"role": "user", "content": "..."}])
|
|
78
|
+
for chunk in stream:
|
|
79
|
+
print(chunk.choices[0].delta.content or "", end="") # stops early if it reads off-map
|
|
80
|
+
print(stream.aperture) # the verdict, scored on the first `window` tokens
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## CLI
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
aperture-gate read --cert openai/gpt-4o-mini --text "No such film exists."
|
|
87
|
+
aperture-gate read --cert ./aperture-cert-my-model.json \
|
|
88
|
+
--query "Who founded Brindlewick & Thorne?" \
|
|
89
|
+
--base-url http://localhost:8000/v1 --key sk-...
|
|
90
|
+
aperture-gate calibrate --base-url http://localhost:8000/v1 --model my-llama-70b
|
|
91
|
+
aperture-gate verify # conformance self-test against frozen vectors
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## MCP server
|
|
95
|
+
|
|
96
|
+
A pure-stdlib stdio MCP server, for Claude Desktop / Claude Code / any MCP client:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
python -m aperture_gate.mcp
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Tools: `aperture_read {query, model?}` (ask the upstream model and gate the
|
|
103
|
+
answer) and `aperture_check_text {text}` (words-only read, no model call).
|
|
104
|
+
Configure with `APERTURE_BASE_URL` (default `https://openrouter.ai/api/v1`),
|
|
105
|
+
`APERTURE_UPSTREAM_KEY` or `OPENROUTER_API_KEY`, and `APERTURE_CERT`
|
|
106
|
+
(path or registry id; default `openai/gpt-4o-mini`).
|
|
107
|
+
|
|
108
|
+
## Conformance
|
|
109
|
+
|
|
110
|
+
`aperture-gate verify` (or `python -m aperture_gate.verify`) runs six frozen
|
|
111
|
+
vectors computed from the real gpt-4o-mini registry certificate and the
|
|
112
|
+
public demo battery — three fingerprint scores to six decimal places and
|
|
113
|
+
three refusal-reader reads pinning the normalization and guard logic. If any
|
|
114
|
+
vector fails, the install does not implement the certified method.
|
|
115
|
+
|
|
116
|
+
## Docs
|
|
117
|
+
|
|
118
|
+
Full method, evidence, and the certificate registry: [honesty.tools/docs](https://honesty.tools/docs)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""aperture-gate: deploy honesty.tools calibration certificates.
|
|
4
|
+
|
|
5
|
+
A team calibrates a model at https://honesty.tools/calibrate and downloads a
|
|
6
|
+
certificate (``aperture.cert.v1``) containing a logistic probe over the
|
|
7
|
+
model's logprob confidence fingerprint. This package consumes that
|
|
8
|
+
certificate and gates live answers: words-first refusal reading, then the
|
|
9
|
+
fingerprint probe, yielding ON_MAP / UNCERTAIN / OFF_MAP verdicts.
|
|
10
|
+
|
|
11
|
+
Quickstart::
|
|
12
|
+
|
|
13
|
+
from aperture_gate import Gate
|
|
14
|
+
gate = Gate.from_cert("371e95d23ba7") # registry id, path, or dict
|
|
15
|
+
gate.read_text("I couldn't find any record of that company.")
|
|
16
|
+
# -> {'verdict': 'OFF_MAP', 'instrument': 'words', ...}
|
|
17
|
+
|
|
18
|
+
Zero runtime dependencies (stdlib only), Python >= 3.9.
|
|
19
|
+
"""
|
|
20
|
+
from .gate import Gate
|
|
21
|
+
from .hedge import read_hedge
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.5"
|
|
24
|
+
__all__ = ["Gate", "read_hedge", "__version__"]
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""Calibrate the Aperture honesty layer to YOUR model — vendored CLI.
|
|
4
|
+
|
|
5
|
+
This is the public ``https://honesty.tools/aperture_calibrate.py`` adapted as
|
|
6
|
+
a package module so ``aperture-gate calibrate --base-url ... --model ...``
|
|
7
|
+
works; the behavior is identical. The shared math (the refusal reader and the
|
|
8
|
+
feature/scoring functions) lives in :mod:`aperture_gate.hedge` and
|
|
9
|
+
:mod:`aperture_gate.gate` — byte-for-byte the same code — and is imported
|
|
10
|
+
here rather than duplicated.
|
|
11
|
+
|
|
12
|
+
What it does (the exact deployed method, in the open):
|
|
13
|
+
1. pulls the current public battery from honesty.tools (84 real / 84
|
|
14
|
+
fabricated entities, every one validated against Wikipedia; the battery
|
|
15
|
+
hash binds the certificate)
|
|
16
|
+
2. asks your model all 168 questions (max_tokens=24, temperature=0,
|
|
17
|
+
logprobs+top-5 if exposed)
|
|
18
|
+
3. the refusal reader catches fakes your model declines IN WORDS
|
|
19
|
+
4. a logistic probe on [mean logprob, min logprob, mean top-5 entropy,
|
|
20
|
+
max top-5 entropy] separates grounded answers from confabulations —
|
|
21
|
+
5-fold CV AUROC, P95-anchored thresholds
|
|
22
|
+
5. a shuffled-label permutation null tries to kill the signal; the verdict
|
|
23
|
+
lands honestly: CALIBRATED / WORDS-DOMINANT / REFUSAL-READER (no
|
|
24
|
+
logprobs) / PROBE-WEAK
|
|
25
|
+
6. writes aperture-cert-<model>.json — the certificate plus the deployable
|
|
26
|
+
probe
|
|
27
|
+
|
|
28
|
+
Nothing leaves your machine except calls to YOUR endpoint — unless you pass
|
|
29
|
+
``--register``, which uploads only the finished certificate (never the
|
|
30
|
+
prompts, answers, or key) to the public registry.
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import argparse
|
|
35
|
+
import concurrent.futures as cf
|
|
36
|
+
import json
|
|
37
|
+
import math
|
|
38
|
+
import random
|
|
39
|
+
import re
|
|
40
|
+
import sys
|
|
41
|
+
import urllib.error
|
|
42
|
+
import urllib.request
|
|
43
|
+
from typing import List, Optional
|
|
44
|
+
|
|
45
|
+
from .gate import feats_from, score_one, sigmoid
|
|
46
|
+
from .hedge import refused
|
|
47
|
+
|
|
48
|
+
SITE = "https://honesty.tools"
|
|
49
|
+
SEED = 20260610
|
|
50
|
+
MIN_CONFAB, AUROC_PASS, NPERM = 8, 0.70, 120
|
|
51
|
+
FIT = {"alpha": 0.008, "iters": 1000, "lr": 0.5} # parity-locked vs the deployed sklearn re-fit
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ── the fit (pure-python port, parity-locked vs sklearn: AUROC 0.968 == 0.968 on the reference) ──
|
|
55
|
+
def standardize_fit(X):
|
|
56
|
+
n, d = len(X), len(X[0])
|
|
57
|
+
mean = [sum(r[j] for r in X) / n for j in range(d)]
|
|
58
|
+
scale = []
|
|
59
|
+
for j in range(d):
|
|
60
|
+
v = sum((r[j] - mean[j]) ** 2 for r in X) / n
|
|
61
|
+
scale.append(math.sqrt(v) if v > 1e-24 else 1.0)
|
|
62
|
+
return mean, scale
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def fit_logreg(Z, y):
|
|
66
|
+
n, d = len(Z), len(Z[0])
|
|
67
|
+
w, b = [0.0] * d, 0.0
|
|
68
|
+
for _ in range(FIT["iters"]):
|
|
69
|
+
gw, gb = [0.0] * d, 0.0
|
|
70
|
+
for i in range(n):
|
|
71
|
+
err = sigmoid(b + sum(w[j] * Z[i][j] for j in range(d))) - y[i]
|
|
72
|
+
gb += err
|
|
73
|
+
for j in range(d):
|
|
74
|
+
gw[j] += err * Z[i][j]
|
|
75
|
+
b -= FIT["lr"] * (gb / n)
|
|
76
|
+
for j in range(d):
|
|
77
|
+
w[j] -= FIT["lr"] * (gw[j] / n + FIT["alpha"] * w[j])
|
|
78
|
+
return w, b
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def auroc(scores, y):
|
|
82
|
+
pairs = sorted(range(len(scores)), key=lambda i: scores[i])
|
|
83
|
+
rank, i = [0.0] * len(scores), 0
|
|
84
|
+
while i < len(pairs):
|
|
85
|
+
j = i
|
|
86
|
+
while j + 1 < len(pairs) and scores[pairs[j + 1]] == scores[pairs[i]]:
|
|
87
|
+
j += 1
|
|
88
|
+
for k in range(i, j + 1):
|
|
89
|
+
rank[pairs[k]] = (i + j) / 2 + 1
|
|
90
|
+
i = j + 1
|
|
91
|
+
pos = [i for i in range(len(y)) if y[i] == 1]
|
|
92
|
+
neg = [i for i in range(len(y)) if y[i] == 0]
|
|
93
|
+
if not pos or not neg:
|
|
94
|
+
return float("nan")
|
|
95
|
+
return (sum(rank[i] for i in pos) - len(pos) * (len(pos) + 1) / 2) / (len(pos) * len(neg))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def percentile(arr, p):
|
|
99
|
+
a = sorted(arr)
|
|
100
|
+
if len(a) == 1:
|
|
101
|
+
return a[0]
|
|
102
|
+
idx = (p / 100) * (len(a) - 1)
|
|
103
|
+
lo, hi = int(math.floor(idx)), int(math.ceil(idx))
|
|
104
|
+
return a[lo] + (a[hi] - a[lo]) * (idx - lo)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def cv_scores(X, y, k, rng):
|
|
108
|
+
pos = [i for i in range(len(y)) if y[i] == 1]
|
|
109
|
+
neg = [i for i in range(len(y)) if y[i] == 0]
|
|
110
|
+
rng.shuffle(pos)
|
|
111
|
+
rng.shuffle(neg)
|
|
112
|
+
folds = [[] for _ in range(k)]
|
|
113
|
+
for m, ix in enumerate(pos):
|
|
114
|
+
folds[m % k].append(ix)
|
|
115
|
+
for m, ix in enumerate(neg):
|
|
116
|
+
folds[m % k].append(ix)
|
|
117
|
+
oof = [0.5] * len(y)
|
|
118
|
+
for f in range(k):
|
|
119
|
+
test = set(folds[f])
|
|
120
|
+
trX = [X[i] for i in range(len(y)) if i not in test]
|
|
121
|
+
trY = [y[i] for i in range(len(y)) if i not in test]
|
|
122
|
+
if not trX:
|
|
123
|
+
continue
|
|
124
|
+
mean, scale = standardize_fit(trX)
|
|
125
|
+
Z = [[(r[j] - mean[j]) / scale[j] for j in range(4)] for r in trX]
|
|
126
|
+
w, b = fit_logreg(Z, trY)
|
|
127
|
+
probe = {"mean": mean, "scale": scale, "coef": w, "intercept": b}
|
|
128
|
+
for i in test:
|
|
129
|
+
oof[i] = score_one(X[i], probe)
|
|
130
|
+
return oof
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ── plumbing ─────────────────────────────────────────────────────────────────────────────────────
|
|
134
|
+
def http_json(url, payload=None, headers=None, timeout=60):
|
|
135
|
+
data = json.dumps(payload).encode() if payload is not None else None
|
|
136
|
+
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json", **(headers or {})})
|
|
137
|
+
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
138
|
+
return json.loads(r.read().decode())
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_battery(path=None):
|
|
142
|
+
if path:
|
|
143
|
+
raw = open(path, encoding="utf-8").read()
|
|
144
|
+
else:
|
|
145
|
+
with urllib.request.urlopen(SITE + "/battery.js", timeout=30) as r:
|
|
146
|
+
raw = r.read().decode()
|
|
147
|
+
m = re.search(r"window\.APERTURE_BATTERY\s*=\s*(\{.*\});", raw, re.S)
|
|
148
|
+
return json.loads(m.group(1))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def call_model(base, key, model, q):
|
|
152
|
+
url = base.rstrip("/")
|
|
153
|
+
if not url.endswith("/chat/completions"):
|
|
154
|
+
url += "/chat/completions"
|
|
155
|
+
payload = {"model": model, "messages": [{"role": "user", "content": q}],
|
|
156
|
+
"max_tokens": 24, "temperature": 0, "logprobs": True, "top_logprobs": 5}
|
|
157
|
+
headers = {"Authorization": "Bearer " + key} if key else {}
|
|
158
|
+
last = "?"
|
|
159
|
+
for attempt in range(3):
|
|
160
|
+
try:
|
|
161
|
+
d = http_json(url, payload, headers)
|
|
162
|
+
ch = d["choices"][0]
|
|
163
|
+
text = (ch.get("message") or {}).get("content") or ""
|
|
164
|
+
cont = (ch.get("logprobs") or {}).get("content")
|
|
165
|
+
return {"text": text, "feats": feats_from(cont)}
|
|
166
|
+
except urllib.error.HTTPError as e:
|
|
167
|
+
if e.code == 400: # logprobs unsupported on some servers -> words-only retry
|
|
168
|
+
try:
|
|
169
|
+
d = http_json(url, {"model": model, "messages": payload["messages"],
|
|
170
|
+
"max_tokens": 40, "temperature": 0}, headers)
|
|
171
|
+
return {"text": d["choices"][0]["message"]["content"] or "", "feats": None}
|
|
172
|
+
except Exception as e2: # noqa: BLE001
|
|
173
|
+
last = str(e2)[:80]
|
|
174
|
+
else:
|
|
175
|
+
last = f"HTTP {e.code}"
|
|
176
|
+
except Exception as e: # noqa: BLE001
|
|
177
|
+
last = str(e)[:80]
|
|
178
|
+
return {"error": last}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main(argv: Optional[List[str]] = None) -> None:
|
|
182
|
+
ap = argparse.ArgumentParser(prog="aperture-gate calibrate",
|
|
183
|
+
description="Calibrate the Aperture honesty layer to your model.")
|
|
184
|
+
ap.add_argument("--base-url", required=True, help="your OpenAI-compatible /v1 base URL")
|
|
185
|
+
ap.add_argument("--key", default="", help="API key if your endpoint needs one (sent ONLY to your endpoint)")
|
|
186
|
+
ap.add_argument("--model", required=True, help="the model id your server hosts")
|
|
187
|
+
ap.add_argument("--workers", type=int, default=4)
|
|
188
|
+
ap.add_argument("--battery", default=None, help="local battery.js (defaults to fetching the current one)")
|
|
189
|
+
ap.add_argument("--out", default=None, help="output cert path (default aperture-cert-<model>.json)")
|
|
190
|
+
ap.add_argument("--register", action="store_true",
|
|
191
|
+
help="publish the finished certificate to the honesty.tools registry (self-attested)")
|
|
192
|
+
a = ap.parse_args(argv)
|
|
193
|
+
|
|
194
|
+
bat = get_battery(a.battery)
|
|
195
|
+
items = bat["items"]
|
|
196
|
+
print(f"battery {bat['hash']} — {len(items)} items ({bat['n_real']} real / {bat['n_fake']} fabricated)")
|
|
197
|
+
print(f"model {a.model} @ {a.base_url}")
|
|
198
|
+
|
|
199
|
+
cap = call_model(a.base_url, a.key, a.model, "What is the capital of France?")
|
|
200
|
+
if "error" in cap:
|
|
201
|
+
sys.exit(f"cannot reach the endpoint: {cap['error']}")
|
|
202
|
+
print(f"logprobs exposed: {'yes' if cap.get('feats') else 'NO — words-only calibration'}\n")
|
|
203
|
+
|
|
204
|
+
rows, done = [], 0
|
|
205
|
+
with cf.ThreadPoolExecutor(max_workers=a.workers) as ex:
|
|
206
|
+
for it, r in zip(items, ex.map(lambda it: call_model(a.base_url, a.key, a.model, it["q"]), items)):
|
|
207
|
+
done += 1
|
|
208
|
+
if "error" in r:
|
|
209
|
+
rows.append({**it, "text": "", "feats": None, "refused": False, "error": r["error"]})
|
|
210
|
+
else:
|
|
211
|
+
rows.append({**it, "text": r["text"], "feats": r["feats"], "refused": refused(r["text"])})
|
|
212
|
+
if done % 24 == 0:
|
|
213
|
+
print(f" {done}/{len(items)}")
|
|
214
|
+
errs = sum(1 for r in rows if r.get("error"))
|
|
215
|
+
if errs > len(items) * 0.2:
|
|
216
|
+
sys.exit(f"too many collection errors ({errs}/{len(items)}) — check the endpoint and retry")
|
|
217
|
+
|
|
218
|
+
ok_rows = [r for r in rows if not r.get("error")]
|
|
219
|
+
reals_all = [r for r in ok_rows if r["label"] == 0]
|
|
220
|
+
fakes_all = [r for r in ok_rows if r["label"] == 1]
|
|
221
|
+
refused_fakes = [r for r in fakes_all if r["refused"]]
|
|
222
|
+
false_ref = [r for r in reals_all if r["refused"]]
|
|
223
|
+
coverage = len(refused_fakes) / max(1, len(fakes_all))
|
|
224
|
+
cert = {"schema": "aperture.cert.v1", "grade": "self-attested", "model": a.model,
|
|
225
|
+
"method": "logprob confidence-fingerprint probe (reals vs confabulated fakes) + words refusal reader",
|
|
226
|
+
"battery": {"hash": bat["hash"], "version": bat["version"],
|
|
227
|
+
"n_real": len(reals_all), "n_fake": len(fakes_all)},
|
|
228
|
+
"refusal_reader": {"fakes_refused": len(refused_fakes), "coverage": round(coverage, 3),
|
|
229
|
+
"false_refusals_on_reals": len(false_ref)},
|
|
230
|
+
"fingerprint_probe": None, "combined": None, "samples": []}
|
|
231
|
+
|
|
232
|
+
reals = [r for r in reals_all if r["feats"]]
|
|
233
|
+
confab = [r for r in fakes_all if r["feats"] and not r["refused"]]
|
|
234
|
+
if not [r for r in ok_rows if r["feats"]]:
|
|
235
|
+
cert["verdict"] = "REFUSAL-READER"
|
|
236
|
+
cert["combined"] = {"detect_rate": round(coverage, 3),
|
|
237
|
+
"fp_rate": round(len(false_ref) / max(1, len(reals_all)), 3)}
|
|
238
|
+
elif len(confab) < MIN_CONFAB:
|
|
239
|
+
cert["verdict"] = "WORDS-DOMINANT"
|
|
240
|
+
cert["combined"] = {"detect_rate": round(coverage, 3),
|
|
241
|
+
"fp_rate": round(len(false_ref) / max(1, len(reals_all)), 3)}
|
|
242
|
+
else:
|
|
243
|
+
X = [r["feats"] for r in reals + confab]
|
|
244
|
+
y = [0] * len(reals) + [1] * len(confab)
|
|
245
|
+
oof = cv_scores(X, y, 5, random.Random(SEED))
|
|
246
|
+
au = auroc(oof, y)
|
|
247
|
+
p95 = percentile([oof[i] for i in range(len(y)) if y[i] == 0], 95)
|
|
248
|
+
un, off = round(p95 + 0.10, 3), round(p95 + 0.22, 3)
|
|
249
|
+
rng = random.Random(SEED)
|
|
250
|
+
boots = []
|
|
251
|
+
for _ in range(400):
|
|
252
|
+
ix = [rng.randrange(len(y)) for _ in range(len(y))]
|
|
253
|
+
aa = auroc([oof[i] for i in ix], [y[i] for i in ix])
|
|
254
|
+
if aa == aa:
|
|
255
|
+
boots.append(aa)
|
|
256
|
+
ci = [round(percentile(boots, 2.5), 3), round(percentile(boots, 97.5), 3)]
|
|
257
|
+
null = []
|
|
258
|
+
print(f"\nnull test ({NPERM} label permutations)…")
|
|
259
|
+
for pi in range(NPERM):
|
|
260
|
+
ys = y[:]
|
|
261
|
+
rng.shuffle(ys)
|
|
262
|
+
o2 = cv_scores(X, ys, 5, random.Random(SEED + pi))
|
|
263
|
+
aa = auroc(o2, ys)
|
|
264
|
+
if aa == aa:
|
|
265
|
+
null.append(aa)
|
|
266
|
+
nmean = sum(null) / len(null)
|
|
267
|
+
nsd = math.sqrt(sum((v - nmean) ** 2 for v in null) / len(null)) or 1e-9
|
|
268
|
+
z = (au - nmean) / nsd
|
|
269
|
+
pval = (sum(1 for v in null if v >= au) + 1) / (len(null) + 1)
|
|
270
|
+
mean, scale = standardize_fit(X)
|
|
271
|
+
Z = [[(r[j] - mean[j]) / scale[j] for j in range(4)] for r in X]
|
|
272
|
+
w, b = fit_logreg(Z, y)
|
|
273
|
+
probe = {"mean": [round(v, 6) for v in mean], "scale": [round(v, 6) for v in scale],
|
|
274
|
+
"coef": [round(v, 6) for v in w], "intercept": round(b, 6),
|
|
275
|
+
"uncertain_thr": un, "off_map_thr": off, "cv_auroc": round(au, 4)}
|
|
276
|
+
caught = len(refused_fakes) + sum(1 for r in confab if score_one(r["feats"], probe) >= off)
|
|
277
|
+
fp = sum(1 for r in reals if r["refused"] or score_one(r["feats"], probe) >= off)
|
|
278
|
+
cert["verdict"] = "CALIBRATED" if (au >= AUROC_PASS and pval < 0.05) else "PROBE-WEAK"
|
|
279
|
+
cert["fingerprint_probe"] = {"cv_auroc": round(au, 4), "ci95": ci, "n_confabulations": len(confab),
|
|
280
|
+
"null": {"mean": round(nmean, 3), "z": round(z, 2), "p": round(pval, 5)},
|
|
281
|
+
"probe": probe}
|
|
282
|
+
cert["combined"] = {"detect_rate": round(caught / max(1, len(fakes_all)), 3),
|
|
283
|
+
"fp_rate": round(fp / max(1, len(reals)), 3)}
|
|
284
|
+
cc = [r for r in confab if score_one(r["feats"], probe) >= off]
|
|
285
|
+
cert["samples"] = ([{"q": r["q"], "answer": (r["text"] or "")[:120], "caught_by": "words", "score": None}
|
|
286
|
+
for r in refused_fakes[:1]] +
|
|
287
|
+
[{"q": r["q"], "answer": (r["text"] or "")[:120], "caught_by": "fingerprint",
|
|
288
|
+
"score": round(score_one(r["feats"], probe), 3)} for r in cc[:2]])
|
|
289
|
+
|
|
290
|
+
out = a.out or ("aperture-cert-" + re.sub(r"[^A-Za-z0-9]+", "-", a.model) + ".json")
|
|
291
|
+
json.dump(cert, open(out, "w", encoding="utf-8"), indent=2, ensure_ascii=False)
|
|
292
|
+
print("\n" + "=" * 64)
|
|
293
|
+
print(f" VERDICT {cert['verdict']}")
|
|
294
|
+
print(f" refusal reader {len(refused_fakes)}/{len(fakes_all)} fakes caught in words "
|
|
295
|
+
f"({round(100 * coverage)}%), {len(false_ref)} false on reals")
|
|
296
|
+
if cert["fingerprint_probe"]:
|
|
297
|
+
fpd = cert["fingerprint_probe"]
|
|
298
|
+
print(f" fingerprint AUROC {fpd['cv_auroc']} CI {fpd['ci95']} "
|
|
299
|
+
f"null z={fpd['null']['z']} p={fpd['null']['p']}")
|
|
300
|
+
if cert["combined"]:
|
|
301
|
+
print(f" combined {round(100 * cert['combined']['detect_rate'])}% of fabrications caught "
|
|
302
|
+
f"at {round(100 * cert['combined']['fp_rate'])}% real-FP")
|
|
303
|
+
print(f" certificate {out}")
|
|
304
|
+
print("=" * 64)
|
|
305
|
+
if a.register:
|
|
306
|
+
try:
|
|
307
|
+
res = http_json(SITE + "/api/cert/register", cert)
|
|
308
|
+
if res.get("id"):
|
|
309
|
+
print(f"\nregistered: {SITE}/cert/{res['id']}")
|
|
310
|
+
else:
|
|
311
|
+
print(f"\nregistry said: {res.get('error', res)}")
|
|
312
|
+
except Exception as e: # noqa: BLE001
|
|
313
|
+
print(f"\ncould not register: {e}")
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
if __name__ == "__main__":
|
|
317
|
+
main()
|