proofbundle 0.7.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.7.0/src/proofbundle.egg-info → proofbundle-0.8.0}/PKG-INFO +47 -14
- {proofbundle-0.7.0 → proofbundle-0.8.0}/README.md +44 -11
- {proofbundle-0.7.0 → proofbundle-0.8.0}/pyproject.toml +5 -4
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/__init__.py +1 -1
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/inspect_ai.py +15 -1
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/bundle.py +46 -6
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/evalclaim.py +17 -6
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/intoto.py +17 -16
- {proofbundle-0.7.0 → proofbundle-0.8.0/src/proofbundle.egg-info}/PKG-INFO +47 -14
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/SOURCES.txt +2 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/requires.txt +4 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_adapters.py +2 -0
- proofbundle-0.8.0/tests/test_bundle_robustness.py +74 -0
- proofbundle-0.8.0/tests/test_examples.py +28 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/LICENSE +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/setup.cfg +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/lm_eval.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_cli.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_emit.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_intoto.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_schema.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -28,7 +28,7 @@ Provides-Extra: eval
|
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
30
|
Provides-Extra: inspect
|
|
31
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
|
|
32
32
|
Provides-Extra: dev
|
|
33
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
|
|
|
38
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
39
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
40
|
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
|
|
42
42
|
Dynamic: license-file
|
|
43
43
|
|
|
44
44
|
<div align="center">
|
|
@@ -62,14 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
62
62
|
[](https://github.com/astral-sh/ruff)
|
|
63
63
|
[](https://slsa.dev)
|
|
64
64
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
-
<!-- DOI badge placeholder: Zenodo
|
|
66
|
-
here (and the DOI to CITATION.cff) once Zenodo assigns
|
|
65
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
66
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
67
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
67
68
|
|
|
68
69
|
</div>
|
|
69
70
|
|
|
70
71
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
71
72
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
72
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
73
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
|
|
73
74
|
|
|
74
75
|
## Contents
|
|
75
76
|
|
|
@@ -78,6 +79,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
|
78
79
|
- [How it fits together](#how-it-fits-together)
|
|
79
80
|
- [Install](#install)
|
|
80
81
|
- [Quickstart](#quickstart)
|
|
82
|
+
- [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
|
|
81
83
|
- [Interoperability](#interoperability)
|
|
82
84
|
- [Bundle format](#bundle-format-proofbundlev01)
|
|
83
85
|
- [Eval receipts](#eval-receipts)
|
|
@@ -208,6 +210,21 @@ from proofbundle import verify_consistency
|
|
|
208
210
|
verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
|
|
209
211
|
```
|
|
210
212
|
|
|
213
|
+
## Demo — a real eval log to a verified receipt, offline
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
pip install "proofbundle[eval,inspect]"
|
|
217
|
+
make demo # or: bash scripts/demo.sh
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
`make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
|
|
221
|
+
inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
|
|
222
|
+
(committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
|
|
223
|
+
proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
|
|
224
|
+
that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
|
|
225
|
+
See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
|
|
226
|
+
[`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
|
|
227
|
+
|
|
211
228
|
## Interoperability
|
|
212
229
|
|
|
213
230
|
proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
|
|
@@ -284,15 +301,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
|
|
|
284
301
|
```
|
|
285
302
|
|
|
286
303
|
The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
|
|
287
|
-
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
304
|
+
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
305
|
+
|
|
306
|
+
**Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
|
|
307
|
+
*claimed* result and its context — these exact bytes, signed by this key, anchored under this root, with
|
|
308
|
+
model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
|
|
309
|
+
and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
|
|
310
|
+
suite measures what it claims, and whether the number was computed honestly are separate questions.
|
|
311
|
+
Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
|
|
312
|
+
computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
|
|
313
|
+
hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
|
|
314
|
+
|
|
315
|
+
**How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
|
|
316
|
+
signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
|
|
317
|
+
those lines,
|
|
318
|
+
[inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
|
|
319
|
+
belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
|
|
320
|
+
computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
|
|
321
|
+
SD-JWT selective disclosure over one portable file, offline.
|
|
292
322
|
|
|
293
323
|
### A verification layer for trustworthy eval logs
|
|
294
324
|
|
|
295
|
-
The UK
|
|
325
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
326
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
296
327
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
297
328
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
298
329
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -326,8 +357,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
326
357
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
327
358
|
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
328
359
|
CITATION.cff, PEP 740 attestations documented.
|
|
329
|
-
- **v0.7
|
|
330
|
-
|
|
360
|
+
- **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
|
|
361
|
+
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
362
|
+
- **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
363
|
+
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
331
364
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
332
365
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
333
366
|
|
|
@@ -19,14 +19,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
19
19
|
[](https://github.com/astral-sh/ruff)
|
|
20
20
|
[](https://slsa.dev)
|
|
21
21
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
22
|
-
<!-- DOI badge placeholder: Zenodo
|
|
23
|
-
here (and the DOI to CITATION.cff) once Zenodo assigns
|
|
22
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
23
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
24
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
24
25
|
|
|
25
26
|
</div>
|
|
26
27
|
|
|
27
28
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
28
29
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
29
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
30
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
|
|
30
31
|
|
|
31
32
|
## Contents
|
|
32
33
|
|
|
@@ -35,6 +36,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
|
35
36
|
- [How it fits together](#how-it-fits-together)
|
|
36
37
|
- [Install](#install)
|
|
37
38
|
- [Quickstart](#quickstart)
|
|
39
|
+
- [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
|
|
38
40
|
- [Interoperability](#interoperability)
|
|
39
41
|
- [Bundle format](#bundle-format-proofbundlev01)
|
|
40
42
|
- [Eval receipts](#eval-receipts)
|
|
@@ -165,6 +167,21 @@ from proofbundle import verify_consistency
|
|
|
165
167
|
verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
|
|
166
168
|
```
|
|
167
169
|
|
|
170
|
+
## Demo — a real eval log to a verified receipt, offline
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
pip install "proofbundle[eval,inspect]"
|
|
174
|
+
make demo # or: bash scripts/demo.sh
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
`make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
|
|
178
|
+
inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
|
|
179
|
+
(committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
|
|
180
|
+
proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
|
|
181
|
+
that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
|
|
182
|
+
See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
|
|
183
|
+
[`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
|
|
184
|
+
|
|
168
185
|
## Interoperability
|
|
169
186
|
|
|
170
187
|
proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
|
|
@@ -241,15 +258,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
|
|
|
241
258
|
```
|
|
242
259
|
|
|
243
260
|
The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
|
|
244
|
-
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
261
|
+
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
262
|
+
|
|
263
|
+
**Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
|
|
264
|
+
*claimed* result and its context — these exact bytes, signed by this key, anchored under this root, with
|
|
265
|
+
model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
|
|
266
|
+
and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
|
|
267
|
+
suite measures what it claims, and whether the number was computed honestly are separate questions.
|
|
268
|
+
Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
|
|
269
|
+
computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
|
|
270
|
+
hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
|
|
271
|
+
|
|
272
|
+
**How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
|
|
273
|
+
signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
|
|
274
|
+
those lines,
|
|
275
|
+
[inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
|
|
276
|
+
belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
|
|
277
|
+
computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
|
|
278
|
+
SD-JWT selective disclosure over one portable file, offline.
|
|
249
279
|
|
|
250
280
|
### A verification layer for trustworthy eval logs
|
|
251
281
|
|
|
252
|
-
The UK
|
|
282
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
283
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
253
284
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
254
285
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
255
286
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -283,8 +314,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
283
314
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
284
315
|
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
285
316
|
CITATION.cff, PEP 740 attestations documented.
|
|
286
|
-
- **v0.7
|
|
287
|
-
|
|
317
|
+
- **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
|
|
318
|
+
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
319
|
+
- **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
320
|
+
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
288
321
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
289
322
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
290
323
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.8.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -47,10 +47,11 @@ eval = ["rfc8785>=0.1.4"]
|
|
|
47
47
|
adapters = []
|
|
48
48
|
# The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
|
|
49
49
|
# the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
|
|
50
|
-
# test is bound to this range.
|
|
51
|
-
|
|
50
|
+
# test is bound to this range. inspect_ai requires Python >= 3.10, so the marker gates it out on 3.9
|
|
51
|
+
# (base + [eval]/[sdjwt] still work on 3.9; the inspect adapter test skips there). Fixes the red 3.9 CI.
|
|
52
|
+
inspect = ['inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
|
|
52
53
|
dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
|
|
53
|
-
"rfc8785>=0.1.4", "sd-jwt>=0.10",
|
|
54
|
+
"rfc8785>=0.1.4", "sd-jwt>=0.10", 'inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
|
|
54
55
|
|
|
55
56
|
[project.urls]
|
|
56
57
|
Homepage = "https://b7n0de.com"
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.8.0"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -57,9 +57,23 @@ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, t
|
|
|
57
57
|
model_id = str(getattr(ev, "model", "unknown"))
|
|
58
58
|
dataset = getattr(ev, "dataset", None)
|
|
59
59
|
dataset_id = str(getattr(dataset, "name", None) or suite)
|
|
60
|
+
|
|
61
|
+
# Provenance parity with the lm-eval adapter: inspect_ai exposes the same run provenance for free.
|
|
62
|
+
provenance = {"harness": "inspect_ai"}
|
|
63
|
+
revision = getattr(ev, "revision", None)
|
|
64
|
+
commit = getattr(revision, "commit", None)
|
|
65
|
+
if commit:
|
|
66
|
+
provenance["git_hash"] = str(commit)
|
|
67
|
+
packages = getattr(ev, "packages", None) or {}
|
|
68
|
+
if isinstance(packages, dict) and packages.get("inspect_ai"):
|
|
69
|
+
provenance["harness_version"] = str(packages["inspect_ai"])
|
|
70
|
+
tv = getattr(ev, "task_version", None)
|
|
71
|
+
if tv is not None:
|
|
72
|
+
provenance["task_version"] = str(tv)
|
|
73
|
+
|
|
60
74
|
return build_eval_claim(
|
|
61
75
|
suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
|
|
62
76
|
metric=metric, comparator=comparator, threshold=threshold, score=repr(value),
|
|
63
77
|
n=int(getattr(results, "total_samples", 0) or 0),
|
|
64
78
|
model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
|
|
65
|
-
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
79
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -12,7 +12,11 @@ checks, fully offline and without any running log server:
|
|
|
12
12
|
The verifier treats ``payload`` as opaque bytes: it proves *that these exact
|
|
13
13
|
bytes were signed and anchored*, not what they mean. That keeps v0.1 small and
|
|
14
14
|
correct. Turning a reproducible eval run into such a payload is the job of the
|
|
15
|
-
emitter (see
|
|
15
|
+
eval-receipt emitter (see :mod:`proofbundle.evalclaim`, since v0.4).
|
|
16
|
+
|
|
17
|
+
Malformed input (wrong types, missing or unknown fields) is rejected with a
|
|
18
|
+
``BundleFormatError`` — never a raw traceback — so a caller gets the documented
|
|
19
|
+
malformed exit code, not a crash.
|
|
16
20
|
"""
|
|
17
21
|
|
|
18
22
|
from __future__ import annotations
|
|
@@ -30,6 +34,13 @@ __all__ = ["SCHEMA", "verify_bundle", "load_bundle"]
|
|
|
30
34
|
|
|
31
35
|
SCHEMA = "proofbundle/v0.1"
|
|
32
36
|
|
|
37
|
+
# Allowed keys per object — SPEC.md §3: a verifier MUST reject unknown fields (schema is
|
|
38
|
+
# additionalProperties: false). Enforced here so the code matches its own normative spec.
|
|
39
|
+
_TOP_KEYS = {"schema", "payload_b64", "signature", "merkle", "sd_jwt_vc"}
|
|
40
|
+
_SIG_KEYS = {"alg", "public_key_b64", "sig_b64"}
|
|
41
|
+
_MERKLE_KEYS = {"hash_alg", "leaf_index", "tree_size", "inclusion_proof_b64", "root_b64"}
|
|
42
|
+
_SD_KEYS = {"compact", "issuer_public_key_b64"}
|
|
43
|
+
|
|
33
44
|
|
|
34
45
|
def _b64d(value: str, field: str) -> bytes:
|
|
35
46
|
try:
|
|
@@ -44,6 +55,27 @@ def _require(obj: dict, key: str, field: str):
|
|
|
44
55
|
return obj[key]
|
|
45
56
|
|
|
46
57
|
|
|
58
|
+
def _require_dict(obj, field: str) -> dict:
|
|
59
|
+
"""The value must be a JSON object — a string/list/number is malformed, not a crash."""
|
|
60
|
+
if not isinstance(obj, dict):
|
|
61
|
+
raise BundleFormatError(f"field {field} must be a JSON object")
|
|
62
|
+
return obj
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _require_int(obj: dict, key: str, field: str) -> int:
|
|
66
|
+
"""The value must be a JSON integer — reject floats (SPEC §2) and non-numeric strings/None."""
|
|
67
|
+
val = _require(obj, key, field)
|
|
68
|
+
if isinstance(val, bool) or not isinstance(val, int): # bool is an int subclass; a float/str/None is not
|
|
69
|
+
raise BundleFormatError(f"field {field} must be an integer, got {type(val).__name__}")
|
|
70
|
+
return val
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _reject_unknown(obj: dict, allowed: set, field: str) -> None:
|
|
74
|
+
extra = set(obj) - allowed
|
|
75
|
+
if extra:
|
|
76
|
+
raise BundleFormatError(f"unknown field(s) in {field}: {sorted(extra)}")
|
|
77
|
+
|
|
78
|
+
|
|
47
79
|
def load_bundle(path: str) -> dict:
|
|
48
80
|
"""Read and JSON-parse a bundle file."""
|
|
49
81
|
with open(path, "r", encoding="utf-8") as handle:
|
|
@@ -60,12 +92,14 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
60
92
|
schema = bundle.get("schema")
|
|
61
93
|
if schema != SCHEMA:
|
|
62
94
|
raise UnsupportedError(f"unsupported schema {schema!r}, expected {SCHEMA!r}")
|
|
95
|
+
_reject_unknown(bundle, _TOP_KEYS, "bundle")
|
|
63
96
|
|
|
64
97
|
result = VerificationResult()
|
|
65
98
|
payload = _b64d(_require(bundle, "payload_b64", "payload_b64"), "payload_b64")
|
|
66
99
|
|
|
67
100
|
# 1. signature over the payload
|
|
68
|
-
sig = _require(bundle, "signature", "signature")
|
|
101
|
+
sig = _require_dict(_require(bundle, "signature", "signature"), "signature")
|
|
102
|
+
_reject_unknown(sig, _SIG_KEYS, "signature")
|
|
69
103
|
alg = sig.get("alg")
|
|
70
104
|
if alg != "ed25519":
|
|
71
105
|
raise UnsupportedError(f"signature alg {alg!r} not supported in v0.1")
|
|
@@ -75,13 +109,17 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
75
109
|
result.add("ed25519-signature", sig_ok, "payload signed by stated key" if sig_ok else "invalid signature")
|
|
76
110
|
|
|
77
111
|
# 2. merkle inclusion of the payload
|
|
78
|
-
mk = _require(bundle, "merkle", "merkle")
|
|
112
|
+
mk = _require_dict(_require(bundle, "merkle", "merkle"), "merkle")
|
|
113
|
+
_reject_unknown(mk, _MERKLE_KEYS, "merkle")
|
|
79
114
|
hash_alg = mk.get("hash_alg", "sha256-rfc6962")
|
|
80
115
|
if hash_alg != "sha256-rfc6962":
|
|
81
116
|
raise UnsupportedError(f"merkle hash_alg {hash_alg!r} not supported in v0.1")
|
|
82
|
-
leaf_index =
|
|
83
|
-
tree_size =
|
|
84
|
-
|
|
117
|
+
leaf_index = _require_int(mk, "leaf_index", "merkle.leaf_index")
|
|
118
|
+
tree_size = _require_int(mk, "tree_size", "merkle.tree_size")
|
|
119
|
+
proof_list = _require(mk, "inclusion_proof_b64", "merkle.inclusion_proof_b64") # required per SPEC §5
|
|
120
|
+
if not isinstance(proof_list, list):
|
|
121
|
+
raise BundleFormatError("field merkle.inclusion_proof_b64 must be a list")
|
|
122
|
+
proof = [_b64d(p, "merkle.inclusion_proof_b64[]") for p in proof_list]
|
|
85
123
|
root = _b64d(_require(mk, "root_b64", "merkle.root_b64"), "merkle.root_b64")
|
|
86
124
|
incl_ok = merkle.verify_inclusion(payload, leaf_index, tree_size, proof, root)
|
|
87
125
|
result.add(
|
|
@@ -93,6 +131,8 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
93
131
|
# 3. optional SD-JWT selective disclosure credential
|
|
94
132
|
sd = bundle.get("sd_jwt_vc")
|
|
95
133
|
if sd is not None:
|
|
134
|
+
sd = _require_dict(sd, "sd_jwt_vc")
|
|
135
|
+
_reject_unknown(sd, _SD_KEYS, "sd_jwt_vc")
|
|
96
136
|
compact = _require(sd, "compact", "sd_jwt_vc.compact")
|
|
97
137
|
issuer_pub = None
|
|
98
138
|
if sd.get("issuer_public_key_b64"):
|
|
@@ -21,6 +21,7 @@ import base64
|
|
|
21
21
|
import hashlib
|
|
22
22
|
import json
|
|
23
23
|
import os
|
|
24
|
+
import re
|
|
24
25
|
import unicodedata
|
|
25
26
|
from typing import Optional, Sequence
|
|
26
27
|
|
|
@@ -34,6 +35,8 @@ EVAL_CLAIM_SCHEMA = "proofbundle/eval-claim/v0.1"
|
|
|
34
35
|
COMMIT_ALG = "sha256-salted-v1"
|
|
35
36
|
_COMPARATORS = {">=", ">", "<=", "<"}
|
|
36
37
|
_MAX_SAFE_INT = 2 ** 53 - 1
|
|
38
|
+
# The published eval-claim schema's decimal pattern for threshold/score (no exponent, no sign+, no spaces).
|
|
39
|
+
_DECIMAL_RE = re.compile(r"^-?[0-9]+(\.[0-9]+)?$")
|
|
37
40
|
# The exact key set of an eval claim; decode/validate reject anything else.
|
|
38
41
|
_REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
|
|
39
42
|
"passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
|
|
@@ -103,7 +106,12 @@ def canonicalize(claim: dict) -> bytes:
|
|
|
103
106
|
for the UTF-16 code-unit key sort + compact UTF-8 serialization.
|
|
104
107
|
"""
|
|
105
108
|
_reject_non_jcs(claim)
|
|
106
|
-
|
|
109
|
+
try:
|
|
110
|
+
import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
|
|
111
|
+
except ImportError as e:
|
|
112
|
+
raise EvalClaimError(
|
|
113
|
+
"emitting eval receipts needs an RFC 8785 canonicalizer — install with: "
|
|
114
|
+
"pip install \"proofbundle[eval]\"") from e
|
|
107
115
|
try:
|
|
108
116
|
return rfc8785.dumps(claim)
|
|
109
117
|
except (rfc8785.FloatDomainError, rfc8785.IntegerDomainError, rfc8785.CanonicalizationError) as e:
|
|
@@ -137,14 +145,17 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
137
145
|
"""
|
|
138
146
|
if comparator not in _COMPARATORS:
|
|
139
147
|
raise EvalClaimError(f"comparator must be one of {sorted(_COMPARATORS)}")
|
|
148
|
+
# threshold/score must match the PUBLISHED schema's decimal pattern exactly — reject "1e2",
|
|
149
|
+
# "Infinity", "+5", " 5 " etc. that Decimal() would accept but jsonschema rejects (schema-conformance).
|
|
140
150
|
for name, val in (("threshold", threshold), ("score", score)):
|
|
141
151
|
if not isinstance(val, str):
|
|
142
152
|
raise EvalClaimError(f"{name} must be a decimal STRING, not {type(val).__name__}")
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
153
|
+
if not _DECIMAL_RE.match(val):
|
|
154
|
+
raise EvalClaimError(f"{name} must be a plain decimal string (^-?[0-9]+(\\.[0-9]+)?$), got {val!r}")
|
|
155
|
+
if not isinstance(n, int) or isinstance(n, bool) or n < 0 or n > _MAX_SAFE_INT:
|
|
156
|
+
raise EvalClaimError(f"n must be a non-negative integer <= 2**53-1, got {n!r}")
|
|
157
|
+
from decimal import Decimal # noqa: PLC0415
|
|
158
|
+
s, t = Decimal(score), Decimal(threshold)
|
|
148
159
|
passed = {">=": s >= t, ">": s > t, "<=": s <= t, "<": s < t}[comparator]
|
|
149
160
|
m_salt = model_salt if model_salt is not None else os.urandom(16)
|
|
150
161
|
d_salt = dataset_salt if dataset_salt is not None else os.urandom(16)
|
|
@@ -12,7 +12,7 @@ exists (deferred, see the roadmap).
|
|
|
12
12
|
"""
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
STATEMENT_TYPE = "https://in-toto.io/Statement/v1"
|
|
18
18
|
PREDICATE_TYPE = "https://b7n0de.com/proofbundle/eval-receipt/v0.1"
|
|
@@ -37,6 +37,21 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
|
|
|
37
37
|
(e.g. {"name": "inspect_ai", "version": "0.3.217"}) is optional. The subject digest is the model
|
|
38
38
|
commitment under a custom key (never `sha256`).
|
|
39
39
|
"""
|
|
40
|
+
predicate: dict[str, Any] = {
|
|
41
|
+
"verifier": {"id": VERIFIER_ID},
|
|
42
|
+
"evaluatedAt": claim["timestamp"],
|
|
43
|
+
"suite": claim["suite"],
|
|
44
|
+
"claims": [{
|
|
45
|
+
"metric": claim["metric"], "comparator": claim["comparator"],
|
|
46
|
+
"threshold": claim["threshold"], "passed": claim["passed"],
|
|
47
|
+
}],
|
|
48
|
+
"datasetCommit": claim.get("dataset_id_commit"),
|
|
49
|
+
"subject_digest_note": _SUBJECT_DIGEST_NOTE,
|
|
50
|
+
}
|
|
51
|
+
if harness:
|
|
52
|
+
predicate["harness"] = harness
|
|
53
|
+
if root_b64:
|
|
54
|
+
predicate["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
|
|
40
55
|
statement = {
|
|
41
56
|
"_type": STATEMENT_TYPE,
|
|
42
57
|
"subject": [{
|
|
@@ -44,20 +59,6 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
|
|
|
44
59
|
"digest": {MODEL_COMMIT_DIGEST_KEY: _commit_hex(claim["model_id_commit"])},
|
|
45
60
|
}],
|
|
46
61
|
"predicateType": PREDICATE_TYPE,
|
|
47
|
-
"predicate":
|
|
48
|
-
"verifier": {"id": VERIFIER_ID},
|
|
49
|
-
"evaluatedAt": claim["timestamp"],
|
|
50
|
-
"suite": claim["suite"],
|
|
51
|
-
"claims": [{
|
|
52
|
-
"metric": claim["metric"], "comparator": claim["comparator"],
|
|
53
|
-
"threshold": claim["threshold"], "passed": claim["passed"],
|
|
54
|
-
}],
|
|
55
|
-
"datasetCommit": claim.get("dataset_id_commit"),
|
|
56
|
-
"subject_digest_note": _SUBJECT_DIGEST_NOTE,
|
|
57
|
-
},
|
|
62
|
+
"predicate": predicate,
|
|
58
63
|
}
|
|
59
|
-
if harness:
|
|
60
|
-
statement["predicate"]["harness"] = harness
|
|
61
|
-
if root_b64:
|
|
62
|
-
statement["predicate"]["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
|
|
63
64
|
return statement
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -28,7 +28,7 @@ Provides-Extra: eval
|
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
30
|
Provides-Extra: inspect
|
|
31
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
|
|
32
32
|
Provides-Extra: dev
|
|
33
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
|
|
|
38
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
39
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
40
|
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
|
|
42
42
|
Dynamic: license-file
|
|
43
43
|
|
|
44
44
|
<div align="center">
|
|
@@ -62,14 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
62
62
|
[](https://github.com/astral-sh/ruff)
|
|
63
63
|
[](https://slsa.dev)
|
|
64
64
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
-
<!-- DOI badge placeholder: Zenodo
|
|
66
|
-
here (and the DOI to CITATION.cff) once Zenodo assigns
|
|
65
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
66
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
67
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
67
68
|
|
|
68
69
|
</div>
|
|
69
70
|
|
|
70
71
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
71
72
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
72
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
73
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
|
|
73
74
|
|
|
74
75
|
## Contents
|
|
75
76
|
|
|
@@ -78,6 +79,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
|
78
79
|
- [How it fits together](#how-it-fits-together)
|
|
79
80
|
- [Install](#install)
|
|
80
81
|
- [Quickstart](#quickstart)
|
|
82
|
+
- [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
|
|
81
83
|
- [Interoperability](#interoperability)
|
|
82
84
|
- [Bundle format](#bundle-format-proofbundlev01)
|
|
83
85
|
- [Eval receipts](#eval-receipts)
|
|
@@ -208,6 +210,21 @@ from proofbundle import verify_consistency
|
|
|
208
210
|
verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
|
|
209
211
|
```
|
|
210
212
|
|
|
213
|
+
## Demo — a real eval log to a verified receipt, offline
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
pip install "proofbundle[eval,inspect]"
|
|
217
|
+
make demo # or: bash scripts/demo.sh
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
`make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
|
|
221
|
+
inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
|
|
222
|
+
(committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
|
|
223
|
+
proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
|
|
224
|
+
that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
|
|
225
|
+
See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
|
|
226
|
+
[`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
|
|
227
|
+
|
|
211
228
|
## Interoperability
|
|
212
229
|
|
|
213
230
|
proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
|
|
@@ -284,15 +301,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
|
|
|
284
301
|
```
|
|
285
302
|
|
|
286
303
|
The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
|
|
287
|
-
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
304
|
+
RFC 8785 JCS canonicalization, the verify path stays dependency-free.
|
|
305
|
+
|
|
306
|
+
**Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
|
|
307
|
+
*claimed* result and its context — these exact bytes, signed by this key, anchored under this root, with
|
|
308
|
+
model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
|
|
309
|
+
and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
|
|
310
|
+
suite measures what it claims, and whether the number was computed honestly are separate questions.
|
|
311
|
+
Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
|
|
312
|
+
computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
|
|
313
|
+
hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
|
|
314
|
+
|
|
315
|
+
**How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
|
|
316
|
+
signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
|
|
317
|
+
those lines,
|
|
318
|
+
[inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
|
|
319
|
+
belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
|
|
320
|
+
computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
|
|
321
|
+
SD-JWT selective disclosure over one portable file, offline.
|
|
292
322
|
|
|
293
323
|
### A verification layer for trustworthy eval logs
|
|
294
324
|
|
|
295
|
-
The UK
|
|
325
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
326
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
296
327
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
297
328
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
298
329
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -326,8 +357,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
326
357
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
327
358
|
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
328
359
|
CITATION.cff, PEP 740 attestations documented.
|
|
329
|
-
- **v0.7
|
|
330
|
-
|
|
360
|
+
- **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
|
|
361
|
+
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
362
|
+
- **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
363
|
+
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
331
364
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
332
365
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
333
366
|
|
|
@@ -24,11 +24,13 @@ src/proofbundle/adapters/inspect_ai.py
|
|
|
24
24
|
src/proofbundle/adapters/lm_eval.py
|
|
25
25
|
tests/test_adapters.py
|
|
26
26
|
tests/test_bundle.py
|
|
27
|
+
tests/test_bundle_robustness.py
|
|
27
28
|
tests/test_cli.py
|
|
28
29
|
tests/test_cli_eval.py
|
|
29
30
|
tests/test_emit.py
|
|
30
31
|
tests/test_eval_claim_schema.py
|
|
31
32
|
tests/test_evalclaim.py
|
|
33
|
+
tests/test_examples.py
|
|
32
34
|
tests/test_intoto.py
|
|
33
35
|
tests/test_merkle.py
|
|
34
36
|
tests/test_merkle_property.py
|
|
@@ -11,12 +11,16 @@ build>=1
|
|
|
11
11
|
hypothesis>=6
|
|
12
12
|
rfc8785>=0.1.4
|
|
13
13
|
sd-jwt>=0.10
|
|
14
|
+
|
|
15
|
+
[dev:python_version >= "3.10"]
|
|
14
16
|
inspect_ai<0.4,>=0.3.100
|
|
15
17
|
|
|
16
18
|
[eval]
|
|
17
19
|
rfc8785>=0.1.4
|
|
18
20
|
|
|
19
21
|
[inspect]
|
|
22
|
+
|
|
23
|
+
[inspect:python_version >= "3.10"]
|
|
20
24
|
inspect_ai<0.4,>=0.3.100
|
|
21
25
|
|
|
22
26
|
[sdjwt]
|
|
@@ -39,6 +39,8 @@ class TestAdapters(unittest.TestCase):
|
|
|
39
39
|
self.assertEqual(claim["suite"], "safety_refusal_demo")
|
|
40
40
|
self.assertTrue(claim["passed"]) # accuracy 0.0 >= 0.00
|
|
41
41
|
self.assertNotIn("mockllm/model", str(claim)) # model id only as salted commitment
|
|
42
|
+
self.assertEqual(claim["provenance"]["harness"], "inspect_ai") # provenance parity with lm-eval
|
|
43
|
+
self.assertIn("harness_version", claim["provenance"])
|
|
42
44
|
|
|
43
45
|
def test_inspect_ai_missing_metric_clear_error(self):
|
|
44
46
|
from proofbundle.adapters.inspect_ai import InspectAdapterError
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Malformed-input robustness of verify_bundle + build_eval_claim (holistic-review findings, 0.7.1).
|
|
2
|
+
|
|
3
|
+
The verifier's contract is OK/FAILED/malformed — never a raw traceback. build_eval_claim must not emit a
|
|
4
|
+
receipt that fails its own published schema. One red-test per finding."""
|
|
5
|
+
import copy
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from proofbundle import verify_bundle
|
|
9
|
+
from proofbundle.emit import emit_bundle, generate_signer
|
|
10
|
+
from proofbundle.errors import BundleFormatError
|
|
11
|
+
from proofbundle.evalclaim import EvalClaimError, build_eval_claim
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _bundle():
|
|
15
|
+
return emit_bundle(b"payload", generate_signer())
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _mut(mut):
|
|
19
|
+
b = copy.deepcopy(_bundle())
|
|
20
|
+
mut(b)
|
|
21
|
+
return b
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestBundleRobustness(unittest.TestCase):
|
|
25
|
+
def test_leaf_index_non_numeric_raises_format_error(self): # D1
|
|
26
|
+
with self.assertRaises(BundleFormatError):
|
|
27
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("leaf_index", "abc")))
|
|
28
|
+
|
|
29
|
+
def test_signature_non_object_raises_format_error(self): # D2
|
|
30
|
+
with self.assertRaises(BundleFormatError):
|
|
31
|
+
verify_bundle(_mut(lambda b: b.__setitem__("signature", "notadict")))
|
|
32
|
+
with self.assertRaises(BundleFormatError):
|
|
33
|
+
verify_bundle(_mut(lambda b: b.__setitem__("merkle", ["x"])))
|
|
34
|
+
|
|
35
|
+
def test_tree_size_float_rejected(self): # D3 (SPEC §2: integers only)
|
|
36
|
+
with self.assertRaises(BundleFormatError):
|
|
37
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("tree_size", 1.5)))
|
|
38
|
+
|
|
39
|
+
def test_missing_inclusion_proof_rejected(self): # D4 (SPEC §5: required)
|
|
40
|
+
with self.assertRaises(BundleFormatError):
|
|
41
|
+
verify_bundle(_mut(lambda b: b["merkle"].pop("inclusion_proof_b64")))
|
|
42
|
+
|
|
43
|
+
def test_unknown_fields_rejected(self): # SPEC §3: additionalProperties false
|
|
44
|
+
with self.assertRaises(BundleFormatError):
|
|
45
|
+
verify_bundle(_mut(lambda b: b.__setitem__("evil", "x")))
|
|
46
|
+
with self.assertRaises(BundleFormatError):
|
|
47
|
+
verify_bundle(_mut(lambda b: b["signature"].__setitem__("evil", "x")))
|
|
48
|
+
with self.assertRaises(BundleFormatError):
|
|
49
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("evil", "x")))
|
|
50
|
+
|
|
51
|
+
def test_well_formed_still_ok(self): # no false positive
|
|
52
|
+
self.assertTrue(verify_bundle(_bundle()).ok)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TestEvalClaimSchemaConformance(unittest.TestCase):
|
|
56
|
+
def _build(self, **kw):
|
|
57
|
+
base = dict(suite="s", suite_version="v1", metric="acc", comparator=">=", threshold="0.8",
|
|
58
|
+
score="0.9", n=1, model_id="m", dataset_id="d", issuer="",
|
|
59
|
+
timestamp="2026-07-01T12:00:00Z", model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
60
|
+
base.update(kw)
|
|
61
|
+
return build_eval_claim(**base)
|
|
62
|
+
|
|
63
|
+
def test_negative_n_rejected(self): # schema minimum 0
|
|
64
|
+
with self.assertRaises(EvalClaimError):
|
|
65
|
+
self._build(n=-5)
|
|
66
|
+
|
|
67
|
+
def test_exponent_and_sign_threshold_rejected(self): # schema decimal pattern
|
|
68
|
+
for bad in ("1e2", "Infinity", "+5", " 0.9 "):
|
|
69
|
+
with self.assertRaises(EvalClaimError):
|
|
70
|
+
self._build(threshold=bad)
|
|
71
|
+
|
|
72
|
+
def test_plain_decimal_accepted(self):
|
|
73
|
+
claim, _ = self._build(threshold="0.80", score="0.92")
|
|
74
|
+
self.assertTrue(claim["passed"])
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""The demo examples run end-to-end (real fixtures -> receipt -> verify). Covers `make demo` (Phase B)."""
|
|
2
|
+
import importlib.util
|
|
3
|
+
import sys
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
REPO = Path(__file__).resolve().parents[1]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _run_example(name):
|
|
11
|
+
try:
|
|
12
|
+
import inspect_ai.log # noqa: F401 (inspect example needs it)
|
|
13
|
+
except ImportError:
|
|
14
|
+
if name == "inspect_receipt":
|
|
15
|
+
raise unittest.SkipTest("inspect_ai not installed")
|
|
16
|
+
spec = importlib.util.spec_from_file_location(name, REPO / "examples" / f"{name}.py")
|
|
17
|
+
m = importlib.util.module_from_spec(spec)
|
|
18
|
+
sys.modules[name] = m
|
|
19
|
+
spec.loader.exec_module(m)
|
|
20
|
+
return m.main()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestExamples(unittest.TestCase):
|
|
24
|
+
def test_lm_eval_receipt_example(self):
|
|
25
|
+
self.assertEqual(_run_example("lm_eval_receipt"), 0)
|
|
26
|
+
|
|
27
|
+
def test_inspect_receipt_example(self):
|
|
28
|
+
self.assertEqual(_run_example("inspect_receipt"), 0)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|