proofbundle 0.8.1__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.8.1/src/proofbundle.egg-info → proofbundle-0.9.0}/PKG-INFO +45 -20
- {proofbundle-0.8.1 → proofbundle-0.9.0}/README.md +44 -19
- {proofbundle-0.8.1 → proofbundle-0.9.0}/pyproject.toml +2 -2
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/__init__.py +1 -1
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/adapters/__init__.py +2 -1
- proofbundle-0.9.0/src/proofbundle/adapters/eee.py +175 -0
- proofbundle-0.9.0/src/proofbundle/checkpoint.py +157 -0
- proofbundle-0.9.0/src/proofbundle/dsse.py +110 -0
- proofbundle-0.9.0/src/proofbundle/eee_eval_schema.json +769 -0
- proofbundle-0.9.0/src/proofbundle/intoto.py +182 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0/src/proofbundle.egg-info}/PKG-INFO +45 -20
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle.egg-info/SOURCES.txt +7 -0
- proofbundle-0.9.0/tests/test_checkpoint.py +69 -0
- proofbundle-0.9.0/tests/test_eee.py +67 -0
- proofbundle-0.9.0/tests/test_intoto_dsse.py +83 -0
- proofbundle-0.8.1/src/proofbundle/intoto.py +0 -64
- {proofbundle-0.8.1 → proofbundle-0.9.0}/LICENSE +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/setup.cfg +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/adapters/lm_eval.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/bundle.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/evalclaim.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle.egg-info/requires.txt +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_adapters.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_bundle_robustness.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_cli.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_emit.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_examples.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_intoto.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_schema.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.8.1 → proofbundle-0.9.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -50,9 +50,10 @@ Dynamic: license-file
|
|
|
50
50
|
|
|
51
51
|
<h1>proofbundle</h1>
|
|
52
52
|
|
|
53
|
-
**
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
**An offline verifier for AI eval receipts. Standards-native: Ed25519 signature,
|
|
54
|
+
RFC 6962 transparency-log Merkle anchoring, optional SD-JWT (RFC 9901) selective
|
|
55
|
+
disclosure, aligned to the in-toto test-result predicate. One portable JSON file,
|
|
56
|
+
no server, no network.**
|
|
56
57
|
|
|
57
58
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
58
59
|
[](https://pypi.org/project/proofbundle/)
|
|
@@ -70,7 +71,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
70
71
|
|
|
71
72
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
72
73
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
73
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
74
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
|
|
74
75
|
|
|
75
76
|
## Contents
|
|
76
77
|
|
|
@@ -325,24 +326,46 @@ SD-JWT selective disclosure over one portable file, offline.
|
|
|
325
326
|
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
326
327
|
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
327
328
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
328
|
-
missing **signature + selective-disclosure layer** for exactly that
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
329
|
+
missing **signature + selective-disclosure layer** for exactly that.
|
|
330
|
+
|
|
331
|
+
**How it fits — standards-native, and honest about the neighbours.** proofbundle attests that a *claimed*
|
|
332
|
+
evaluation result is authentic, tamper-evident, and selectively disclosable. It does **not** attest that
|
|
333
|
+
the evaluation was computed correctly or that results were not cherry-picked — proving faithful
|
|
334
|
+
computation is the domain of TEE approaches such as
|
|
335
|
+
[Attestable Audits](https://arxiv.org/abs/2506.23706). It is complementary to its neighbours, named
|
|
336
|
+
fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
|
|
337
|
+
adds no cryptography (proofbundle ships an EEE→receipt converter);
|
|
338
|
+
[OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
|
|
339
|
+
results; [ValiChord](https://github.com/topeuph-ai/ValiChord) provides blind peer consensus and an
|
|
340
|
+
attested log on a Holochain network (its v1 attestation library uses a simple SHA-256 Merkle tree, no
|
|
341
|
+
signature, no SD-JWT, no in-toto). proofbundle is the lightweight, **standards-native** piece between them:
|
|
342
|
+
a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
|
|
343
|
+
threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
|
|
344
|
+
|
|
345
|
+
- **Three framework bridges** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
333
346
|
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
334
347
|
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
335
348
|
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
336
|
-
genuine `acc,none` filter-suffix format)
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
- **
|
|
349
|
+
genuine `acc,none` filter-suffix format). **`proofbundle.adapters.from_eee_dataset`** (v0.9) reads an
|
|
350
|
+
Every Eval Ever v0.2.2 aggregate JSON and builds a signed receipt — validated against the vendored EEE
|
|
351
|
+
schema, with **no runtime import** of `every_eval_ever` (it needs Python 3.12; proofbundle stays 3.9+).
|
|
352
|
+
- **in-toto test-result export, DSSE-signed** (v0.9) — `proofbundle.intoto.export_intoto_dsse(claim,
|
|
353
|
+
signer)` emits the receipt as a DSSE-signed in-toto Statement v1 with the **generic
|
|
354
|
+
`test-result/v0.1` predicate** (result PASSED/FAILED, `configuration` ResourceDescriptors), so a generic
|
|
355
|
+
in-toto verifier understands it. Alongside the self-hosted-predicate `to_intoto_statement` (see
|
|
356
|
+
[PREDICATE.md](PREDICATE.md)). Metric details live in `annotations` (test-result has no native metric
|
|
357
|
+
field); the model/dataset stay salted commitments, never `sha256`.
|
|
358
|
+
- **C2SP tlog-checkpoint** (v0.9) — `proofbundle.checkpoint.sign_checkpoint(origin, tree_size, root, …)`
|
|
359
|
+
emits a valid [C2SP](https://github.com/C2SP/C2SP/blob/main/tlog-checkpoint.md) signed note over the
|
|
360
|
+
RFC 6962 Merkle root, making a receipt witness-network / transparency-log compatible. Pure serialization
|
|
361
|
+
over the Ed25519 key already in use — no new crypto.
|
|
362
|
+
- **SD-JWT issuance** (RFC 9901, verified Nov 2025) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
|
|
342
363
|
root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
|
|
343
|
-
`threshold` while **withholding the exact score** and the identifier openings. The
|
|
344
|
-
|
|
345
|
-
|
|
364
|
+
`threshold` while **withholding the exact score** and the identifier openings. The digest mechanic is
|
|
365
|
+
RFC 9901 §4.2.3 (base64url of SHA-256 over the base64url-encoded Disclosure), cross-checked against the
|
|
366
|
+
`sd-jwt-python` reference.
|
|
367
|
+
The signed bundle payload is always the source of truth; the SD-JWT and the in-toto export are derived,
|
|
368
|
+
bundle-bound views.
|
|
346
369
|
|
|
347
370
|
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
348
371
|
attestation — see [SECURITY.md](SECURITY.md).
|
|
@@ -359,8 +382,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
359
382
|
CITATION.cff, PEP 740 attestations documented.
|
|
360
383
|
- **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
|
|
361
384
|
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
362
|
-
- **v0.8
|
|
385
|
+
- **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
363
386
|
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
387
|
+
- **v0.9 (current release)** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP
|
|
388
|
+
tlog-checkpoint over the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
|
|
364
389
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
365
390
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
366
391
|
|
|
@@ -7,9 +7,10 @@
|
|
|
7
7
|
|
|
8
8
|
<h1>proofbundle</h1>
|
|
9
9
|
|
|
10
|
-
**
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
**An offline verifier for AI eval receipts. Standards-native: Ed25519 signature,
|
|
11
|
+
RFC 6962 transparency-log Merkle anchoring, optional SD-JWT (RFC 9901) selective
|
|
12
|
+
disclosure, aligned to the in-toto test-result predicate. One portable JSON file,
|
|
13
|
+
no server, no network.**
|
|
13
14
|
|
|
14
15
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
15
16
|
[](https://pypi.org/project/proofbundle/)
|
|
@@ -27,7 +28,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
27
28
|
|
|
28
29
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
29
30
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
30
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
31
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
|
|
31
32
|
|
|
32
33
|
## Contents
|
|
33
34
|
|
|
@@ -282,24 +283,46 @@ SD-JWT selective disclosure over one portable file, offline.
|
|
|
282
283
|
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
283
284
|
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
284
285
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
285
|
-
missing **signature + selective-disclosure layer** for exactly that
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
286
|
+
missing **signature + selective-disclosure layer** for exactly that.
|
|
287
|
+
|
|
288
|
+
**How it fits — standards-native, and honest about the neighbours.** proofbundle attests that a *claimed*
|
|
289
|
+
evaluation result is authentic, tamper-evident, and selectively disclosable. It does **not** attest that
|
|
290
|
+
the evaluation was computed correctly or that results were not cherry-picked — proving faithful
|
|
291
|
+
computation is the domain of TEE approaches such as
|
|
292
|
+
[Attestable Audits](https://arxiv.org/abs/2506.23706). It is complementary to its neighbours, named
|
|
293
|
+
fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
|
|
294
|
+
adds no cryptography (proofbundle ships an EEE→receipt converter);
|
|
295
|
+
[OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
|
|
296
|
+
results; [ValiChord](https://github.com/topeuph-ai/ValiChord) provides blind peer consensus and an
|
|
297
|
+
attested log on a Holochain network (its v1 attestation library uses a simple SHA-256 Merkle tree, no
|
|
298
|
+
signature, no SD-JWT, no in-toto). proofbundle is the lightweight, **standards-native** piece between them:
|
|
299
|
+
a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
|
|
300
|
+
threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
|
|
301
|
+
|
|
302
|
+
- **Three framework bridges** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
290
303
|
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
291
304
|
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
292
305
|
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
293
|
-
genuine `acc,none` filter-suffix format)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
- **
|
|
306
|
+
genuine `acc,none` filter-suffix format). **`proofbundle.adapters.from_eee_dataset`** (v0.9) reads an
|
|
307
|
+
Every Eval Ever v0.2.2 aggregate JSON and builds a signed receipt — validated against the vendored EEE
|
|
308
|
+
schema, with **no runtime import** of `every_eval_ever` (it needs Python 3.12; proofbundle stays 3.9+).
|
|
309
|
+
- **in-toto test-result export, DSSE-signed** (v0.9) — `proofbundle.intoto.export_intoto_dsse(claim,
|
|
310
|
+
signer)` emits the receipt as a DSSE-signed in-toto Statement v1 with the **generic
|
|
311
|
+
`test-result/v0.1` predicate** (result PASSED/FAILED, `configuration` ResourceDescriptors), so a generic
|
|
312
|
+
in-toto verifier understands it. Alongside the self-hosted-predicate `to_intoto_statement` (see
|
|
313
|
+
[PREDICATE.md](PREDICATE.md)). Metric details live in `annotations` (test-result has no native metric
|
|
314
|
+
field); the model/dataset stay salted commitments, never `sha256`.
|
|
315
|
+
- **C2SP tlog-checkpoint** (v0.9) — `proofbundle.checkpoint.sign_checkpoint(origin, tree_size, root, …)`
|
|
316
|
+
emits a valid [C2SP](https://github.com/C2SP/C2SP/blob/main/tlog-checkpoint.md) signed note over the
|
|
317
|
+
RFC 6962 Merkle root, making a receipt witness-network / transparency-log compatible. Pure serialization
|
|
318
|
+
over the Ed25519 key already in use — no new crypto.
|
|
319
|
+
- **SD-JWT issuance** (RFC 9901, verified Nov 2025) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
|
|
299
320
|
root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
|
|
300
|
-
`threshold` while **withholding the exact score** and the identifier openings. The
|
|
301
|
-
|
|
302
|
-
|
|
321
|
+
`threshold` while **withholding the exact score** and the identifier openings. The digest mechanic is
|
|
322
|
+
RFC 9901 §4.2.3 (base64url of SHA-256 over the base64url-encoded Disclosure), cross-checked against the
|
|
323
|
+
`sd-jwt-python` reference.
|
|
324
|
+
The signed bundle payload is always the source of truth; the SD-JWT and the in-toto export are derived,
|
|
325
|
+
bundle-bound views.
|
|
303
326
|
|
|
304
327
|
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
305
328
|
attestation — see [SECURITY.md](SECURITY.md).
|
|
@@ -316,8 +339,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
316
339
|
CITATION.cff, PEP 740 attestations documented.
|
|
317
340
|
- **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
|
|
318
341
|
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
319
|
-
- **v0.8
|
|
342
|
+
- **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
320
343
|
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
344
|
+
- **v0.9 (current release)** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP
|
|
345
|
+
tlog-checkpoint over the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
|
|
321
346
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
322
347
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
323
348
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.9.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -67,7 +67,7 @@ proofbundle = "proofbundle.cli:main"
|
|
|
67
67
|
where = ["src"]
|
|
68
68
|
|
|
69
69
|
[tool.setuptools.package-data]
|
|
70
|
-
proofbundle = ["py.typed"]
|
|
70
|
+
proofbundle = ["py.typed", "eee_eval_schema.json"]
|
|
71
71
|
|
|
72
72
|
[tool.ruff]
|
|
73
73
|
line-length = 100
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.9.0"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -5,6 +5,7 @@ add no runtime dependency. The output-format mapping is bound to a framework ver
|
|
|
5
5
|
each fixture in tests/fixtures documents its source + version.
|
|
6
6
|
"""
|
|
7
7
|
from .inspect_ai import from_inspect_ai_log
|
|
8
|
+
from .eee import from_eee_dataset
|
|
8
9
|
from .lm_eval import from_lm_eval_results
|
|
9
10
|
|
|
10
|
-
__all__ = ["from_lm_eval_results", "from_inspect_ai_log"]
|
|
11
|
+
__all__ = ["from_lm_eval_results", "from_inspect_ai_log", "from_eee_dataset"]
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Adapter: an Every Eval Ever (EEE) dataset record → a signed proofbundle eval receipt (v0.9).
|
|
2
|
+
|
|
3
|
+
Every Eval Ever (evaleval/every_eval_ever, MIT) is the community aggregation schema for eval metadata —
|
|
4
|
+
it has no cryptography. This converter is strictly additive: it reads an EEE aggregate JSON and builds a
|
|
5
|
+
signed, selectively-disclosable proofbundle receipt from it.
|
|
6
|
+
|
|
7
|
+
IMPORTANT: `every_eval_ever` is NOT imported at runtime — it requires Python 3.12+ (pydantic/numpy/pandas/
|
|
8
|
+
duckdb), while proofbundle stays 3.9+. We parse the EEE JSON directly and OPTIONALLY validate it against the
|
|
9
|
+
vendored `eee_eval_schema.json` (schema version 0.2.2, MIT) using `jsonschema` if available.
|
|
10
|
+
|
|
11
|
+
Field mapping (verified 2026-07 against schemas/eval.schema.json v0.2.2):
|
|
12
|
+
- model_info.id → model_id
|
|
13
|
+
- evaluation_results[i].evaluation_name → suite / task
|
|
14
|
+
- evaluation_results[i].source_data.dataset_name → dataset_id (required in every source variant)
|
|
15
|
+
- metric_config.metric_name | metric_id | metric_kind → metric (all optional; fallback chain)
|
|
16
|
+
- score_details.score → score
|
|
17
|
+
- score_details.uncertainty.standard_error.value → provenance.stderr
|
|
18
|
+
- eval_library.{name,version} → provenance.harness / harness_version
|
|
19
|
+
Gotcha handled: metric_config with score_type == "levels" is an integer level index; -1 with
|
|
20
|
+
has_unknown_level == true means Unknown and is rejected (not silently mapped to 0).
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Optional, Union
|
|
27
|
+
|
|
28
|
+
from ..evalclaim import build_eval_claim
|
|
29
|
+
|
|
30
|
+
_SCHEMA_PATH = Path(__file__).resolve().parent.parent / "eee_eval_schema.json"
|
|
31
|
+
_SCHEMA_VERSION = "0.2.2"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class EEEAdapterError(ValueError):
|
|
35
|
+
"""Raised when the EEE record is missing the expected structure — a clear error, not a bare KeyError."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load(source: Union[str, Path, dict]) -> dict:
|
|
39
|
+
if isinstance(source, dict):
|
|
40
|
+
return source
|
|
41
|
+
try:
|
|
42
|
+
return json.loads(Path(source).read_text(encoding="utf-8"))
|
|
43
|
+
except (OSError, ValueError) as e:
|
|
44
|
+
raise EEEAdapterError(f"could not read EEE dataset {source!r}: {e}") from e
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _validate(record: dict) -> None:
|
|
48
|
+
"""Best-effort schema validation against the vendored EEE schema (skipped if jsonschema/schema absent)."""
|
|
49
|
+
try:
|
|
50
|
+
import jsonschema # noqa: PLC0415
|
|
51
|
+
except ImportError:
|
|
52
|
+
return
|
|
53
|
+
if not _SCHEMA_PATH.is_file():
|
|
54
|
+
return
|
|
55
|
+
schema = json.loads(_SCHEMA_PATH.read_text(encoding="utf-8"))
|
|
56
|
+
try:
|
|
57
|
+
jsonschema.validate(record, schema)
|
|
58
|
+
except jsonschema.ValidationError as e:
|
|
59
|
+
raise EEEAdapterError(f"EEE record does not validate against schema {_SCHEMA_VERSION}: {e.message}") from e
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _num_to_decimal_str(x) -> str:
|
|
63
|
+
"""Format a JSON number as a plain decimal string (no exponent) for build_eval_claim's pattern."""
|
|
64
|
+
if isinstance(x, bool) or not isinstance(x, (int, float)):
|
|
65
|
+
raise EEEAdapterError(f"score must be a number, got {type(x).__name__}")
|
|
66
|
+
if isinstance(x, int):
|
|
67
|
+
return str(x)
|
|
68
|
+
if x != x or x in (float("inf"), float("-inf")): # NaN/Inf
|
|
69
|
+
raise EEEAdapterError("score must be finite")
|
|
70
|
+
s = repr(x)
|
|
71
|
+
if "e" in s or "E" in s: # avoid exponent form (build_eval_claim rejects it)
|
|
72
|
+
s = f"{x:.12f}".rstrip("0").rstrip(".")
|
|
73
|
+
return s
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _pick_metric(metric_config: dict) -> str:
|
|
77
|
+
for key in ("metric_name", "metric_id", "metric_kind"):
|
|
78
|
+
v = metric_config.get(key)
|
|
79
|
+
if isinstance(v, str) and v:
|
|
80
|
+
return v
|
|
81
|
+
return "score"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _extract_score(score_details: dict, metric_config: dict) -> str:
|
|
85
|
+
if "score" not in score_details:
|
|
86
|
+
raise EEEAdapterError("evaluation_results[].score_details.score is required")
|
|
87
|
+
raw = score_details["score"]
|
|
88
|
+
if metric_config.get("score_type") == "levels":
|
|
89
|
+
if not isinstance(raw, (int, float)) or isinstance(raw, bool):
|
|
90
|
+
raise EEEAdapterError("levels score must be an integer level index")
|
|
91
|
+
idx = int(raw)
|
|
92
|
+
if idx == -1 and metric_config.get("has_unknown_level"):
|
|
93
|
+
raise EEEAdapterError("levels score is -1 (Unknown) — cannot build a threshold claim")
|
|
94
|
+
return str(idx)
|
|
95
|
+
return _num_to_decimal_str(raw)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def from_eee_dataset(source: Union[str, Path, dict], *, comparator: str, threshold: str,
|
|
99
|
+
timestamp: Optional[str] = None, eval_index: int = 0, metric_name: Optional[str] = None,
|
|
100
|
+
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None,
|
|
101
|
+
validate: bool = True):
|
|
102
|
+
"""Read an EEE dataset record and build a proofbundle eval claim for one evaluation result.
|
|
103
|
+
|
|
104
|
+
`comparator`/`threshold` set the pass/fail assertion (EEE stores the raw score, not a threshold verdict).
|
|
105
|
+
`eval_index` selects which of `evaluation_results` to use; `metric_name` instead selects the first result
|
|
106
|
+
whose metric matches. Returns (claim, salts). Raises EEEAdapterError on a malformed record.
|
|
107
|
+
"""
|
|
108
|
+
record = _load(source)
|
|
109
|
+
if not isinstance(record, dict):
|
|
110
|
+
raise EEEAdapterError("EEE dataset must be a JSON object")
|
|
111
|
+
if validate:
|
|
112
|
+
_validate(record)
|
|
113
|
+
|
|
114
|
+
model_info = record.get("model_info") or {}
|
|
115
|
+
model_id = model_info.get("id")
|
|
116
|
+
if not model_id:
|
|
117
|
+
raise EEEAdapterError("EEE record missing model_info.id")
|
|
118
|
+
|
|
119
|
+
results = record.get("evaluation_results")
|
|
120
|
+
if not isinstance(results, list) or not results:
|
|
121
|
+
raise EEEAdapterError("EEE record has no evaluation_results")
|
|
122
|
+
|
|
123
|
+
if metric_name is not None:
|
|
124
|
+
chosen = next((r for r in results if isinstance(r, dict)
|
|
125
|
+
and _pick_metric(r.get("metric_config") or {}) == metric_name), None)
|
|
126
|
+
if chosen is None:
|
|
127
|
+
raise EEEAdapterError(f"no evaluation_result with metric {metric_name!r}")
|
|
128
|
+
else:
|
|
129
|
+
if eval_index < 0 or eval_index >= len(results):
|
|
130
|
+
raise EEEAdapterError(f"eval_index {eval_index} out of range (0..{len(results) - 1})")
|
|
131
|
+
chosen = results[eval_index]
|
|
132
|
+
if not isinstance(chosen, dict):
|
|
133
|
+
raise EEEAdapterError("evaluation_results item is not an object")
|
|
134
|
+
|
|
135
|
+
metric_config = chosen.get("metric_config") or {}
|
|
136
|
+
score_details = chosen.get("score_details") or {}
|
|
137
|
+
source_data = chosen.get("source_data") or {}
|
|
138
|
+
|
|
139
|
+
suite = chosen.get("evaluation_name")
|
|
140
|
+
if not suite:
|
|
141
|
+
raise EEEAdapterError("evaluation_results[].evaluation_name is required")
|
|
142
|
+
dataset_id = source_data.get("dataset_name") or str(suite) # dataset_name is required in EEE; defensive fallback
|
|
143
|
+
metric = _pick_metric(metric_config)
|
|
144
|
+
score = _extract_score(score_details, metric_config)
|
|
145
|
+
|
|
146
|
+
eval_library = record.get("eval_library") or {}
|
|
147
|
+
ts = timestamp or chosen.get("evaluation_timestamp") or record.get("retrieved_timestamp")
|
|
148
|
+
if not ts:
|
|
149
|
+
raise EEEAdapterError("no timestamp: pass timestamp= or set retrieved_timestamp/evaluation_timestamp")
|
|
150
|
+
|
|
151
|
+
provenance = {"source": "every_eval_ever", "eee_schema_version": record.get("schema_version") or _SCHEMA_VERSION}
|
|
152
|
+
if eval_library.get("name"):
|
|
153
|
+
provenance["harness"] = str(eval_library["name"])
|
|
154
|
+
if eval_library.get("version"):
|
|
155
|
+
provenance["harness_version"] = str(eval_library["version"])
|
|
156
|
+
# NOTE: the EEE `evaluation_id` (format eval_name/model_id/timestamp) embeds the model id in cleartext,
|
|
157
|
+
# which would defeat proofbundle's salted model commitment (a receipt is meant to hide the model). So it
|
|
158
|
+
# is deliberately NOT copied into provenance — the receipt keeps the model private by design.
|
|
159
|
+
if metric_config.get("metric_id"):
|
|
160
|
+
provenance["metric_id"] = str(metric_config["metric_id"])
|
|
161
|
+
if metric_config.get("score_type"):
|
|
162
|
+
provenance["score_type"] = str(metric_config["score_type"])
|
|
163
|
+
se = ((score_details.get("uncertainty") or {}).get("standard_error") or {}).get("value")
|
|
164
|
+
if isinstance(se, (int, float)) and not isinstance(se, bool):
|
|
165
|
+
provenance["stderr"] = str(se)
|
|
166
|
+
rel = (record.get("source_metadata") or {}).get("evaluator_relationship")
|
|
167
|
+
if rel:
|
|
168
|
+
provenance["evaluator_relationship"] = str(rel)
|
|
169
|
+
|
|
170
|
+
return build_eval_claim(
|
|
171
|
+
suite=str(suite), suite_version=str(eval_library.get("version") or "unknown"),
|
|
172
|
+
metric=metric, comparator=comparator, threshold=threshold, score=score,
|
|
173
|
+
n=int((score_details.get("uncertainty") or {}).get("num_samples") or 0),
|
|
174
|
+
model_id=str(model_id), dataset_id=str(dataset_id), issuer="", timestamp=str(ts),
|
|
175
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""C2SP tlog-checkpoint output — a signed note over the RFC 6962 Merkle root (v0.9).
|
|
2
|
+
|
|
3
|
+
proofbundle already has an RFC 6962 Merkle root and Ed25519, so it can emit a valid C2SP tlog-checkpoint:
|
|
4
|
+
a signed note that makes a receipt witness-network / transparency-log compatible. Pure serialization and
|
|
5
|
+
framing, no new crypto. Spec verified 2026-07 against C2SP/C2SP tlog-checkpoint.md + signed-note.md.
|
|
6
|
+
|
|
7
|
+
Byte-exact rules (the ones that bite):
|
|
8
|
+
- Note text = at least three non-empty lines separated by U+000A: line 1 `origin` (a schemeless log
|
|
9
|
+
identity, no unicode spaces, no '+'), line 2 the tree size as ASCII decimal with no leading zeros
|
|
10
|
+
(empty tree = "0"), line 3 the Merkle root in STANDARD RFC 4648 §4 base64 (with padding) — NOT
|
|
11
|
+
base64url. The note text ends with a final U+000A.
|
|
12
|
+
- The signed note = note text (ending in U+000A) + one empty line + one-or-more signature lines.
|
|
13
|
+
- A signature line is: U+2014 (EM DASH, not a hyphen) SP keyname SP base64(keyID ‖ signature) U+000A
|
|
14
|
+
where keyID is 4 bytes big-endian and, for Ed25519, signature is 64 raw bytes → 68 bytes total.
|
|
15
|
+
- What is signed: the note text bytes INCLUDING the final U+000A, EXCLUDING the separating empty line.
|
|
16
|
+
Raw bytes — NO DSSE/PAE wrapping.
|
|
17
|
+
- keyID = SHA-256(keyname_bytes ‖ 0x0A ‖ 0x01 ‖ pubkey[32])[:4] (0x01 = Ed25519 signature type).
|
|
18
|
+
- vkey (to distribute the key) = keyname + "+" + hex8(keyID) + "+" + base64(0x01 ‖ pubkey[32]).
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import base64
|
|
23
|
+
import hashlib
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
|
|
27
|
+
|
|
28
|
+
from .errors import BundleFormatError
|
|
29
|
+
from .signature import verify_ed25519
|
|
30
|
+
|
|
31
|
+
__all__ = ["checkpoint_note", "key_id", "vkey", "sign_checkpoint", "verify_checkpoint", "root_bytes_from_b64"]
|
|
32
|
+
|
|
33
|
+
EM_DASH = "—"
|
|
34
|
+
_ED25519_SIG_TYPE = 0x01
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _root_std_b64(root: bytes) -> str:
|
|
38
|
+
"""Standard RFC 4648 §4 base64 (with padding) of the raw Merkle root — NOT base64url."""
|
|
39
|
+
return base64.b64encode(root).decode("ascii")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def checkpoint_note(origin: str, tree_size: int, root: bytes) -> str:
|
|
43
|
+
"""Build the C2SP checkpoint note text (3 lines + trailing newline). ``root`` is the raw RFC 6962
|
|
44
|
+
Merkle root bytes at ``tree_size``. ``origin`` must be non-empty with no spaces/'+' (a schemeless URL)."""
|
|
45
|
+
if not origin or " " in origin or "+" in origin or "\n" in origin:
|
|
46
|
+
raise BundleFormatError("checkpoint origin must be a non-empty schemeless id without spaces or '+'")
|
|
47
|
+
if isinstance(tree_size, bool) or not isinstance(tree_size, int) or tree_size < 0:
|
|
48
|
+
raise BundleFormatError("checkpoint tree_size must be a non-negative integer")
|
|
49
|
+
return f"{origin}\n{tree_size}\n{_root_std_b64(root)}\n"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def key_id(keyname: str, pubkey: bytes) -> bytes:
|
|
53
|
+
"""C2SP note key ID = first 4 bytes of SHA-256(keyname ‖ 0x0A ‖ 0x01 ‖ 32-byte-Ed25519-pubkey)."""
|
|
54
|
+
if len(pubkey) != 32:
|
|
55
|
+
raise BundleFormatError("Ed25519 public key must be 32 raw bytes")
|
|
56
|
+
h = hashlib.sha256(keyname.encode("utf-8") + b"\n" + bytes([_ED25519_SIG_TYPE]) + pubkey).digest()
|
|
57
|
+
return h[:4]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def vkey(keyname: str, pubkey: bytes) -> str:
|
|
61
|
+
"""C2SP verifier key encoding: name + '+' + hex8(keyID) + '+' + base64(0x01 ‖ pubkey)."""
|
|
62
|
+
kid = key_id(keyname, pubkey)
|
|
63
|
+
kid_hex = f"{int.from_bytes(kid, 'big'):08x}"
|
|
64
|
+
keymat = base64.b64encode(bytes([_ED25519_SIG_TYPE]) + pubkey).decode("ascii")
|
|
65
|
+
return f"{keyname}+{kid_hex}+{keymat}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def sign_checkpoint(origin: str, tree_size: int, root: bytes, signer, keyname: str) -> str:
|
|
69
|
+
"""Produce a signed C2SP checkpoint note. ``signer`` is an Ed25519 private key whose public key must
|
|
70
|
+
correspond to ``keyname``. The signature is over the RAW note-text bytes (including the trailing
|
|
71
|
+
newline), never over base64 and never PAE-wrapped."""
|
|
72
|
+
note = checkpoint_note(origin, tree_size, root)
|
|
73
|
+
pubkey = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
|
|
74
|
+
sig = signer.sign(note.encode("utf-8"))
|
|
75
|
+
kid = key_id(keyname, pubkey)
|
|
76
|
+
sig_b64 = base64.b64encode(kid + sig).decode("ascii")
|
|
77
|
+
sig_line = f"{EM_DASH} {keyname} {sig_b64}\n"
|
|
78
|
+
return note + "\n" + sig_line
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _parse_vkey(vkey_str: str) -> tuple[str, bytes, bytes]:
|
|
82
|
+
# The key material is standard base64, which can itself contain '+'. Since the name has no '+' (a
|
|
83
|
+
# schemeless origin) and the hex keyID has none, the FIRST TWO '+' are the separators and everything
|
|
84
|
+
# after is the base64 — so split with maxsplit=2, never a plain split (that would over-split the b64).
|
|
85
|
+
parts = vkey_str.split("+", 2)
|
|
86
|
+
if len(parts) != 3:
|
|
87
|
+
raise BundleFormatError("vkey must have 3 '+'-separated parts (name+hexKeyID+base64KeyMaterial)")
|
|
88
|
+
name, kid_hex, keymat_b64 = parts
|
|
89
|
+
try:
|
|
90
|
+
keymat = base64.b64decode(keymat_b64, validate=True)
|
|
91
|
+
except (ValueError, TypeError) as exc:
|
|
92
|
+
raise BundleFormatError("vkey key material is not valid base64") from exc
|
|
93
|
+
if len(keymat) != 33 or keymat[0] != _ED25519_SIG_TYPE:
|
|
94
|
+
raise BundleFormatError("vkey key material must be 0x01 followed by a 32-byte Ed25519 key")
|
|
95
|
+
pubkey = keymat[1:]
|
|
96
|
+
try:
|
|
97
|
+
kid = bytes.fromhex(kid_hex)
|
|
98
|
+
except ValueError as exc:
|
|
99
|
+
raise BundleFormatError("vkey keyID is not valid hex") from exc
|
|
100
|
+
return name, kid, pubkey
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def verify_checkpoint(signed_note: str, vkey_str: str) -> dict:
|
|
104
|
+
"""Verify a signed C2SP checkpoint against a vkey. Returns {ok, origin, tree_size, root}. ``ok`` is
|
|
105
|
+
True iff a signature line whose keyID matches the vkey verifies (Ed25519) over the exact note-text
|
|
106
|
+
bytes. Reconstructs the note text from the parsed bytes — never re-derives it."""
|
|
107
|
+
name, kid_v, pubkey = _parse_vkey(vkey_str)
|
|
108
|
+
# note text = everything up to (and including the \n before) the separating empty line
|
|
109
|
+
if "\n\n" not in signed_note:
|
|
110
|
+
raise BundleFormatError("signed note has no empty-line separator between text and signatures")
|
|
111
|
+
note_text, sig_block = signed_note.split("\n\n", 1)
|
|
112
|
+
note_text += "\n" # restore the trailing newline that belongs to the note text
|
|
113
|
+
note_bytes = note_text.encode("utf-8")
|
|
114
|
+
lines = note_text.split("\n")
|
|
115
|
+
if len(lines) < 4 or not lines[0] or not lines[1] or not lines[2]:
|
|
116
|
+
raise BundleFormatError("checkpoint note must have at least 3 non-empty lines")
|
|
117
|
+
origin, size_s, root_b64 = lines[0], lines[1], lines[2]
|
|
118
|
+
if size_s != "0" and (size_s.startswith("0") or not size_s.isdigit()):
|
|
119
|
+
raise BundleFormatError("checkpoint tree size must be ASCII decimal with no leading zeros")
|
|
120
|
+
try:
|
|
121
|
+
root = base64.b64decode(root_b64, validate=True)
|
|
122
|
+
except (ValueError, TypeError) as exc:
|
|
123
|
+
raise BundleFormatError("checkpoint root is not valid standard base64") from exc
|
|
124
|
+
|
|
125
|
+
ok = False
|
|
126
|
+
kid_expected = key_id(name, pubkey)
|
|
127
|
+
for line in sig_block.split("\n"):
|
|
128
|
+
if not line.startswith(EM_DASH + " "):
|
|
129
|
+
continue
|
|
130
|
+
rest = line[len(EM_DASH) + 1:]
|
|
131
|
+
try:
|
|
132
|
+
lname, payload_b64 = rest.split(" ", 1)
|
|
133
|
+
except ValueError:
|
|
134
|
+
continue
|
|
135
|
+
if lname != name:
|
|
136
|
+
continue
|
|
137
|
+
try:
|
|
138
|
+
payload = base64.b64decode(payload_b64, validate=True)
|
|
139
|
+
except (ValueError, TypeError):
|
|
140
|
+
continue
|
|
141
|
+
if len(payload) < 4:
|
|
142
|
+
continue
|
|
143
|
+
kid, sig = payload[:4], payload[4:]
|
|
144
|
+
if kid != kid_v or kid != kid_expected: # keyID must match both the vkey and the recomputed id
|
|
145
|
+
continue
|
|
146
|
+
if verify_ed25519(pubkey, sig, note_bytes):
|
|
147
|
+
ok = True
|
|
148
|
+
break
|
|
149
|
+
return {"ok": ok, "origin": origin, "tree_size": int(size_s), "root": root}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def root_bytes_from_b64(root_b64: str) -> Optional[bytes]:
|
|
153
|
+
"""Decode a bundle's standard-base64 Merkle root to raw bytes (for feeding into checkpoint_note)."""
|
|
154
|
+
try:
|
|
155
|
+
return base64.b64decode(root_b64, validate=True)
|
|
156
|
+
except (ValueError, TypeError):
|
|
157
|
+
return None
|