proofbundle 0.8.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {proofbundle-0.8.0/src/proofbundle.egg-info → proofbundle-0.9.0}/PKG-INFO +45 -20
  2. {proofbundle-0.8.0 → proofbundle-0.9.0}/README.md +44 -19
  3. {proofbundle-0.8.0 → proofbundle-0.9.0}/pyproject.toml +2 -2
  4. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/__init__.py +1 -1
  5. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/adapters/__init__.py +2 -1
  6. proofbundle-0.9.0/src/proofbundle/adapters/eee.py +175 -0
  7. proofbundle-0.9.0/src/proofbundle/checkpoint.py +157 -0
  8. proofbundle-0.9.0/src/proofbundle/dsse.py +110 -0
  9. proofbundle-0.9.0/src/proofbundle/eee_eval_schema.json +769 -0
  10. proofbundle-0.9.0/src/proofbundle/intoto.py +182 -0
  11. {proofbundle-0.8.0 → proofbundle-0.9.0/src/proofbundle.egg-info}/PKG-INFO +45 -20
  12. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle.egg-info/SOURCES.txt +7 -0
  13. proofbundle-0.9.0/tests/test_checkpoint.py +69 -0
  14. proofbundle-0.9.0/tests/test_eee.py +67 -0
  15. proofbundle-0.9.0/tests/test_intoto_dsse.py +83 -0
  16. proofbundle-0.8.0/src/proofbundle/intoto.py +0 -64
  17. {proofbundle-0.8.0 → proofbundle-0.9.0}/LICENSE +0 -0
  18. {proofbundle-0.8.0 → proofbundle-0.9.0}/setup.cfg +0 -0
  19. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
  20. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/adapters/lm_eval.py +0 -0
  21. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/bundle.py +0 -0
  22. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/cli.py +0 -0
  23. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/emit.py +0 -0
  24. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/errors.py +0 -0
  25. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/evalclaim.py +0 -0
  26. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/merkle.py +0 -0
  27. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/py.typed +0 -0
  28. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/sdjwt.py +0 -0
  29. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/sdjwt_issue.py +0 -0
  30. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle/signature.py +0 -0
  31. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  32. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  33. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle.egg-info/requires.txt +0 -0
  34. {proofbundle-0.8.0 → proofbundle-0.9.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  35. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_adapters.py +0 -0
  36. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_bundle.py +0 -0
  37. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_bundle_robustness.py +0 -0
  38. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_cli.py +0 -0
  39. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_cli_eval.py +0 -0
  40. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_emit.py +0 -0
  41. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_eval_claim_schema.py +0 -0
  42. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_evalclaim.py +0 -0
  43. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_examples.py +0 -0
  44. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_intoto.py +0 -0
  45. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_merkle.py +0 -0
  46. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_merkle_property.py +0 -0
  47. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_rekor_interop.py +0 -0
  48. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_rfc6962_external_vectors.py +0 -0
  49. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_schema.py +0 -0
  50. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_sdjwt_issue.py +0 -0
  51. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_sdjwt_reference.py +0 -0
  52. {proofbundle-0.8.0 → proofbundle-0.9.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.8.0
3
+ Version: 0.9.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -50,9 +50,10 @@ Dynamic: license-file
50
50
 
51
51
  <h1>proofbundle</h1>
52
52
 
53
- **Emit and verify, fully offline, portable evidence that a piece of data was
54
- signed and anchored in a tamper-evident log and optionally carries a
55
- selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
53
+ **An offline verifier for AI eval receipts. Standards-native: Ed25519 signature,
54
+ RFC 6962 transparency-log Merkle anchoring, optional SD-JWT (RFC 9901) selective
55
+ disclosure, aligned to the in-toto test-result predicate. One portable JSON file,
56
+ no server, no network.**
56
57
 
57
58
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
58
59
  [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
@@ -70,7 +71,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
70
71
 
71
72
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
72
73
  verify` checks one self-contained `bundle.json` with three offline cryptographic
73
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
74
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
74
75
 
75
76
  ## Contents
76
77
 
@@ -325,24 +326,46 @@ SD-JWT selective disclosure over one portable file, offline.
325
326
  The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
326
327
  gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
327
328
  a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
328
- missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
329
- aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
330
- See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
331
-
332
- - **Two framework adapters** `pip install "proofbundle[inspect]"` reads a UK AISI
329
+ missing **signature + selective-disclosure layer** for exactly that.
330
+
331
+ **How it fits standards-native, and honest about the neighbours.** proofbundle attests that a *claimed*
332
+ evaluation result is authentic, tamper-evident, and selectively disclosable. It does **not** attest that
333
+ the evaluation was computed correctly or that results were not cherry-picked — proving faithful
334
+ computation is the domain of TEE approaches such as
335
+ [Attestable Audits](https://arxiv.org/abs/2506.23706). It is complementary to its neighbours, named
336
+ fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
337
+ adds no cryptography (proofbundle ships an EEE→receipt converter);
338
+ [OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
339
+ results; [ValiChord](https://github.com/topeuph-ai/ValiChord) provides blind peer consensus and an
340
+ attested log on a Holochain network (its v1 attestation library uses a simple SHA-256 Merkle tree, no
341
+ signature, no SD-JWT, no in-toto). proofbundle is the lightweight, **standards-native** piece between them:
342
+ a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
343
+ threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
344
+
345
+ - **Three framework bridges** — `pip install "proofbundle[inspect]"` reads a UK AISI
333
346
  [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
334
347
  API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
335
348
  [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
336
- genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
337
- - **in-toto Statement v1**`proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
338
- emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
339
- digest is an *honest salted commitment* under a custom key, never `sha256` (see
340
- [PREDICATE.md](PREDICATE.md)).
341
- - **SD-JWT issuance** (RFC 9901) `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
349
+ genuine `acc,none` filter-suffix format). **`proofbundle.adapters.from_eee_dataset`** (v0.9) reads an
350
+ Every Eval Ever v0.2.2 aggregate JSON and builds a signed receipt validated against the vendored EEE
351
+ schema, with **no runtime import** of `every_eval_ever` (it needs Python 3.12; proofbundle stays 3.9+).
352
+ - **in-toto test-result export, DSSE-signed** (v0.9) `proofbundle.intoto.export_intoto_dsse(claim,
353
+ signer)` emits the receipt as a DSSE-signed in-toto Statement v1 with the **generic
354
+ `test-result/v0.1` predicate** (result PASSED/FAILED, `configuration` ResourceDescriptors), so a generic
355
+ in-toto verifier understands it. Alongside the self-hosted-predicate `to_intoto_statement` (see
356
+ [PREDICATE.md](PREDICATE.md)). Metric details live in `annotations` (test-result has no native metric
357
+ field); the model/dataset stay salted commitments, never `sha256`.
358
+ - **C2SP tlog-checkpoint** (v0.9) — `proofbundle.checkpoint.sign_checkpoint(origin, tree_size, root, …)`
359
+ emits a valid [C2SP](https://github.com/C2SP/C2SP/blob/main/tlog-checkpoint.md) signed note over the
360
+ RFC 6962 Merkle root, making a receipt witness-network / transparency-log compatible. Pure serialization
361
+ over the Ed25519 key already in use — no new crypto.
362
+ - **SD-JWT issuance** (RFC 9901, verified Nov 2025) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
342
363
  root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
343
- `threshold` while **withholding the exact score** and the identifier openings. The signed
344
- bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
345
- by proofbundle's own verifier **and** the `sd-jwt-python` reference.
364
+ `threshold` while **withholding the exact score** and the identifier openings. The digest mechanic is
365
+ RFC 9901 §4.2.3 (base64url of SHA-256 over the base64url-encoded Disclosure), cross-checked against the
366
+ `sd-jwt-python` reference.
367
+ The signed bundle payload is always the source of truth; the SD-JWT and the in-toto export are derived,
368
+ bundle-bound views.
346
369
 
347
370
  Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
348
371
  attestation — see [SECURITY.md](SECURITY.md).
@@ -359,8 +382,10 @@ attestation — see [SECURITY.md](SECURITY.md).
359
382
  CITATION.cff, PEP 740 attestations documented.
360
383
  - **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
361
384
  verifier robustness + CI on Python 3.9 after a holistic review.
362
- - **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
385
+ - **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
363
386
  a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
387
+ - **v0.9 (current release)** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP
388
+ tlog-checkpoint over the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
364
389
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
365
390
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
366
391
 
@@ -7,9 +7,10 @@
7
7
 
8
8
  <h1>proofbundle</h1>
9
9
 
10
- **Emit and verify, fully offline, portable evidence that a piece of data was
11
- signed and anchored in a tamper-evident log and optionally carries a
12
- selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
10
+ **An offline verifier for AI eval receipts. Standards-native: Ed25519 signature,
11
+ RFC 6962 transparency-log Merkle anchoring, optional SD-JWT (RFC 9901) selective
12
+ disclosure, aligned to the in-toto test-result predicate. One portable JSON file,
13
+ no server, no network.**
13
14
 
14
15
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
15
16
  [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
@@ -27,7 +28,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
27
28
 
28
29
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
29
30
  verify` checks one self-contained `bundle.json` with three offline cryptographic
30
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
31
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
31
32
 
32
33
  ## Contents
33
34
 
@@ -282,24 +283,46 @@ SD-JWT selective disclosure over one portable file, offline.
282
283
  The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
283
284
  gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
284
285
  a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
285
- missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
286
- aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
287
- See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
288
-
289
- - **Two framework adapters** `pip install "proofbundle[inspect]"` reads a UK AISI
286
+ missing **signature + selective-disclosure layer** for exactly that.
287
+
288
+ **How it fits standards-native, and honest about the neighbours.** proofbundle attests that a *claimed*
289
+ evaluation result is authentic, tamper-evident, and selectively disclosable. It does **not** attest that
290
+ the evaluation was computed correctly or that results were not cherry-picked — proving faithful
291
+ computation is the domain of TEE approaches such as
292
+ [Attestable Audits](https://arxiv.org/abs/2506.23706). It is complementary to its neighbours, named
293
+ fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
294
+ adds no cryptography (proofbundle ships an EEE→receipt converter);
295
+ [OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
296
+ results; [ValiChord](https://github.com/topeuph-ai/ValiChord) provides blind peer consensus and an
297
+ attested log on a Holochain network (its v1 attestation library uses a simple SHA-256 Merkle tree, no
298
+ signature, no SD-JWT, no in-toto). proofbundle is the lightweight, **standards-native** piece between them:
299
+ a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
300
+ threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
301
+
302
+ - **Three framework bridges** — `pip install "proofbundle[inspect]"` reads a UK AISI
290
303
  [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
291
304
  API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
292
305
  [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
293
- genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
294
- - **in-toto Statement v1**`proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
295
- emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
296
- digest is an *honest salted commitment* under a custom key, never `sha256` (see
297
- [PREDICATE.md](PREDICATE.md)).
298
- - **SD-JWT issuance** (RFC 9901) `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
306
+ genuine `acc,none` filter-suffix format). **`proofbundle.adapters.from_eee_dataset`** (v0.9) reads an
307
+ Every Eval Ever v0.2.2 aggregate JSON and builds a signed receipt validated against the vendored EEE
308
+ schema, with **no runtime import** of `every_eval_ever` (it needs Python 3.12; proofbundle stays 3.9+).
309
+ - **in-toto test-result export, DSSE-signed** (v0.9) `proofbundle.intoto.export_intoto_dsse(claim,
310
+ signer)` emits the receipt as a DSSE-signed in-toto Statement v1 with the **generic
311
+ `test-result/v0.1` predicate** (result PASSED/FAILED, `configuration` ResourceDescriptors), so a generic
312
+ in-toto verifier understands it. Alongside the self-hosted-predicate `to_intoto_statement` (see
313
+ [PREDICATE.md](PREDICATE.md)). Metric details live in `annotations` (test-result has no native metric
314
+ field); the model/dataset stay salted commitments, never `sha256`.
315
+ - **C2SP tlog-checkpoint** (v0.9) — `proofbundle.checkpoint.sign_checkpoint(origin, tree_size, root, …)`
316
+ emits a valid [C2SP](https://github.com/C2SP/C2SP/blob/main/tlog-checkpoint.md) signed note over the
317
+ RFC 6962 Merkle root, making a receipt witness-network / transparency-log compatible. Pure serialization
318
+ over the Ed25519 key already in use — no new crypto.
319
+ - **SD-JWT issuance** (RFC 9901, verified Nov 2025) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
299
320
  root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
300
- `threshold` while **withholding the exact score** and the identifier openings. The signed
301
- bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
302
- by proofbundle's own verifier **and** the `sd-jwt-python` reference.
321
+ `threshold` while **withholding the exact score** and the identifier openings. The digest mechanic is
322
+ RFC 9901 §4.2.3 (base64url of SHA-256 over the base64url-encoded Disclosure), cross-checked against the
323
+ `sd-jwt-python` reference.
324
+ The signed bundle payload is always the source of truth; the SD-JWT and the in-toto export are derived,
325
+ bundle-bound views.
303
326
 
304
327
  Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
305
328
  attestation — see [SECURITY.md](SECURITY.md).
@@ -316,8 +339,10 @@ attestation — see [SECURITY.md](SECURITY.md).
316
339
  CITATION.cff, PEP 740 attestations documented.
317
340
  - **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
318
341
  verifier robustness + CI on Python 3.9 after a holistic review.
319
- - **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
342
+ - **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
320
343
  a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
344
+ - **v0.9 (current release)** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP
345
+ tlog-checkpoint over the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
321
346
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
322
347
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
323
348
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.8.0"
7
+ version = "0.9.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -67,7 +67,7 @@ proofbundle = "proofbundle.cli:main"
67
67
  where = ["src"]
68
68
 
69
69
  [tool.setuptools.package-data]
70
- proofbundle = ["py.typed"]
70
+ proofbundle = ["py.typed", "eee_eval_schema.json"]
71
71
 
72
72
  [tool.ruff]
73
73
  line-length = 100
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.8.0"
16
+ __version__ = "0.9.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -5,6 +5,7 @@ add no runtime dependency. The output-format mapping is bound to a framework ver
5
5
  each fixture in tests/fixtures documents its source + version.
6
6
  """
7
7
  from .inspect_ai import from_inspect_ai_log
8
+ from .eee import from_eee_dataset
8
9
  from .lm_eval import from_lm_eval_results
9
10
 
10
- __all__ = ["from_lm_eval_results", "from_inspect_ai_log"]
11
+ __all__ = ["from_lm_eval_results", "from_inspect_ai_log", "from_eee_dataset"]
@@ -0,0 +1,175 @@
1
+ """Adapter: an Every Eval Ever (EEE) dataset record → a signed proofbundle eval receipt (v0.9).
2
+
3
+ Every Eval Ever (evaleval/every_eval_ever, MIT) is the community aggregation schema for eval metadata —
4
+ it has no cryptography. This converter is strictly additive: it reads an EEE aggregate JSON and builds a
5
+ signed, selectively-disclosable proofbundle receipt from it.
6
+
7
+ IMPORTANT: `every_eval_ever` is NOT imported at runtime — it requires Python 3.12+ (pydantic/numpy/pandas/
8
+ duckdb), while proofbundle stays 3.9+. We parse the EEE JSON directly and OPTIONALLY validate it against the
9
+ vendored `eee_eval_schema.json` (schema version 0.2.2, MIT) using `jsonschema` if available.
10
+
11
+ Field mapping (verified 2026-07 against schemas/eval.schema.json v0.2.2):
12
+ - model_info.id → model_id
13
+ - evaluation_results[i].evaluation_name → suite / task
14
+ - evaluation_results[i].source_data.dataset_name → dataset_id (required in every source variant)
15
+ - metric_config.metric_name | metric_id | metric_kind → metric (all optional; fallback chain)
16
+ - score_details.score → score
17
+ - score_details.uncertainty.standard_error.value → provenance.stderr
18
+ - eval_library.{name,version} → provenance.harness / harness_version
19
+ Gotcha handled: metric_config with score_type == "levels" is an integer level index; -1 with
20
+ has_unknown_level == true means Unknown and is rejected (not silently mapped to 0).
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ from pathlib import Path
26
+ from typing import Optional, Union
27
+
28
+ from ..evalclaim import build_eval_claim
29
+
30
+ _SCHEMA_PATH = Path(__file__).resolve().parent.parent / "eee_eval_schema.json"
31
+ _SCHEMA_VERSION = "0.2.2"
32
+
33
+
34
+ class EEEAdapterError(ValueError):
35
+ """Raised when the EEE record is missing the expected structure — a clear error, not a bare KeyError."""
36
+
37
+
38
+ def _load(source: Union[str, Path, dict]) -> dict:
39
+ if isinstance(source, dict):
40
+ return source
41
+ try:
42
+ return json.loads(Path(source).read_text(encoding="utf-8"))
43
+ except (OSError, ValueError) as e:
44
+ raise EEEAdapterError(f"could not read EEE dataset {source!r}: {e}") from e
45
+
46
+
47
+ def _validate(record: dict) -> None:
48
+ """Best-effort schema validation against the vendored EEE schema (skipped if jsonschema/schema absent)."""
49
+ try:
50
+ import jsonschema # noqa: PLC0415
51
+ except ImportError:
52
+ return
53
+ if not _SCHEMA_PATH.is_file():
54
+ return
55
+ schema = json.loads(_SCHEMA_PATH.read_text(encoding="utf-8"))
56
+ try:
57
+ jsonschema.validate(record, schema)
58
+ except jsonschema.ValidationError as e:
59
+ raise EEEAdapterError(f"EEE record does not validate against schema {_SCHEMA_VERSION}: {e.message}") from e
60
+
61
+
62
+ def _num_to_decimal_str(x) -> str:
63
+ """Format a JSON number as a plain decimal string (no exponent) for build_eval_claim's pattern."""
64
+ if isinstance(x, bool) or not isinstance(x, (int, float)):
65
+ raise EEEAdapterError(f"score must be a number, got {type(x).__name__}")
66
+ if isinstance(x, int):
67
+ return str(x)
68
+ if x != x or x in (float("inf"), float("-inf")): # NaN/Inf
69
+ raise EEEAdapterError("score must be finite")
70
+ s = repr(x)
71
+ if "e" in s or "E" in s: # avoid exponent form (build_eval_claim rejects it)
72
+ s = f"{x:.12f}".rstrip("0").rstrip(".")
73
+ return s
74
+
75
+
76
+ def _pick_metric(metric_config: dict) -> str:
77
+ for key in ("metric_name", "metric_id", "metric_kind"):
78
+ v = metric_config.get(key)
79
+ if isinstance(v, str) and v:
80
+ return v
81
+ return "score"
82
+
83
+
84
+ def _extract_score(score_details: dict, metric_config: dict) -> str:
85
+ if "score" not in score_details:
86
+ raise EEEAdapterError("evaluation_results[].score_details.score is required")
87
+ raw = score_details["score"]
88
+ if metric_config.get("score_type") == "levels":
89
+ if not isinstance(raw, (int, float)) or isinstance(raw, bool):
90
+ raise EEEAdapterError("levels score must be an integer level index")
91
+ idx = int(raw)
92
+ if idx == -1 and metric_config.get("has_unknown_level"):
93
+ raise EEEAdapterError("levels score is -1 (Unknown) — cannot build a threshold claim")
94
+ return str(idx)
95
+ return _num_to_decimal_str(raw)
96
+
97
+
98
+ def from_eee_dataset(source: Union[str, Path, dict], *, comparator: str, threshold: str,
99
+ timestamp: Optional[str] = None, eval_index: int = 0, metric_name: Optional[str] = None,
100
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None,
101
+ validate: bool = True):
102
+ """Read an EEE dataset record and build a proofbundle eval claim for one evaluation result.
103
+
104
+ `comparator`/`threshold` set the pass/fail assertion (EEE stores the raw score, not a threshold verdict).
105
+ `eval_index` selects which of `evaluation_results` to use; `metric_name` instead selects the first result
106
+ whose metric matches. Returns (claim, salts). Raises EEEAdapterError on a malformed record.
107
+ """
108
+ record = _load(source)
109
+ if not isinstance(record, dict):
110
+ raise EEEAdapterError("EEE dataset must be a JSON object")
111
+ if validate:
112
+ _validate(record)
113
+
114
+ model_info = record.get("model_info") or {}
115
+ model_id = model_info.get("id")
116
+ if not model_id:
117
+ raise EEEAdapterError("EEE record missing model_info.id")
118
+
119
+ results = record.get("evaluation_results")
120
+ if not isinstance(results, list) or not results:
121
+ raise EEEAdapterError("EEE record has no evaluation_results")
122
+
123
+ if metric_name is not None:
124
+ chosen = next((r for r in results if isinstance(r, dict)
125
+ and _pick_metric(r.get("metric_config") or {}) == metric_name), None)
126
+ if chosen is None:
127
+ raise EEEAdapterError(f"no evaluation_result with metric {metric_name!r}")
128
+ else:
129
+ if eval_index < 0 or eval_index >= len(results):
130
+ raise EEEAdapterError(f"eval_index {eval_index} out of range (0..{len(results) - 1})")
131
+ chosen = results[eval_index]
132
+ if not isinstance(chosen, dict):
133
+ raise EEEAdapterError("evaluation_results item is not an object")
134
+
135
+ metric_config = chosen.get("metric_config") or {}
136
+ score_details = chosen.get("score_details") or {}
137
+ source_data = chosen.get("source_data") or {}
138
+
139
+ suite = chosen.get("evaluation_name")
140
+ if not suite:
141
+ raise EEEAdapterError("evaluation_results[].evaluation_name is required")
142
+ dataset_id = source_data.get("dataset_name") or str(suite) # dataset_name is required in EEE; defensive fallback
143
+ metric = _pick_metric(metric_config)
144
+ score = _extract_score(score_details, metric_config)
145
+
146
+ eval_library = record.get("eval_library") or {}
147
+ ts = timestamp or chosen.get("evaluation_timestamp") or record.get("retrieved_timestamp")
148
+ if not ts:
149
+ raise EEEAdapterError("no timestamp: pass timestamp= or set retrieved_timestamp/evaluation_timestamp")
150
+
151
+ provenance = {"source": "every_eval_ever", "eee_schema_version": record.get("schema_version") or _SCHEMA_VERSION}
152
+ if eval_library.get("name"):
153
+ provenance["harness"] = str(eval_library["name"])
154
+ if eval_library.get("version"):
155
+ provenance["harness_version"] = str(eval_library["version"])
156
+ # NOTE: the EEE `evaluation_id` (format eval_name/model_id/timestamp) embeds the model id in cleartext,
157
+ # which would defeat proofbundle's salted model commitment (a receipt is meant to hide the model). So it
158
+ # is deliberately NOT copied into provenance — the receipt keeps the model private by design.
159
+ if metric_config.get("metric_id"):
160
+ provenance["metric_id"] = str(metric_config["metric_id"])
161
+ if metric_config.get("score_type"):
162
+ provenance["score_type"] = str(metric_config["score_type"])
163
+ se = ((score_details.get("uncertainty") or {}).get("standard_error") or {}).get("value")
164
+ if isinstance(se, (int, float)) and not isinstance(se, bool):
165
+ provenance["stderr"] = str(se)
166
+ rel = (record.get("source_metadata") or {}).get("evaluator_relationship")
167
+ if rel:
168
+ provenance["evaluator_relationship"] = str(rel)
169
+
170
+ return build_eval_claim(
171
+ suite=str(suite), suite_version=str(eval_library.get("version") or "unknown"),
172
+ metric=metric, comparator=comparator, threshold=threshold, score=score,
173
+ n=int((score_details.get("uncertainty") or {}).get("num_samples") or 0),
174
+ model_id=str(model_id), dataset_id=str(dataset_id), issuer="", timestamp=str(ts),
175
+ provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -0,0 +1,157 @@
1
+ """C2SP tlog-checkpoint output — a signed note over the RFC 6962 Merkle root (v0.9).
2
+
3
+ proofbundle already has an RFC 6962 Merkle root and Ed25519, so it can emit a valid C2SP tlog-checkpoint:
4
+ a signed note that makes a receipt witness-network / transparency-log compatible. Pure serialization and
5
+ framing, no new crypto. Spec verified 2026-07 against C2SP/C2SP tlog-checkpoint.md + signed-note.md.
6
+
7
+ Byte-exact rules (the ones that bite):
8
+ - Note text = at least three non-empty lines separated by U+000A: line 1 `origin` (a schemeless log
9
+ identity, no unicode spaces, no '+'), line 2 the tree size as ASCII decimal with no leading zeros
10
+ (empty tree = "0"), line 3 the Merkle root in STANDARD RFC 4648 §4 base64 (with padding) — NOT
11
+ base64url. The note text ends with a final U+000A.
12
+ - The signed note = note text (ending in U+000A) + one empty line + one-or-more signature lines.
13
+ - A signature line is: U+2014 (EM DASH, not a hyphen) SP keyname SP base64(keyID ‖ signature) U+000A
14
+ where keyID is 4 bytes big-endian and, for Ed25519, signature is 64 raw bytes → 68 bytes total.
15
+ - What is signed: the note text bytes INCLUDING the final U+000A, EXCLUDING the separating empty line.
16
+ Raw bytes — NO DSSE/PAE wrapping.
17
+ - keyID = SHA-256(keyname_bytes ‖ 0x0A ‖ 0x01 ‖ pubkey[32])[:4] (0x01 = Ed25519 signature type).
18
+ - vkey (to distribute the key) = keyname + "+" + hex8(keyID) + "+" + base64(0x01 ‖ pubkey[32]).
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import base64
23
+ import hashlib
24
+ from typing import Optional
25
+
26
+ from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
27
+
28
+ from .errors import BundleFormatError
29
+ from .signature import verify_ed25519
30
+
31
+ __all__ = ["checkpoint_note", "key_id", "vkey", "sign_checkpoint", "verify_checkpoint", "root_bytes_from_b64"]
32
+
33
+ EM_DASH = "—"
34
+ _ED25519_SIG_TYPE = 0x01
35
+
36
+
37
+ def _root_std_b64(root: bytes) -> str:
38
+ """Standard RFC 4648 §4 base64 (with padding) of the raw Merkle root — NOT base64url."""
39
+ return base64.b64encode(root).decode("ascii")
40
+
41
+
42
+ def checkpoint_note(origin: str, tree_size: int, root: bytes) -> str:
43
+ """Build the C2SP checkpoint note text (3 lines + trailing newline). ``root`` is the raw RFC 6962
44
+ Merkle root bytes at ``tree_size``. ``origin`` must be non-empty with no spaces/'+' (a schemeless URL)."""
45
+ if not origin or " " in origin or "+" in origin or "\n" in origin:
46
+ raise BundleFormatError("checkpoint origin must be a non-empty schemeless id without spaces or '+'")
47
+ if isinstance(tree_size, bool) or not isinstance(tree_size, int) or tree_size < 0:
48
+ raise BundleFormatError("checkpoint tree_size must be a non-negative integer")
49
+ return f"{origin}\n{tree_size}\n{_root_std_b64(root)}\n"
50
+
51
+
52
+ def key_id(keyname: str, pubkey: bytes) -> bytes:
53
+ """C2SP note key ID = first 4 bytes of SHA-256(keyname ‖ 0x0A ‖ 0x01 ‖ 32-byte-Ed25519-pubkey)."""
54
+ if len(pubkey) != 32:
55
+ raise BundleFormatError("Ed25519 public key must be 32 raw bytes")
56
+ h = hashlib.sha256(keyname.encode("utf-8") + b"\n" + bytes([_ED25519_SIG_TYPE]) + pubkey).digest()
57
+ return h[:4]
58
+
59
+
60
+ def vkey(keyname: str, pubkey: bytes) -> str:
61
+ """C2SP verifier key encoding: name + '+' + hex8(keyID) + '+' + base64(0x01 ‖ pubkey)."""
62
+ kid = key_id(keyname, pubkey)
63
+ kid_hex = f"{int.from_bytes(kid, 'big'):08x}"
64
+ keymat = base64.b64encode(bytes([_ED25519_SIG_TYPE]) + pubkey).decode("ascii")
65
+ return f"{keyname}+{kid_hex}+{keymat}"
66
+
67
+
68
+ def sign_checkpoint(origin: str, tree_size: int, root: bytes, signer, keyname: str) -> str:
69
+ """Produce a signed C2SP checkpoint note. ``signer`` is an Ed25519 private key whose public key must
70
+ correspond to ``keyname``. The signature is over the RAW note-text bytes (including the trailing
71
+ newline), never over base64 and never PAE-wrapped."""
72
+ note = checkpoint_note(origin, tree_size, root)
73
+ pubkey = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
74
+ sig = signer.sign(note.encode("utf-8"))
75
+ kid = key_id(keyname, pubkey)
76
+ sig_b64 = base64.b64encode(kid + sig).decode("ascii")
77
+ sig_line = f"{EM_DASH} {keyname} {sig_b64}\n"
78
+ return note + "\n" + sig_line
79
+
80
+
81
+ def _parse_vkey(vkey_str: str) -> tuple[str, bytes, bytes]:
82
+ # The key material is standard base64, which can itself contain '+'. Since the name has no '+' (a
83
+ # schemeless origin) and the hex keyID has none, the FIRST TWO '+' are the separators and everything
84
+ # after is the base64 — so split with maxsplit=2, never a plain split (that would over-split the b64).
85
+ parts = vkey_str.split("+", 2)
86
+ if len(parts) != 3:
87
+ raise BundleFormatError("vkey must have 3 '+'-separated parts (name+hexKeyID+base64KeyMaterial)")
88
+ name, kid_hex, keymat_b64 = parts
89
+ try:
90
+ keymat = base64.b64decode(keymat_b64, validate=True)
91
+ except (ValueError, TypeError) as exc:
92
+ raise BundleFormatError("vkey key material is not valid base64") from exc
93
+ if len(keymat) != 33 or keymat[0] != _ED25519_SIG_TYPE:
94
+ raise BundleFormatError("vkey key material must be 0x01 followed by a 32-byte Ed25519 key")
95
+ pubkey = keymat[1:]
96
+ try:
97
+ kid = bytes.fromhex(kid_hex)
98
+ except ValueError as exc:
99
+ raise BundleFormatError("vkey keyID is not valid hex") from exc
100
+ return name, kid, pubkey
101
+
102
+
103
+ def verify_checkpoint(signed_note: str, vkey_str: str) -> dict:
104
+ """Verify a signed C2SP checkpoint against a vkey. Returns {ok, origin, tree_size, root}. ``ok`` is
105
+ True iff a signature line whose keyID matches the vkey verifies (Ed25519) over the exact note-text
106
+ bytes. Reconstructs the note text from the parsed bytes — never re-derives it."""
107
+ name, kid_v, pubkey = _parse_vkey(vkey_str)
108
+ # note text = everything up to (and including the \n before) the separating empty line
109
+ if "\n\n" not in signed_note:
110
+ raise BundleFormatError("signed note has no empty-line separator between text and signatures")
111
+ note_text, sig_block = signed_note.split("\n\n", 1)
112
+ note_text += "\n" # restore the trailing newline that belongs to the note text
113
+ note_bytes = note_text.encode("utf-8")
114
+ lines = note_text.split("\n")
115
+ if len(lines) < 4 or not lines[0] or not lines[1] or not lines[2]:
116
+ raise BundleFormatError("checkpoint note must have at least 3 non-empty lines")
117
+ origin, size_s, root_b64 = lines[0], lines[1], lines[2]
118
+ if size_s != "0" and (size_s.startswith("0") or not size_s.isdigit()):
119
+ raise BundleFormatError("checkpoint tree size must be ASCII decimal with no leading zeros")
120
+ try:
121
+ root = base64.b64decode(root_b64, validate=True)
122
+ except (ValueError, TypeError) as exc:
123
+ raise BundleFormatError("checkpoint root is not valid standard base64") from exc
124
+
125
+ ok = False
126
+ kid_expected = key_id(name, pubkey)
127
+ for line in sig_block.split("\n"):
128
+ if not line.startswith(EM_DASH + " "):
129
+ continue
130
+ rest = line[len(EM_DASH) + 1:]
131
+ try:
132
+ lname, payload_b64 = rest.split(" ", 1)
133
+ except ValueError:
134
+ continue
135
+ if lname != name:
136
+ continue
137
+ try:
138
+ payload = base64.b64decode(payload_b64, validate=True)
139
+ except (ValueError, TypeError):
140
+ continue
141
+ if len(payload) < 4:
142
+ continue
143
+ kid, sig = payload[:4], payload[4:]
144
+ if kid != kid_v or kid != kid_expected: # keyID must match both the vkey and the recomputed id
145
+ continue
146
+ if verify_ed25519(pubkey, sig, note_bytes):
147
+ ok = True
148
+ break
149
+ return {"ok": ok, "origin": origin, "tree_size": int(size_s), "root": root}
150
+
151
+
152
+ def root_bytes_from_b64(root_b64: str) -> Optional[bytes]:
153
+ """Decode a bundle's standard-base64 Merkle root to raw bytes (for feeding into checkpoint_note)."""
154
+ try:
155
+ return base64.b64decode(root_b64, validate=True)
156
+ except (ValueError, TypeError):
157
+ return None