proofbundle 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {proofbundle-0.3.0/src/proofbundle.egg-info → proofbundle-0.4.1}/PKG-INFO +37 -20
  2. {proofbundle-0.3.0 → proofbundle-0.4.1}/README.md +32 -19
  3. {proofbundle-0.3.0 → proofbundle-0.4.1}/pyproject.toml +8 -2
  4. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/__init__.py +1 -1
  5. proofbundle-0.4.1/src/proofbundle/adapters/__init__.py +10 -0
  6. proofbundle-0.4.1/src/proofbundle/adapters/inspect_ai.py +36 -0
  7. proofbundle-0.4.1/src/proofbundle/adapters/lm_eval.py +32 -0
  8. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/cli.py +66 -12
  9. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/emit.py +2 -29
  10. proofbundle-0.4.1/src/proofbundle/evalclaim.py +212 -0
  11. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/sdjwt.py +1 -1
  12. {proofbundle-0.3.0 → proofbundle-0.4.1/src/proofbundle.egg-info}/PKG-INFO +37 -20
  13. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle.egg-info/SOURCES.txt +9 -0
  14. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle.egg-info/requires.txt +6 -0
  15. proofbundle-0.4.1/tests/test_adapters.py +32 -0
  16. proofbundle-0.4.1/tests/test_cli_eval.py +39 -0
  17. proofbundle-0.4.1/tests/test_eval_claim_schema.py +36 -0
  18. proofbundle-0.4.1/tests/test_evalclaim.py +107 -0
  19. proofbundle-0.4.1/tests/test_sdjwt_reference.py +39 -0
  20. {proofbundle-0.3.0 → proofbundle-0.4.1}/LICENSE +0 -0
  21. {proofbundle-0.3.0 → proofbundle-0.4.1}/setup.cfg +0 -0
  22. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/bundle.py +0 -0
  23. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/errors.py +0 -0
  24. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/merkle.py +0 -0
  25. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/py.typed +0 -0
  26. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle/signature.py +0 -0
  27. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  28. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle.egg-info/entry_points.txt +0 -0
  29. {proofbundle-0.3.0 → proofbundle-0.4.1}/src/proofbundle.egg-info/top_level.txt +0 -0
  30. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_bundle.py +0 -0
  31. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_cli.py +0 -0
  32. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_emit.py +0 -0
  33. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_merkle.py +0 -0
  34. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_merkle_property.py +0 -0
  35. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_rekor_interop.py +0 -0
  36. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_rfc6962_external_vectors.py +0 -0
  37. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_schema.py +0 -0
  38. {proofbundle-0.3.0 → proofbundle-0.4.1}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -24,6 +24,9 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: cryptography>=42
26
26
  Provides-Extra: sdjwt
27
+ Provides-Extra: eval
28
+ Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
+ Provides-Extra: adapters
27
30
  Provides-Extra: dev
28
31
  Requires-Dist: pytest>=7; extra == "dev"
29
32
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -31,6 +34,7 @@ Requires-Dist: jsonschema>=4; extra == "dev"
31
34
  Requires-Dist: mypy>=1.8; extra == "dev"
32
35
  Requires-Dist: build>=1; extra == "dev"
33
36
  Requires-Dist: hypothesis>=6; extra == "dev"
37
+ Requires-Dist: rfc8785>=0.1.4; extra == "dev"
34
38
  Dynamic: license-file
35
39
 
36
40
  <div align="center">
@@ -57,7 +61,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
57
61
 
58
62
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
59
63
  verify` checks one self-contained `bundle.json` with three offline cryptographic
60
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
64
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
61
65
 
62
66
  ## Contents
63
67
 
@@ -68,6 +72,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
68
72
  - [Quickstart](#quickstart)
69
73
  - [Interoperability](#interoperability)
70
74
  - [Bundle format](#bundle-format-proofbundlev01)
75
+ - [Eval receipts](#eval-receipts)
71
76
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
72
77
  - [Roadmap](#roadmap)
73
78
  - [Contributing](#contributing)
@@ -236,12 +241,12 @@ string uses base64url as per the spec.
236
241
 
237
242
  ## Security notes and scope, stated honestly
238
243
 
239
- This is v0.1. It does exactly what it says and no more:
244
+ The scope is deliberately narrow. It does exactly what it says and no more:
240
245
 
241
246
  - Ed25519 signatures only, for both the payload and the optional SD-JWT issuer
242
247
  signature.
243
248
  - SD-JWT: the SD-JWT core is now [RFC 9901](https://datatracker.ietf.org/doc/rfc9901/)
244
- (Dec 2025); this verifies that every presented disclosure is committed in the
249
+ (November 2025); this verifies that every presented disclosure is committed in the
245
250
  issuer-signed payload, and the issuer signature (EdDSA) if a key is supplied. It
246
251
  does **not** verify a Key Binding JWT, an X.509 or trust-list chain, status
247
252
  lists, or `vct` type metadata. **SD-JWT VC** (the credential-type profile) is
@@ -255,25 +260,37 @@ This is v0.1. It does exactly what it says and no more:
255
260
  If you find a correctness or security issue, please open an issue or see
256
261
  [SECURITY.md](SECURITY.md).
257
262
 
263
+ ## Eval receipts
264
+
265
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
266
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
267
+ **salted commitments** to the model and dataset identifiers — never the weights, the
268
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
269
+ from one file, without ever seeing the model or the test set.
270
+
271
+ ```bash
272
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
273
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
274
+ proofbundle verify receipt.json # a receipt is a normal bundle
275
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
276
+ ```
277
+
278
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
279
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
280
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
281
+ commitments — it does **not** prove the evaluation was well designed or that the score
282
+ itself is correct. Those are human judgements; what it removes is the need to simply
283
+ trust the number.
284
+
258
285
  ## Roadmap
259
286
 
260
287
  - **v0.1** — the offline verifier plus a real example bundle.
261
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
262
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
263
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
264
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
265
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
266
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
267
- into a signed receipt whose payload is a minimal canonical claim, for example
268
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
269
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
270
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
271
- interval, a multiple-testing correction and a preregistration hash.
272
-
273
- That last step is the point: today no widely used AI project turns a
274
- reproducible evaluation result into a signed, third-party-verifiable,
275
- selectively disclosable receipt. This repository is the trustworthy verification
276
- core that makes it possible.
288
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
291
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
277
294
 
278
295
  ## Contributing
279
296
 
@@ -22,7 +22,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
22
22
 
23
23
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
24
24
  verify` checks one self-contained `bundle.json` with three offline cryptographic
25
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
25
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
26
26
 
27
27
  ## Contents
28
28
 
@@ -33,6 +33,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
33
33
  - [Quickstart](#quickstart)
34
34
  - [Interoperability](#interoperability)
35
35
  - [Bundle format](#bundle-format-proofbundlev01)
36
+ - [Eval receipts](#eval-receipts)
36
37
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
37
38
  - [Roadmap](#roadmap)
38
39
  - [Contributing](#contributing)
@@ -201,12 +202,12 @@ string uses base64url as per the spec.
201
202
 
202
203
  ## Security notes and scope, stated honestly
203
204
 
204
- This is v0.1. It does exactly what it says and no more:
205
+ The scope is deliberately narrow. It does exactly what it says and no more:
205
206
 
206
207
  - Ed25519 signatures only, for both the payload and the optional SD-JWT issuer
207
208
  signature.
208
209
  - SD-JWT: the SD-JWT core is now [RFC 9901](https://datatracker.ietf.org/doc/rfc9901/)
209
- (Dec 2025); this verifies that every presented disclosure is committed in the
210
+ (November 2025); this verifies that every presented disclosure is committed in the
210
211
  issuer-signed payload, and the issuer signature (EdDSA) if a key is supplied. It
211
212
  does **not** verify a Key Binding JWT, an X.509 or trust-list chain, status
212
213
  lists, or `vct` type metadata. **SD-JWT VC** (the credential-type profile) is
@@ -220,25 +221,37 @@ This is v0.1. It does exactly what it says and no more:
220
221
  If you find a correctness or security issue, please open an issue or see
221
222
  [SECURITY.md](SECURITY.md).
222
223
 
224
+ ## Eval receipts
225
+
226
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
227
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
228
+ **salted commitments** to the model and dataset identifiers — never the weights, the
229
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
230
+ from one file, without ever seeing the model or the test set.
231
+
232
+ ```bash
233
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
234
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
235
+ proofbundle verify receipt.json # a receipt is a normal bundle
236
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
237
+ ```
238
+
239
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
240
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
241
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
242
+ commitments — it does **not** prove the evaluation was well designed or that the score
243
+ itself is correct. Those are human judgements; what it removes is the need to simply
244
+ trust the number.
245
+
223
246
  ## Roadmap
224
247
 
225
248
  - **v0.1** — the offline verifier plus a real example bundle.
226
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
227
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
228
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
229
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
230
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
231
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
232
- into a signed receipt whose payload is a minimal canonical claim, for example
233
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
234
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
235
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
236
- interval, a multiple-testing correction and a preregistration hash.
237
-
238
- That last step is the point: today no widely used AI project turns a
239
- reproducible evaluation result into a signed, third-party-verifiable,
240
- selectively disclosable receipt. This repository is the trustworthy verification
241
- core that makes it possible.
249
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
250
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
251
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
252
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
253
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
254
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
242
255
 
243
256
  ## Contributing
244
257
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.3.0"
7
+ version = "0.4.1"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -39,7 +39,13 @@ dependencies = ["cryptography>=42"]
39
39
  # `cryptography` (EdDSA + stdlib), so `pip install proofbundle[sdjwt]` keeps the trusted core
40
40
  # lean; the extra documents intent and is forward-compatible if SD-JWT ever needs a heavier lib.
41
41
  sdjwt = []
42
- dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6"]
42
+ # EMITTING eval receipts needs a real RFC 8785 JCS canonicalizer (emit path only). The VERIFY
43
+ # path (verify_bundle / decode_eval_claim) never canonicalizes — it checks stored bytes — so the
44
+ # verifier stays dependency-free. `pip install proofbundle[eval]` adds emit-side canonicalization.
45
+ eval = ["rfc8785>=0.1.4"]
46
+ # Framework adapters read exported result JSON only (no framework import) → pure stdlib today.
47
+ adapters = []
48
+ dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6", "rfc8785>=0.1.4"]
43
49
 
44
50
  [project.urls]
45
51
  Homepage = "https://b7n0de.com"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.4.1"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -0,0 +1,10 @@
1
+ """Adapters that map an eval framework's EXPORTED result JSON to an eval claim.
2
+
3
+ Each adapter reads a result file from disk and never imports the framework, so they
4
+ add no runtime dependency. The output-format mapping is bound to a framework version;
5
+ each fixture in tests/fixtures documents its source + version.
6
+ """
7
+ from .inspect_ai import from_inspect_ai_log
8
+ from .lm_eval import from_lm_eval_results
9
+
10
+ __all__ = ["from_lm_eval_results", "from_inspect_ai_log"]
@@ -0,0 +1,36 @@
1
+ """Adapter for UK AISI inspect_ai eval-log JSON (file-based, no framework import)."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ..evalclaim import build_eval_claim
9
+
10
+
11
+ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
12
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
13
+ """Read an inspect_ai eval-log JSON and build an eval claim.
14
+
15
+ Expects: {"eval": {"task": ..., "model": ..., "dataset": {"name": ...}},
16
+ "results": {"total_samples": n, "scores": [{"metrics": {metric: {"value": <number>}}}]}}.
17
+ Returns (claim, salts).
18
+ """
19
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
20
+ ev = data.get("eval", {})
21
+ scores = data.get("results", {}).get("scores", [])
22
+ value = None
23
+ for s in scores:
24
+ m = s.get("metrics", {})
25
+ if metric in m:
26
+ value = m[metric].get("value")
27
+ break
28
+ if value is None:
29
+ raise ValueError(f"metric {metric!r} not found in inspect_ai scores")
30
+ n = int(data.get("results", {}).get("total_samples") or 0)
31
+ return build_eval_claim(
32
+ suite=str(ev.get("task", "inspect_ai")), suite_version=str(ev.get("task_version", "1")),
33
+ metric=metric, comparator=comparator, threshold=threshold, score=repr(value), n=n,
34
+ model_id=str(ev.get("model", "unknown")),
35
+ dataset_id=str(ev.get("dataset", {}).get("name", ev.get("task", "unknown"))),
36
+ issuer="", timestamp=timestamp, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -0,0 +1,32 @@
1
+ """Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ..evalclaim import build_eval_claim
9
+
10
+
11
+ def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
12
+ timestamp: str, model_salt: Optional[bytes] = None,
13
+ dataset_salt: Optional[bytes] = None):
14
+ """Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
15
+
16
+ Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
17
+ "n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
18
+ STRING to avoid float canonicalization issues. Returns (claim, salts).
19
+ """
20
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
21
+ res = data.get("results", {}).get(task)
22
+ if res is None or metric not in res:
23
+ raise ValueError(f"task/metric not found in results: {task}/{metric}")
24
+ score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
25
+ n = int(data.get("n-samples", {}).get(task, {}).get("effective")
26
+ or data.get("n-samples", {}).get(task, {}).get("original") or 0)
27
+ model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
28
+ return build_eval_claim(
29
+ suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
30
+ metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
31
+ model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
32
+ model_salt=model_salt, dataset_salt=dataset_salt)
@@ -1,4 +1,4 @@
1
- """Command line interface: ``proofbundle verify`` and ``proofbundle emit``."""
1
+ """Command line interface: ``proofbundle`` verify / emit / emit-eval / show-eval."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -12,6 +12,58 @@ from .emit import emit_bundle, generate_signer, load_signer, save_signer
12
12
  from .errors import ProofBundleError
13
13
 
14
14
 
15
+ def _resolve_signer(args):
16
+ """Shared signer resolution for emit / emit-eval. Returns a signer or None (with an error)."""
17
+ if getattr(args, "new_key", None) and getattr(args, "key", None):
18
+ print("ERROR: use either --key or --new-key, not both", file=sys.stderr)
19
+ return None
20
+ if getattr(args, "new_key", None):
21
+ signer = generate_signer()
22
+ save_signer(signer, args.new_key)
23
+ print(f"wrote new signing key to {args.new_key} (keep this secret)", file=sys.stderr)
24
+ return signer
25
+ if getattr(args, "key", None):
26
+ return load_signer(args.key)
27
+ print("ERROR: provide --key <file> or --new-key <file>", file=sys.stderr)
28
+ return None
29
+
30
+
31
+ def _cmd_emit_eval(args: argparse.Namespace) -> int:
32
+ from .evalclaim import EvalClaimError, emit_eval_receipt, load_claim_text # noqa: PLC0415
33
+ signer = _resolve_signer(args)
34
+ if signer is None:
35
+ return 2
36
+ try:
37
+ with open(args.claim, encoding="utf-8") as handle:
38
+ claim = load_claim_text(handle.read())
39
+ bundle = emit_eval_receipt(claim, signer)
40
+ except (EvalClaimError, OSError, ValueError) as exc:
41
+ print(f"ERROR: {exc}", file=sys.stderr)
42
+ return 2
43
+ with open(args.out, "w", encoding="utf-8") as handle:
44
+ json.dump(bundle, handle, indent=2)
45
+ handle.write("\n")
46
+ print(f"wrote eval receipt {args.out}")
47
+ return 0
48
+
49
+
50
+ def _cmd_show_eval(args: argparse.Namespace) -> int:
51
+ from .evalclaim import decode_eval_claim # noqa: PLC0415
52
+ claim = decode_eval_claim(args.receipt)
53
+ if claim is None:
54
+ print("=> FAILED: not a valid, issuer-bound eval receipt", file=sys.stderr)
55
+ return 1
56
+ print(f"suite {claim['suite']} ({claim['suite_version']})")
57
+ print(f"metric {claim['metric']} {claim['comparator']} {claim['threshold']}")
58
+ print(f"passed {claim['passed']} (n={claim['n']})")
59
+ print(f"model commit {claim['model_id_commit']}")
60
+ print(f"dataset commit {claim['dataset_id_commit']}")
61
+ print(f"issuer {claim['issuer']}")
62
+ print(f"timestamp {claim['timestamp']}")
63
+ print("=> OK")
64
+ return 0
65
+
66
+
15
67
  def _cmd_verify(args: argparse.Namespace) -> int:
16
68
  try:
17
69
  result = verify_bundle(args.bundle)
@@ -32,17 +84,8 @@ def _cmd_verify(args: argparse.Namespace) -> int:
32
84
 
33
85
 
34
86
  def _cmd_emit(args: argparse.Namespace) -> int:
35
- if args.new_key and args.key:
36
- print("ERROR: use either --key or --new-key, not both", file=sys.stderr)
37
- return 2
38
- if args.new_key:
39
- signer = generate_signer()
40
- save_signer(signer, args.new_key)
41
- print(f"wrote new signing key to {args.new_key} (keep this secret)", file=sys.stderr)
42
- elif args.key:
43
- signer = load_signer(args.key)
44
- else:
45
- print("ERROR: provide --key <file> or --new-key <file>", file=sys.stderr)
87
+ signer = _resolve_signer(args)
88
+ if signer is None:
46
89
  return 2
47
90
 
48
91
  with open(args.payload_file, "rb") as handle:
@@ -76,6 +119,17 @@ def build_parser() -> argparse.ArgumentParser:
76
119
  emit.add_argument("--new-key", help="generate a signing key and save it to this file")
77
120
  emit.set_defaults(func=_cmd_emit)
78
121
 
122
+ emit_eval = sub.add_parser("emit-eval", help="emit a signed eval receipt from a claim JSON")
123
+ emit_eval.add_argument("--claim", required=True, help="path to the eval-claim JSON")
124
+ emit_eval.add_argument("--out", required=True, help="path to write the receipt bundle JSON")
125
+ emit_eval.add_argument("--key", help="use an existing 32 byte raw Ed25519 seed file")
126
+ emit_eval.add_argument("--new-key", help="generate a signing key and save it to this file")
127
+ emit_eval.set_defaults(func=_cmd_emit_eval)
128
+
129
+ show_eval = sub.add_parser("show-eval", help="verify an eval receipt and print the claim")
130
+ show_eval.add_argument("receipt", help="path to the eval receipt bundle JSON")
131
+ show_eval.set_defaults(func=_cmd_show_eval)
132
+
79
133
  return parser
80
134
 
81
135
 
@@ -5,9 +5,8 @@ Merkle tree, producing a bundle that ``verify_bundle`` accepts. This is the
5
5
  counterpart to the verifier: create the evidence here, check it anywhere with
6
6
  ``proofbundle verify``, fully offline.
7
7
 
8
- The v0.3 eval-receipt emitter (wrap one evaluation run into a signed,
9
- selectively disclosable receipt) is still a roadmap stub at the bottom of this
10
- module.
8
+ The eval-receipt emitter that builds on this (``emit_eval_receipt``) lives in
9
+ :mod:`proofbundle.evalclaim` since v0.4.
11
10
  """
12
11
 
13
12
  from __future__ import annotations
@@ -110,29 +109,3 @@ def emit_bundle(
110
109
  if sd_jwt_vc is not None:
111
110
  bundle["sd_jwt_vc"] = sd_jwt_vc
112
111
  return bundle
113
-
114
-
115
- # --------------------------------------------------------------------------
116
- # Roadmap stub, v0.3
117
- # --------------------------------------------------------------------------
118
-
119
-
120
- class NotYetImplemented(NotImplementedError):
121
- """Raised by roadmap functions that are planned but not implemented yet."""
122
-
123
-
124
- def emit_eval_receipt(*args, **kwargs): # pragma: no cover - roadmap stub
125
- """v0.3, the core differentiator.
126
-
127
- Wrap one evaluation framework run (Inspect AI, lm-evaluation-harness) into a
128
- signed receipt whose payload is a minimal, RFC 8785 canonicalized claim such
129
- as ``{"suite": "...", "threshold": 0.8, "passed": true}``, optionally wrapped
130
- as an SD-JWT VC so a holder can disclose "passed above threshold" without
131
- revealing the model, weights or dataset, carrying a cluster-bootstrap
132
- confidence interval, a multiple-testing correction and a preregistration
133
- hash. Built on top of :func:`emit_bundle`.
134
- """
135
- raise NotYetImplemented(
136
- "emit_eval_receipt lands in v0.3. Use emit_bundle for a generic signed, "
137
- "anchored bundle today."
138
- )
@@ -0,0 +1,212 @@
1
+ """Eval receipts (v0.4): sign + Merkle-anchor a canonical eval CLAIM.
2
+
3
+ A receipt proves exactly one thing — *suite S scored `comparator` threshold T,
4
+ passed=…* — carrying only SALTED commitments to the model and dataset identifiers,
5
+ never the weights, the data, or the plaintext names. A third party verifies the
6
+ threshold was met, offline, from one file, without ever seeing the model or dataset.
7
+
8
+ Honest scope (see EVAL_CLAIM.md): the receipt proves `passed` against `threshold`
9
+ and hides the model/dataset via salted commitments. It does NOT prove the evaluation
10
+ itself was well designed or that the suite measures what it claims — those are human
11
+ judgements. What it removes is the need to simply *trust the number*.
12
+
13
+ Layering: the claim payload is canonicalized with RFC 8785 JCS **only on the emit
14
+ path** (a lazy dependency). The verify path (`decode_eval_claim`) never canonicalizes —
15
+ it checks the exact stored bytes that `verify_bundle` already authenticated — so the
16
+ verifier stays dependency-free (cryptography + stdlib only).
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import base64
21
+ import hashlib
22
+ import json
23
+ import os
24
+ import unicodedata
25
+ from typing import Optional, Sequence
26
+
27
+ from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
28
+ from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
29
+
30
+ from .bundle import load_bundle, verify_bundle
31
+ from .emit import emit_bundle
32
+
33
+ EVAL_CLAIM_SCHEMA = "proofbundle/eval-claim/v0.1"
34
+ COMMIT_ALG = "sha256-salted-v1"
35
+ _COMPARATORS = {">=", ">", "<=", "<"}
36
+ _MAX_SAFE_INT = 2 ** 53 - 1
37
+ # The exact key set of an eval claim; decode/validate reject anything else.
38
+ _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
+ "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
40
+ _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
41
+
42
+ __all__ = [
43
+ "EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
44
+ "emit_eval_receipt", "decode_eval_claim", "salted_commit", "issuer_fingerprint",
45
+ ]
46
+
47
+
48
+ class EvalClaimError(ValueError):
49
+ """Raised for a malformed eval claim (float in payload, non-NFC string, unsafe int, …)."""
50
+
51
+
52
+ def issuer_fingerprint(signer: Ed25519PrivateKey) -> str:
53
+ """The `issuer` field value: ed25519:<base64 of the 32-byte raw public key>."""
54
+ raw = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
55
+ return "ed25519:" + base64.b64encode(raw).decode("ascii")
56
+
57
+
58
+ def salted_commit(identifier: str, salt: bytes) -> str:
59
+ """Salted commitment to an identifier: sha256:<hex> over salt || utf8(identifier).
60
+
61
+ The salt (>=16 bytes, high entropy) stays with the issuer and is NEVER in the payload,
62
+ so the identifier cannot be recovered from the commitment — not even via a rainbow table
63
+ over known model names like gpt-4o.
64
+ """
65
+ if len(salt) < 16:
66
+ raise EvalClaimError("commitment salt must be at least 16 bytes")
67
+ return "sha256:" + hashlib.sha256(salt + identifier.encode("utf-8")).hexdigest()
68
+
69
+
70
+ def _reject_non_jcs(value) -> None:
71
+ """Recursively reject values that RFC 8785 / this profile forbids in a claim."""
72
+ if isinstance(value, bool):
73
+ return
74
+ if isinstance(value, float):
75
+ raise EvalClaimError("float values are forbidden; use a decimal STRING (e.g. \"0.80\")")
76
+ if isinstance(value, int):
77
+ if abs(value) > _MAX_SAFE_INT:
78
+ raise EvalClaimError(f"integer {value} exceeds the IEEE-754 safe range (2**53-1)")
79
+ return
80
+ if isinstance(value, str):
81
+ if unicodedata.normalize("NFC", value) != value:
82
+ raise EvalClaimError("string is not NFC-normalized")
83
+ return
84
+ if value is None:
85
+ return
86
+ if isinstance(value, dict):
87
+ for v in value.values():
88
+ _reject_non_jcs(v)
89
+ return
90
+ if isinstance(value, (list, tuple)):
91
+ for v in value:
92
+ _reject_non_jcs(v)
93
+ return
94
+ raise EvalClaimError(f"unsupported value type {type(value).__name__}")
95
+
96
+
97
+ def canonicalize(claim: dict) -> bytes:
98
+ """RFC 8785 JCS canonical bytes of a claim — EMIT PATH ONLY.
99
+
100
+ Enforces the profile before serializing: no Python float, NFC strings, safe-range ints.
101
+ Duplicate keys cannot exist in a Python dict; when parsing claim JSON from text, use
102
+ `load_claim_text` which rejects duplicate keys. Uses the rfc8785 library (lazy import)
103
+ for the UTF-16 code-unit key sort + compact UTF-8 serialization.
104
+ """
105
+ _reject_non_jcs(claim)
106
+ import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
107
+ try:
108
+ return rfc8785.dumps(claim)
109
+ except (rfc8785.FloatDomainError, rfc8785.IntegerDomainError, rfc8785.CanonicalizationError) as e:
110
+ raise EvalClaimError(f"canonicalization failed: {e}") from e
111
+
112
+
113
+ def load_claim_text(text: str) -> dict:
114
+ """Parse claim JSON text, rejecting duplicate keys (JCS forbids them)."""
115
+ def _no_dupes(pairs):
116
+ seen = {}
117
+ for k, v in pairs:
118
+ if k in seen:
119
+ raise EvalClaimError(f"duplicate key {k!r} in claim JSON")
120
+ seen[k] = v
121
+ return seen
122
+ return json.loads(text, object_pairs_hook=_no_dupes)
123
+
124
+
125
+ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator: str,
126
+ threshold: str, score: str, n: int, model_id: str, dataset_id: str,
127
+ issuer: str, timestamp: str, context_binding: Optional[str] = None,
128
+ ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
129
+ prereg_sha256: Optional[str] = None,
130
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
131
+ """Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
132
+ (never trusts the caller), creates salted commitments, and returns (claim, salts) with the
133
+ salts SEPARATE (never in the payload).
134
+
135
+ threshold/score are decimal STRINGS (never floats). Returns:
136
+ (claim: dict, salts: {"model_salt": bytes, "dataset_salt": bytes})
137
+ """
138
+ if comparator not in _COMPARATORS:
139
+ raise EvalClaimError(f"comparator must be one of {sorted(_COMPARATORS)}")
140
+ for name, val in (("threshold", threshold), ("score", score)):
141
+ if not isinstance(val, str):
142
+ raise EvalClaimError(f"{name} must be a decimal STRING, not {type(val).__name__}")
143
+ from decimal import Decimal, InvalidOperation # noqa: PLC0415
144
+ try:
145
+ s, t = Decimal(score), Decimal(threshold)
146
+ except InvalidOperation as e:
147
+ raise EvalClaimError(f"threshold/score are not valid decimals: {e}") from e
148
+ passed = {">=": s >= t, ">": s > t, "<=": s <= t, "<": s < t}[comparator]
149
+ m_salt = model_salt if model_salt is not None else os.urandom(16)
150
+ d_salt = dataset_salt if dataset_salt is not None else os.urandom(16)
151
+ claim = {
152
+ "schema": EVAL_CLAIM_SCHEMA, "suite": suite, "suite_version": suite_version,
153
+ "metric": metric, "comparator": comparator, "threshold": threshold, "passed": passed,
154
+ "n": n, "model_id_commit": salted_commit(model_id, m_salt),
155
+ "dataset_id_commit": salted_commit(dataset_id, d_salt), "commit_alg": COMMIT_ALG,
156
+ "issuer": issuer, "timestamp": timestamp,
157
+ }
158
+ if context_binding is not None:
159
+ claim["context_binding"] = context_binding
160
+ if ci95 is not None:
161
+ claim["ci95"] = [str(x) for x in ci95]
162
+ if multiple_testing is not None:
163
+ claim["multiple_testing"] = multiple_testing
164
+ if prereg_sha256 is not None:
165
+ claim["prereg_sha256"] = prereg_sha256
166
+ _reject_non_jcs(claim)
167
+ return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
168
+
169
+
170
+ def emit_eval_receipt(claim: dict, signer: Ed25519PrivateKey, *, prior_leaves: Sequence[bytes] = (),
171
+ sd_jwt: Optional[dict] = None) -> dict:
172
+ """Emit a proofbundle/v0.1 bundle whose payload is the canonical eval claim.
173
+
174
+ Sets `issuer` to the signer's fingerprint automatically (binding the receipt to the key),
175
+ canonicalizes, and calls emit_bundle. The returned bundle is verified unchanged by verify_bundle.
176
+ """
177
+ claim = dict(claim)
178
+ claim["issuer"] = issuer_fingerprint(signer)
179
+ missing = _REQUIRED - set(claim)
180
+ if missing:
181
+ raise EvalClaimError(f"claim missing required fields: {sorted(missing)}")
182
+ extra = set(claim) - _REQUIRED - _OPTIONAL
183
+ if extra:
184
+ raise EvalClaimError(f"claim has unknown fields: {sorted(extra)}")
185
+ payload = canonicalize(claim)
186
+ return emit_bundle(payload, signer, prior_leaves=prior_leaves, sd_jwt_vc=sd_jwt)
187
+
188
+
189
+ def decode_eval_claim(bundle) -> Optional[dict]:
190
+ """Verify the bundle, then check the signing key matches the claim's `issuer` field.
191
+
192
+ Returns the parsed claim on success, None on any failure. Dependency-free (no JCS import):
193
+ it re-reads the exact stored payload bytes that verify_bundle already authenticated.
194
+ """
195
+ result = verify_bundle(bundle)
196
+ if not result.ok:
197
+ return None
198
+ if isinstance(bundle, str):
199
+ bundle = load_bundle(bundle) # a str is a PATH (consistent with verify_bundle)
200
+ try:
201
+ payload = base64.b64decode(bundle["payload_b64"])
202
+ claim = load_claim_text(payload.decode("utf-8"))
203
+ if claim.get("schema") != EVAL_CLAIM_SCHEMA:
204
+ return None
205
+ # Issuer binding: the claim's issuer must be the key that signed the bundle.
206
+ sig_pub_b64 = bundle["signature"]["public_key_b64"]
207
+ want = "ed25519:" + base64.b64encode(base64.b64decode(sig_pub_b64)).decode("ascii")
208
+ if claim.get("issuer") != want:
209
+ return None
210
+ return claim
211
+ except (KeyError, ValueError, EvalClaimError):
212
+ return None
@@ -1,6 +1,6 @@
1
1
  """Minimal SD-JWT selective disclosure verification.
2
2
 
3
- The SD-JWT *core* is now a published standard, RFC 9901 (December 2025). This
3
+ The SD-JWT *core* is now a published standard, RFC 9901 (November 2025). This
4
4
  module verifies the heart of it: that every presented Disclosure hashes to a
5
5
  digest that is actually committed in the issuer-signed JWT payload, and, if an
6
6
  issuer public key is supplied and the algorithm is EdDSA, that the issuer
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -24,6 +24,9 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: cryptography>=42
26
26
  Provides-Extra: sdjwt
27
+ Provides-Extra: eval
28
+ Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
+ Provides-Extra: adapters
27
30
  Provides-Extra: dev
28
31
  Requires-Dist: pytest>=7; extra == "dev"
29
32
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -31,6 +34,7 @@ Requires-Dist: jsonschema>=4; extra == "dev"
31
34
  Requires-Dist: mypy>=1.8; extra == "dev"
32
35
  Requires-Dist: build>=1; extra == "dev"
33
36
  Requires-Dist: hypothesis>=6; extra == "dev"
37
+ Requires-Dist: rfc8785>=0.1.4; extra == "dev"
34
38
  Dynamic: license-file
35
39
 
36
40
  <div align="center">
@@ -57,7 +61,7 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
57
61
 
58
62
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
59
63
  verify` checks one self-contained `bundle.json` with three offline cryptographic
60
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
64
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
61
65
 
62
66
  ## Contents
63
67
 
@@ -68,6 +72,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
68
72
  - [Quickstart](#quickstart)
69
73
  - [Interoperability](#interoperability)
70
74
  - [Bundle format](#bundle-format-proofbundlev01)
75
+ - [Eval receipts](#eval-receipts)
71
76
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
72
77
  - [Roadmap](#roadmap)
73
78
  - [Contributing](#contributing)
@@ -236,12 +241,12 @@ string uses base64url as per the spec.
236
241
 
237
242
  ## Security notes and scope, stated honestly
238
243
 
239
- This is v0.1. It does exactly what it says and no more:
244
+ The scope is deliberately narrow. It does exactly what it says and no more:
240
245
 
241
246
  - Ed25519 signatures only, for both the payload and the optional SD-JWT issuer
242
247
  signature.
243
248
  - SD-JWT: the SD-JWT core is now [RFC 9901](https://datatracker.ietf.org/doc/rfc9901/)
244
- (Dec 2025); this verifies that every presented disclosure is committed in the
249
+ (November 2025); this verifies that every presented disclosure is committed in the
245
250
  issuer-signed payload, and the issuer signature (EdDSA) if a key is supplied. It
246
251
  does **not** verify a Key Binding JWT, an X.509 or trust-list chain, status
247
252
  lists, or `vct` type metadata. **SD-JWT VC** (the credential-type profile) is
@@ -255,25 +260,37 @@ This is v0.1. It does exactly what it says and no more:
255
260
  If you find a correctness or security issue, please open an issue or see
256
261
  [SECURITY.md](SECURITY.md).
257
262
 
263
+ ## Eval receipts
264
+
265
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
266
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
267
+ **salted commitments** to the model and dataset identifiers — never the weights, the
268
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
269
+ from one file, without ever seeing the model or the test set.
270
+
271
+ ```bash
272
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
273
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
274
+ proofbundle verify receipt.json # a receipt is a normal bundle
275
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
276
+ ```
277
+
278
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
279
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
280
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
281
+ commitments — it does **not** prove the evaluation was well designed or that the score
282
+ itself is correct. Those are human judgements; what it removes is the need to simply
283
+ trust the number.
284
+
258
285
  ## Roadmap
259
286
 
260
287
  - **v0.1** — the offline verifier plus a real example bundle.
261
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
262
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
263
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
264
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
265
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
266
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
267
- into a signed receipt whose payload is a minimal canonical claim, for example
268
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
269
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
270
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
271
- interval, a multiple-testing correction and a preregistration hash.
272
-
273
- That last step is the point: today no widely used AI project turns a
274
- reproducible evaluation result into a signed, third-party-verifiable,
275
- selectively disclosable receipt. This repository is the trustworthy verification
276
- core that makes it possible.
288
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
291
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
277
294
 
278
295
  ## Contributing
279
296
 
@@ -6,6 +6,7 @@ src/proofbundle/bundle.py
6
6
  src/proofbundle/cli.py
7
7
  src/proofbundle/emit.py
8
8
  src/proofbundle/errors.py
9
+ src/proofbundle/evalclaim.py
9
10
  src/proofbundle/merkle.py
10
11
  src/proofbundle/py.typed
11
12
  src/proofbundle/sdjwt.py
@@ -16,12 +17,20 @@ src/proofbundle.egg-info/dependency_links.txt
16
17
  src/proofbundle.egg-info/entry_points.txt
17
18
  src/proofbundle.egg-info/requires.txt
18
19
  src/proofbundle.egg-info/top_level.txt
20
+ src/proofbundle/adapters/__init__.py
21
+ src/proofbundle/adapters/inspect_ai.py
22
+ src/proofbundle/adapters/lm_eval.py
23
+ tests/test_adapters.py
19
24
  tests/test_bundle.py
20
25
  tests/test_cli.py
26
+ tests/test_cli_eval.py
21
27
  tests/test_emit.py
28
+ tests/test_eval_claim_schema.py
29
+ tests/test_evalclaim.py
22
30
  tests/test_merkle.py
23
31
  tests/test_merkle_property.py
24
32
  tests/test_rekor_interop.py
25
33
  tests/test_rfc6962_external_vectors.py
26
34
  tests/test_schema.py
35
+ tests/test_sdjwt_reference.py
27
36
  tests/test_signature.py
@@ -1,5 +1,7 @@
1
1
  cryptography>=42
2
2
 
3
+ [adapters]
4
+
3
5
  [dev]
4
6
  pytest>=7
5
7
  ruff>=0.5
@@ -7,5 +9,9 @@ jsonschema>=4
7
9
  mypy>=1.8
8
10
  build>=1
9
11
  hypothesis>=6
12
+ rfc8785>=0.1.4
13
+
14
+ [eval]
15
+ rfc8785>=0.1.4
10
16
 
11
17
  [sdjwt]
@@ -0,0 +1,32 @@
1
+ """Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
6
+
7
+ FX = Path(__file__).resolve().parent / "fixtures"
8
+ TS = "2026-07-01T12:00:00Z"
9
+
10
+
11
+ class TestAdapters(unittest.TestCase):
12
+ def test_lm_eval(self):
13
+ claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
14
+ comparator=">=", threshold="0.70", timestamp=TS,
15
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
16
+ self.assertEqual(claim["suite"], "hellaswag")
17
+ self.assertEqual(claim["threshold"], "0.70")
18
+ self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
19
+ self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
20
+ self.assertEqual(claim["n"], 10042)
21
+
22
+ def test_inspect_ai(self):
23
+ claim, salts = from_inspect_ai_log(FX / "inspect_ai_log.json", "accuracy",
24
+ comparator=">=", threshold="0.80", timestamp=TS,
25
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
26
+ self.assertEqual(claim["suite"], "safety_refusal")
27
+ self.assertTrue(claim["passed"]) # 0.92 >= 0.80
28
+ self.assertEqual(claim["n"], 500)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ unittest.main()
@@ -0,0 +1,39 @@
1
+ """CLI emit-eval + show-eval end-to-end (round-trip through the process boundary)."""
2
+ import json
3
+ import subprocess
4
+ import sys
5
+ import unittest
6
+ from pathlib import Path
7
+
8
+ REPO = Path(__file__).resolve().parents[1]
9
+
10
+
11
+ def _run(*args, **kw):
12
+ return subprocess.run([sys.executable, "-m", "proofbundle.cli", *args],
13
+ capture_output=True, text=True, cwd=REPO,
14
+ env={"PYTHONPATH": str(REPO / "src"), **kw.get("env", {})})
15
+
16
+
17
+ class TestCliEval(unittest.TestCase):
18
+ def test_emit_eval_then_verify_and_show(self):
19
+ import tempfile
20
+ import os
21
+ with tempfile.TemporaryDirectory() as d:
22
+ claim = os.path.join(d, "claim.json")
23
+ Path(claim).write_text(json.dumps({
24
+ "schema": "proofbundle/eval-claim/v0.1", "suite": "s", "suite_version": "v1",
25
+ "metric": "acc", "comparator": ">=", "threshold": "0.80", "passed": True, "n": 100,
26
+ "model_id_commit": "sha256:x", "dataset_id_commit": "sha256:y",
27
+ "commit_alg": "sha256-salted-v1", "issuer": "ed25519:z",
28
+ "timestamp": "2026-07-01T12:00:00Z"}), encoding="utf-8")
29
+ out = os.path.join(d, "receipt.json")
30
+ key = os.path.join(d, "k.key")
31
+ self.assertEqual(_run("emit-eval", "--claim", claim, "--out", out, "--new-key", key).returncode, 0)
32
+ self.assertEqual(_run("verify", out).returncode, 0)
33
+ show = _run("show-eval", out)
34
+ self.assertEqual(show.returncode, 0)
35
+ self.assertIn("passed", show.stdout)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ unittest.main()
@@ -0,0 +1,36 @@
1
+ """An emitted eval claim validates against schemas/eval_claim_v0_1.schema.json."""
2
+ import json
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ try:
7
+ import jsonschema
8
+ except ImportError: # pragma: no cover
9
+ jsonschema = None
10
+
11
+ from proofbundle.emit import generate_signer
12
+ from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
13
+
14
+ ROOT = Path(__file__).resolve().parents[1]
15
+ SCHEMA = ROOT / "schemas" / "eval_claim_v0_1.schema.json"
16
+
17
+
18
+ @unittest.skipIf(jsonschema is None, "jsonschema not installed (pip install -e .[dev])")
19
+ class TestEvalClaimSchema(unittest.TestCase):
20
+ def test_schema_valid(self):
21
+ jsonschema.Draft202012Validator.check_schema(json.loads(SCHEMA.read_text(encoding="utf-8")))
22
+
23
+ def test_built_claim_matches_schema(self):
24
+ signer = generate_signer()
25
+ claim, _ = build_eval_claim(
26
+ suite="s", suite_version="v1", metric="acc", comparator=">=", threshold="0.80",
27
+ score="0.92", n=500, model_id="m", dataset_id="d",
28
+ issuer=issuer_fingerprint(signer), timestamp="2026-07-01T12:00:00Z",
29
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
30
+ jsonschema.validate(instance=claim, schema=json.loads(SCHEMA.read_text(encoding="utf-8")))
31
+
32
+ def test_schema_rejects_float_threshold(self):
33
+ schema = json.loads(SCHEMA.read_text(encoding="utf-8"))
34
+ bad = {"schema": "proofbundle/eval-claim/v0.1", "threshold": 0.80}
35
+ with self.assertRaises(jsonschema.ValidationError):
36
+ jsonschema.validate(instance=bad, schema=schema)
@@ -0,0 +1,107 @@
1
+ """Eval-receipt (v0.4) tests — No-Fake, one red-test per new invariant."""
2
+ import base64
3
+ import json
4
+ import unittest
5
+
6
+ from proofbundle import verify_bundle
7
+ from proofbundle.emit import generate_signer
8
+ from proofbundle.evalclaim import (
9
+ EvalClaimError,
10
+ build_eval_claim,
11
+ canonicalize,
12
+ decode_eval_claim,
13
+ emit_eval_receipt,
14
+ issuer_fingerprint,
15
+ salted_commit,
16
+ )
17
+
18
+ TS = "2026-07-01T12:00:00Z"
19
+
20
+
21
+ def _claim(signer, score="0.92", threshold="0.80", comparator=">="):
22
+ claim, salts = build_eval_claim(
23
+ suite="safety-refusal", suite_version="v1", metric="refusal_rate",
24
+ comparator=comparator, threshold=threshold, score=score, n=500,
25
+ model_id="acme/model-x", dataset_id="acme/dataset-y",
26
+ issuer=issuer_fingerprint(signer), timestamp=TS,
27
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
28
+ return claim, salts
29
+
30
+
31
+ class TestEvalClaim(unittest.TestCase):
32
+ def test_round_trip(self):
33
+ signer = generate_signer()
34
+ claim, _ = _claim(signer)
35
+ bundle = emit_eval_receipt(claim, signer)
36
+ self.assertTrue(verify_bundle(bundle).ok)
37
+ decoded = decode_eval_claim(bundle)
38
+ self.assertIsNotNone(decoded)
39
+ self.assertEqual(decoded["suite"], "safety-refusal")
40
+ self.assertTrue(decoded["passed"])
41
+
42
+ def test_determinism_emoji_and_nfc(self):
43
+ # A key beyond the BMP + NFC content must canonicalize identically twice.
44
+ c = {"schema": "x", "\U0001F600z": "café"} # NFD 'é'
45
+ with self.assertRaises(EvalClaimError):
46
+ canonicalize(c) # non-NFC string rejected
47
+ c2 = {"b": "1", "\U0001F600": "ok", "a": "2"}
48
+ self.assertEqual(canonicalize(c2), canonicalize(dict(reversed(list(c2.items())))))
49
+
50
+ def test_duplicate_keys_rejected(self):
51
+ from proofbundle.evalclaim import load_claim_text
52
+ with self.assertRaises(EvalClaimError):
53
+ load_claim_text('{"a": 1, "a": 2}')
54
+
55
+ def test_float_guard_red(self):
56
+ with self.assertRaises(EvalClaimError):
57
+ canonicalize({"schema": "x", "threshold": 0.80}) # a Python float is forbidden
58
+
59
+ def test_passed_integrity_at_boundary(self):
60
+ signer = generate_signer()
61
+ eq, _ = _claim(signer, score="0.80", threshold="0.80", comparator=">=")
62
+ self.assertTrue(eq["passed"])
63
+ gt, _ = _claim(signer, score="0.80", threshold="0.80", comparator=">")
64
+ self.assertFalse(gt["passed"])
65
+ lt, _ = _claim(signer, score="0.79", threshold="0.80", comparator="<")
66
+ self.assertTrue(lt["passed"])
67
+
68
+ def test_issuer_binding_red(self):
69
+ signer = generate_signer()
70
+ claim, _ = _claim(signer)
71
+ bundle = emit_eval_receipt(claim, signer)
72
+ # Tamper the issuer field to a different key -> re-sign with the SAME signer.
73
+ # decode must reject because claim.issuer != signing key.
74
+ import copy
75
+ b2 = copy.deepcopy(bundle)
76
+ other = issuer_fingerprint(generate_signer())
77
+ payload = json.loads(base64.b64decode(b2["payload_b64"]).decode("utf-8"))
78
+ payload["issuer"] = other
79
+ # keep bytes verifiable only if re-emitted; here we just prove decode's issuer check:
80
+ b2["payload_b64"] = base64.b64encode(canonicalize(payload)).decode("ascii")
81
+ # signature no longer matches the new payload -> verify_bundle fails -> decode None.
82
+ self.assertIsNone(decode_eval_claim(b2))
83
+
84
+ def test_commitment_hides_identifier(self):
85
+ c1 = salted_commit("gpt-4o", b"A" * 16)
86
+ c1b = salted_commit("gpt-4o", b"A" * 16)
87
+ c2 = salted_commit("gpt-4o", b"B" * 16)
88
+ self.assertEqual(c1, c1b) # same id + salt -> same commit
89
+ self.assertNotEqual(c1, c2) # different salt -> different commit
90
+ signer = generate_signer()
91
+ claim, _ = _claim(signer)
92
+ payload = json.dumps(claim)
93
+ self.assertNotIn("acme/model-x", payload) # plaintext id never in the payload
94
+ with self.assertRaises(EvalClaimError):
95
+ salted_commit("x", b"short") # salt must be >= 16 bytes
96
+
97
+ def test_tamper_red(self):
98
+ signer = generate_signer()
99
+ claim, _ = _claim(signer)
100
+ bundle = emit_eval_receipt(claim, signer)
101
+ bundle["payload_b64"] = base64.b64encode(b'{"tampered":true}').decode("ascii")
102
+ self.assertFalse(verify_bundle(bundle).ok)
103
+ self.assertIsNone(decode_eval_claim(bundle))
104
+
105
+
106
+ if __name__ == "__main__":
107
+ unittest.main()
@@ -0,0 +1,39 @@
1
+ """proofbundle verifies an SD-JWT produced by the EXTERNAL reference library.
2
+
3
+ The fixture tests/fixtures/sdjwt_reference_eddsa.json was generated by
4
+ openwallet-foundation-labs/sd-jwt-python (the reference implementation that
5
+ produces the IETF/RFC 9901 examples) with an Ed25519 issuer key and two
6
+ selectively-disclosable claims. proofbundle must verify both the disclosure-digest
7
+ commitments and the EdDSA issuer signature — i.e. it interops with the reference
8
+ tool, not just with its own emitter. No network / no sd-jwt dependency at test
9
+ time; the SD-JWT is committed.
10
+ """
11
+ import json
12
+ import unittest
13
+ from base64 import b64decode
14
+ from pathlib import Path
15
+
16
+ from proofbundle.sdjwt import verify_sd_jwt
17
+
18
+ FIXTURE = Path(__file__).resolve().parent / "fixtures" / "sdjwt_reference_eddsa.json"
19
+
20
+
21
+ @unittest.skipIf(not FIXTURE.exists(), "sd-jwt reference fixture not present")
22
+ class TestSdJwtReference(unittest.TestCase):
23
+ def setUp(self):
24
+ self.f = json.loads(FIXTURE.read_text(encoding="utf-8"))
25
+
26
+ def test_source_documented(self):
27
+ self.assertIn("sd-jwt-python", self.f["source"])
28
+
29
+ def test_proofbundle_verifies_reference_sd_jwt(self):
30
+ res = verify_sd_jwt(self.f["compact"], b64decode(self.f["issuer_public_key_b64"]))
31
+ self.assertTrue(res["structure_ok"], res)
32
+ self.assertTrue(res["sig_ok"], res)
33
+ self.assertIn("2 disclosure", res["detail"])
34
+
35
+ def test_wrong_issuer_key_is_rejected(self):
36
+ # a different key must fail the issuer-signature check (no false accept).
37
+ import os
38
+ res = verify_sd_jwt(self.f["compact"], os.urandom(32))
39
+ self.assertFalse(res.get("sig_ok"), res)
File without changes
File without changes