proofbundle 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {proofbundle-0.3.0/src/proofbundle.egg-info → proofbundle-0.4.0}/PKG-INFO +34 -17
  2. {proofbundle-0.3.0 → proofbundle-0.4.0}/README.md +29 -16
  3. {proofbundle-0.3.0 → proofbundle-0.4.0}/pyproject.toml +8 -2
  4. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/__init__.py +1 -1
  5. proofbundle-0.4.0/src/proofbundle/adapters/__init__.py +10 -0
  6. proofbundle-0.4.0/src/proofbundle/adapters/inspect_ai.py +36 -0
  7. proofbundle-0.4.0/src/proofbundle/adapters/lm_eval.py +32 -0
  8. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/cli.py +66 -12
  9. proofbundle-0.4.0/src/proofbundle/evalclaim.py +212 -0
  10. {proofbundle-0.3.0 → proofbundle-0.4.0/src/proofbundle.egg-info}/PKG-INFO +34 -17
  11. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle.egg-info/SOURCES.txt +9 -0
  12. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle.egg-info/requires.txt +6 -0
  13. proofbundle-0.4.0/tests/test_adapters.py +32 -0
  14. proofbundle-0.4.0/tests/test_cli_eval.py +39 -0
  15. proofbundle-0.4.0/tests/test_eval_claim_schema.py +36 -0
  16. proofbundle-0.4.0/tests/test_evalclaim.py +107 -0
  17. proofbundle-0.4.0/tests/test_sdjwt_reference.py +39 -0
  18. {proofbundle-0.3.0 → proofbundle-0.4.0}/LICENSE +0 -0
  19. {proofbundle-0.3.0 → proofbundle-0.4.0}/setup.cfg +0 -0
  20. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/bundle.py +0 -0
  21. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/emit.py +0 -0
  22. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/errors.py +0 -0
  23. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/merkle.py +0 -0
  24. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/py.typed +0 -0
  25. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/sdjwt.py +0 -0
  26. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle/signature.py +0 -0
  27. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  28. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  29. {proofbundle-0.3.0 → proofbundle-0.4.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  30. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_bundle.py +0 -0
  31. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_cli.py +0 -0
  32. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_emit.py +0 -0
  33. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_merkle.py +0 -0
  34. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_merkle_property.py +0 -0
  35. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_rekor_interop.py +0 -0
  36. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_rfc6962_external_vectors.py +0 -0
  37. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_schema.py +0 -0
  38. {proofbundle-0.3.0 → proofbundle-0.4.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -24,6 +24,9 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: cryptography>=42
26
26
  Provides-Extra: sdjwt
27
+ Provides-Extra: eval
28
+ Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
+ Provides-Extra: adapters
27
30
  Provides-Extra: dev
28
31
  Requires-Dist: pytest>=7; extra == "dev"
29
32
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -31,6 +34,7 @@ Requires-Dist: jsonschema>=4; extra == "dev"
31
34
  Requires-Dist: mypy>=1.8; extra == "dev"
32
35
  Requires-Dist: build>=1; extra == "dev"
33
36
  Requires-Dist: hypothesis>=6; extra == "dev"
37
+ Requires-Dist: rfc8785>=0.1.4; extra == "dev"
34
38
  Dynamic: license-file
35
39
 
36
40
  <div align="center">
@@ -68,6 +72,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
68
72
  - [Quickstart](#quickstart)
69
73
  - [Interoperability](#interoperability)
70
74
  - [Bundle format](#bundle-format-proofbundlev01)
75
+ - [Eval receipts](#eval-receipts)
71
76
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
72
77
  - [Roadmap](#roadmap)
73
78
  - [Contributing](#contributing)
@@ -255,25 +260,37 @@ This is v0.1. It does exactly what it says and no more:
255
260
  If you find a correctness or security issue, please open an issue or see
256
261
  [SECURITY.md](SECURITY.md).
257
262
 
263
+ ## Eval receipts
264
+
265
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
266
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
267
+ **salted commitments** to the model and dataset identifiers — never the weights, the
268
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
269
+ from one file, without ever seeing the model or the test set.
270
+
271
+ ```bash
272
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
273
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
274
+ proofbundle verify receipt.json # a receipt is a normal bundle
275
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
276
+ ```
277
+
278
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
279
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
280
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
281
+ commitments — it does **not** prove the evaluation was well designed or that the score
282
+ itself is correct. Those are human judgements; what it removes is the need to simply
283
+ trust the number.
284
+
258
285
  ## Roadmap
259
286
 
260
287
  - **v0.1** — the offline verifier plus a real example bundle.
261
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
262
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
263
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
264
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
265
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
266
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
267
- into a signed receipt whose payload is a minimal canonical claim, for example
268
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
269
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
270
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
271
- interval, a multiple-testing correction and a preregistration hash.
272
-
273
- That last step is the point: today no widely used AI project turns a
274
- reproducible evaluation result into a signed, third-party-verifiable,
275
- selectively disclosable receipt. This repository is the trustworthy verification
276
- core that makes it possible.
288
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
291
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
277
294
 
278
295
  ## Contributing
279
296
 
@@ -33,6 +33,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
33
33
  - [Quickstart](#quickstart)
34
34
  - [Interoperability](#interoperability)
35
35
  - [Bundle format](#bundle-format-proofbundlev01)
36
+ - [Eval receipts](#eval-receipts)
36
37
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
37
38
  - [Roadmap](#roadmap)
38
39
  - [Contributing](#contributing)
@@ -220,25 +221,37 @@ This is v0.1. It does exactly what it says and no more:
220
221
  If you find a correctness or security issue, please open an issue or see
221
222
  [SECURITY.md](SECURITY.md).
222
223
 
224
+ ## Eval receipts
225
+
226
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
227
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
228
+ **salted commitments** to the model and dataset identifiers — never the weights, the
229
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
230
+ from one file, without ever seeing the model or the test set.
231
+
232
+ ```bash
233
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
234
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
235
+ proofbundle verify receipt.json # a receipt is a normal bundle
236
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
237
+ ```
238
+
239
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
240
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
241
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
242
+ commitments — it does **not** prove the evaluation was well designed or that the score
243
+ itself is correct. Those are human judgements; what it removes is the need to simply
244
+ trust the number.
245
+
223
246
  ## Roadmap
224
247
 
225
248
  - **v0.1** — the offline verifier plus a real example bundle.
226
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
227
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
228
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
229
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
230
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
231
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
232
- into a signed receipt whose payload is a minimal canonical claim, for example
233
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
234
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
235
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
236
- interval, a multiple-testing correction and a preregistration hash.
237
-
238
- That last step is the point: today no widely used AI project turns a
239
- reproducible evaluation result into a signed, third-party-verifiable,
240
- selectively disclosable receipt. This repository is the trustworthy verification
241
- core that makes it possible.
249
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
250
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
251
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
252
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
253
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
254
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
242
255
 
243
256
  ## Contributing
244
257
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -39,7 +39,13 @@ dependencies = ["cryptography>=42"]
39
39
  # `cryptography` (EdDSA + stdlib), so `pip install proofbundle[sdjwt]` keeps the trusted core
40
40
  # lean; the extra documents intent and is forward-compatible if SD-JWT ever needs a heavier lib.
41
41
  sdjwt = []
42
- dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6"]
42
+ # EMITTING eval receipts needs a real RFC 8785 JCS canonicalizer (emit path only). The VERIFY
43
+ # path (verify_bundle / decode_eval_claim) never canonicalizes — it checks stored bytes — so the
44
+ # verifier stays dependency-free. `pip install proofbundle[eval]` adds emit-side canonicalization.
45
+ eval = ["rfc8785>=0.1.4"]
46
+ # Framework adapters read exported result JSON only (no framework import) → pure stdlib today.
47
+ adapters = []
48
+ dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6", "rfc8785>=0.1.4"]
43
49
 
44
50
  [project.urls]
45
51
  Homepage = "https://b7n0de.com"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.4.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -0,0 +1,10 @@
1
+ """Adapters that map an eval framework's EXPORTED result JSON to an eval claim.
2
+
3
+ Each adapter reads a result file from disk and never imports the framework, so they
4
+ add no runtime dependency. The output-format mapping is bound to a framework version;
5
+ each fixture in tests/fixtures documents its source + version.
6
+ """
7
+ from .inspect_ai import from_inspect_ai_log
8
+ from .lm_eval import from_lm_eval_results
9
+
10
+ __all__ = ["from_lm_eval_results", "from_inspect_ai_log"]
@@ -0,0 +1,36 @@
1
+ """Adapter for UK AISI inspect_ai eval-log JSON (file-based, no framework import)."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ..evalclaim import build_eval_claim
9
+
10
+
11
+ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
12
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
13
+ """Read an inspect_ai eval-log JSON and build an eval claim.
14
+
15
+ Expects: {"eval": {"task": ..., "model": ..., "dataset": {"name": ...}},
16
+ "results": {"total_samples": n, "scores": [{"metrics": {metric: {"value": <number>}}}]}}.
17
+ Returns (claim, salts).
18
+ """
19
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
20
+ ev = data.get("eval", {})
21
+ scores = data.get("results", {}).get("scores", [])
22
+ value = None
23
+ for s in scores:
24
+ m = s.get("metrics", {})
25
+ if metric in m:
26
+ value = m[metric].get("value")
27
+ break
28
+ if value is None:
29
+ raise ValueError(f"metric {metric!r} not found in inspect_ai scores")
30
+ n = int(data.get("results", {}).get("total_samples") or 0)
31
+ return build_eval_claim(
32
+ suite=str(ev.get("task", "inspect_ai")), suite_version=str(ev.get("task_version", "1")),
33
+ metric=metric, comparator=comparator, threshold=threshold, score=repr(value), n=n,
34
+ model_id=str(ev.get("model", "unknown")),
35
+ dataset_id=str(ev.get("dataset", {}).get("name", ev.get("task", "unknown"))),
36
+ issuer="", timestamp=timestamp, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -0,0 +1,32 @@
1
+ """Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from ..evalclaim import build_eval_claim
9
+
10
+
11
+ def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
12
+ timestamp: str, model_salt: Optional[bytes] = None,
13
+ dataset_salt: Optional[bytes] = None):
14
+ """Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
15
+
16
+ Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
17
+ "n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
18
+ STRING to avoid float canonicalization issues. Returns (claim, salts).
19
+ """
20
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
21
+ res = data.get("results", {}).get(task)
22
+ if res is None or metric not in res:
23
+ raise ValueError(f"task/metric not found in results: {task}/{metric}")
24
+ score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
25
+ n = int(data.get("n-samples", {}).get(task, {}).get("effective")
26
+ or data.get("n-samples", {}).get(task, {}).get("original") or 0)
27
+ model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
28
+ return build_eval_claim(
29
+ suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
30
+ metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
31
+ model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
32
+ model_salt=model_salt, dataset_salt=dataset_salt)
@@ -1,4 +1,4 @@
1
- """Command line interface: ``proofbundle verify`` and ``proofbundle emit``."""
1
+ """Command line interface: ``proofbundle`` verify / emit / emit-eval / show-eval."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -12,6 +12,58 @@ from .emit import emit_bundle, generate_signer, load_signer, save_signer
12
12
  from .errors import ProofBundleError
13
13
 
14
14
 
15
+ def _resolve_signer(args):
16
+ """Shared signer resolution for emit / emit-eval. Returns a signer or None (with an error)."""
17
+ if getattr(args, "new_key", None) and getattr(args, "key", None):
18
+ print("ERROR: use either --key or --new-key, not both", file=sys.stderr)
19
+ return None
20
+ if getattr(args, "new_key", None):
21
+ signer = generate_signer()
22
+ save_signer(signer, args.new_key)
23
+ print(f"wrote new signing key to {args.new_key} (keep this secret)", file=sys.stderr)
24
+ return signer
25
+ if getattr(args, "key", None):
26
+ return load_signer(args.key)
27
+ print("ERROR: provide --key <file> or --new-key <file>", file=sys.stderr)
28
+ return None
29
+
30
+
31
+ def _cmd_emit_eval(args: argparse.Namespace) -> int:
32
+ from .evalclaim import EvalClaimError, emit_eval_receipt, load_claim_text # noqa: PLC0415
33
+ signer = _resolve_signer(args)
34
+ if signer is None:
35
+ return 2
36
+ try:
37
+ with open(args.claim, encoding="utf-8") as handle:
38
+ claim = load_claim_text(handle.read())
39
+ bundle = emit_eval_receipt(claim, signer)
40
+ except (EvalClaimError, OSError, ValueError) as exc:
41
+ print(f"ERROR: {exc}", file=sys.stderr)
42
+ return 2
43
+ with open(args.out, "w", encoding="utf-8") as handle:
44
+ json.dump(bundle, handle, indent=2)
45
+ handle.write("\n")
46
+ print(f"wrote eval receipt {args.out}")
47
+ return 0
48
+
49
+
50
+ def _cmd_show_eval(args: argparse.Namespace) -> int:
51
+ from .evalclaim import decode_eval_claim # noqa: PLC0415
52
+ claim = decode_eval_claim(args.receipt)
53
+ if claim is None:
54
+ print("=> FAILED: not a valid, issuer-bound eval receipt", file=sys.stderr)
55
+ return 1
56
+ print(f"suite {claim['suite']} ({claim['suite_version']})")
57
+ print(f"metric {claim['metric']} {claim['comparator']} {claim['threshold']}")
58
+ print(f"passed {claim['passed']} (n={claim['n']})")
59
+ print(f"model commit {claim['model_id_commit']}")
60
+ print(f"dataset commit {claim['dataset_id_commit']}")
61
+ print(f"issuer {claim['issuer']}")
62
+ print(f"timestamp {claim['timestamp']}")
63
+ print("=> OK")
64
+ return 0
65
+
66
+
15
67
  def _cmd_verify(args: argparse.Namespace) -> int:
16
68
  try:
17
69
  result = verify_bundle(args.bundle)
@@ -32,17 +84,8 @@ def _cmd_verify(args: argparse.Namespace) -> int:
32
84
 
33
85
 
34
86
  def _cmd_emit(args: argparse.Namespace) -> int:
35
- if args.new_key and args.key:
36
- print("ERROR: use either --key or --new-key, not both", file=sys.stderr)
37
- return 2
38
- if args.new_key:
39
- signer = generate_signer()
40
- save_signer(signer, args.new_key)
41
- print(f"wrote new signing key to {args.new_key} (keep this secret)", file=sys.stderr)
42
- elif args.key:
43
- signer = load_signer(args.key)
44
- else:
45
- print("ERROR: provide --key <file> or --new-key <file>", file=sys.stderr)
87
+ signer = _resolve_signer(args)
88
+ if signer is None:
46
89
  return 2
47
90
 
48
91
  with open(args.payload_file, "rb") as handle:
@@ -76,6 +119,17 @@ def build_parser() -> argparse.ArgumentParser:
76
119
  emit.add_argument("--new-key", help="generate a signing key and save it to this file")
77
120
  emit.set_defaults(func=_cmd_emit)
78
121
 
122
+ emit_eval = sub.add_parser("emit-eval", help="emit a signed eval receipt from a claim JSON")
123
+ emit_eval.add_argument("--claim", required=True, help="path to the eval-claim JSON")
124
+ emit_eval.add_argument("--out", required=True, help="path to write the receipt bundle JSON")
125
+ emit_eval.add_argument("--key", help="use an existing 32 byte raw Ed25519 seed file")
126
+ emit_eval.add_argument("--new-key", help="generate a signing key and save it to this file")
127
+ emit_eval.set_defaults(func=_cmd_emit_eval)
128
+
129
+ show_eval = sub.add_parser("show-eval", help="verify an eval receipt and print the claim")
130
+ show_eval.add_argument("receipt", help="path to the eval receipt bundle JSON")
131
+ show_eval.set_defaults(func=_cmd_show_eval)
132
+
79
133
  return parser
80
134
 
81
135
 
@@ -0,0 +1,212 @@
1
+ """Eval receipts (v0.4): sign + Merkle-anchor a canonical eval CLAIM.
2
+
3
+ A receipt proves exactly one thing — *suite S scored `comparator` threshold T,
4
+ passed=…* — carrying only SALTED commitments to the model and dataset identifiers,
5
+ never the weights, the data, or the plaintext names. A third party verifies the
6
+ threshold was met, offline, from one file, without ever seeing the model or dataset.
7
+
8
+ Honest scope (see EVAL_CLAIM.md): the receipt proves `passed` against `threshold`
9
+ and hides the model/dataset via salted commitments. It does NOT prove the evaluation
10
+ itself was well designed or that the suite measures what it claims — those are human
11
+ judgements. What it removes is the need to simply *trust the number*.
12
+
13
+ Layering: the claim payload is canonicalized with RFC 8785 JCS **only on the emit
14
+ path** (a lazy dependency). The verify path (`decode_eval_claim`) never canonicalizes —
15
+ it checks the exact stored bytes that `verify_bundle` already authenticated — so the
16
+ verifier stays dependency-free (cryptography + stdlib only).
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import base64
21
+ import hashlib
22
+ import json
23
+ import os
24
+ import unicodedata
25
+ from typing import Optional, Sequence
26
+
27
+ from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
28
+ from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
29
+
30
+ from .bundle import load_bundle, verify_bundle
31
+ from .emit import emit_bundle
32
+
33
+ EVAL_CLAIM_SCHEMA = "proofbundle/eval-claim/v0.1"
34
+ COMMIT_ALG = "sha256-salted-v1"
35
+ _COMPARATORS = {">=", ">", "<=", "<"}
36
+ _MAX_SAFE_INT = 2 ** 53 - 1
37
+ # The exact key set of an eval claim; decode/validate reject anything else.
38
+ _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
+ "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
40
+ _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
41
+
42
+ __all__ = [
43
+ "EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
44
+ "emit_eval_receipt", "decode_eval_claim", "salted_commit", "issuer_fingerprint",
45
+ ]
46
+
47
+
48
+ class EvalClaimError(ValueError):
49
+ """Raised for a malformed eval claim (float in payload, non-NFC string, unsafe int, …)."""
50
+
51
+
52
+ def issuer_fingerprint(signer: Ed25519PrivateKey) -> str:
53
+ """The `issuer` field value: ed25519:<base64 of the 32-byte raw public key>."""
54
+ raw = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
55
+ return "ed25519:" + base64.b64encode(raw).decode("ascii")
56
+
57
+
58
+ def salted_commit(identifier: str, salt: bytes) -> str:
59
+ """Salted commitment to an identifier: sha256:<hex> over salt || utf8(identifier).
60
+
61
+ The salt (>=16 bytes, high entropy) stays with the issuer and is NEVER in the payload,
62
+ so the identifier cannot be recovered from the commitment — not even via a rainbow table
63
+ over known model names like gpt-4o.
64
+ """
65
+ if len(salt) < 16:
66
+ raise EvalClaimError("commitment salt must be at least 16 bytes")
67
+ return "sha256:" + hashlib.sha256(salt + identifier.encode("utf-8")).hexdigest()
68
+
69
+
70
+ def _reject_non_jcs(value) -> None:
71
+ """Recursively reject values that RFC 8785 / this profile forbids in a claim."""
72
+ if isinstance(value, bool):
73
+ return
74
+ if isinstance(value, float):
75
+ raise EvalClaimError("float values are forbidden; use a decimal STRING (e.g. \"0.80\")")
76
+ if isinstance(value, int):
77
+ if abs(value) > _MAX_SAFE_INT:
78
+ raise EvalClaimError(f"integer {value} exceeds the IEEE-754 safe range (2**53-1)")
79
+ return
80
+ if isinstance(value, str):
81
+ if unicodedata.normalize("NFC", value) != value:
82
+ raise EvalClaimError("string is not NFC-normalized")
83
+ return
84
+ if value is None:
85
+ return
86
+ if isinstance(value, dict):
87
+ for v in value.values():
88
+ _reject_non_jcs(v)
89
+ return
90
+ if isinstance(value, (list, tuple)):
91
+ for v in value:
92
+ _reject_non_jcs(v)
93
+ return
94
+ raise EvalClaimError(f"unsupported value type {type(value).__name__}")
95
+
96
+
97
+ def canonicalize(claim: dict) -> bytes:
98
+ """RFC 8785 JCS canonical bytes of a claim — EMIT PATH ONLY.
99
+
100
+ Enforces the profile before serializing: no Python float, NFC strings, safe-range ints.
101
+ Duplicate keys cannot exist in a Python dict; when parsing claim JSON from text, use
102
+ `load_claim_text` which rejects duplicate keys. Uses the rfc8785 library (lazy import)
103
+ for the UTF-16 code-unit key sort + compact UTF-8 serialization.
104
+ """
105
+ _reject_non_jcs(claim)
106
+ import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
107
+ try:
108
+ return rfc8785.dumps(claim)
109
+ except (rfc8785.FloatDomainError, rfc8785.IntegerDomainError, rfc8785.CanonicalizationError) as e:
110
+ raise EvalClaimError(f"canonicalization failed: {e}") from e
111
+
112
+
113
+ def load_claim_text(text: str) -> dict:
114
+ """Parse claim JSON text, rejecting duplicate keys (JCS forbids them)."""
115
+ def _no_dupes(pairs):
116
+ seen = {}
117
+ for k, v in pairs:
118
+ if k in seen:
119
+ raise EvalClaimError(f"duplicate key {k!r} in claim JSON")
120
+ seen[k] = v
121
+ return seen
122
+ return json.loads(text, object_pairs_hook=_no_dupes)
123
+
124
+
125
+ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator: str,
126
+ threshold: str, score: str, n: int, model_id: str, dataset_id: str,
127
+ issuer: str, timestamp: str, context_binding: Optional[str] = None,
128
+ ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
129
+ prereg_sha256: Optional[str] = None,
130
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
131
+ """Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
132
+ (never trusts the caller), creates salted commitments, and returns (claim, salts) with the
133
+ salts SEPARATE (never in the payload).
134
+
135
+ threshold/score are decimal STRINGS (never floats). Returns:
136
+ (claim: dict, salts: {"model_salt": bytes, "dataset_salt": bytes})
137
+ """
138
+ if comparator not in _COMPARATORS:
139
+ raise EvalClaimError(f"comparator must be one of {sorted(_COMPARATORS)}")
140
+ for name, val in (("threshold", threshold), ("score", score)):
141
+ if not isinstance(val, str):
142
+ raise EvalClaimError(f"{name} must be a decimal STRING, not {type(val).__name__}")
143
+ from decimal import Decimal, InvalidOperation # noqa: PLC0415
144
+ try:
145
+ s, t = Decimal(score), Decimal(threshold)
146
+ except InvalidOperation as e:
147
+ raise EvalClaimError(f"threshold/score are not valid decimals: {e}") from e
148
+ passed = {">=": s >= t, ">": s > t, "<=": s <= t, "<": s < t}[comparator]
149
+ m_salt = model_salt if model_salt is not None else os.urandom(16)
150
+ d_salt = dataset_salt if dataset_salt is not None else os.urandom(16)
151
+ claim = {
152
+ "schema": EVAL_CLAIM_SCHEMA, "suite": suite, "suite_version": suite_version,
153
+ "metric": metric, "comparator": comparator, "threshold": threshold, "passed": passed,
154
+ "n": n, "model_id_commit": salted_commit(model_id, m_salt),
155
+ "dataset_id_commit": salted_commit(dataset_id, d_salt), "commit_alg": COMMIT_ALG,
156
+ "issuer": issuer, "timestamp": timestamp,
157
+ }
158
+ if context_binding is not None:
159
+ claim["context_binding"] = context_binding
160
+ if ci95 is not None:
161
+ claim["ci95"] = [str(x) for x in ci95]
162
+ if multiple_testing is not None:
163
+ claim["multiple_testing"] = multiple_testing
164
+ if prereg_sha256 is not None:
165
+ claim["prereg_sha256"] = prereg_sha256
166
+ _reject_non_jcs(claim)
167
+ return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
168
+
169
+
170
+ def emit_eval_receipt(claim: dict, signer: Ed25519PrivateKey, *, prior_leaves: Sequence[bytes] = (),
171
+ sd_jwt: Optional[dict] = None) -> dict:
172
+ """Emit a proofbundle/v0.1 bundle whose payload is the canonical eval claim.
173
+
174
+ Sets `issuer` to the signer's fingerprint automatically (binding the receipt to the key),
175
+ canonicalizes, and calls emit_bundle. The returned bundle is verified unchanged by verify_bundle.
176
+ """
177
+ claim = dict(claim)
178
+ claim["issuer"] = issuer_fingerprint(signer)
179
+ missing = _REQUIRED - set(claim)
180
+ if missing:
181
+ raise EvalClaimError(f"claim missing required fields: {sorted(missing)}")
182
+ extra = set(claim) - _REQUIRED - _OPTIONAL
183
+ if extra:
184
+ raise EvalClaimError(f"claim has unknown fields: {sorted(extra)}")
185
+ payload = canonicalize(claim)
186
+ return emit_bundle(payload, signer, prior_leaves=prior_leaves, sd_jwt_vc=sd_jwt)
187
+
188
+
189
+ def decode_eval_claim(bundle) -> Optional[dict]:
190
+ """Verify the bundle, then check the signing key matches the claim's `issuer` field.
191
+
192
+ Returns the parsed claim on success, None on any failure. Dependency-free (no JCS import):
193
+ it re-reads the exact stored payload bytes that verify_bundle already authenticated.
194
+ """
195
+ result = verify_bundle(bundle)
196
+ if not result.ok:
197
+ return None
198
+ if isinstance(bundle, str):
199
+ bundle = load_bundle(bundle) # a str is a PATH (consistent with verify_bundle)
200
+ try:
201
+ payload = base64.b64decode(bundle["payload_b64"])
202
+ claim = load_claim_text(payload.decode("utf-8"))
203
+ if claim.get("schema") != EVAL_CLAIM_SCHEMA:
204
+ return None
205
+ # Issuer binding: the claim's issuer must be the key that signed the bundle.
206
+ sig_pub_b64 = bundle["signature"]["public_key_b64"]
207
+ want = "ed25519:" + base64.b64encode(base64.b64decode(sig_pub_b64)).decode("ascii")
208
+ if claim.get("issuer") != want:
209
+ return None
210
+ return claim
211
+ except (KeyError, ValueError, EvalClaimError):
212
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -24,6 +24,9 @@ Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: cryptography>=42
26
26
  Provides-Extra: sdjwt
27
+ Provides-Extra: eval
28
+ Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
+ Provides-Extra: adapters
27
30
  Provides-Extra: dev
28
31
  Requires-Dist: pytest>=7; extra == "dev"
29
32
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -31,6 +34,7 @@ Requires-Dist: jsonschema>=4; extra == "dev"
31
34
  Requires-Dist: mypy>=1.8; extra == "dev"
32
35
  Requires-Dist: build>=1; extra == "dev"
33
36
  Requires-Dist: hypothesis>=6; extra == "dev"
37
+ Requires-Dist: rfc8785>=0.1.4; extra == "dev"
34
38
  Dynamic: license-file
35
39
 
36
40
  <div align="center">
@@ -68,6 +72,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 25 tests.
68
72
  - [Quickstart](#quickstart)
69
73
  - [Interoperability](#interoperability)
70
74
  - [Bundle format](#bundle-format-proofbundlev01)
75
+ - [Eval receipts](#eval-receipts)
71
76
  - [Security notes and scope](#security-notes-and-scope-stated-honestly)
72
77
  - [Roadmap](#roadmap)
73
78
  - [Contributing](#contributing)
@@ -255,25 +260,37 @@ This is v0.1. It does exactly what it says and no more:
255
260
  If you find a correctness or security issue, please open an issue or see
256
261
  [SECURITY.md](SECURITY.md).
257
262
 
263
+ ## Eval receipts
264
+
265
+ Since v0.4, proofbundle turns a reproducible eval run into a signed, Merkle-anchored
266
+ **receipt** that proves *suite S `comparator` threshold T, passed* while carrying only
267
+ **salted commitments** to the model and dataset identifiers — never the weights, the
268
+ data, or the plaintext names. A third party verifies the threshold was met, offline,
269
+ from one file, without ever seeing the model or the test set.
270
+
271
+ ```bash
272
+ pip install "proofbundle[eval]" # emit side needs an RFC 8785 canonicalizer
273
+ proofbundle emit-eval --claim claim.json --out receipt.json --new-key signer.key
274
+ proofbundle verify receipt.json # a receipt is a normal bundle
275
+ proofbundle show-eval receipt.json # verify + print the claim (issuer-bound)
276
+ ```
277
+
278
+ The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
279
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
280
+ a receipt proves `passed` against `threshold` and hides the model/dataset via salted
281
+ commitments — it does **not** prove the evaluation was well designed or that the score
282
+ itself is correct. Those are human judgements; what it removes is the need to simply
283
+ trust the number.
284
+
258
285
  ## Roadmap
259
286
 
260
287
  - **v0.1** — the offline verifier plus a real example bundle.
261
- - **v0.2 (current release)** — the emitter: `emit_bundle` signs a payload with
262
- Ed25519 and anchors it as the last leaf of an RFC 6962 Merkle tree, producing
263
- a bundle that `verify_bundle` accepts. Available as `proofbundle emit`.
264
- - **v0.3** — an eval-receipt emitter: wrap one evaluation framework run
265
- ([Inspect AI](https://github.com/UKGovernmentBEIS/inspect_ai),
266
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness))
267
- into a signed receipt whose payload is a minimal canonical claim, for example
268
- `{"suite": "...", "threshold": 0.8, "passed": true}`, optionally wrapped as an
269
- SD-JWT VC so a holder can disclose *passed above threshold* without revealing
270
- the model, weights or dataset, and carrying a cluster-bootstrap confidence
271
- interval, a multiple-testing correction and a preregistration hash.
272
-
273
- That last step is the point: today no widely used AI project turns a
274
- reproducible evaluation result into a signed, third-party-verifiable,
275
- selectively disclosable receipt. This repository is the trustworthy verification
276
- core that makes it possible.
288
+ - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
+ - **v0.3** external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
+ - **v0.4 (current release)** the eval-receipt emitter (`emit_eval_receipt` /
291
+ `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
+ - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
+ reveals identifier + salt on demand) and full SD-JWT VC conformance.
277
294
 
278
295
  ## Contributing
279
296
 
@@ -6,6 +6,7 @@ src/proofbundle/bundle.py
6
6
  src/proofbundle/cli.py
7
7
  src/proofbundle/emit.py
8
8
  src/proofbundle/errors.py
9
+ src/proofbundle/evalclaim.py
9
10
  src/proofbundle/merkle.py
10
11
  src/proofbundle/py.typed
11
12
  src/proofbundle/sdjwt.py
@@ -16,12 +17,20 @@ src/proofbundle.egg-info/dependency_links.txt
16
17
  src/proofbundle.egg-info/entry_points.txt
17
18
  src/proofbundle.egg-info/requires.txt
18
19
  src/proofbundle.egg-info/top_level.txt
20
+ src/proofbundle/adapters/__init__.py
21
+ src/proofbundle/adapters/inspect_ai.py
22
+ src/proofbundle/adapters/lm_eval.py
23
+ tests/test_adapters.py
19
24
  tests/test_bundle.py
20
25
  tests/test_cli.py
26
+ tests/test_cli_eval.py
21
27
  tests/test_emit.py
28
+ tests/test_eval_claim_schema.py
29
+ tests/test_evalclaim.py
22
30
  tests/test_merkle.py
23
31
  tests/test_merkle_property.py
24
32
  tests/test_rekor_interop.py
25
33
  tests/test_rfc6962_external_vectors.py
26
34
  tests/test_schema.py
35
+ tests/test_sdjwt_reference.py
27
36
  tests/test_signature.py
@@ -1,5 +1,7 @@
1
1
  cryptography>=42
2
2
 
3
+ [adapters]
4
+
3
5
  [dev]
4
6
  pytest>=7
5
7
  ruff>=0.5
@@ -7,5 +9,9 @@ jsonschema>=4
7
9
  mypy>=1.8
8
10
  build>=1
9
11
  hypothesis>=6
12
+ rfc8785>=0.1.4
13
+
14
+ [eval]
15
+ rfc8785>=0.1.4
10
16
 
11
17
  [sdjwt]
@@ -0,0 +1,32 @@
1
+ """Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
6
+
7
+ FX = Path(__file__).resolve().parent / "fixtures"
8
+ TS = "2026-07-01T12:00:00Z"
9
+
10
+
11
+ class TestAdapters(unittest.TestCase):
12
+ def test_lm_eval(self):
13
+ claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
14
+ comparator=">=", threshold="0.70", timestamp=TS,
15
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
16
+ self.assertEqual(claim["suite"], "hellaswag")
17
+ self.assertEqual(claim["threshold"], "0.70")
18
+ self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
19
+ self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
20
+ self.assertEqual(claim["n"], 10042)
21
+
22
+ def test_inspect_ai(self):
23
+ claim, salts = from_inspect_ai_log(FX / "inspect_ai_log.json", "accuracy",
24
+ comparator=">=", threshold="0.80", timestamp=TS,
25
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
26
+ self.assertEqual(claim["suite"], "safety_refusal")
27
+ self.assertTrue(claim["passed"]) # 0.92 >= 0.80
28
+ self.assertEqual(claim["n"], 500)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ unittest.main()
@@ -0,0 +1,39 @@
1
+ """CLI emit-eval + show-eval end-to-end (round-trip through the process boundary)."""
2
+ import json
3
+ import subprocess
4
+ import sys
5
+ import unittest
6
+ from pathlib import Path
7
+
8
+ REPO = Path(__file__).resolve().parents[1]
9
+
10
+
11
+ def _run(*args, **kw):
12
+ return subprocess.run([sys.executable, "-m", "proofbundle.cli", *args],
13
+ capture_output=True, text=True, cwd=REPO,
14
+ env={"PYTHONPATH": str(REPO / "src"), **kw.get("env", {})})
15
+
16
+
17
+ class TestCliEval(unittest.TestCase):
18
+ def test_emit_eval_then_verify_and_show(self):
19
+ import tempfile
20
+ import os
21
+ with tempfile.TemporaryDirectory() as d:
22
+ claim = os.path.join(d, "claim.json")
23
+ Path(claim).write_text(json.dumps({
24
+ "schema": "proofbundle/eval-claim/v0.1", "suite": "s", "suite_version": "v1",
25
+ "metric": "acc", "comparator": ">=", "threshold": "0.80", "passed": True, "n": 100,
26
+ "model_id_commit": "sha256:x", "dataset_id_commit": "sha256:y",
27
+ "commit_alg": "sha256-salted-v1", "issuer": "ed25519:z",
28
+ "timestamp": "2026-07-01T12:00:00Z"}), encoding="utf-8")
29
+ out = os.path.join(d, "receipt.json")
30
+ key = os.path.join(d, "k.key")
31
+ self.assertEqual(_run("emit-eval", "--claim", claim, "--out", out, "--new-key", key).returncode, 0)
32
+ self.assertEqual(_run("verify", out).returncode, 0)
33
+ show = _run("show-eval", out)
34
+ self.assertEqual(show.returncode, 0)
35
+ self.assertIn("passed", show.stdout)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ unittest.main()
@@ -0,0 +1,36 @@
1
+ """An emitted eval claim validates against schemas/eval_claim_v0_1.schema.json."""
2
+ import json
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ try:
7
+ import jsonschema
8
+ except ImportError: # pragma: no cover
9
+ jsonschema = None
10
+
11
+ from proofbundle.emit import generate_signer
12
+ from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
13
+
14
+ ROOT = Path(__file__).resolve().parents[1]
15
+ SCHEMA = ROOT / "schemas" / "eval_claim_v0_1.schema.json"
16
+
17
+
18
+ @unittest.skipIf(jsonschema is None, "jsonschema not installed (pip install -e .[dev])")
19
+ class TestEvalClaimSchema(unittest.TestCase):
20
+ def test_schema_valid(self):
21
+ jsonschema.Draft202012Validator.check_schema(json.loads(SCHEMA.read_text(encoding="utf-8")))
22
+
23
+ def test_built_claim_matches_schema(self):
24
+ signer = generate_signer()
25
+ claim, _ = build_eval_claim(
26
+ suite="s", suite_version="v1", metric="acc", comparator=">=", threshold="0.80",
27
+ score="0.92", n=500, model_id="m", dataset_id="d",
28
+ issuer=issuer_fingerprint(signer), timestamp="2026-07-01T12:00:00Z",
29
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
30
+ jsonschema.validate(instance=claim, schema=json.loads(SCHEMA.read_text(encoding="utf-8")))
31
+
32
+ def test_schema_rejects_float_threshold(self):
33
+ schema = json.loads(SCHEMA.read_text(encoding="utf-8"))
34
+ bad = {"schema": "proofbundle/eval-claim/v0.1", "threshold": 0.80}
35
+ with self.assertRaises(jsonschema.ValidationError):
36
+ jsonschema.validate(instance=bad, schema=schema)
@@ -0,0 +1,107 @@
1
+ """Eval-receipt (v0.4) tests — No-Fake, one red-test per new invariant."""
2
+ import base64
3
+ import json
4
+ import unittest
5
+
6
+ from proofbundle import verify_bundle
7
+ from proofbundle.emit import generate_signer
8
+ from proofbundle.evalclaim import (
9
+ EvalClaimError,
10
+ build_eval_claim,
11
+ canonicalize,
12
+ decode_eval_claim,
13
+ emit_eval_receipt,
14
+ issuer_fingerprint,
15
+ salted_commit,
16
+ )
17
+
18
+ TS = "2026-07-01T12:00:00Z"
19
+
20
+
21
+ def _claim(signer, score="0.92", threshold="0.80", comparator=">="):
22
+ claim, salts = build_eval_claim(
23
+ suite="safety-refusal", suite_version="v1", metric="refusal_rate",
24
+ comparator=comparator, threshold=threshold, score=score, n=500,
25
+ model_id="acme/model-x", dataset_id="acme/dataset-y",
26
+ issuer=issuer_fingerprint(signer), timestamp=TS,
27
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
28
+ return claim, salts
29
+
30
+
31
+ class TestEvalClaim(unittest.TestCase):
32
+ def test_round_trip(self):
33
+ signer = generate_signer()
34
+ claim, _ = _claim(signer)
35
+ bundle = emit_eval_receipt(claim, signer)
36
+ self.assertTrue(verify_bundle(bundle).ok)
37
+ decoded = decode_eval_claim(bundle)
38
+ self.assertIsNotNone(decoded)
39
+ self.assertEqual(decoded["suite"], "safety-refusal")
40
+ self.assertTrue(decoded["passed"])
41
+
42
+ def test_determinism_emoji_and_nfc(self):
43
+ # A key beyond the BMP + NFC content must canonicalize identically twice.
44
+ c = {"schema": "x", "\U0001F600z": "café"} # NFD 'é'
45
+ with self.assertRaises(EvalClaimError):
46
+ canonicalize(c) # non-NFC string rejected
47
+ c2 = {"b": "1", "\U0001F600": "ok", "a": "2"}
48
+ self.assertEqual(canonicalize(c2), canonicalize(dict(reversed(list(c2.items())))))
49
+
50
+ def test_duplicate_keys_rejected(self):
51
+ from proofbundle.evalclaim import load_claim_text
52
+ with self.assertRaises(EvalClaimError):
53
+ load_claim_text('{"a": 1, "a": 2}')
54
+
55
+ def test_float_guard_red(self):
56
+ with self.assertRaises(EvalClaimError):
57
+ canonicalize({"schema": "x", "threshold": 0.80}) # a Python float is forbidden
58
+
59
+ def test_passed_integrity_at_boundary(self):
60
+ signer = generate_signer()
61
+ eq, _ = _claim(signer, score="0.80", threshold="0.80", comparator=">=")
62
+ self.assertTrue(eq["passed"])
63
+ gt, _ = _claim(signer, score="0.80", threshold="0.80", comparator=">")
64
+ self.assertFalse(gt["passed"])
65
+ lt, _ = _claim(signer, score="0.79", threshold="0.80", comparator="<")
66
+ self.assertTrue(lt["passed"])
67
+
68
+ def test_issuer_binding_red(self):
69
+ signer = generate_signer()
70
+ claim, _ = _claim(signer)
71
+ bundle = emit_eval_receipt(claim, signer)
72
+ # Tamper the issuer field to a different key -> re-sign with the SAME signer.
73
+ # decode must reject because claim.issuer != signing key.
74
+ import copy
75
+ b2 = copy.deepcopy(bundle)
76
+ other = issuer_fingerprint(generate_signer())
77
+ payload = json.loads(base64.b64decode(b2["payload_b64"]).decode("utf-8"))
78
+ payload["issuer"] = other
79
+ # keep bytes verifiable only if re-emitted; here we just prove decode's issuer check:
80
+ b2["payload_b64"] = base64.b64encode(canonicalize(payload)).decode("ascii")
81
+ # signature no longer matches the new payload -> verify_bundle fails -> decode None.
82
+ self.assertIsNone(decode_eval_claim(b2))
83
+
84
+ def test_commitment_hides_identifier(self):
85
+ c1 = salted_commit("gpt-4o", b"A" * 16)
86
+ c1b = salted_commit("gpt-4o", b"A" * 16)
87
+ c2 = salted_commit("gpt-4o", b"B" * 16)
88
+ self.assertEqual(c1, c1b) # same id + salt -> same commit
89
+ self.assertNotEqual(c1, c2) # different salt -> different commit
90
+ signer = generate_signer()
91
+ claim, _ = _claim(signer)
92
+ payload = json.dumps(claim)
93
+ self.assertNotIn("acme/model-x", payload) # plaintext id never in the payload
94
+ with self.assertRaises(EvalClaimError):
95
+ salted_commit("x", b"short") # salt must be >= 16 bytes
96
+
97
+ def test_tamper_red(self):
98
+ signer = generate_signer()
99
+ claim, _ = _claim(signer)
100
+ bundle = emit_eval_receipt(claim, signer)
101
+ bundle["payload_b64"] = base64.b64encode(b'{"tampered":true}').decode("ascii")
102
+ self.assertFalse(verify_bundle(bundle).ok)
103
+ self.assertIsNone(decode_eval_claim(bundle))
104
+
105
+
106
+ if __name__ == "__main__":
107
+ unittest.main()
@@ -0,0 +1,39 @@
1
+ """proofbundle verifies an SD-JWT produced by the EXTERNAL reference library.
2
+
3
+ The fixture tests/fixtures/sdjwt_reference_eddsa.json was generated by
4
+ openwallet-foundation-labs/sd-jwt-python (the reference implementation that
5
+ produces the IETF/RFC 9901 examples) with an Ed25519 issuer key and two
6
+ selectively-disclosable claims. proofbundle must verify both the disclosure-digest
7
+ commitments and the EdDSA issuer signature — i.e. it interops with the reference
8
+ tool, not just with its own emitter. No network / no sd-jwt dependency at test
9
+ time; the SD-JWT is committed.
10
+ """
11
+ import json
12
+ import unittest
13
+ from base64 import b64decode
14
+ from pathlib import Path
15
+
16
+ from proofbundle.sdjwt import verify_sd_jwt
17
+
18
+ FIXTURE = Path(__file__).resolve().parent / "fixtures" / "sdjwt_reference_eddsa.json"
19
+
20
+
21
+ @unittest.skipIf(not FIXTURE.exists(), "sd-jwt reference fixture not present")
22
+ class TestSdJwtReference(unittest.TestCase):
23
+ def setUp(self):
24
+ self.f = json.loads(FIXTURE.read_text(encoding="utf-8"))
25
+
26
+ def test_source_documented(self):
27
+ self.assertIn("sd-jwt-python", self.f["source"])
28
+
29
+ def test_proofbundle_verifies_reference_sd_jwt(self):
30
+ res = verify_sd_jwt(self.f["compact"], b64decode(self.f["issuer_public_key_b64"]))
31
+ self.assertTrue(res["structure_ok"], res)
32
+ self.assertTrue(res["sig_ok"], res)
33
+ self.assertIn("2 disclosure", res["detail"])
34
+
35
+ def test_wrong_issuer_key_is_rejected(self):
36
+ # a different key must fail the issuer-signature check (no false accept).
37
+ import os
38
+ res = verify_sd_jwt(self.f["compact"], os.urandom(32))
39
+ self.assertFalse(res.get("sig_ok"), res)
File without changes
File without changes