proofbundle 0.4.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {proofbundle-0.4.1/src/proofbundle.egg-info → proofbundle-0.6.0}/PKG-INFO +43 -8
  2. {proofbundle-0.4.1 → proofbundle-0.6.0}/README.md +38 -7
  3. {proofbundle-0.4.1 → proofbundle-0.6.0}/pyproject.toml +8 -3
  4. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/__init__.py +1 -1
  5. proofbundle-0.6.0/src/proofbundle/adapters/inspect_ai.py +65 -0
  6. proofbundle-0.6.0/src/proofbundle/adapters/lm_eval.py +76 -0
  7. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/evalclaim.py +4 -2
  8. proofbundle-0.6.0/src/proofbundle/intoto.py +63 -0
  9. proofbundle-0.6.0/src/proofbundle/sdjwt_issue.py +119 -0
  10. {proofbundle-0.4.1 → proofbundle-0.6.0/src/proofbundle.egg-info}/PKG-INFO +43 -8
  11. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/SOURCES.txt +4 -0
  12. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/requires.txt +5 -0
  13. proofbundle-0.6.0/tests/test_adapters.py +56 -0
  14. proofbundle-0.6.0/tests/test_intoto.py +63 -0
  15. proofbundle-0.6.0/tests/test_sdjwt_issue.py +98 -0
  16. proofbundle-0.4.1/src/proofbundle/adapters/inspect_ai.py +0 -36
  17. proofbundle-0.4.1/src/proofbundle/adapters/lm_eval.py +0 -32
  18. proofbundle-0.4.1/tests/test_adapters.py +0 -32
  19. {proofbundle-0.4.1 → proofbundle-0.6.0}/LICENSE +0 -0
  20. {proofbundle-0.4.1 → proofbundle-0.6.0}/setup.cfg +0 -0
  21. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/adapters/__init__.py +0 -0
  22. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/bundle.py +0 -0
  23. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/cli.py +0 -0
  24. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/emit.py +0 -0
  25. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/errors.py +0 -0
  26. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/merkle.py +0 -0
  27. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/py.typed +0 -0
  28. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/sdjwt.py +0 -0
  29. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/signature.py +0 -0
  30. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  31. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  32. {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  33. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_bundle.py +0 -0
  34. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_cli.py +0 -0
  35. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_cli_eval.py +0 -0
  36. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_emit.py +0 -0
  37. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_eval_claim_schema.py +0 -0
  38. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_evalclaim.py +0 -0
  39. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_merkle.py +0 -0
  40. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_merkle_property.py +0 -0
  41. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_rekor_interop.py +0 -0
  42. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_rfc6962_external_vectors.py +0 -0
  43. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_schema.py +0 -0
  44. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_sdjwt_reference.py +0 -0
  45. {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.4.1
3
+ Version: 0.6.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -27,6 +27,8 @@ Provides-Extra: sdjwt
27
27
  Provides-Extra: eval
28
28
  Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
29
  Provides-Extra: adapters
30
+ Provides-Extra: inspect
31
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
30
32
  Provides-Extra: dev
31
33
  Requires-Dist: pytest>=7; extra == "dev"
32
34
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.8; extra == "dev"
35
37
  Requires-Dist: build>=1; extra == "dev"
36
38
  Requires-Dist: hypothesis>=6; extra == "dev"
37
39
  Requires-Dist: rfc8785>=0.1.4; extra == "dev"
40
+ Requires-Dist: sd-jwt>=0.10; extra == "dev"
41
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
38
42
  Dynamic: license-file
39
43
 
40
44
  <div align="center">
@@ -51,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
51
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
52
56
 
53
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
54
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
55
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
56
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
57
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
58
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
59
65
 
60
66
  </div>
61
67
 
62
68
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
63
69
  verify` checks one self-contained `bundle.json` with three offline cryptographic
64
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
70
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
65
71
 
66
72
  ## Contents
67
73
 
@@ -282,15 +288,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
282
288
  itself is correct. Those are human judgements; what it removes is the need to simply
283
289
  trust the number.
284
290
 
291
+ ### A verification layer for trustworthy eval logs
292
+
293
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
294
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
295
+ missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
296
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
297
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
298
+
299
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
300
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
301
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
302
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
303
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
304
+ - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
305
+ emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
306
+ digest is an *honest salted commitment* under a custom key, never `sha256` (see
307
+ [PREDICATE.md](PREDICATE.md)).
308
+ - **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
309
+ root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
310
+ `threshold` while **withholding the exact score** and the identifier openings. The signed
311
+ bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
312
+ by proofbundle's own verifier **and** the `sd-jwt-python` reference.
313
+
314
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
315
+ attestation — see [SECURITY.md](SECURITY.md).
316
+
285
317
  ## Roadmap
286
318
 
287
319
  - **v0.1** — the offline verifier plus a real example bundle.
288
320
  - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
321
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
- - **v0.4 (current release)** — the eval-receipt emitter (`emit_eval_receipt` /
291
- `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
- - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
- reveals identifier + salt on demand) and full SD-JWT VC conformance.
322
+ - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
323
+ salted commitments, issuer binding.
324
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
325
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
326
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
327
+ - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
328
+ Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
294
329
 
295
330
  ## Contributing
296
331
 
@@ -12,17 +12,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
12
12
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
13
13
 
14
14
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
15
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
16
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
15
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
16
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
17
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
17
18
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
18
19
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
19
20
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
21
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
20
22
 
21
23
  </div>
22
24
 
23
25
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
24
26
  verify` checks one self-contained `bundle.json` with three offline cryptographic
25
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
27
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
26
28
 
27
29
  ## Contents
28
30
 
@@ -243,15 +245,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
243
245
  itself is correct. Those are human judgements; what it removes is the need to simply
244
246
  trust the number.
245
247
 
248
+ ### A verification layer for trustworthy eval logs
249
+
250
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
251
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
252
+ missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
253
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
254
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
255
+
256
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
257
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
258
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
259
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
260
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
261
+ - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
262
+ emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
263
+ digest is an *honest salted commitment* under a custom key, never `sha256` (see
264
+ [PREDICATE.md](PREDICATE.md)).
265
+ - **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
266
+ root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
267
+ `threshold` while **withholding the exact score** and the identifier openings. The signed
268
+ bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
269
+ by proofbundle's own verifier **and** the `sd-jwt-python` reference.
270
+
271
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
272
+ attestation — see [SECURITY.md](SECURITY.md).
273
+
246
274
  ## Roadmap
247
275
 
248
276
  - **v0.1** — the offline verifier plus a real example bundle.
249
277
  - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
250
278
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
251
- - **v0.4 (current release)** — the eval-receipt emitter (`emit_eval_receipt` /
252
- `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
253
- - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
254
- reveals identifier + salt on demand) and full SD-JWT VC conformance.
279
+ - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
280
+ salted commitments, issuer binding.
281
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
282
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
283
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
284
+ - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
285
+ Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
255
286
 
256
287
  ## Contributing
257
288
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.4.1"
7
+ version = "0.6.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -43,9 +43,14 @@ sdjwt = []
43
43
  # path (verify_bundle / decode_eval_claim) never canonicalizes — it checks stored bytes — so the
44
44
  # verifier stays dependency-free. `pip install proofbundle[eval]` adds emit-side canonicalization.
45
45
  eval = ["rfc8785>=0.1.4"]
46
- # Framework adapters read exported result JSON only (no framework import) → pure stdlib today.
46
+ # The lm-eval adapter reads exported results.json (no import) → pure stdlib.
47
47
  adapters = []
48
- dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6", "rfc8785>=0.1.4"]
48
+ # The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
49
+ # the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
50
+ # test is bound to this range. `pip install "proofbundle[inspect]"`.
51
+ inspect = ["inspect_ai>=0.3.100,<0.4"]
52
+ dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
53
+ "rfc8785>=0.1.4", "sd-jwt>=0.10", "inspect_ai>=0.3.100,<0.4"]
49
54
 
50
55
  [project.urls]
51
56
  Homepage = "https://b7n0de.com"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.4.1"
16
+ __version__ = "0.6.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -0,0 +1,65 @@
1
+ """Adapter for UK AISI inspect_ai eval logs — via the STABLE API, optional extra `proofbundle[inspect]`.
2
+
3
+ Unlike the v0.4 file-based reader, this uses the stable `inspect_ai.log.read_eval_log(path,
4
+ header_only=True)` API (the `.eval` on-disk format + its pydantic schema change between versions, see
5
+ inspect_ai issue 834; the stable API is robust). inspect_ai is imported LAZILY inside the function, so
6
+ the proofbundle core stays dependency-free — only `pip install "proofbundle[inspect]"` pulls it.
7
+
8
+ Object model (inspect_ai): `log.eval.task` is the suite; `log.results.scores` is a list of EvalScore;
9
+ `EvalScore.metrics` is a dict name→EvalMetric; `EvalMetric.value` is the number. threshold, comparator
10
+ and thus `passed` are set by proofbundle, NOT read from the log. model_id/dataset_id become salted
11
+ commitments (never plaintext in the payload).
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from typing import Optional
16
+
17
+ from ..evalclaim import build_eval_claim
18
+
19
+
20
+ class InspectAdapterError(RuntimeError):
21
+ """Raised when inspect_ai is missing or the log lacks the expected structure (no bare AttributeError)."""
22
+
23
+
24
+ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
25
+ model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
26
+ """Read an inspect_ai eval log via the stable API and build an eval claim for `metric`.
27
+
28
+ Returns (claim, salts). Raises InspectAdapterError if inspect_ai is unavailable or the log is
29
+ missing the expected attributes — a clear error instead of an opaque AttributeError.
30
+ """
31
+ try:
32
+ from inspect_ai.log import read_eval_log # noqa: PLC0415 — lazy: keeps the core dependency-free
33
+ except ImportError as e:
34
+ raise InspectAdapterError(
35
+ "inspect_ai is required for this adapter — install with: pip install \"proofbundle[inspect]\"") from e
36
+
37
+ try:
38
+ log = read_eval_log(str(path), header_only=True)
39
+ except Exception as e: # noqa: BLE001 — surface any read/parse failure as a clear adapter error
40
+ raise InspectAdapterError(f"could not read inspect_ai log {path!r}: {e}") from e
41
+
42
+ ev = getattr(log, "eval", None)
43
+ results = getattr(log, "results", None)
44
+ if ev is None or results is None:
45
+ raise InspectAdapterError("inspect_ai log missing .eval or .results (empty or malformed log)")
46
+
47
+ value = None
48
+ for score in (getattr(results, "scores", None) or []):
49
+ metrics = getattr(score, "metrics", None) or {}
50
+ if metric in metrics:
51
+ value = getattr(metrics[metric], "value", None)
52
+ break
53
+ if value is None:
54
+ raise InspectAdapterError(f"metric {metric!r} not found in any score.metrics of the log")
55
+
56
+ suite = str(getattr(ev, "task", "inspect_ai"))
57
+ model_id = str(getattr(ev, "model", "unknown"))
58
+ dataset = getattr(ev, "dataset", None)
59
+ dataset_id = str(getattr(dataset, "name", None) or suite)
60
+ return build_eval_claim(
61
+ suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
62
+ metric=metric, comparator=comparator, threshold=threshold, score=repr(value),
63
+ n=int(getattr(results, "total_samples", 0) or 0),
64
+ model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
65
+ model_salt=model_salt, dataset_salt=dataset_salt)
@@ -0,0 +1,76 @@
1
+ """Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
2
+
3
+ Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
4
+
5
+ Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
6
+ the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
7
+ `"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
8
+ (or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
9
+ optional `provenance` field so a verifier can trace exactly which run produced it.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from ..evalclaim import build_eval_claim
18
+
19
+
20
+ def _find_metric(res: dict, metric: str):
21
+ """Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
22
+
23
+ Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
24
+ `metric_stderr,<same filter>`."""
25
+ if metric in res: # bare key (older/simple exports)
26
+ stderr = res.get(f"{metric}_stderr")
27
+ return res[metric], stderr, metric
28
+ if f"{metric},none" in res:
29
+ return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
30
+ for key in res: # any filter, e.g. metric,custom-filter
31
+ if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
32
+ flt = key.split(",", 1)[1] if "," in key else "none"
33
+ return res[key], res.get(f"{metric}_stderr,{flt}"), key
34
+ return None, None, None
35
+
36
+
37
+ def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
38
+ timestamp: str, model_salt: Optional[bytes] = None,
39
+ dataset_salt: Optional[bytes] = None):
40
+ """Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
41
+
42
+ `metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
43
+ to avoid float canonicalization issues. Returns (claim, salts).
44
+ """
45
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
46
+ res = data.get("results", {}).get(task)
47
+ if res is None:
48
+ raise ValueError(f"task not found in results: {task!r}")
49
+ value, stderr, matched = _find_metric(res, metric)
50
+ if value is None:
51
+ raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
52
+ f"(available: {sorted(k for k in res if ',' in k)})")
53
+ score = value if isinstance(value, str) else repr(value)
54
+
55
+ n_samples = data.get("n-samples", {}).get(task, {})
56
+ n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
57
+ cfg = data.get("config", {})
58
+ model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
59
+ if cfg.get("model_args"):
60
+ model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
61
+
62
+ provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
63
+ if data.get("git_hash"):
64
+ provenance["git_hash"] = str(data["git_hash"])
65
+ if data.get("versions", {}).get(task) is not None:
66
+ provenance["task_version"] = str(data["versions"][task])
67
+ if data.get("n-shot", {}).get(task) is not None:
68
+ provenance["n_shot"] = str(data["n-shot"][task])
69
+ if stderr is not None:
70
+ provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
71
+
72
+ return build_eval_claim(
73
+ suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
74
+ metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
75
+ model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
76
+ provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
37
37
  # The exact key set of an eval claim; decode/validate reject anything else.
38
38
  _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
39
  "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
40
- _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
40
+ _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
41
41
 
42
42
  __all__ = [
43
43
  "EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
126
126
  threshold: str, score: str, n: int, model_id: str, dataset_id: str,
127
127
  issuer: str, timestamp: str, context_binding: Optional[str] = None,
128
128
  ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
129
- prereg_sha256: Optional[str] = None,
129
+ prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
130
130
  model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
131
131
  """Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
132
132
  (never trusts the caller), creates salted commitments, and returns (claim, salts) with the
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
163
163
  claim["multiple_testing"] = multiple_testing
164
164
  if prereg_sha256 is not None:
165
165
  claim["prereg_sha256"] = prereg_sha256
166
+ if provenance is not None:
167
+ claim["provenance"] = provenance
166
168
  _reject_non_jcs(claim)
167
169
  return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
168
170
 
@@ -0,0 +1,63 @@
1
+ """in-toto Statement v1 view of an eval receipt (self-hosted predicate type).
2
+
3
+ A self-hosted `predicateType` URI is fully in-toto-spec-conform and the right choice for a solo v0.x
4
+ (no official in-toto/attestation PR needed). See PREDICATE.md.
5
+
6
+ HONESTY (important): the `subject.digest` here is a SALTED COMMITMENT to the model identifier, NOT the
7
+ content hash of an artifact. Placing it under the standard `sha256` key would suggest an artifact hash
8
+ and mislead generic in-toto verifiers. in-toto permits arbitrary digest keys, so we use a unique custom
9
+ key `proofbundleModelCommitV1`; the `subject.name` is the descriptive `model-id-commitment`; and the
10
+ predicate mirrors the note in `subject_digest_note`. Full artifact digests come only once a model artifact
11
+ exists (deferred, see the roadmap).
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from typing import Optional
16
+
17
+ STATEMENT_TYPE = "https://in-toto.io/Statement/v1"
18
+ PREDICATE_TYPE = "https://b7n0de.com/proofbundle/eval-receipt/v0.1"
19
+ VERIFIER_ID = "https://b7n0de.com/proofbundle"
20
+ MODEL_COMMIT_DIGEST_KEY = "proofbundleModelCommitV1"
21
+
22
+ _SUBJECT_DIGEST_NOTE = (
23
+ "subject.digest is a salted commitment to the model identifier (key "
24
+ f"{MODEL_COMMIT_DIGEST_KEY}), NOT an artifact content hash — do not treat it as sha256.")
25
+
26
+
27
+ def _commit_hex(commit: str) -> str:
28
+ """Extract the hex of a `sha256:<hex>` salted commitment (the value that goes into the digest)."""
29
+ return commit.split(":", 1)[1] if ":" in commit else commit
30
+
31
+
32
+ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
33
+ harness: Optional[dict] = None) -> dict:
34
+ """Build an in-toto Statement v1 whose predicate is the eval receipt.
35
+
36
+ `root_b64` (from the signed bundle's merkle root) binds the statement to the receipt. `harness`
37
+ (e.g. {"name": "inspect_ai", "version": "0.3.217"}) is optional. The subject digest is the model
38
+ commitment under a custom key (never `sha256`).
39
+ """
40
+ statement = {
41
+ "_type": STATEMENT_TYPE,
42
+ "subject": [{
43
+ "name": "model-id-commitment",
44
+ "digest": {MODEL_COMMIT_DIGEST_KEY: _commit_hex(claim["model_id_commit"])},
45
+ }],
46
+ "predicateType": PREDICATE_TYPE,
47
+ "predicate": {
48
+ "verifier": {"id": VERIFIER_ID},
49
+ "evaluatedAt": claim["timestamp"],
50
+ "suite": claim["suite"],
51
+ "claims": [{
52
+ "metric": claim["metric"], "comparator": claim["comparator"],
53
+ "threshold": claim["threshold"], "passed": claim["passed"],
54
+ }],
55
+ "datasetCommit": claim.get("dataset_id_commit"),
56
+ "subject_digest_note": _SUBJECT_DIGEST_NOTE,
57
+ },
58
+ }
59
+ if harness:
60
+ statement["predicate"]["harness"] = harness
61
+ if root_b64:
62
+ statement["predicate"]["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
63
+ return statement
@@ -0,0 +1,119 @@
1
+ """SD-JWT issuance per RFC 9901 — the differentiation feature (v0.5).
2
+
3
+ Issue an eval receipt so a holder can disclose `passed` + `threshold` while WITHHOLDING the exact score
4
+ and the identifier openings. The existing verifier (proofbundle.sdjwt) stays; this adds issuance.
5
+
6
+ Source of truth: the signed canonical bundle payload (evalclaim) is the ONLY truth. This SD-JWT is a
7
+ derived view — its always-open claims are copied bit-exact from that payload, and it binds the bundle
8
+ anchor via `receipt.root_b64`. Sign the SD-JWT with the SAME Ed25519 key that signed the bundle (matching
9
+ the `issuer` field). A holder cannot lift a claim under a different key.
10
+
11
+ Always-open (plaintext JWT claims, NEVER a disclosure): passed, threshold, comparator, suite, issuer,
12
+ receipt.root_b64. Selectively-disclosable (via `_sd` + disclosures): the exact metric value, ci95, and
13
+ the identifier-commitment openings (identifier + salt).
14
+
15
+ RFC 9901 §4.2.4.1 digest byte-chain (the subtle, load-bearing detail): for each disclosable field, a
16
+ CSPRNG salt of ≥128 bit (base64url); the disclosure is base64url(UTF-8(JSON array [salt, name, value]));
17
+ the digest placed in `_sd` is **base64url(SHA-256(ASCII bytes of the base64url-ENCODED disclosure
18
+ string)))** — hashed over the ENCODED string, NOT over the JSON bytes. `_sd_alg` = "sha-256" at the top
19
+ level. The JWT is signed with EdDSA. Compact form is tilde-separated: JWT~disclosure1~...~ (trailing ~).
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import base64
24
+ import hashlib
25
+ import json
26
+ import os
27
+ from typing import Optional, Sequence
28
+
29
+ from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
30
+ from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
31
+
32
+ SD_ALG = "sha-256"
33
+ _SALT_BYTES = 16 # 128 bit
34
+
35
+
36
+ def _b64url(data: bytes) -> str:
37
+ return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
38
+
39
+
40
+ def _make_disclosure(name: str, value, salt_b64: str) -> tuple[str, str]:
41
+ """Return (disclosure_b64url, digest_b64url) per RFC 9901 §4.2.4.1.
42
+
43
+ The digest hashes the ASCII bytes of the base64url-ENCODED disclosure string (not the JSON bytes)."""
44
+ disclosure_json = json.dumps([salt_b64, name, value]) # array [salt, name, value]
45
+ disclosure_b64 = _b64url(disclosure_json.encode("utf-8"))
46
+ digest = _b64url(hashlib.sha256(disclosure_b64.encode("ascii")).digest())
47
+ return disclosure_b64, digest
48
+
49
+
50
+ def issue_sd_jwt(claim: dict, signer: Ed25519PrivateKey, *, root_b64: str,
51
+ exact_score: Optional[str] = None, ci95: Optional[Sequence[str]] = None,
52
+ model_id_opening: Optional[Sequence] = None,
53
+ dataset_id_opening: Optional[Sequence] = None) -> str:
54
+ """Issue a compact SD-JWT for the eval claim, signed with `signer` (must match claim['issuer']).
55
+
56
+ Openings are (identifier, salt_hex) pairs the issuer may later reveal; `exact_score`/`ci95` are the
57
+ withheld numeric detail. All extras are selectively-disclosable; the pass/threshold facts are open.
58
+ """
59
+ always_open = {
60
+ "passed": claim["passed"], "threshold": claim["threshold"],
61
+ "comparator": claim["comparator"], "suite": claim["suite"],
62
+ "issuer": claim["issuer"], "receipt": {"root_b64": root_b64},
63
+ }
64
+ disclosures: list[str] = []
65
+ sd_digests: list[str] = []
66
+
67
+ def _add(name: str, value):
68
+ d, dig = _make_disclosure(name, value, _b64url(os.urandom(_SALT_BYTES)))
69
+ disclosures.append(d)
70
+ sd_digests.append(dig)
71
+
72
+ if exact_score is not None:
73
+ _add("exact_score", exact_score)
74
+ if ci95 is not None:
75
+ _add("ci95", list(ci95))
76
+ if model_id_opening is not None:
77
+ _add("model_id_opening", list(model_id_opening))
78
+ if dataset_id_opening is not None:
79
+ _add("dataset_id_opening", list(dataset_id_opening))
80
+
81
+ payload = dict(always_open)
82
+ if sd_digests:
83
+ payload["_sd"] = sd_digests
84
+ payload["_sd_alg"] = SD_ALG
85
+
86
+ header = {"alg": "EdDSA", "typ": "sd-jwt"}
87
+ signing_input = _b64url(json.dumps(header).encode("utf-8")) + "." + _b64url(json.dumps(payload).encode("utf-8"))
88
+ signature = signer.sign(signing_input.encode("ascii"))
89
+ jwt = signing_input + "." + _b64url(signature)
90
+
91
+ # compact: JWT ~ disclosure1 ~ ... ~ (trailing tilde, no key-binding JWT in v0.5)
92
+ return "~".join([jwt, *disclosures]) + "~"
93
+
94
+
95
+ def issuer_matches(claim: dict, signer: Ed25519PrivateKey) -> bool:
96
+ """True iff the claim's issuer fingerprint equals the signer's public key (bundle↔SD-JWT same key)."""
97
+ raw = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
98
+ return claim.get("issuer") == "ed25519:" + base64.b64encode(raw).decode("ascii")
99
+
100
+
101
+ def _jwt_payload(compact: str) -> dict:
102
+ """Decode the always-open JWT payload of a compact SD-JWT (the part before the first '~')."""
103
+ jwt = compact.split("~", 1)[0]
104
+ payload_b64 = jwt.split(".")[1]
105
+ padded = payload_b64 + "=" * (-len(payload_b64) % 4)
106
+ return json.loads(base64.urlsafe_b64decode(padded).decode("utf-8"))
107
+
108
+
109
+ def check_binds_bundle(compact: str, claim: dict, root_b64: str) -> bool:
110
+ """No-Fake binding: the SD-JWT's always-open claims MUST match the signed bundle payload bit-exact and
111
+ bind its merkle root. A derived SD-JWT that diverges from its bundle source of truth is rejected."""
112
+ try:
113
+ p = _jwt_payload(compact)
114
+ except (ValueError, KeyError, IndexError):
115
+ return False
116
+ return (p.get("passed") == claim["passed"] and p.get("threshold") == claim["threshold"]
117
+ and p.get("comparator") == claim["comparator"] and p.get("suite") == claim["suite"]
118
+ and p.get("issuer") == claim["issuer"]
119
+ and (p.get("receipt") or {}).get("root_b64") == root_b64)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.4.1
3
+ Version: 0.6.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -27,6 +27,8 @@ Provides-Extra: sdjwt
27
27
  Provides-Extra: eval
28
28
  Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
29
  Provides-Extra: adapters
30
+ Provides-Extra: inspect
31
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
30
32
  Provides-Extra: dev
31
33
  Requires-Dist: pytest>=7; extra == "dev"
32
34
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.8; extra == "dev"
35
37
  Requires-Dist: build>=1; extra == "dev"
36
38
  Requires-Dist: hypothesis>=6; extra == "dev"
37
39
  Requires-Dist: rfc8785>=0.1.4; extra == "dev"
40
+ Requires-Dist: sd-jwt>=0.10; extra == "dev"
41
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
38
42
  Dynamic: license-file
39
43
 
40
44
  <div align="center">
@@ -51,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
51
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
52
56
 
53
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
54
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
55
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
56
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
57
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
58
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
59
65
 
60
66
  </div>
61
67
 
62
68
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
63
69
  verify` checks one self-contained `bundle.json` with three offline cryptographic
64
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 50 tests.
70
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
65
71
 
66
72
  ## Contents
67
73
 
@@ -282,15 +288,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
282
288
  itself is correct. Those are human judgements; what it removes is the need to simply
283
289
  trust the number.
284
290
 
291
+ ### A verification layer for trustworthy eval logs
292
+
293
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
294
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
295
+ missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
296
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
297
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
298
+
299
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
300
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
301
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
302
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
303
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
304
+ - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
305
+ emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
306
+ digest is an *honest salted commitment* under a custom key, never `sha256` (see
307
+ [PREDICATE.md](PREDICATE.md)).
308
+ - **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
309
+ root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
310
+ `threshold` while **withholding the exact score** and the identifier openings. The signed
311
+ bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
312
+ by proofbundle's own verifier **and** the `sd-jwt-python` reference.
313
+
314
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
315
+ attestation — see [SECURITY.md](SECURITY.md).
316
+
285
317
  ## Roadmap
286
318
 
287
319
  - **v0.1** — the offline verifier plus a real example bundle.
288
320
  - **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
289
321
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
290
- - **v0.4 (current release)** — the eval-receipt emitter (`emit_eval_receipt` /
291
- `proofbundle emit-eval`), salted commitments, issuer binding, file-based adapters.
292
- - **v0.5** — selective disclosure of the exact score via SD-JWT **issuance** (the issuer
293
- reveals identifier + salt on demand) and full SD-JWT VC conformance.
322
+ - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
323
+ salted commitments, issuer binding.
324
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
325
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
326
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
327
+ - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
328
+ Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
294
329
 
295
330
  ## Contributing
296
331
 
@@ -7,9 +7,11 @@ src/proofbundle/cli.py
7
7
  src/proofbundle/emit.py
8
8
  src/proofbundle/errors.py
9
9
  src/proofbundle/evalclaim.py
10
+ src/proofbundle/intoto.py
10
11
  src/proofbundle/merkle.py
11
12
  src/proofbundle/py.typed
12
13
  src/proofbundle/sdjwt.py
14
+ src/proofbundle/sdjwt_issue.py
13
15
  src/proofbundle/signature.py
14
16
  src/proofbundle.egg-info/PKG-INFO
15
17
  src/proofbundle.egg-info/SOURCES.txt
@@ -27,10 +29,12 @@ tests/test_cli_eval.py
27
29
  tests/test_emit.py
28
30
  tests/test_eval_claim_schema.py
29
31
  tests/test_evalclaim.py
32
+ tests/test_intoto.py
30
33
  tests/test_merkle.py
31
34
  tests/test_merkle_property.py
32
35
  tests/test_rekor_interop.py
33
36
  tests/test_rfc6962_external_vectors.py
34
37
  tests/test_schema.py
38
+ tests/test_sdjwt_issue.py
35
39
  tests/test_sdjwt_reference.py
36
40
  tests/test_signature.py
@@ -10,8 +10,13 @@ mypy>=1.8
10
10
  build>=1
11
11
  hypothesis>=6
12
12
  rfc8785>=0.1.4
13
+ sd-jwt>=0.10
14
+ inspect_ai<0.4,>=0.3.100
13
15
 
14
16
  [eval]
15
17
  rfc8785>=0.1.4
16
18
 
19
+ [inspect]
20
+ inspect_ai<0.4,>=0.3.100
21
+
17
22
  [sdjwt]
@@ -0,0 +1,56 @@
1
+ """Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
6
+
7
+ FX = Path(__file__).resolve().parent / "fixtures"
8
+ TS = "2026-07-01T12:00:00Z"
9
+
10
+
11
+ class TestAdapters(unittest.TestCase):
12
+ def test_lm_eval_real_acc_none_format(self):
13
+ # REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
14
+ claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
15
+ comparator=">=", threshold="0.30", timestamp=TS,
16
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
17
+ self.assertEqual(claim["suite"], "arc_easy")
18
+ self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
19
+ self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
20
+ self.assertIn("git_hash", claim["provenance"]) # provenance captured
21
+ self.assertEqual(claim["provenance"]["n_shot"], "0")
22
+ self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
23
+
24
+ def test_lm_eval_missing_metric_lists_available(self):
25
+ with self.assertRaises(ValueError):
26
+ from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
27
+ comparator=">=", threshold="0.5", timestamp=TS,
28
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
29
+
30
+ def test_inspect_ai_stable_api(self):
31
+ # Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
32
+ try:
33
+ import inspect_ai.log # noqa: F401
34
+ except ImportError:
35
+ self.skipTest("inspect_ai not installed (pip install proofbundle[inspect])")
36
+ claim, salts = from_inspect_ai_log(FX / "inspect_logs" / "safety_refusal_demo.eval", "accuracy",
37
+ comparator=">=", threshold="0.00", timestamp=TS,
38
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
39
+ self.assertEqual(claim["suite"], "safety_refusal_demo")
40
+ self.assertTrue(claim["passed"]) # accuracy 0.0 >= 0.00
41
+ self.assertNotIn("mockllm/model", str(claim)) # model id only as salted commitment
42
+
43
+ def test_inspect_ai_missing_metric_clear_error(self):
44
+ from proofbundle.adapters.inspect_ai import InspectAdapterError
45
+ try:
46
+ import inspect_ai.log # noqa: F401
47
+ except ImportError:
48
+ self.skipTest("inspect_ai not installed")
49
+ with self.assertRaises(InspectAdapterError):
50
+ from_inspect_ai_log(FX / "inspect_logs" / "safety_refusal_demo.eval", "nonexistent_metric",
51
+ comparator=">=", threshold="0.5", timestamp=TS,
52
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ unittest.main()
@@ -0,0 +1,63 @@
1
+ """in-toto Statement v1 view of an eval receipt — structurally valid + honest salted-commitment digest."""
2
+ import json
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ try:
7
+ import jsonschema
8
+ except ImportError:
9
+ jsonschema = None
10
+
11
+ from proofbundle.emit import generate_signer
12
+ from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
13
+ from proofbundle.intoto import MODEL_COMMIT_DIGEST_KEY, PREDICATE_TYPE, to_intoto_statement
14
+
15
+ ROOT = Path(__file__).resolve().parents[1]
16
+ TS = "2026-07-01T12:00:00Z"
17
+
18
+
19
+ def _claim():
20
+ signer = generate_signer()
21
+ claim, _ = build_eval_claim(
22
+ suite="safety-refusal", suite_version="v1", metric="accuracy", comparator=">=",
23
+ threshold="0.65", score="0.92", n=500, model_id="acme/model-x", dataset_id="acme/set",
24
+ issuer=issuer_fingerprint(signer), timestamp=TS, model_salt=b"0" * 16, dataset_salt=b"1" * 16)
25
+ return claim
26
+
27
+
28
+ class TestInToto(unittest.TestCase):
29
+ def test_structure(self):
30
+ stmt = to_intoto_statement(_claim(), root_b64="cm9vdA==",
31
+ harness={"name": "inspect_ai", "version": "0.3.217"})
32
+ self.assertEqual(stmt["_type"], "https://in-toto.io/Statement/v1")
33
+ self.assertEqual(stmt["predicateType"], PREDICATE_TYPE)
34
+ self.assertEqual(len(stmt["subject"]), 1)
35
+ self.assertIn("digest", stmt["subject"][0])
36
+ # honest custom digest key, NOT sha256 (would mislead generic verifiers about an artifact hash)
37
+ self.assertIn(MODEL_COMMIT_DIGEST_KEY, stmt["subject"][0]["digest"])
38
+ self.assertNotIn("sha256", stmt["subject"][0]["digest"])
39
+ self.assertIn("salted commitment", stmt["predicate"]["subject_digest_note"])
40
+ self.assertEqual(stmt["predicate"]["receipt"]["root_b64"], "cm9vdA==")
41
+
42
+ def test_digest_is_commit_hex(self):
43
+ claim = _claim()
44
+ stmt = to_intoto_statement(claim)
45
+ expected_hex = claim["model_id_commit"].split(":", 1)[1]
46
+ self.assertEqual(stmt["subject"][0]["digest"][MODEL_COMMIT_DIGEST_KEY], expected_hex)
47
+
48
+ @unittest.skipIf(jsonschema is None, "jsonschema not installed (pip install proofbundle[dev])")
49
+ def test_validates_against_official_intoto_v1_schema(self):
50
+ schema = json.loads((ROOT / "schemas" / "in_toto_statement_v1.schema.json").read_text(encoding="utf-8"))
51
+ stmt = to_intoto_statement(_claim(), root_b64="cm9vdA==")
52
+ jsonschema.validate(instance=stmt, schema=schema) # raises if invalid
53
+
54
+ @unittest.skipIf(jsonschema is None, "jsonschema not installed")
55
+ def test_schema_rejects_missing_subject(self):
56
+ schema = json.loads((ROOT / "schemas" / "in_toto_statement_v1.schema.json").read_text(encoding="utf-8"))
57
+ bad = {"_type": "https://in-toto.io/Statement/v1", "predicateType": "x", "subject": []}
58
+ with self.assertRaises(jsonschema.ValidationError):
59
+ jsonschema.validate(instance=bad, schema=schema)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ unittest.main()
@@ -0,0 +1,98 @@
1
+ """SD-JWT issuance (v0.5, RFC 9901) — own verifier + reference interop + red-tests. No-Fake."""
2
+ import base64
3
+ import json
4
+ import unittest
5
+ from pathlib import Path
6
+
7
+ from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
8
+
9
+ from proofbundle.emit import generate_signer
10
+ from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
11
+ from proofbundle.sdjwt import verify_sd_jwt
12
+ from proofbundle.sdjwt_issue import (
13
+ _make_disclosure,
14
+ check_binds_bundle,
15
+ issue_sd_jwt,
16
+ )
17
+
18
+ FX = Path(__file__).resolve().parent / "fixtures"
19
+ TS = "2026-07-01T12:00:00Z"
20
+ ROOT_B64 = "cm9vdA=="
21
+
22
+
23
+ def _claim(signer):
24
+ claim, _ = build_eval_claim(suite="safety", suite_version="v1", metric="accuracy", comparator=">=",
25
+ threshold="0.65", score="0.92", n=500, model_id="acme/model-x", dataset_id="acme/set",
26
+ issuer=issuer_fingerprint(signer), timestamp=TS, model_salt=b"0" * 16, dataset_salt=b"1" * 16)
27
+ return claim
28
+
29
+
30
+ def _raw_pub(signer):
31
+ return signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
32
+
33
+
34
+ class TestSdJwtIssue(unittest.TestCase):
35
+ def test_own_verifier_accepts(self):
36
+ signer = generate_signer()
37
+ compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92", ci95=["0.90", "0.94"])
38
+ res = verify_sd_jwt(compact, _raw_pub(signer))
39
+ self.assertTrue(res["structure_ok"], res)
40
+ self.assertTrue(res["sig_ok"], res)
41
+
42
+ def test_reference_verifier_accepts(self):
43
+ try:
44
+ from jwcrypto.jwk import JWK
45
+ from sd_jwt.verifier import SDJWTVerifier
46
+ except ImportError:
47
+ self.skipTest("sd-jwt-python not installed (dev extra)")
48
+ signer = generate_signer()
49
+ compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
50
+ jwk = JWK(kty="OKP", crv="Ed25519", x=base64.urlsafe_b64encode(_raw_pub(signer)).rstrip(b"=").decode())
51
+ payload = SDJWTVerifier(compact, lambda *_a, **_k: jwk).get_verified_payload()
52
+ self.assertEqual(payload["passed"], True) # always-open
53
+ self.assertEqual(payload["exact_score"], "0.92") # selectively disclosed
54
+
55
+ def test_digest_byte_chain_vector(self):
56
+ # RFC 9901 §4.2.4.1: digest over the base64url-ENCODED disclosure string, not the JSON bytes.
57
+ v = json.loads((FX / "sdjwt_disclosure_vector.json").read_text(encoding="utf-8"))
58
+ d_b64, dig = _make_disclosure(v["name"], v["value"], v["salt_b64url"])
59
+ self.assertEqual(d_b64, v["disclosure_b64url"])
60
+ self.assertEqual(dig, v["expected_digest_b64url"])
61
+
62
+ def test_always_open_vs_selective(self):
63
+ signer = generate_signer()
64
+ compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
65
+ jwt_payload = json.loads(base64.urlsafe_b64decode(
66
+ compact.split("~")[0].split(".")[1] + "==").decode("utf-8"))
67
+ # passed/threshold are plaintext; exact_score is NOT (only its digest is in _sd)
68
+ self.assertEqual(jwt_payload["passed"], True)
69
+ self.assertIn("threshold", jwt_payload)
70
+ self.assertNotIn("exact_score", jwt_payload)
71
+ self.assertIn("_sd", jwt_payload)
72
+
73
+ def test_binds_bundle(self):
74
+ signer = generate_signer()
75
+ claim = _claim(signer)
76
+ compact = issue_sd_jwt(claim, signer, root_b64=ROOT_B64, exact_score="0.92")
77
+ self.assertTrue(check_binds_bundle(compact, claim, ROOT_B64))
78
+
79
+ def test_divergence_red(self): # SD-JWT claims diverge from bundle → rejected
80
+ signer = generate_signer()
81
+ claim = _claim(signer)
82
+ compact = issue_sd_jwt(claim, signer, root_b64=ROOT_B64, exact_score="0.92")
83
+ diverged = dict(claim, passed=False) # bundle says passed=False, SD-JWT says True
84
+ self.assertFalse(check_binds_bundle(compact, diverged, ROOT_B64))
85
+ self.assertFalse(check_binds_bundle(compact, claim, "d3Jvbmc=")) # wrong root
86
+
87
+ def test_tamper_disclosure_red(self): # tampered disclosure → digest mismatch → own verifier fails
88
+ signer = generate_signer()
89
+ compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
90
+ jwt, *disc = compact.rstrip("~").split("~")
91
+ tampered_d, _ = _make_disclosure("exact_score", "0.99", "AAAAAAAAAAAAAAAAAAAAAA") # not committed in _sd
92
+ tampered = "~".join([jwt, tampered_d]) + "~"
93
+ res = verify_sd_jwt(tampered, _raw_pub(signer))
94
+ self.assertFalse(res.get("structure_ok") and res.get("sig_ok") and "1 disclosure" in res.get("detail", ""))
95
+
96
+
97
+ if __name__ == "__main__":
98
+ unittest.main()
@@ -1,36 +0,0 @@
1
- """Adapter for UK AISI inspect_ai eval-log JSON (file-based, no framework import)."""
2
- from __future__ import annotations
3
-
4
- import json
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from ..evalclaim import build_eval_claim
9
-
10
-
11
- def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
12
- model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
13
- """Read an inspect_ai eval-log JSON and build an eval claim.
14
-
15
- Expects: {"eval": {"task": ..., "model": ..., "dataset": {"name": ...}},
16
- "results": {"total_samples": n, "scores": [{"metrics": {metric: {"value": <number>}}}]}}.
17
- Returns (claim, salts).
18
- """
19
- data = json.loads(Path(path).read_text(encoding="utf-8"))
20
- ev = data.get("eval", {})
21
- scores = data.get("results", {}).get("scores", [])
22
- value = None
23
- for s in scores:
24
- m = s.get("metrics", {})
25
- if metric in m:
26
- value = m[metric].get("value")
27
- break
28
- if value is None:
29
- raise ValueError(f"metric {metric!r} not found in inspect_ai scores")
30
- n = int(data.get("results", {}).get("total_samples") or 0)
31
- return build_eval_claim(
32
- suite=str(ev.get("task", "inspect_ai")), suite_version=str(ev.get("task_version", "1")),
33
- metric=metric, comparator=comparator, threshold=threshold, score=repr(value), n=n,
34
- model_id=str(ev.get("model", "unknown")),
35
- dataset_id=str(ev.get("dataset", {}).get("name", ev.get("task", "unknown"))),
36
- issuer="", timestamp=timestamp, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -1,32 +0,0 @@
1
- """Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
2
- from __future__ import annotations
3
-
4
- import json
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from ..evalclaim import build_eval_claim
9
-
10
-
11
- def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
12
- timestamp: str, model_salt: Optional[bytes] = None,
13
- dataset_salt: Optional[bytes] = None):
14
- """Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
15
-
16
- Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
17
- "n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
18
- STRING to avoid float canonicalization issues. Returns (claim, salts).
19
- """
20
- data = json.loads(Path(path).read_text(encoding="utf-8"))
21
- res = data.get("results", {}).get(task)
22
- if res is None or metric not in res:
23
- raise ValueError(f"task/metric not found in results: {task}/{metric}")
24
- score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
25
- n = int(data.get("n-samples", {}).get(task, {}).get("effective")
26
- or data.get("n-samples", {}).get(task, {}).get("original") or 0)
27
- model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
28
- return build_eval_claim(
29
- suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
30
- metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
31
- model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
32
- model_salt=model_salt, dataset_salt=dataset_salt)
@@ -1,32 +0,0 @@
1
- """Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
2
- import unittest
3
- from pathlib import Path
4
-
5
- from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
6
-
7
- FX = Path(__file__).resolve().parent / "fixtures"
8
- TS = "2026-07-01T12:00:00Z"
9
-
10
-
11
- class TestAdapters(unittest.TestCase):
12
- def test_lm_eval(self):
13
- claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
14
- comparator=">=", threshold="0.70", timestamp=TS,
15
- model_salt=b"0" * 16, dataset_salt=b"1" * 16)
16
- self.assertEqual(claim["suite"], "hellaswag")
17
- self.assertEqual(claim["threshold"], "0.70")
18
- self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
19
- self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
20
- self.assertEqual(claim["n"], 10042)
21
-
22
- def test_inspect_ai(self):
23
- claim, salts = from_inspect_ai_log(FX / "inspect_ai_log.json", "accuracy",
24
- comparator=">=", threshold="0.80", timestamp=TS,
25
- model_salt=b"0" * 16, dataset_salt=b"1" * 16)
26
- self.assertEqual(claim["suite"], "safety_refusal")
27
- self.assertTrue(claim["passed"]) # 0.92 >= 0.80
28
- self.assertEqual(claim["n"], 500)
29
-
30
-
31
- if __name__ == "__main__":
32
- unittest.main()
File without changes
File without changes