proofbundle 0.6.0__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.6.0/src/proofbundle.egg-info → proofbundle-0.7.1}/PKG-INFO +13 -7
- {proofbundle-0.6.0 → proofbundle-0.7.1}/README.md +10 -4
- {proofbundle-0.6.0 → proofbundle-0.7.1}/pyproject.toml +5 -4
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/__init__.py +1 -1
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/adapters/inspect_ai.py +15 -1
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/bundle.py +46 -6
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/evalclaim.py +17 -6
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/intoto.py +17 -16
- {proofbundle-0.6.0 → proofbundle-0.7.1/src/proofbundle.egg-info}/PKG-INFO +13 -7
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle.egg-info/SOURCES.txt +1 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle.egg-info/requires.txt +4 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_adapters.py +2 -0
- proofbundle-0.7.1/tests/test_bundle_robustness.py +74 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/LICENSE +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/setup.cfg +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/adapters/lm_eval.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_bundle.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_cli.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_emit.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_intoto.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_merkle.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_schema.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.6.0 → proofbundle-0.7.1}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -28,7 +28,7 @@ Provides-Extra: eval
|
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
30
|
Provides-Extra: inspect
|
|
31
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
|
|
32
32
|
Provides-Extra: dev
|
|
33
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
|
|
|
38
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
39
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
40
|
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
|
|
42
42
|
Dynamic: license-file
|
|
43
43
|
|
|
44
44
|
<div align="center">
|
|
@@ -62,12 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
62
62
|
[](https://github.com/astral-sh/ruff)
|
|
63
63
|
[](https://slsa.dev)
|
|
64
64
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
66
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
67
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
65
68
|
|
|
66
69
|
</div>
|
|
67
70
|
|
|
68
71
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
69
72
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
70
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
73
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 72 tests.
|
|
71
74
|
|
|
72
75
|
## Contents
|
|
73
76
|
|
|
@@ -290,7 +293,8 @@ trust the number.
|
|
|
290
293
|
|
|
291
294
|
### A verification layer for trustworthy eval logs
|
|
292
295
|
|
|
293
|
-
The UK
|
|
296
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
297
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
298
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
299
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
300
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -322,8 +326,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
322
326
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
323
327
|
salted commitments, issuer binding.
|
|
324
328
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
-
- **v0.6
|
|
326
|
-
|
|
329
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
330
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
331
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
332
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
327
333
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
328
334
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
329
335
|
|
|
@@ -19,12 +19,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
19
19
|
[](https://github.com/astral-sh/ruff)
|
|
20
20
|
[](https://slsa.dev)
|
|
21
21
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
22
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
23
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
24
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
22
25
|
|
|
23
26
|
</div>
|
|
24
27
|
|
|
25
28
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
26
29
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
27
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
30
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 72 tests.
|
|
28
31
|
|
|
29
32
|
## Contents
|
|
30
33
|
|
|
@@ -247,7 +250,8 @@ trust the number.
|
|
|
247
250
|
|
|
248
251
|
### A verification layer for trustworthy eval logs
|
|
249
252
|
|
|
250
|
-
The UK
|
|
253
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
254
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
251
255
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
252
256
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
253
257
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -279,8 +283,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
279
283
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
280
284
|
salted commitments, issuer binding.
|
|
281
285
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
282
|
-
- **v0.6
|
|
283
|
-
|
|
286
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
287
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
288
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
289
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
284
290
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
285
291
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
286
292
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.7.1"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -47,10 +47,11 @@ eval = ["rfc8785>=0.1.4"]
|
|
|
47
47
|
adapters = []
|
|
48
48
|
# The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
|
|
49
49
|
# the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
|
|
50
|
-
# test is bound to this range.
|
|
51
|
-
|
|
50
|
+
# test is bound to this range. inspect_ai requires Python >= 3.10, so the marker gates it out on 3.9
|
|
51
|
+
# (base + [eval]/[sdjwt] still work on 3.9; the inspect adapter test skips there). Fixes the red 3.9 CI.
|
|
52
|
+
inspect = ['inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
|
|
52
53
|
dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
|
|
53
|
-
"rfc8785>=0.1.4", "sd-jwt>=0.10",
|
|
54
|
+
"rfc8785>=0.1.4", "sd-jwt>=0.10", 'inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
|
|
54
55
|
|
|
55
56
|
[project.urls]
|
|
56
57
|
Homepage = "https://b7n0de.com"
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.7.1"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -57,9 +57,23 @@ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, t
|
|
|
57
57
|
model_id = str(getattr(ev, "model", "unknown"))
|
|
58
58
|
dataset = getattr(ev, "dataset", None)
|
|
59
59
|
dataset_id = str(getattr(dataset, "name", None) or suite)
|
|
60
|
+
|
|
61
|
+
# Provenance parity with the lm-eval adapter: inspect_ai exposes the same run provenance for free.
|
|
62
|
+
provenance = {"harness": "inspect_ai"}
|
|
63
|
+
revision = getattr(ev, "revision", None)
|
|
64
|
+
commit = getattr(revision, "commit", None)
|
|
65
|
+
if commit:
|
|
66
|
+
provenance["git_hash"] = str(commit)
|
|
67
|
+
packages = getattr(ev, "packages", None) or {}
|
|
68
|
+
if isinstance(packages, dict) and packages.get("inspect_ai"):
|
|
69
|
+
provenance["harness_version"] = str(packages["inspect_ai"])
|
|
70
|
+
tv = getattr(ev, "task_version", None)
|
|
71
|
+
if tv is not None:
|
|
72
|
+
provenance["task_version"] = str(tv)
|
|
73
|
+
|
|
60
74
|
return build_eval_claim(
|
|
61
75
|
suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
|
|
62
76
|
metric=metric, comparator=comparator, threshold=threshold, score=repr(value),
|
|
63
77
|
n=int(getattr(results, "total_samples", 0) or 0),
|
|
64
78
|
model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
|
|
65
|
-
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
79
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -12,7 +12,11 @@ checks, fully offline and without any running log server:
|
|
|
12
12
|
The verifier treats ``payload`` as opaque bytes: it proves *that these exact
|
|
13
13
|
bytes were signed and anchored*, not what they mean. That keeps v0.1 small and
|
|
14
14
|
correct. Turning a reproducible eval run into such a payload is the job of the
|
|
15
|
-
emitter (see
|
|
15
|
+
eval-receipt emitter (see :mod:`proofbundle.evalclaim`, since v0.4).
|
|
16
|
+
|
|
17
|
+
Malformed input (wrong types, missing or unknown fields) is rejected with a
|
|
18
|
+
``BundleFormatError`` — never a raw traceback — so a caller gets the documented
|
|
19
|
+
malformed exit code, not a crash.
|
|
16
20
|
"""
|
|
17
21
|
|
|
18
22
|
from __future__ import annotations
|
|
@@ -30,6 +34,13 @@ __all__ = ["SCHEMA", "verify_bundle", "load_bundle"]
|
|
|
30
34
|
|
|
31
35
|
SCHEMA = "proofbundle/v0.1"
|
|
32
36
|
|
|
37
|
+
# Allowed keys per object — SPEC.md §3: a verifier MUST reject unknown fields (schema is
|
|
38
|
+
# additionalProperties: false). Enforced here so the code matches its own normative spec.
|
|
39
|
+
_TOP_KEYS = {"schema", "payload_b64", "signature", "merkle", "sd_jwt_vc"}
|
|
40
|
+
_SIG_KEYS = {"alg", "public_key_b64", "sig_b64"}
|
|
41
|
+
_MERKLE_KEYS = {"hash_alg", "leaf_index", "tree_size", "inclusion_proof_b64", "root_b64"}
|
|
42
|
+
_SD_KEYS = {"compact", "issuer_public_key_b64"}
|
|
43
|
+
|
|
33
44
|
|
|
34
45
|
def _b64d(value: str, field: str) -> bytes:
|
|
35
46
|
try:
|
|
@@ -44,6 +55,27 @@ def _require(obj: dict, key: str, field: str):
|
|
|
44
55
|
return obj[key]
|
|
45
56
|
|
|
46
57
|
|
|
58
|
+
def _require_dict(obj, field: str) -> dict:
|
|
59
|
+
"""The value must be a JSON object — a string/list/number is malformed, not a crash."""
|
|
60
|
+
if not isinstance(obj, dict):
|
|
61
|
+
raise BundleFormatError(f"field {field} must be a JSON object")
|
|
62
|
+
return obj
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _require_int(obj: dict, key: str, field: str) -> int:
|
|
66
|
+
"""The value must be a JSON integer — reject floats (SPEC §2) and non-numeric strings/None."""
|
|
67
|
+
val = _require(obj, key, field)
|
|
68
|
+
if isinstance(val, bool) or not isinstance(val, int): # bool is an int subclass; a float/str/None is not
|
|
69
|
+
raise BundleFormatError(f"field {field} must be an integer, got {type(val).__name__}")
|
|
70
|
+
return val
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _reject_unknown(obj: dict, allowed: set, field: str) -> None:
|
|
74
|
+
extra = set(obj) - allowed
|
|
75
|
+
if extra:
|
|
76
|
+
raise BundleFormatError(f"unknown field(s) in {field}: {sorted(extra)}")
|
|
77
|
+
|
|
78
|
+
|
|
47
79
|
def load_bundle(path: str) -> dict:
|
|
48
80
|
"""Read and JSON-parse a bundle file."""
|
|
49
81
|
with open(path, "r", encoding="utf-8") as handle:
|
|
@@ -60,12 +92,14 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
60
92
|
schema = bundle.get("schema")
|
|
61
93
|
if schema != SCHEMA:
|
|
62
94
|
raise UnsupportedError(f"unsupported schema {schema!r}, expected {SCHEMA!r}")
|
|
95
|
+
_reject_unknown(bundle, _TOP_KEYS, "bundle")
|
|
63
96
|
|
|
64
97
|
result = VerificationResult()
|
|
65
98
|
payload = _b64d(_require(bundle, "payload_b64", "payload_b64"), "payload_b64")
|
|
66
99
|
|
|
67
100
|
# 1. signature over the payload
|
|
68
|
-
sig = _require(bundle, "signature", "signature")
|
|
101
|
+
sig = _require_dict(_require(bundle, "signature", "signature"), "signature")
|
|
102
|
+
_reject_unknown(sig, _SIG_KEYS, "signature")
|
|
69
103
|
alg = sig.get("alg")
|
|
70
104
|
if alg != "ed25519":
|
|
71
105
|
raise UnsupportedError(f"signature alg {alg!r} not supported in v0.1")
|
|
@@ -75,13 +109,17 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
75
109
|
result.add("ed25519-signature", sig_ok, "payload signed by stated key" if sig_ok else "invalid signature")
|
|
76
110
|
|
|
77
111
|
# 2. merkle inclusion of the payload
|
|
78
|
-
mk = _require(bundle, "merkle", "merkle")
|
|
112
|
+
mk = _require_dict(_require(bundle, "merkle", "merkle"), "merkle")
|
|
113
|
+
_reject_unknown(mk, _MERKLE_KEYS, "merkle")
|
|
79
114
|
hash_alg = mk.get("hash_alg", "sha256-rfc6962")
|
|
80
115
|
if hash_alg != "sha256-rfc6962":
|
|
81
116
|
raise UnsupportedError(f"merkle hash_alg {hash_alg!r} not supported in v0.1")
|
|
82
|
-
leaf_index =
|
|
83
|
-
tree_size =
|
|
84
|
-
|
|
117
|
+
leaf_index = _require_int(mk, "leaf_index", "merkle.leaf_index")
|
|
118
|
+
tree_size = _require_int(mk, "tree_size", "merkle.tree_size")
|
|
119
|
+
proof_list = _require(mk, "inclusion_proof_b64", "merkle.inclusion_proof_b64") # required per SPEC §5
|
|
120
|
+
if not isinstance(proof_list, list):
|
|
121
|
+
raise BundleFormatError("field merkle.inclusion_proof_b64 must be a list")
|
|
122
|
+
proof = [_b64d(p, "merkle.inclusion_proof_b64[]") for p in proof_list]
|
|
85
123
|
root = _b64d(_require(mk, "root_b64", "merkle.root_b64"), "merkle.root_b64")
|
|
86
124
|
incl_ok = merkle.verify_inclusion(payload, leaf_index, tree_size, proof, root)
|
|
87
125
|
result.add(
|
|
@@ -93,6 +131,8 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
|
|
|
93
131
|
# 3. optional SD-JWT selective disclosure credential
|
|
94
132
|
sd = bundle.get("sd_jwt_vc")
|
|
95
133
|
if sd is not None:
|
|
134
|
+
sd = _require_dict(sd, "sd_jwt_vc")
|
|
135
|
+
_reject_unknown(sd, _SD_KEYS, "sd_jwt_vc")
|
|
96
136
|
compact = _require(sd, "compact", "sd_jwt_vc.compact")
|
|
97
137
|
issuer_pub = None
|
|
98
138
|
if sd.get("issuer_public_key_b64"):
|
|
@@ -21,6 +21,7 @@ import base64
|
|
|
21
21
|
import hashlib
|
|
22
22
|
import json
|
|
23
23
|
import os
|
|
24
|
+
import re
|
|
24
25
|
import unicodedata
|
|
25
26
|
from typing import Optional, Sequence
|
|
26
27
|
|
|
@@ -34,6 +35,8 @@ EVAL_CLAIM_SCHEMA = "proofbundle/eval-claim/v0.1"
|
|
|
34
35
|
COMMIT_ALG = "sha256-salted-v1"
|
|
35
36
|
_COMPARATORS = {">=", ">", "<=", "<"}
|
|
36
37
|
_MAX_SAFE_INT = 2 ** 53 - 1
|
|
38
|
+
# The published eval-claim schema's decimal pattern for threshold/score (no exponent, no sign+, no spaces).
|
|
39
|
+
_DECIMAL_RE = re.compile(r"^-?[0-9]+(\.[0-9]+)?$")
|
|
37
40
|
# The exact key set of an eval claim; decode/validate reject anything else.
|
|
38
41
|
_REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
|
|
39
42
|
"passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
|
|
@@ -103,7 +106,12 @@ def canonicalize(claim: dict) -> bytes:
|
|
|
103
106
|
for the UTF-16 code-unit key sort + compact UTF-8 serialization.
|
|
104
107
|
"""
|
|
105
108
|
_reject_non_jcs(claim)
|
|
106
|
-
|
|
109
|
+
try:
|
|
110
|
+
import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
|
|
111
|
+
except ImportError as e:
|
|
112
|
+
raise EvalClaimError(
|
|
113
|
+
"emitting eval receipts needs an RFC 8785 canonicalizer — install with: "
|
|
114
|
+
"pip install \"proofbundle[eval]\"") from e
|
|
107
115
|
try:
|
|
108
116
|
return rfc8785.dumps(claim)
|
|
109
117
|
except (rfc8785.FloatDomainError, rfc8785.IntegerDomainError, rfc8785.CanonicalizationError) as e:
|
|
@@ -137,14 +145,17 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
137
145
|
"""
|
|
138
146
|
if comparator not in _COMPARATORS:
|
|
139
147
|
raise EvalClaimError(f"comparator must be one of {sorted(_COMPARATORS)}")
|
|
148
|
+
# threshold/score must match the PUBLISHED schema's decimal pattern exactly — reject "1e2",
|
|
149
|
+
# "Infinity", "+5", " 5 " etc. that Decimal() would accept but jsonschema rejects (schema-conformance).
|
|
140
150
|
for name, val in (("threshold", threshold), ("score", score)):
|
|
141
151
|
if not isinstance(val, str):
|
|
142
152
|
raise EvalClaimError(f"{name} must be a decimal STRING, not {type(val).__name__}")
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
153
|
+
if not _DECIMAL_RE.match(val):
|
|
154
|
+
raise EvalClaimError(f"{name} must be a plain decimal string (^-?[0-9]+(\\.[0-9]+)?$), got {val!r}")
|
|
155
|
+
if not isinstance(n, int) or isinstance(n, bool) or n < 0 or n > _MAX_SAFE_INT:
|
|
156
|
+
raise EvalClaimError(f"n must be a non-negative integer <= 2**53-1, got {n!r}")
|
|
157
|
+
from decimal import Decimal # noqa: PLC0415
|
|
158
|
+
s, t = Decimal(score), Decimal(threshold)
|
|
148
159
|
passed = {">=": s >= t, ">": s > t, "<=": s <= t, "<": s < t}[comparator]
|
|
149
160
|
m_salt = model_salt if model_salt is not None else os.urandom(16)
|
|
150
161
|
d_salt = dataset_salt if dataset_salt is not None else os.urandom(16)
|
|
@@ -12,7 +12,7 @@ exists (deferred, see the roadmap).
|
|
|
12
12
|
"""
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import Any, Optional
|
|
16
16
|
|
|
17
17
|
STATEMENT_TYPE = "https://in-toto.io/Statement/v1"
|
|
18
18
|
PREDICATE_TYPE = "https://b7n0de.com/proofbundle/eval-receipt/v0.1"
|
|
@@ -37,6 +37,21 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
|
|
|
37
37
|
(e.g. {"name": "inspect_ai", "version": "0.3.217"}) is optional. The subject digest is the model
|
|
38
38
|
commitment under a custom key (never `sha256`).
|
|
39
39
|
"""
|
|
40
|
+
predicate: dict[str, Any] = {
|
|
41
|
+
"verifier": {"id": VERIFIER_ID},
|
|
42
|
+
"evaluatedAt": claim["timestamp"],
|
|
43
|
+
"suite": claim["suite"],
|
|
44
|
+
"claims": [{
|
|
45
|
+
"metric": claim["metric"], "comparator": claim["comparator"],
|
|
46
|
+
"threshold": claim["threshold"], "passed": claim["passed"],
|
|
47
|
+
}],
|
|
48
|
+
"datasetCommit": claim.get("dataset_id_commit"),
|
|
49
|
+
"subject_digest_note": _SUBJECT_DIGEST_NOTE,
|
|
50
|
+
}
|
|
51
|
+
if harness:
|
|
52
|
+
predicate["harness"] = harness
|
|
53
|
+
if root_b64:
|
|
54
|
+
predicate["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
|
|
40
55
|
statement = {
|
|
41
56
|
"_type": STATEMENT_TYPE,
|
|
42
57
|
"subject": [{
|
|
@@ -44,20 +59,6 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
|
|
|
44
59
|
"digest": {MODEL_COMMIT_DIGEST_KEY: _commit_hex(claim["model_id_commit"])},
|
|
45
60
|
}],
|
|
46
61
|
"predicateType": PREDICATE_TYPE,
|
|
47
|
-
"predicate":
|
|
48
|
-
"verifier": {"id": VERIFIER_ID},
|
|
49
|
-
"evaluatedAt": claim["timestamp"],
|
|
50
|
-
"suite": claim["suite"],
|
|
51
|
-
"claims": [{
|
|
52
|
-
"metric": claim["metric"], "comparator": claim["comparator"],
|
|
53
|
-
"threshold": claim["threshold"], "passed": claim["passed"],
|
|
54
|
-
}],
|
|
55
|
-
"datasetCommit": claim.get("dataset_id_commit"),
|
|
56
|
-
"subject_digest_note": _SUBJECT_DIGEST_NOTE,
|
|
57
|
-
},
|
|
62
|
+
"predicate": predicate,
|
|
58
63
|
}
|
|
59
|
-
if harness:
|
|
60
|
-
statement["predicate"]["harness"] = harness
|
|
61
|
-
if root_b64:
|
|
62
|
-
statement["predicate"]["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
|
|
63
64
|
return statement
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -28,7 +28,7 @@ Provides-Extra: eval
|
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
30
|
Provides-Extra: inspect
|
|
31
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
|
|
32
32
|
Provides-Extra: dev
|
|
33
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
|
|
|
38
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
39
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
40
|
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
|
|
42
42
|
Dynamic: license-file
|
|
43
43
|
|
|
44
44
|
<div align="center">
|
|
@@ -62,12 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
|
|
|
62
62
|
[](https://github.com/astral-sh/ruff)
|
|
63
63
|
[](https://slsa.dev)
|
|
64
64
|
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
+
<!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
|
|
66
|
+
badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
|
|
67
|
+
been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
|
|
65
68
|
|
|
66
69
|
</div>
|
|
67
70
|
|
|
68
71
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
69
72
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
70
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
73
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 72 tests.
|
|
71
74
|
|
|
72
75
|
## Contents
|
|
73
76
|
|
|
@@ -290,7 +293,8 @@ trust the number.
|
|
|
290
293
|
|
|
291
294
|
### A verification layer for trustworthy eval logs
|
|
292
295
|
|
|
293
|
-
The UK
|
|
296
|
+
The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
|
|
297
|
+
gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
298
|
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
299
|
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
300
|
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
@@ -322,8 +326,10 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
322
326
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
323
327
|
salted commitments, issuer binding.
|
|
324
328
|
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
-
- **v0.6
|
|
326
|
-
|
|
329
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
330
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
331
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
332
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
327
333
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
328
334
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
329
335
|
|
|
@@ -11,12 +11,16 @@ build>=1
|
|
|
11
11
|
hypothesis>=6
|
|
12
12
|
rfc8785>=0.1.4
|
|
13
13
|
sd-jwt>=0.10
|
|
14
|
+
|
|
15
|
+
[dev:python_version >= "3.10"]
|
|
14
16
|
inspect_ai<0.4,>=0.3.100
|
|
15
17
|
|
|
16
18
|
[eval]
|
|
17
19
|
rfc8785>=0.1.4
|
|
18
20
|
|
|
19
21
|
[inspect]
|
|
22
|
+
|
|
23
|
+
[inspect:python_version >= "3.10"]
|
|
20
24
|
inspect_ai<0.4,>=0.3.100
|
|
21
25
|
|
|
22
26
|
[sdjwt]
|
|
@@ -39,6 +39,8 @@ class TestAdapters(unittest.TestCase):
|
|
|
39
39
|
self.assertEqual(claim["suite"], "safety_refusal_demo")
|
|
40
40
|
self.assertTrue(claim["passed"]) # accuracy 0.0 >= 0.00
|
|
41
41
|
self.assertNotIn("mockllm/model", str(claim)) # model id only as salted commitment
|
|
42
|
+
self.assertEqual(claim["provenance"]["harness"], "inspect_ai") # provenance parity with lm-eval
|
|
43
|
+
self.assertIn("harness_version", claim["provenance"])
|
|
42
44
|
|
|
43
45
|
def test_inspect_ai_missing_metric_clear_error(self):
|
|
44
46
|
from proofbundle.adapters.inspect_ai import InspectAdapterError
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Malformed-input robustness of verify_bundle + build_eval_claim (holistic-review findings, 0.7.1).
|
|
2
|
+
|
|
3
|
+
The verifier's contract is OK/FAILED/malformed — never a raw traceback. build_eval_claim must not emit a
|
|
4
|
+
receipt that fails its own published schema. One red-test per finding."""
|
|
5
|
+
import copy
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from proofbundle import verify_bundle
|
|
9
|
+
from proofbundle.emit import emit_bundle, generate_signer
|
|
10
|
+
from proofbundle.errors import BundleFormatError
|
|
11
|
+
from proofbundle.evalclaim import EvalClaimError, build_eval_claim
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _bundle():
|
|
15
|
+
return emit_bundle(b"payload", generate_signer())
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _mut(mut):
|
|
19
|
+
b = copy.deepcopy(_bundle())
|
|
20
|
+
mut(b)
|
|
21
|
+
return b
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestBundleRobustness(unittest.TestCase):
|
|
25
|
+
def test_leaf_index_non_numeric_raises_format_error(self): # D1
|
|
26
|
+
with self.assertRaises(BundleFormatError):
|
|
27
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("leaf_index", "abc")))
|
|
28
|
+
|
|
29
|
+
def test_signature_non_object_raises_format_error(self): # D2
|
|
30
|
+
with self.assertRaises(BundleFormatError):
|
|
31
|
+
verify_bundle(_mut(lambda b: b.__setitem__("signature", "notadict")))
|
|
32
|
+
with self.assertRaises(BundleFormatError):
|
|
33
|
+
verify_bundle(_mut(lambda b: b.__setitem__("merkle", ["x"])))
|
|
34
|
+
|
|
35
|
+
def test_tree_size_float_rejected(self): # D3 (SPEC §2: integers only)
|
|
36
|
+
with self.assertRaises(BundleFormatError):
|
|
37
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("tree_size", 1.5)))
|
|
38
|
+
|
|
39
|
+
def test_missing_inclusion_proof_rejected(self): # D4 (SPEC §5: required)
|
|
40
|
+
with self.assertRaises(BundleFormatError):
|
|
41
|
+
verify_bundle(_mut(lambda b: b["merkle"].pop("inclusion_proof_b64")))
|
|
42
|
+
|
|
43
|
+
def test_unknown_fields_rejected(self): # SPEC §3: additionalProperties false
|
|
44
|
+
with self.assertRaises(BundleFormatError):
|
|
45
|
+
verify_bundle(_mut(lambda b: b.__setitem__("evil", "x")))
|
|
46
|
+
with self.assertRaises(BundleFormatError):
|
|
47
|
+
verify_bundle(_mut(lambda b: b["signature"].__setitem__("evil", "x")))
|
|
48
|
+
with self.assertRaises(BundleFormatError):
|
|
49
|
+
verify_bundle(_mut(lambda b: b["merkle"].__setitem__("evil", "x")))
|
|
50
|
+
|
|
51
|
+
def test_well_formed_still_ok(self): # no false positive
|
|
52
|
+
self.assertTrue(verify_bundle(_bundle()).ok)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TestEvalClaimSchemaConformance(unittest.TestCase):
|
|
56
|
+
def _build(self, **kw):
|
|
57
|
+
base = dict(suite="s", suite_version="v1", metric="acc", comparator=">=", threshold="0.8",
|
|
58
|
+
score="0.9", n=1, model_id="m", dataset_id="d", issuer="",
|
|
59
|
+
timestamp="2026-07-01T12:00:00Z", model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
60
|
+
base.update(kw)
|
|
61
|
+
return build_eval_claim(**base)
|
|
62
|
+
|
|
63
|
+
def test_negative_n_rejected(self): # schema minimum 0
|
|
64
|
+
with self.assertRaises(EvalClaimError):
|
|
65
|
+
self._build(n=-5)
|
|
66
|
+
|
|
67
|
+
def test_exponent_and_sign_threshold_rejected(self): # schema decimal pattern
|
|
68
|
+
for bad in ("1e2", "Infinity", "+5", " 0.9 "):
|
|
69
|
+
with self.assertRaises(EvalClaimError):
|
|
70
|
+
self._build(threshold=bad)
|
|
71
|
+
|
|
72
|
+
def test_plain_decimal_accepted(self):
|
|
73
|
+
claim, _ = self._build(threshold="0.80", score="0.92")
|
|
74
|
+
self.assertTrue(claim["passed"])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|