proofbundle 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.5.0/src/proofbundle.egg-info → proofbundle-0.7.0}/PKG-INFO +28 -12
- {proofbundle-0.5.0 → proofbundle-0.7.0}/README.md +27 -11
- {proofbundle-0.5.0 → proofbundle-0.7.0}/pyproject.toml +1 -1
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/__init__.py +1 -1
- proofbundle-0.7.0/src/proofbundle/adapters/lm_eval.py +76 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/evalclaim.py +4 -2
- {proofbundle-0.5.0 → proofbundle-0.7.0/src/proofbundle.egg-info}/PKG-INFO +28 -12
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_adapters.py +16 -8
- proofbundle-0.5.0/src/proofbundle/adapters/lm_eval.py +0 -32
- {proofbundle-0.5.0 → proofbundle-0.7.0}/LICENSE +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/setup.cfg +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/bundle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/intoto.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/SOURCES.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/requires.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_cli.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_emit.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_intoto.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_schema.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -55,17 +55,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
55
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
56
56
|
|
|
57
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
58
|
-
[](https://pypi.org/project/proofbundle/)
|
|
59
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
60
61
|
[](LICENSE)
|
|
61
62
|
[](https://github.com/astral-sh/ruff)
|
|
62
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
+
<!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
|
|
66
|
+
here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
|
|
63
67
|
|
|
64
68
|
</div>
|
|
65
69
|
|
|
66
70
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
67
71
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
68
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
72
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
69
73
|
|
|
70
74
|
## Contents
|
|
71
75
|
|
|
@@ -286,13 +290,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
286
290
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
287
291
|
trust the number.
|
|
288
292
|
|
|
289
|
-
###
|
|
293
|
+
### A verification layer for trustworthy eval logs
|
|
290
294
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
295
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
296
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
297
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
298
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
299
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
300
|
+
|
|
301
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
302
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
303
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
304
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
305
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
296
306
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
297
307
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
298
308
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -303,6 +313,9 @@ trust the number.
|
|
|
303
313
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
304
314
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
305
315
|
|
|
316
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
317
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
318
|
+
|
|
306
319
|
## Roadmap
|
|
307
320
|
|
|
308
321
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -310,8 +323,11 @@ trust the number.
|
|
|
310
323
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
311
324
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
312
325
|
salted commitments, issuer binding.
|
|
313
|
-
- **v0.5
|
|
314
|
-
|
|
326
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
327
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
328
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
329
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
330
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
315
331
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
316
332
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
317
333
|
|
|
@@ -12,17 +12,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
12
12
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
13
13
|
|
|
14
14
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
15
|
-
[](https://pypi.org/project/proofbundle/)
|
|
16
|
-
[](https://pypi.org/project/proofbundle/)
|
|
15
|
+
[](https://pypi.org/project/proofbundle/)
|
|
16
|
+
[](https://pypi.org/project/proofbundle/)
|
|
17
|
+
[](https://pepy.tech/project/proofbundle)
|
|
17
18
|
[](LICENSE)
|
|
18
19
|
[](https://github.com/astral-sh/ruff)
|
|
19
20
|
[](https://slsa.dev)
|
|
21
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
22
|
+
<!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
|
|
23
|
+
here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
|
|
20
24
|
|
|
21
25
|
</div>
|
|
22
26
|
|
|
23
27
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
24
28
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
25
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
29
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
26
30
|
|
|
27
31
|
## Contents
|
|
28
32
|
|
|
@@ -243,13 +247,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
243
247
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
244
248
|
trust the number.
|
|
245
249
|
|
|
246
|
-
###
|
|
250
|
+
### A verification layer for trustworthy eval logs
|
|
247
251
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
252
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
253
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
254
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
255
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
256
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
257
|
+
|
|
258
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
259
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
260
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
261
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
262
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
253
263
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
254
264
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
255
265
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -260,6 +270,9 @@ trust the number.
|
|
|
260
270
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
261
271
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
262
272
|
|
|
273
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
274
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
275
|
+
|
|
263
276
|
## Roadmap
|
|
264
277
|
|
|
265
278
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -267,8 +280,11 @@ trust the number.
|
|
|
267
280
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
268
281
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
269
282
|
salted commitments, issuer binding.
|
|
270
|
-
- **v0.5
|
|
271
|
-
|
|
283
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
284
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
285
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
286
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
287
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
272
288
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
273
289
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
274
290
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.7.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.7.0"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
|
|
2
|
+
|
|
3
|
+
Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
|
|
4
|
+
|
|
5
|
+
Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
|
|
6
|
+
the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
|
|
7
|
+
`"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
|
|
8
|
+
(or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
|
|
9
|
+
optional `provenance` field so a verifier can trace exactly which run produced it.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from ..evalclaim import build_eval_claim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _find_metric(res: dict, metric: str):
|
|
21
|
+
"""Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
|
|
22
|
+
|
|
23
|
+
Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
|
|
24
|
+
`metric_stderr,<same filter>`."""
|
|
25
|
+
if metric in res: # bare key (older/simple exports)
|
|
26
|
+
stderr = res.get(f"{metric}_stderr")
|
|
27
|
+
return res[metric], stderr, metric
|
|
28
|
+
if f"{metric},none" in res:
|
|
29
|
+
return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
|
|
30
|
+
for key in res: # any filter, e.g. metric,custom-filter
|
|
31
|
+
if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
|
|
32
|
+
flt = key.split(",", 1)[1] if "," in key else "none"
|
|
33
|
+
return res[key], res.get(f"{metric}_stderr,{flt}"), key
|
|
34
|
+
return None, None, None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
38
|
+
timestamp: str, model_salt: Optional[bytes] = None,
|
|
39
|
+
dataset_salt: Optional[bytes] = None):
|
|
40
|
+
"""Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
|
|
41
|
+
|
|
42
|
+
`metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
|
|
43
|
+
to avoid float canonicalization issues. Returns (claim, salts).
|
|
44
|
+
"""
|
|
45
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
46
|
+
res = data.get("results", {}).get(task)
|
|
47
|
+
if res is None:
|
|
48
|
+
raise ValueError(f"task not found in results: {task!r}")
|
|
49
|
+
value, stderr, matched = _find_metric(res, metric)
|
|
50
|
+
if value is None:
|
|
51
|
+
raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
|
|
52
|
+
f"(available: {sorted(k for k in res if ',' in k)})")
|
|
53
|
+
score = value if isinstance(value, str) else repr(value)
|
|
54
|
+
|
|
55
|
+
n_samples = data.get("n-samples", {}).get(task, {})
|
|
56
|
+
n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
|
|
57
|
+
cfg = data.get("config", {})
|
|
58
|
+
model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
|
|
59
|
+
if cfg.get("model_args"):
|
|
60
|
+
model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
|
|
61
|
+
|
|
62
|
+
provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
|
|
63
|
+
if data.get("git_hash"):
|
|
64
|
+
provenance["git_hash"] = str(data["git_hash"])
|
|
65
|
+
if data.get("versions", {}).get(task) is not None:
|
|
66
|
+
provenance["task_version"] = str(data["versions"][task])
|
|
67
|
+
if data.get("n-shot", {}).get(task) is not None:
|
|
68
|
+
provenance["n_shot"] = str(data["n-shot"][task])
|
|
69
|
+
if stderr is not None:
|
|
70
|
+
provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
|
|
71
|
+
|
|
72
|
+
return build_eval_claim(
|
|
73
|
+
suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
|
|
74
|
+
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
75
|
+
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
76
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
|
|
|
37
37
|
# The exact key set of an eval claim; decode/validate reject anything else.
|
|
38
38
|
_REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
|
|
39
39
|
"passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
|
|
40
|
-
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
|
|
40
|
+
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
|
|
41
41
|
|
|
42
42
|
__all__ = [
|
|
43
43
|
"EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
|
|
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
126
126
|
threshold: str, score: str, n: int, model_id: str, dataset_id: str,
|
|
127
127
|
issuer: str, timestamp: str, context_binding: Optional[str] = None,
|
|
128
128
|
ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
|
|
129
|
-
prereg_sha256: Optional[str] = None,
|
|
129
|
+
prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
|
|
130
130
|
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
131
131
|
"""Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
|
|
132
132
|
(never trusts the caller), creates salted commitments, and returns (claim, salts) with the
|
|
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
163
163
|
claim["multiple_testing"] = multiple_testing
|
|
164
164
|
if prereg_sha256 is not None:
|
|
165
165
|
claim["prereg_sha256"] = prereg_sha256
|
|
166
|
+
if provenance is not None:
|
|
167
|
+
claim["provenance"] = provenance
|
|
166
168
|
_reject_non_jcs(claim)
|
|
167
169
|
return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
|
|
168
170
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -55,17 +55,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
55
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
56
56
|
|
|
57
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
58
|
-
[](https://pypi.org/project/proofbundle/)
|
|
59
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
60
61
|
[](LICENSE)
|
|
61
62
|
[](https://github.com/astral-sh/ruff)
|
|
62
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
65
|
+
<!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
|
|
66
|
+
here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
|
|
63
67
|
|
|
64
68
|
</div>
|
|
65
69
|
|
|
66
70
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
67
71
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
68
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
72
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
69
73
|
|
|
70
74
|
## Contents
|
|
71
75
|
|
|
@@ -286,13 +290,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
286
290
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
287
291
|
trust the number.
|
|
288
292
|
|
|
289
|
-
###
|
|
293
|
+
### A verification layer for trustworthy eval logs
|
|
290
294
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
295
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
296
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
297
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
298
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
299
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
300
|
+
|
|
301
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
302
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
303
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
304
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
305
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
296
306
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
297
307
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
298
308
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -303,6 +313,9 @@ trust the number.
|
|
|
303
313
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
304
314
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
305
315
|
|
|
316
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
317
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
318
|
+
|
|
306
319
|
## Roadmap
|
|
307
320
|
|
|
308
321
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -310,8 +323,11 @@ trust the number.
|
|
|
310
323
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
311
324
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
312
325
|
salted commitments, issuer binding.
|
|
313
|
-
- **v0.5
|
|
314
|
-
|
|
326
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
327
|
+
- **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
|
|
328
|
+
CITATION.cff, PEP 740 attestations documented.
|
|
329
|
+
- **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
|
|
330
|
+
(assigned on release), and a draft in-toto ML-eval predicate proposal.
|
|
315
331
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
316
332
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
317
333
|
|
|
@@ -9,15 +9,23 @@ TS = "2026-07-01T12:00:00Z"
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestAdapters(unittest.TestCase):
|
|
12
|
-
def
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
def test_lm_eval_real_acc_none_format(self):
|
|
13
|
+
# REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
|
|
14
|
+
claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
|
|
15
|
+
comparator=">=", threshold="0.30", timestamp=TS,
|
|
15
16
|
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
16
|
-
self.assertEqual(claim["suite"], "
|
|
17
|
-
self.
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.assertEqual(claim["
|
|
17
|
+
self.assertEqual(claim["suite"], "arc_easy")
|
|
18
|
+
self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
|
|
19
|
+
self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
|
|
20
|
+
self.assertIn("git_hash", claim["provenance"]) # provenance captured
|
|
21
|
+
self.assertEqual(claim["provenance"]["n_shot"], "0")
|
|
22
|
+
self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
|
|
23
|
+
|
|
24
|
+
def test_lm_eval_missing_metric_lists_available(self):
|
|
25
|
+
with self.assertRaises(ValueError):
|
|
26
|
+
from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
|
|
27
|
+
comparator=">=", threshold="0.5", timestamp=TS,
|
|
28
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
21
29
|
|
|
22
30
|
def test_inspect_ai_stable_api(self):
|
|
23
31
|
# Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from ..evalclaim import build_eval_claim
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
12
|
-
timestamp: str, model_salt: Optional[bytes] = None,
|
|
13
|
-
dataset_salt: Optional[bytes] = None):
|
|
14
|
-
"""Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
|
|
15
|
-
|
|
16
|
-
Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
|
|
17
|
-
"n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
|
|
18
|
-
STRING to avoid float canonicalization issues. Returns (claim, salts).
|
|
19
|
-
"""
|
|
20
|
-
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
21
|
-
res = data.get("results", {}).get(task)
|
|
22
|
-
if res is None or metric not in res:
|
|
23
|
-
raise ValueError(f"task/metric not found in results: {task}/{metric}")
|
|
24
|
-
score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
|
|
25
|
-
n = int(data.get("n-samples", {}).get(task, {}).get("effective")
|
|
26
|
-
or data.get("n-samples", {}).get(task, {}).get("original") or 0)
|
|
27
|
-
model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
|
|
28
|
-
return build_eval_claim(
|
|
29
|
-
suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
|
|
30
|
-
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
31
|
-
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
32
|
-
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|