proofbundle 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.5.0/src/proofbundle.egg-info → proofbundle-0.6.0}/PKG-INFO +24 -12
- {proofbundle-0.5.0 → proofbundle-0.6.0}/README.md +23 -11
- {proofbundle-0.5.0 → proofbundle-0.6.0}/pyproject.toml +1 -1
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/__init__.py +1 -1
- proofbundle-0.6.0/src/proofbundle/adapters/lm_eval.py +76 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/evalclaim.py +4 -2
- {proofbundle-0.5.0 → proofbundle-0.6.0/src/proofbundle.egg-info}/PKG-INFO +24 -12
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_adapters.py +16 -8
- proofbundle-0.5.0/src/proofbundle/adapters/lm_eval.py +0 -32
- {proofbundle-0.5.0 → proofbundle-0.6.0}/LICENSE +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/setup.cfg +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/bundle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/intoto.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/SOURCES.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/requires.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_cli.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_emit.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_intoto.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_schema.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -55,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
55
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
56
56
|
|
|
57
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
58
|
-
[](https://pypi.org/project/proofbundle/)
|
|
59
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
60
61
|
[](LICENSE)
|
|
61
62
|
[](https://github.com/astral-sh/ruff)
|
|
62
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
63
65
|
|
|
64
66
|
</div>
|
|
65
67
|
|
|
66
68
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
67
69
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
68
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
70
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
69
71
|
|
|
70
72
|
## Contents
|
|
71
73
|
|
|
@@ -286,13 +288,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
286
288
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
287
289
|
trust the number.
|
|
288
290
|
|
|
289
|
-
###
|
|
291
|
+
### A verification layer for trustworthy eval logs
|
|
290
292
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
293
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
297
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
298
|
+
|
|
299
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
300
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
301
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
302
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
303
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
296
304
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
297
305
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
298
306
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -303,6 +311,9 @@ trust the number.
|
|
|
303
311
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
304
312
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
305
313
|
|
|
314
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
315
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
316
|
+
|
|
306
317
|
## Roadmap
|
|
307
318
|
|
|
308
319
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -310,8 +321,9 @@ trust the number.
|
|
|
310
321
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
311
322
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
312
323
|
salted commitments, issuer binding.
|
|
313
|
-
- **v0.5
|
|
314
|
-
|
|
324
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
326
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
315
327
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
316
328
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
317
329
|
|
|
@@ -12,17 +12,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
12
12
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
13
13
|
|
|
14
14
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
15
|
-
[](https://pypi.org/project/proofbundle/)
|
|
16
|
-
[](https://pypi.org/project/proofbundle/)
|
|
15
|
+
[](https://pypi.org/project/proofbundle/)
|
|
16
|
+
[](https://pypi.org/project/proofbundle/)
|
|
17
|
+
[](https://pepy.tech/project/proofbundle)
|
|
17
18
|
[](LICENSE)
|
|
18
19
|
[](https://github.com/astral-sh/ruff)
|
|
19
20
|
[](https://slsa.dev)
|
|
21
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
20
22
|
|
|
21
23
|
</div>
|
|
22
24
|
|
|
23
25
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
24
26
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
25
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
27
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
26
28
|
|
|
27
29
|
## Contents
|
|
28
30
|
|
|
@@ -243,13 +245,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
243
245
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
244
246
|
trust the number.
|
|
245
247
|
|
|
246
|
-
###
|
|
248
|
+
### A verification layer for trustworthy eval logs
|
|
247
249
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
250
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
251
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
252
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
253
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
254
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
255
|
+
|
|
256
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
257
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
258
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
259
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
260
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
253
261
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
254
262
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
255
263
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -260,6 +268,9 @@ trust the number.
|
|
|
260
268
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
261
269
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
262
270
|
|
|
271
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
272
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
273
|
+
|
|
263
274
|
## Roadmap
|
|
264
275
|
|
|
265
276
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -267,8 +278,9 @@ trust the number.
|
|
|
267
278
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
268
279
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
269
280
|
salted commitments, issuer binding.
|
|
270
|
-
- **v0.5
|
|
271
|
-
|
|
281
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
282
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
283
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
272
284
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
273
285
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
274
286
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.6.0"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
|
|
2
|
+
|
|
3
|
+
Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
|
|
4
|
+
|
|
5
|
+
Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
|
|
6
|
+
the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
|
|
7
|
+
`"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
|
|
8
|
+
(or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
|
|
9
|
+
optional `provenance` field so a verifier can trace exactly which run produced it.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from ..evalclaim import build_eval_claim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _find_metric(res: dict, metric: str):
|
|
21
|
+
"""Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
|
|
22
|
+
|
|
23
|
+
Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
|
|
24
|
+
`metric_stderr,<same filter>`."""
|
|
25
|
+
if metric in res: # bare key (older/simple exports)
|
|
26
|
+
stderr = res.get(f"{metric}_stderr")
|
|
27
|
+
return res[metric], stderr, metric
|
|
28
|
+
if f"{metric},none" in res:
|
|
29
|
+
return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
|
|
30
|
+
for key in res: # any filter, e.g. metric,custom-filter
|
|
31
|
+
if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
|
|
32
|
+
flt = key.split(",", 1)[1] if "," in key else "none"
|
|
33
|
+
return res[key], res.get(f"{metric}_stderr,{flt}"), key
|
|
34
|
+
return None, None, None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
38
|
+
timestamp: str, model_salt: Optional[bytes] = None,
|
|
39
|
+
dataset_salt: Optional[bytes] = None):
|
|
40
|
+
"""Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
|
|
41
|
+
|
|
42
|
+
`metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
|
|
43
|
+
to avoid float canonicalization issues. Returns (claim, salts).
|
|
44
|
+
"""
|
|
45
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
46
|
+
res = data.get("results", {}).get(task)
|
|
47
|
+
if res is None:
|
|
48
|
+
raise ValueError(f"task not found in results: {task!r}")
|
|
49
|
+
value, stderr, matched = _find_metric(res, metric)
|
|
50
|
+
if value is None:
|
|
51
|
+
raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
|
|
52
|
+
f"(available: {sorted(k for k in res if ',' in k)})")
|
|
53
|
+
score = value if isinstance(value, str) else repr(value)
|
|
54
|
+
|
|
55
|
+
n_samples = data.get("n-samples", {}).get(task, {})
|
|
56
|
+
n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
|
|
57
|
+
cfg = data.get("config", {})
|
|
58
|
+
model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
|
|
59
|
+
if cfg.get("model_args"):
|
|
60
|
+
model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
|
|
61
|
+
|
|
62
|
+
provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
|
|
63
|
+
if data.get("git_hash"):
|
|
64
|
+
provenance["git_hash"] = str(data["git_hash"])
|
|
65
|
+
if data.get("versions", {}).get(task) is not None:
|
|
66
|
+
provenance["task_version"] = str(data["versions"][task])
|
|
67
|
+
if data.get("n-shot", {}).get(task) is not None:
|
|
68
|
+
provenance["n_shot"] = str(data["n-shot"][task])
|
|
69
|
+
if stderr is not None:
|
|
70
|
+
provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
|
|
71
|
+
|
|
72
|
+
return build_eval_claim(
|
|
73
|
+
suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
|
|
74
|
+
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
75
|
+
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
76
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
|
|
|
37
37
|
# The exact key set of an eval claim; decode/validate reject anything else.
|
|
38
38
|
_REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
|
|
39
39
|
"passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
|
|
40
|
-
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
|
|
40
|
+
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
|
|
41
41
|
|
|
42
42
|
__all__ = [
|
|
43
43
|
"EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
|
|
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
126
126
|
threshold: str, score: str, n: int, model_id: str, dataset_id: str,
|
|
127
127
|
issuer: str, timestamp: str, context_binding: Optional[str] = None,
|
|
128
128
|
ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
|
|
129
|
-
prereg_sha256: Optional[str] = None,
|
|
129
|
+
prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
|
|
130
130
|
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
131
131
|
"""Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
|
|
132
132
|
(never trusts the caller), creates salted commitments, and returns (claim, salts) with the
|
|
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
163
163
|
claim["multiple_testing"] = multiple_testing
|
|
164
164
|
if prereg_sha256 is not None:
|
|
165
165
|
claim["prereg_sha256"] = prereg_sha256
|
|
166
|
+
if provenance is not None:
|
|
167
|
+
claim["provenance"] = provenance
|
|
166
168
|
_reject_non_jcs(claim)
|
|
167
169
|
return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
|
|
168
170
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -55,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
55
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
56
56
|
|
|
57
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
58
|
-
[](https://pypi.org/project/proofbundle/)
|
|
59
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
60
61
|
[](LICENSE)
|
|
61
62
|
[](https://github.com/astral-sh/ruff)
|
|
62
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
63
65
|
|
|
64
66
|
</div>
|
|
65
67
|
|
|
66
68
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
67
69
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
68
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
70
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
69
71
|
|
|
70
72
|
## Contents
|
|
71
73
|
|
|
@@ -286,13 +288,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
286
288
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
287
289
|
trust the number.
|
|
288
290
|
|
|
289
|
-
###
|
|
291
|
+
### A verification layer for trustworthy eval logs
|
|
290
292
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
293
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
297
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
298
|
+
|
|
299
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
300
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
301
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
302
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
303
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
296
304
|
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
297
305
|
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
298
306
|
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
@@ -303,6 +311,9 @@ trust the number.
|
|
|
303
311
|
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
304
312
|
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
305
313
|
|
|
314
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
315
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
316
|
+
|
|
306
317
|
## Roadmap
|
|
307
318
|
|
|
308
319
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
@@ -310,8 +321,9 @@ trust the number.
|
|
|
310
321
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
311
322
|
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
312
323
|
salted commitments, issuer binding.
|
|
313
|
-
- **v0.5
|
|
314
|
-
|
|
324
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
326
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
315
327
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
316
328
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
317
329
|
|
|
@@ -9,15 +9,23 @@ TS = "2026-07-01T12:00:00Z"
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestAdapters(unittest.TestCase):
|
|
12
|
-
def
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
def test_lm_eval_real_acc_none_format(self):
|
|
13
|
+
# REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
|
|
14
|
+
claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
|
|
15
|
+
comparator=">=", threshold="0.30", timestamp=TS,
|
|
15
16
|
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
16
|
-
self.assertEqual(claim["suite"], "
|
|
17
|
-
self.
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.assertEqual(claim["
|
|
17
|
+
self.assertEqual(claim["suite"], "arc_easy")
|
|
18
|
+
self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
|
|
19
|
+
self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
|
|
20
|
+
self.assertIn("git_hash", claim["provenance"]) # provenance captured
|
|
21
|
+
self.assertEqual(claim["provenance"]["n_shot"], "0")
|
|
22
|
+
self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
|
|
23
|
+
|
|
24
|
+
def test_lm_eval_missing_metric_lists_available(self):
|
|
25
|
+
with self.assertRaises(ValueError):
|
|
26
|
+
from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
|
|
27
|
+
comparator=">=", threshold="0.5", timestamp=TS,
|
|
28
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
21
29
|
|
|
22
30
|
def test_inspect_ai_stable_api(self):
|
|
23
31
|
# Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from ..evalclaim import build_eval_claim
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
12
|
-
timestamp: str, model_salt: Optional[bytes] = None,
|
|
13
|
-
dataset_salt: Optional[bytes] = None):
|
|
14
|
-
"""Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
|
|
15
|
-
|
|
16
|
-
Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
|
|
17
|
-
"n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
|
|
18
|
-
STRING to avoid float canonicalization issues. Returns (claim, salts).
|
|
19
|
-
"""
|
|
20
|
-
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
21
|
-
res = data.get("results", {}).get(task)
|
|
22
|
-
if res is None or metric not in res:
|
|
23
|
-
raise ValueError(f"task/metric not found in results: {task}/{metric}")
|
|
24
|
-
score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
|
|
25
|
-
n = int(data.get("n-samples", {}).get(task, {}).get("effective")
|
|
26
|
-
or data.get("n-samples", {}).get(task, {}).get("original") or 0)
|
|
27
|
-
model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
|
|
28
|
-
return build_eval_claim(
|
|
29
|
-
suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
|
|
30
|
-
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
31
|
-
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
32
|
-
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|