proofbundle 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {proofbundle-0.5.0/src/proofbundle.egg-info → proofbundle-0.7.0}/PKG-INFO +28 -12
  2. {proofbundle-0.5.0 → proofbundle-0.7.0}/README.md +27 -11
  3. {proofbundle-0.5.0 → proofbundle-0.7.0}/pyproject.toml +1 -1
  4. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/__init__.py +1 -1
  5. proofbundle-0.7.0/src/proofbundle/adapters/lm_eval.py +76 -0
  6. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/evalclaim.py +4 -2
  7. {proofbundle-0.5.0 → proofbundle-0.7.0/src/proofbundle.egg-info}/PKG-INFO +28 -12
  8. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_adapters.py +16 -8
  9. proofbundle-0.5.0/src/proofbundle/adapters/lm_eval.py +0 -32
  10. {proofbundle-0.5.0 → proofbundle-0.7.0}/LICENSE +0 -0
  11. {proofbundle-0.5.0 → proofbundle-0.7.0}/setup.cfg +0 -0
  12. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/adapters/__init__.py +0 -0
  13. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
  14. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/bundle.py +0 -0
  15. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/cli.py +0 -0
  16. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/emit.py +0 -0
  17. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/errors.py +0 -0
  18. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/intoto.py +0 -0
  19. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/merkle.py +0 -0
  20. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/py.typed +0 -0
  21. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/sdjwt.py +0 -0
  22. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/sdjwt_issue.py +0 -0
  23. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle/signature.py +0 -0
  24. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/SOURCES.txt +0 -0
  25. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  26. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  27. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/requires.txt +0 -0
  28. {proofbundle-0.5.0 → proofbundle-0.7.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  29. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_bundle.py +0 -0
  30. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_cli.py +0 -0
  31. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_cli_eval.py +0 -0
  32. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_emit.py +0 -0
  33. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_eval_claim_schema.py +0 -0
  34. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_evalclaim.py +0 -0
  35. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_intoto.py +0 -0
  36. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_merkle.py +0 -0
  37. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_merkle_property.py +0 -0
  38. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_rekor_interop.py +0 -0
  39. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_rfc6962_external_vectors.py +0 -0
  40. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_schema.py +0 -0
  41. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_sdjwt_issue.py +0 -0
  42. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_sdjwt_reference.py +0 -0
  43. {proofbundle-0.5.0 → proofbundle-0.7.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -55,17 +55,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
55
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
56
56
 
57
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
58
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
59
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
60
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
61
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
62
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
65
+ <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
66
+ here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
63
67
 
64
68
  </div>
65
69
 
66
70
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
67
71
  verify` checks one self-contained `bundle.json` with three offline cryptographic
68
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
72
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
69
73
 
70
74
  ## Contents
71
75
 
@@ -286,13 +290,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
286
290
  itself is correct. Those are human judgements; what it removes is the need to simply
287
291
  trust the number.
288
292
 
289
- ### Since v0.5: framework adapter, in-toto, selective disclosure
293
+ ### A verification layer for trustworthy eval logs
290
294
 
291
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
292
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
293
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
294
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
295
- without importing anything.
295
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
296
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
297
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
298
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
299
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
300
+
301
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
302
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
303
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
304
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
305
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
296
306
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
297
307
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
298
308
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -303,6 +313,9 @@ trust the number.
303
313
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
304
314
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
305
315
 
316
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
317
+ attestation — see [SECURITY.md](SECURITY.md).
318
+
306
319
  ## Roadmap
307
320
 
308
321
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -310,8 +323,11 @@ trust the number.
310
323
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
311
324
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
312
325
  salted commitments, issuer binding.
313
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
314
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
326
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
327
+ - **v0.6** a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
328
+ CITATION.cff, PEP 740 attestations documented.
329
+ - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
330
+ (assigned on release), and a draft in-toto ML-eval predicate proposal.
315
331
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
316
332
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
317
333
 
@@ -12,17 +12,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
12
12
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
13
13
 
14
14
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
15
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
16
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
15
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
16
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
17
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
17
18
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
18
19
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
19
20
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
21
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
22
+ <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
23
+ here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
20
24
 
21
25
  </div>
22
26
 
23
27
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
24
28
  verify` checks one self-contained `bundle.json` with three offline cryptographic
25
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
29
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
26
30
 
27
31
  ## Contents
28
32
 
@@ -243,13 +247,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
243
247
  itself is correct. Those are human judgements; what it removes is the need to simply
244
248
  trust the number.
245
249
 
246
- ### Since v0.5: framework adapter, in-toto, selective disclosure
250
+ ### A verification layer for trustworthy eval logs
247
251
 
248
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
249
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
250
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
251
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
252
- without importing anything.
252
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
253
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
254
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
255
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
256
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
257
+
258
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
259
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
260
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
261
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
262
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
253
263
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
254
264
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
255
265
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -260,6 +270,9 @@ trust the number.
260
270
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
261
271
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
262
272
 
273
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
274
+ attestation — see [SECURITY.md](SECURITY.md).
275
+
263
276
  ## Roadmap
264
277
 
265
278
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -267,8 +280,11 @@ trust the number.
267
280
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
268
281
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
269
282
  salted commitments, issuer binding.
270
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
271
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
283
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
284
+ - **v0.6** a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
285
+ CITATION.cff, PEP 740 attestations documented.
286
+ - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
287
+ (assigned on release), and a draft in-toto ML-eval predicate proposal.
272
288
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
273
289
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
274
290
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.5.0"
7
+ version = "0.7.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.5.0"
16
+ __version__ = "0.7.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -0,0 +1,76 @@
1
+ """Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
2
+
3
+ Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
4
+
5
+ Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
6
+ the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
7
+ `"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
8
+ (or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
9
+ optional `provenance` field so a verifier can trace exactly which run produced it.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from ..evalclaim import build_eval_claim
18
+
19
+
20
+ def _find_metric(res: dict, metric: str):
21
+ """Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
22
+
23
+ Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
24
+ `metric_stderr,<same filter>`."""
25
+ if metric in res: # bare key (older/simple exports)
26
+ stderr = res.get(f"{metric}_stderr")
27
+ return res[metric], stderr, metric
28
+ if f"{metric},none" in res:
29
+ return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
30
+ for key in res: # any filter, e.g. metric,custom-filter
31
+ if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
32
+ flt = key.split(",", 1)[1] if "," in key else "none"
33
+ return res[key], res.get(f"{metric}_stderr,{flt}"), key
34
+ return None, None, None
35
+
36
+
37
+ def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
38
+ timestamp: str, model_salt: Optional[bytes] = None,
39
+ dataset_salt: Optional[bytes] = None):
40
+ """Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
41
+
42
+ `metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
43
+ to avoid float canonicalization issues. Returns (claim, salts).
44
+ """
45
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
46
+ res = data.get("results", {}).get(task)
47
+ if res is None:
48
+ raise ValueError(f"task not found in results: {task!r}")
49
+ value, stderr, matched = _find_metric(res, metric)
50
+ if value is None:
51
+ raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
52
+ f"(available: {sorted(k for k in res if ',' in k)})")
53
+ score = value if isinstance(value, str) else repr(value)
54
+
55
+ n_samples = data.get("n-samples", {}).get(task, {})
56
+ n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
57
+ cfg = data.get("config", {})
58
+ model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
59
+ if cfg.get("model_args"):
60
+ model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
61
+
62
+ provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
63
+ if data.get("git_hash"):
64
+ provenance["git_hash"] = str(data["git_hash"])
65
+ if data.get("versions", {}).get(task) is not None:
66
+ provenance["task_version"] = str(data["versions"][task])
67
+ if data.get("n-shot", {}).get(task) is not None:
68
+ provenance["n_shot"] = str(data["n-shot"][task])
69
+ if stderr is not None:
70
+ provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
71
+
72
+ return build_eval_claim(
73
+ suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
74
+ metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
75
+ model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
76
+ provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
37
37
  # The exact key set of an eval claim; decode/validate reject anything else.
38
38
  _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
39
  "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
40
- _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
40
+ _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
41
41
 
42
42
  __all__ = [
43
43
  "EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
126
126
  threshold: str, score: str, n: int, model_id: str, dataset_id: str,
127
127
  issuer: str, timestamp: str, context_binding: Optional[str] = None,
128
128
  ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
129
- prereg_sha256: Optional[str] = None,
129
+ prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
130
130
  model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
131
131
  """Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
132
132
  (never trusts the caller), creates salted commitments, and returns (claim, salts) with the
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
163
163
  claim["multiple_testing"] = multiple_testing
164
164
  if prereg_sha256 is not None:
165
165
  claim["prereg_sha256"] = prereg_sha256
166
+ if provenance is not None:
167
+ claim["provenance"] = provenance
166
168
  _reject_non_jcs(claim)
167
169
  return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
168
170
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -55,17 +55,21 @@ signed and anchored in a tamper-evident log — and optionally carries a
55
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
56
56
 
57
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
58
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
59
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
60
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
61
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
62
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
65
+ <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
66
+ here (and the DOI to CITATION.cff) once Zenodo assigns it — it does not exist at build time. -->
63
67
 
64
68
  </div>
65
69
 
66
70
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
67
71
  verify` checks one self-contained `bundle.json` with three offline cryptographic
68
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
72
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
69
73
 
70
74
  ## Contents
71
75
 
@@ -286,13 +290,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
286
290
  itself is correct. Those are human judgements; what it removes is the need to simply
287
291
  trust the number.
288
292
 
289
- ### Since v0.5: framework adapter, in-toto, selective disclosure
293
+ ### A verification layer for trustworthy eval logs
290
294
 
291
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
292
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
293
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
294
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
295
- without importing anything.
295
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
296
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
297
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
298
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
299
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
300
+
301
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
302
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
303
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
304
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
305
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
296
306
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
297
307
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
298
308
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -303,6 +313,9 @@ trust the number.
303
313
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
304
314
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
305
315
 
316
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
317
+ attestation — see [SECURITY.md](SECURITY.md).
318
+
306
319
  ## Roadmap
307
320
 
308
321
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -310,8 +323,11 @@ trust the number.
310
323
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
311
324
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
312
325
  salted commitments, issuer binding.
313
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
314
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
326
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
327
+ - **v0.6** a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
328
+ CITATION.cff, PEP 740 attestations documented.
329
+ - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
330
+ (assigned on release), and a draft in-toto ML-eval predicate proposal.
315
331
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
316
332
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
317
333
 
@@ -9,15 +9,23 @@ TS = "2026-07-01T12:00:00Z"
9
9
 
10
10
 
11
11
  class TestAdapters(unittest.TestCase):
12
- def test_lm_eval(self):
13
- claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
14
- comparator=">=", threshold="0.70", timestamp=TS,
12
+ def test_lm_eval_real_acc_none_format(self):
13
+ # REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
14
+ claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
15
+ comparator=">=", threshold="0.30", timestamp=TS,
15
16
  model_salt=b"0" * 16, dataset_salt=b"1" * 16)
16
- self.assertEqual(claim["suite"], "hellaswag")
17
- self.assertEqual(claim["threshold"], "0.70")
18
- self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
19
- self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
20
- self.assertEqual(claim["n"], 10042)
17
+ self.assertEqual(claim["suite"], "arc_easy")
18
+ self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
19
+ self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
20
+ self.assertIn("git_hash", claim["provenance"]) # provenance captured
21
+ self.assertEqual(claim["provenance"]["n_shot"], "0")
22
+ self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
23
+
24
+ def test_lm_eval_missing_metric_lists_available(self):
25
+ with self.assertRaises(ValueError):
26
+ from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
27
+ comparator=">=", threshold="0.5", timestamp=TS,
28
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
21
29
 
22
30
  def test_inspect_ai_stable_api(self):
23
31
  # Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
@@ -1,32 +0,0 @@
1
- """Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
2
- from __future__ import annotations
3
-
4
- import json
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from ..evalclaim import build_eval_claim
9
-
10
-
11
- def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
12
- timestamp: str, model_salt: Optional[bytes] = None,
13
- dataset_salt: Optional[bytes] = None):
14
- """Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
15
-
16
- Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
17
- "n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
18
- STRING to avoid float canonicalization issues. Returns (claim, salts).
19
- """
20
- data = json.loads(Path(path).read_text(encoding="utf-8"))
21
- res = data.get("results", {}).get(task)
22
- if res is None or metric not in res:
23
- raise ValueError(f"task/metric not found in results: {task}/{metric}")
24
- score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
25
- n = int(data.get("n-samples", {}).get(task, {}).get("effective")
26
- or data.get("n-samples", {}).get(task, {}).get("original") or 0)
27
- model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
28
- return build_eval_claim(
29
- suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
30
- metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
31
- model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
32
- model_salt=model_salt, dataset_salt=dataset_salt)
File without changes
File without changes