proofbundle 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {proofbundle-0.5.0/src/proofbundle.egg-info → proofbundle-0.6.0}/PKG-INFO +24 -12
  2. {proofbundle-0.5.0 → proofbundle-0.6.0}/README.md +23 -11
  3. {proofbundle-0.5.0 → proofbundle-0.6.0}/pyproject.toml +1 -1
  4. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/__init__.py +1 -1
  5. proofbundle-0.6.0/src/proofbundle/adapters/lm_eval.py +76 -0
  6. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/evalclaim.py +4 -2
  7. {proofbundle-0.5.0 → proofbundle-0.6.0/src/proofbundle.egg-info}/PKG-INFO +24 -12
  8. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_adapters.py +16 -8
  9. proofbundle-0.5.0/src/proofbundle/adapters/lm_eval.py +0 -32
  10. {proofbundle-0.5.0 → proofbundle-0.6.0}/LICENSE +0 -0
  11. {proofbundle-0.5.0 → proofbundle-0.6.0}/setup.cfg +0 -0
  12. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/adapters/__init__.py +0 -0
  13. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/adapters/inspect_ai.py +0 -0
  14. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/bundle.py +0 -0
  15. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/cli.py +0 -0
  16. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/emit.py +0 -0
  17. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/errors.py +0 -0
  18. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/intoto.py +0 -0
  19. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/merkle.py +0 -0
  20. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/py.typed +0 -0
  21. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/sdjwt.py +0 -0
  22. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/sdjwt_issue.py +0 -0
  23. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle/signature.py +0 -0
  24. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/SOURCES.txt +0 -0
  25. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  26. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  27. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/requires.txt +0 -0
  28. {proofbundle-0.5.0 → proofbundle-0.6.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  29. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_bundle.py +0 -0
  30. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_cli.py +0 -0
  31. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_cli_eval.py +0 -0
  32. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_emit.py +0 -0
  33. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_eval_claim_schema.py +0 -0
  34. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_evalclaim.py +0 -0
  35. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_intoto.py +0 -0
  36. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_merkle.py +0 -0
  37. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_merkle_property.py +0 -0
  38. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_rekor_interop.py +0 -0
  39. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_rfc6962_external_vectors.py +0 -0
  40. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_schema.py +0 -0
  41. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_sdjwt_issue.py +0 -0
  42. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_sdjwt_reference.py +0 -0
  43. {proofbundle-0.5.0 → proofbundle-0.6.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -55,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
55
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
56
56
 
57
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
58
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
59
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
60
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
61
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
62
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
63
65
 
64
66
  </div>
65
67
 
66
68
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
67
69
  verify` checks one self-contained `bundle.json` with three offline cryptographic
68
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
70
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
69
71
 
70
72
  ## Contents
71
73
 
@@ -286,13 +288,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
286
288
  itself is correct. Those are human judgements; what it removes is the need to simply
287
289
  trust the number.
288
290
 
289
- ### Since v0.5: framework adapter, in-toto, selective disclosure
291
+ ### A verification layer for trustworthy eval logs
290
292
 
291
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
292
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
293
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
294
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
295
- without importing anything.
293
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
294
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
295
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
296
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
297
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
298
+
299
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
300
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
301
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
302
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
303
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
296
304
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
297
305
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
298
306
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -303,6 +311,9 @@ trust the number.
303
311
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
304
312
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
305
313
 
314
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
315
+ attestation — see [SECURITY.md](SECURITY.md).
316
+
306
317
  ## Roadmap
307
318
 
308
319
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -310,8 +321,9 @@ trust the number.
310
321
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
311
322
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
312
323
  salted commitments, issuer binding.
313
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
314
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
324
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
325
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
326
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
315
327
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
316
328
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
317
329
 
@@ -12,17 +12,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
12
12
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
13
13
 
14
14
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
15
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
16
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
15
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
16
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
17
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
17
18
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
18
19
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
19
20
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
21
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
20
22
 
21
23
  </div>
22
24
 
23
25
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
24
26
  verify` checks one self-contained `bundle.json` with three offline cryptographic
25
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
27
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
26
28
 
27
29
  ## Contents
28
30
 
@@ -243,13 +245,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
243
245
  itself is correct. Those are human judgements; what it removes is the need to simply
244
246
  trust the number.
245
247
 
246
- ### Since v0.5: framework adapter, in-toto, selective disclosure
248
+ ### A verification layer for trustworthy eval logs
247
249
 
248
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
249
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
250
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
251
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
252
- without importing anything.
250
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
251
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
252
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
253
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
254
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
255
+
256
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
257
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
258
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
259
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
260
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
253
261
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
254
262
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
255
263
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -260,6 +268,9 @@ trust the number.
260
268
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
261
269
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
262
270
 
271
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
272
+ attestation — see [SECURITY.md](SECURITY.md).
273
+
263
274
  ## Roadmap
264
275
 
265
276
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -267,8 +278,9 @@ trust the number.
267
278
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
268
279
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
269
280
  salted commitments, issuer binding.
270
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
271
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
281
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
282
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
283
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
272
284
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
273
285
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
274
286
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.5.0"
7
+ version = "0.6.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.5.0"
16
+ __version__ = "0.6.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -0,0 +1,76 @@
1
+ """Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
2
+
3
+ Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
4
+
5
+ Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
6
+ the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
7
+ `"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
8
+ (or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
9
+ optional `provenance` field so a verifier can trace exactly which run produced it.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from ..evalclaim import build_eval_claim
18
+
19
+
20
+ def _find_metric(res: dict, metric: str):
21
+ """Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
22
+
23
+ Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
24
+ `metric_stderr,<same filter>`."""
25
+ if metric in res: # bare key (older/simple exports)
26
+ stderr = res.get(f"{metric}_stderr")
27
+ return res[metric], stderr, metric
28
+ if f"{metric},none" in res:
29
+ return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
30
+ for key in res: # any filter, e.g. metric,custom-filter
31
+ if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
32
+ flt = key.split(",", 1)[1] if "," in key else "none"
33
+ return res[key], res.get(f"{metric}_stderr,{flt}"), key
34
+ return None, None, None
35
+
36
+
37
+ def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
38
+ timestamp: str, model_salt: Optional[bytes] = None,
39
+ dataset_salt: Optional[bytes] = None):
40
+ """Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
41
+
42
+ `metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
43
+ to avoid float canonicalization issues. Returns (claim, salts).
44
+ """
45
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
46
+ res = data.get("results", {}).get(task)
47
+ if res is None:
48
+ raise ValueError(f"task not found in results: {task!r}")
49
+ value, stderr, matched = _find_metric(res, metric)
50
+ if value is None:
51
+ raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
52
+ f"(available: {sorted(k for k in res if ',' in k)})")
53
+ score = value if isinstance(value, str) else repr(value)
54
+
55
+ n_samples = data.get("n-samples", {}).get(task, {})
56
+ n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
57
+ cfg = data.get("config", {})
58
+ model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
59
+ if cfg.get("model_args"):
60
+ model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
61
+
62
+ provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
63
+ if data.get("git_hash"):
64
+ provenance["git_hash"] = str(data["git_hash"])
65
+ if data.get("versions", {}).get(task) is not None:
66
+ provenance["task_version"] = str(data["versions"][task])
67
+ if data.get("n-shot", {}).get(task) is not None:
68
+ provenance["n_shot"] = str(data["n-shot"][task])
69
+ if stderr is not None:
70
+ provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
71
+
72
+ return build_eval_claim(
73
+ suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
74
+ metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
75
+ model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
76
+ provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
37
37
  # The exact key set of an eval claim; decode/validate reject anything else.
38
38
  _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
39
  "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
40
- _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
40
+ _OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
41
41
 
42
42
  __all__ = [
43
43
  "EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
126
126
  threshold: str, score: str, n: int, model_id: str, dataset_id: str,
127
127
  issuer: str, timestamp: str, context_binding: Optional[str] = None,
128
128
  ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
129
- prereg_sha256: Optional[str] = None,
129
+ prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
130
130
  model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
131
131
  """Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
132
132
  (never trusts the caller), creates salted commitments, and returns (claim, salts) with the
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
163
163
  claim["multiple_testing"] = multiple_testing
164
164
  if prereg_sha256 is not None:
165
165
  claim["prereg_sha256"] = prereg_sha256
166
+ if provenance is not None:
167
+ claim["provenance"] = provenance
166
168
  _reject_non_jcs(claim)
167
169
  return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
168
170
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -55,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
55
55
  selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
56
56
 
57
57
  [![CI](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml/badge.svg)](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
58
- [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
59
- [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A)](https://pypi.org/project/proofbundle/)
58
+ [![PyPI](https://img.shields.io/pypi/v/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
59
+ [![Python](https://img.shields.io/pypi/pyversions/proofbundle.svg?color=D6248A&cacheSeconds=3600)](https://pypi.org/project/proofbundle/)
60
+ [![Downloads](https://static.pepy.tech/badge/proofbundle)](https://pepy.tech/project/proofbundle)
60
61
  [![License: MIT](https://img.shields.io/badge/license-MIT-D6248A.svg)](LICENSE)
61
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
62
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
+ [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
63
65
 
64
66
  </div>
65
67
 
66
68
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
67
69
  verify` checks one self-contained `bundle.json` with three offline cryptographic
68
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 62 tests.
70
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
69
71
 
70
72
  ## Contents
71
73
 
@@ -286,13 +288,19 @@ commitments — it does **not** prove the evaluation was well designed or that t
286
288
  itself is correct. Those are human judgements; what it removes is the need to simply
287
289
  trust the number.
288
290
 
289
- ### Since v0.5: framework adapter, in-toto, selective disclosure
291
+ ### A verification layer for trustworthy eval logs
290
292
 
291
- - **inspect_ai adapter** (`pip install "proofbundle[inspect]"`) reads a UK AISI
292
- [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable
293
- `read_eval_log` API (lazy import; the core stays dependency-free) and maps it to a claim.
294
- `proofbundle.adapters.from_lm_eval_results` reads lm-evaluation-harness `results.json`
295
- without importing anything.
293
+ The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
294
+ a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
295
+ missing **signature + selective-disclosure layer** for exactly that complementary to metadata
296
+ aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
297
+ See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
298
+
299
+ - **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
300
+ [inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
301
+ API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
302
+ [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
303
+ genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
296
304
  - **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
297
305
  emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
298
306
  digest is an *honest salted commitment* under a custom key, never `sha256` (see
@@ -303,6 +311,9 @@ trust the number.
303
311
  bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
304
312
  by proofbundle's own verifier **and** the `sd-jwt-python` reference.
305
313
 
314
+ Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
315
+ attestation — see [SECURITY.md](SECURITY.md).
316
+
306
317
  ## Roadmap
307
318
 
308
319
  - **v0.1** — the offline verifier plus a real example bundle.
@@ -310,8 +321,9 @@ trust the number.
310
321
  - **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
311
322
  - **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
312
323
  salted commitments, issuer binding.
313
- - **v0.5 (current release)** — inspect_ai adapter (stable API), in-toto Statement v1 view,
314
- and SD-JWT **issuance** per RFC 9901 (selective disclosure of the exact score).
324
+ - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
325
+ - **v0.6 (current release)** a second eval adapter (lm-evaluation-harness, real format + provenance),
326
+ INTEROP.md, CITATION.cff, PEP 740 attestations documented.
315
327
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
316
328
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
317
329
 
@@ -9,15 +9,23 @@ TS = "2026-07-01T12:00:00Z"
9
9
 
10
10
 
11
11
  class TestAdapters(unittest.TestCase):
12
- def test_lm_eval(self):
13
- claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
14
- comparator=">=", threshold="0.70", timestamp=TS,
12
+ def test_lm_eval_real_acc_none_format(self):
13
+ # REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
14
+ claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
15
+ comparator=">=", threshold="0.30", timestamp=TS,
15
16
  model_salt=b"0" * 16, dataset_salt=b"1" * 16)
16
- self.assertEqual(claim["suite"], "hellaswag")
17
- self.assertEqual(claim["threshold"], "0.70")
18
- self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
19
- self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
20
- self.assertEqual(claim["n"], 10042)
17
+ self.assertEqual(claim["suite"], "arc_easy")
18
+ self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
19
+ self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
20
+ self.assertIn("git_hash", claim["provenance"]) # provenance captured
21
+ self.assertEqual(claim["provenance"]["n_shot"], "0")
22
+ self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
23
+
24
+ def test_lm_eval_missing_metric_lists_available(self):
25
+ with self.assertRaises(ValueError):
26
+ from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
27
+ comparator=">=", threshold="0.5", timestamp=TS,
28
+ model_salt=b"0" * 16, dataset_salt=b"1" * 16)
21
29
 
22
30
  def test_inspect_ai_stable_api(self):
23
31
  # Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
@@ -1,32 +0,0 @@
1
- """Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
2
- from __future__ import annotations
3
-
4
- import json
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from ..evalclaim import build_eval_claim
9
-
10
-
11
- def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
12
- timestamp: str, model_salt: Optional[bytes] = None,
13
- dataset_salt: Optional[bytes] = None):
14
- """Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
15
-
16
- Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
17
- "n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
18
- STRING to avoid float canonicalization issues. Returns (claim, salts).
19
- """
20
- data = json.loads(Path(path).read_text(encoding="utf-8"))
21
- res = data.get("results", {}).get(task)
22
- if res is None or metric not in res:
23
- raise ValueError(f"task/metric not found in results: {task}/{metric}")
24
- score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
25
- n = int(data.get("n-samples", {}).get(task, {}).get("effective")
26
- or data.get("n-samples", {}).get(task, {}).get("original") or 0)
27
- model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
28
- return build_eval_claim(
29
- suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
30
- metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
31
- model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
32
- model_salt=model_salt, dataset_salt=dataset_salt)
File without changes
File without changes