proofbundle 0.4.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.4.1/src/proofbundle.egg-info → proofbundle-0.6.0}/PKG-INFO +43 -8
- {proofbundle-0.4.1 → proofbundle-0.6.0}/README.md +38 -7
- {proofbundle-0.4.1 → proofbundle-0.6.0}/pyproject.toml +8 -3
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/__init__.py +1 -1
- proofbundle-0.6.0/src/proofbundle/adapters/inspect_ai.py +65 -0
- proofbundle-0.6.0/src/proofbundle/adapters/lm_eval.py +76 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/evalclaim.py +4 -2
- proofbundle-0.6.0/src/proofbundle/intoto.py +63 -0
- proofbundle-0.6.0/src/proofbundle/sdjwt_issue.py +119 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0/src/proofbundle.egg-info}/PKG-INFO +43 -8
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/SOURCES.txt +4 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/requires.txt +5 -0
- proofbundle-0.6.0/tests/test_adapters.py +56 -0
- proofbundle-0.6.0/tests/test_intoto.py +63 -0
- proofbundle-0.6.0/tests/test_sdjwt_issue.py +98 -0
- proofbundle-0.4.1/src/proofbundle/adapters/inspect_ai.py +0 -36
- proofbundle-0.4.1/src/proofbundle/adapters/lm_eval.py +0 -32
- proofbundle-0.4.1/tests/test_adapters.py +0 -32
- {proofbundle-0.4.1 → proofbundle-0.6.0}/LICENSE +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/setup.cfg +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/bundle.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/cli.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_cli.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_emit.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_schema.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.4.1 → proofbundle-0.6.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -27,6 +27,8 @@ Provides-Extra: sdjwt
|
|
|
27
27
|
Provides-Extra: eval
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
|
+
Provides-Extra: inspect
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
30
32
|
Provides-Extra: dev
|
|
31
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
32
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.8; extra == "dev"
|
|
|
35
37
|
Requires-Dist: build>=1; extra == "dev"
|
|
36
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
37
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
|
+
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
38
42
|
Dynamic: license-file
|
|
39
43
|
|
|
40
44
|
<div align="center">
|
|
@@ -51,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
51
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
52
56
|
|
|
53
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
54
|
-
[](https://pypi.org/project/proofbundle/)
|
|
55
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
56
61
|
[](LICENSE)
|
|
57
62
|
[](https://github.com/astral-sh/ruff)
|
|
58
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
59
65
|
|
|
60
66
|
</div>
|
|
61
67
|
|
|
62
68
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
63
69
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
64
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
70
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
65
71
|
|
|
66
72
|
## Contents
|
|
67
73
|
|
|
@@ -282,15 +288,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
282
288
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
283
289
|
trust the number.
|
|
284
290
|
|
|
291
|
+
### A verification layer for trustworthy eval logs
|
|
292
|
+
|
|
293
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
297
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
298
|
+
|
|
299
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
300
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
301
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
302
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
303
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
304
|
+
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
305
|
+
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
306
|
+
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
307
|
+
[PREDICATE.md](PREDICATE.md)).
|
|
308
|
+
- **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
|
|
309
|
+
root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
|
|
310
|
+
`threshold` while **withholding the exact score** and the identifier openings. The signed
|
|
311
|
+
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
312
|
+
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
313
|
+
|
|
314
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
315
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
316
|
+
|
|
285
317
|
## Roadmap
|
|
286
318
|
|
|
287
319
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
288
320
|
- **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
|
|
289
321
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
290
|
-
- **v0.4
|
|
291
|
-
|
|
292
|
-
- **v0.5** —
|
|
293
|
-
|
|
322
|
+
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
323
|
+
salted commitments, issuer binding.
|
|
324
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
326
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
327
|
+
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
328
|
+
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
294
329
|
|
|
295
330
|
## Contributing
|
|
296
331
|
|
|
@@ -12,17 +12,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
12
12
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
13
13
|
|
|
14
14
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
15
|
-
[](https://pypi.org/project/proofbundle/)
|
|
16
|
-
[](https://pypi.org/project/proofbundle/)
|
|
15
|
+
[](https://pypi.org/project/proofbundle/)
|
|
16
|
+
[](https://pypi.org/project/proofbundle/)
|
|
17
|
+
[](https://pepy.tech/project/proofbundle)
|
|
17
18
|
[](LICENSE)
|
|
18
19
|
[](https://github.com/astral-sh/ruff)
|
|
19
20
|
[](https://slsa.dev)
|
|
21
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
20
22
|
|
|
21
23
|
</div>
|
|
22
24
|
|
|
23
25
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
24
26
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
25
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
27
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
26
28
|
|
|
27
29
|
## Contents
|
|
28
30
|
|
|
@@ -243,15 +245,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
243
245
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
244
246
|
trust the number.
|
|
245
247
|
|
|
248
|
+
### A verification layer for trustworthy eval logs
|
|
249
|
+
|
|
250
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
251
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
252
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
253
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
254
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
255
|
+
|
|
256
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
257
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
258
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
259
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
260
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
261
|
+
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
262
|
+
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
263
|
+
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
264
|
+
[PREDICATE.md](PREDICATE.md)).
|
|
265
|
+
- **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
|
|
266
|
+
root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
|
|
267
|
+
`threshold` while **withholding the exact score** and the identifier openings. The signed
|
|
268
|
+
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
269
|
+
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
270
|
+
|
|
271
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
272
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
273
|
+
|
|
246
274
|
## Roadmap
|
|
247
275
|
|
|
248
276
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
249
277
|
- **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
|
|
250
278
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
251
|
-
- **v0.4
|
|
252
|
-
|
|
253
|
-
- **v0.5** —
|
|
254
|
-
|
|
279
|
+
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
280
|
+
salted commitments, issuer binding.
|
|
281
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
282
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
283
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
284
|
+
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
285
|
+
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
255
286
|
|
|
256
287
|
## Contributing
|
|
257
288
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -43,9 +43,14 @@ sdjwt = []
|
|
|
43
43
|
# path (verify_bundle / decode_eval_claim) never canonicalizes — it checks stored bytes — so the
|
|
44
44
|
# verifier stays dependency-free. `pip install proofbundle[eval]` adds emit-side canonicalization.
|
|
45
45
|
eval = ["rfc8785>=0.1.4"]
|
|
46
|
-
#
|
|
46
|
+
# The lm-eval adapter reads exported results.json (no import) → pure stdlib.
|
|
47
47
|
adapters = []
|
|
48
|
-
|
|
48
|
+
# The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
|
|
49
|
+
# the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
|
|
50
|
+
# test is bound to this range. `pip install "proofbundle[inspect]"`.
|
|
51
|
+
inspect = ["inspect_ai>=0.3.100,<0.4"]
|
|
52
|
+
dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
|
|
53
|
+
"rfc8785>=0.1.4", "sd-jwt>=0.10", "inspect_ai>=0.3.100,<0.4"]
|
|
49
54
|
|
|
50
55
|
[project.urls]
|
|
51
56
|
Homepage = "https://b7n0de.com"
|
|
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
|
|
|
13
13
|
from .errors import Check, ProofBundleError, VerificationResult
|
|
14
14
|
from .merkle import verify_consistency, verify_inclusion
|
|
15
15
|
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.6.0"
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
19
19
|
"__version__",
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Adapter for UK AISI inspect_ai eval logs — via the STABLE API, optional extra `proofbundle[inspect]`.
|
|
2
|
+
|
|
3
|
+
Unlike the v0.4 file-based reader, this uses the stable `inspect_ai.log.read_eval_log(path,
|
|
4
|
+
header_only=True)` API (the `.eval` on-disk format + its pydantic schema change between versions, see
|
|
5
|
+
inspect_ai issue 834; the stable API is robust). inspect_ai is imported LAZILY inside the function, so
|
|
6
|
+
the proofbundle core stays dependency-free — only `pip install "proofbundle[inspect]"` pulls it.
|
|
7
|
+
|
|
8
|
+
Object model (inspect_ai): `log.eval.task` is the suite; `log.results.scores` is a list of EvalScore;
|
|
9
|
+
`EvalScore.metrics` is a dict name→EvalMetric; `EvalMetric.value` is the number. threshold, comparator
|
|
10
|
+
and thus `passed` are set by proofbundle, NOT read from the log. model_id/dataset_id become salted
|
|
11
|
+
commitments (never plaintext in the payload).
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from ..evalclaim import build_eval_claim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InspectAdapterError(RuntimeError):
|
|
21
|
+
"""Raised when inspect_ai is missing or the log lacks the expected structure (no bare AttributeError)."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
|
|
25
|
+
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
26
|
+
"""Read an inspect_ai eval log via the stable API and build an eval claim for `metric`.
|
|
27
|
+
|
|
28
|
+
Returns (claim, salts). Raises InspectAdapterError if inspect_ai is unavailable or the log is
|
|
29
|
+
missing the expected attributes — a clear error instead of an opaque AttributeError.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
from inspect_ai.log import read_eval_log # noqa: PLC0415 — lazy: keeps the core dependency-free
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise InspectAdapterError(
|
|
35
|
+
"inspect_ai is required for this adapter — install with: pip install \"proofbundle[inspect]\"") from e
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
log = read_eval_log(str(path), header_only=True)
|
|
39
|
+
except Exception as e: # noqa: BLE001 — surface any read/parse failure as a clear adapter error
|
|
40
|
+
raise InspectAdapterError(f"could not read inspect_ai log {path!r}: {e}") from e
|
|
41
|
+
|
|
42
|
+
ev = getattr(log, "eval", None)
|
|
43
|
+
results = getattr(log, "results", None)
|
|
44
|
+
if ev is None or results is None:
|
|
45
|
+
raise InspectAdapterError("inspect_ai log missing .eval or .results (empty or malformed log)")
|
|
46
|
+
|
|
47
|
+
value = None
|
|
48
|
+
for score in (getattr(results, "scores", None) or []):
|
|
49
|
+
metrics = getattr(score, "metrics", None) or {}
|
|
50
|
+
if metric in metrics:
|
|
51
|
+
value = getattr(metrics[metric], "value", None)
|
|
52
|
+
break
|
|
53
|
+
if value is None:
|
|
54
|
+
raise InspectAdapterError(f"metric {metric!r} not found in any score.metrics of the log")
|
|
55
|
+
|
|
56
|
+
suite = str(getattr(ev, "task", "inspect_ai"))
|
|
57
|
+
model_id = str(getattr(ev, "model", "unknown"))
|
|
58
|
+
dataset = getattr(ev, "dataset", None)
|
|
59
|
+
dataset_id = str(getattr(dataset, "name", None) or suite)
|
|
60
|
+
return build_eval_claim(
|
|
61
|
+
suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
|
|
62
|
+
metric=metric, comparator=comparator, threshold=threshold, score=repr(value),
|
|
63
|
+
n=int(getattr(results, "total_samples", 0) or 0),
|
|
64
|
+
model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
|
|
65
|
+
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Adapter for EleutherAI lm-evaluation-harness results_*.json (file-based, NO lm_eval import).
|
|
2
|
+
|
|
3
|
+
Parses the exported result JSON only — no runtime dependency on lm_eval, no runner rebuild.
|
|
4
|
+
|
|
5
|
+
Real 0.4.x format (validated against a genuine harness run, see tests/fixtures/lm_eval_arc_easy_real.json):
|
|
6
|
+
the metric keys carry a *filter suffix*, e.g. `"acc,none"`, and the standard error is a **sibling** key
|
|
7
|
+
`"acc_stderr,none"` (not nested). So a caller asking for metric `"acc"` is matched against `"acc,none"`
|
|
8
|
+
(or `"acc,<filter>"`). Provenance (git_hash, harness/task version, n-shot) is copied into the receipt's
|
|
9
|
+
optional `provenance` field so a verifier can trace exactly which run produced it.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from ..evalclaim import build_eval_claim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _find_metric(res: dict, metric: str):
|
|
21
|
+
"""Return (value, stderr, matched_key) for `metric`, handling the `metric,<filter>` suffix format.
|
|
22
|
+
|
|
23
|
+
Prefers an exact `metric` key, then `metric,none`, then any `metric,<filter>`. The stderr sibling is
|
|
24
|
+
`metric_stderr,<same filter>`."""
|
|
25
|
+
if metric in res: # bare key (older/simple exports)
|
|
26
|
+
stderr = res.get(f"{metric}_stderr")
|
|
27
|
+
return res[metric], stderr, metric
|
|
28
|
+
if f"{metric},none" in res:
|
|
29
|
+
return res[f"{metric},none"], res.get(f"{metric}_stderr,none"), f"{metric},none"
|
|
30
|
+
for key in res: # any filter, e.g. metric,custom-filter
|
|
31
|
+
if key == metric or (key.startswith(f"{metric},") and not key.startswith(f"{metric}_stderr")):
|
|
32
|
+
flt = key.split(",", 1)[1] if "," in key else "none"
|
|
33
|
+
return res[key], res.get(f"{metric}_stderr,{flt}"), key
|
|
34
|
+
return None, None, None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
38
|
+
timestamp: str, model_salt: Optional[bytes] = None,
|
|
39
|
+
dataset_salt: Optional[bytes] = None):
|
|
40
|
+
"""Read an lm-evaluation-harness results_*.json and build an eval claim for `task`/`metric`.
|
|
41
|
+
|
|
42
|
+
`metric` is the bare name (e.g. "acc"); the real key may be "acc,none". The score is read as a STRING
|
|
43
|
+
to avoid float canonicalization issues. Returns (claim, salts).
|
|
44
|
+
"""
|
|
45
|
+
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
46
|
+
res = data.get("results", {}).get(task)
|
|
47
|
+
if res is None:
|
|
48
|
+
raise ValueError(f"task not found in results: {task!r}")
|
|
49
|
+
value, stderr, matched = _find_metric(res, metric)
|
|
50
|
+
if value is None:
|
|
51
|
+
raise ValueError(f"metric {metric!r} not found in results[{task!r}] "
|
|
52
|
+
f"(available: {sorted(k for k in res if ',' in k)})")
|
|
53
|
+
score = value if isinstance(value, str) else repr(value)
|
|
54
|
+
|
|
55
|
+
n_samples = data.get("n-samples", {}).get(task, {})
|
|
56
|
+
n = int(n_samples.get("effective") or n_samples.get("original") or res.get("sample_len") or 0)
|
|
57
|
+
cfg = data.get("config", {})
|
|
58
|
+
model_id = str(cfg.get("model_name") or cfg.get("model") or "unknown")
|
|
59
|
+
if cfg.get("model_args"):
|
|
60
|
+
model_id = f"{model_id}::{cfg['model_args']}" # include args so the commitment pins the exact model
|
|
61
|
+
|
|
62
|
+
provenance = {"harness": "lm-evaluation-harness", "matched_metric_key": matched}
|
|
63
|
+
if data.get("git_hash"):
|
|
64
|
+
provenance["git_hash"] = str(data["git_hash"])
|
|
65
|
+
if data.get("versions", {}).get(task) is not None:
|
|
66
|
+
provenance["task_version"] = str(data["versions"][task])
|
|
67
|
+
if data.get("n-shot", {}).get(task) is not None:
|
|
68
|
+
provenance["n_shot"] = str(data["n-shot"][task])
|
|
69
|
+
if stderr is not None:
|
|
70
|
+
provenance["stderr"] = repr(stderr) if not isinstance(stderr, str) else stderr
|
|
71
|
+
|
|
72
|
+
return build_eval_claim(
|
|
73
|
+
suite=task, suite_version=str(data.get("versions", {}).get(task, "lm-eval")),
|
|
74
|
+
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
75
|
+
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
76
|
+
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -37,7 +37,7 @@ _MAX_SAFE_INT = 2 ** 53 - 1
|
|
|
37
37
|
# The exact key set of an eval claim; decode/validate reject anything else.
|
|
38
38
|
_REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
|
|
39
39
|
"passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
|
|
40
|
-
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256"}
|
|
40
|
+
_OPTIONAL = {"context_binding", "ci95", "multiple_testing", "prereg_sha256", "provenance"}
|
|
41
41
|
|
|
42
42
|
__all__ = [
|
|
43
43
|
"EVAL_CLAIM_SCHEMA", "COMMIT_ALG", "canonicalize", "build_eval_claim",
|
|
@@ -126,7 +126,7 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
126
126
|
threshold: str, score: str, n: int, model_id: str, dataset_id: str,
|
|
127
127
|
issuer: str, timestamp: str, context_binding: Optional[str] = None,
|
|
128
128
|
ci95: Optional[Sequence[str]] = None, multiple_testing: Optional[str] = None,
|
|
129
|
-
prereg_sha256: Optional[str] = None,
|
|
129
|
+
prereg_sha256: Optional[str] = None, provenance: Optional[dict] = None,
|
|
130
130
|
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
131
131
|
"""Build a valid eval claim from raw values. Computes `passed` ITSELF from the comparator
|
|
132
132
|
(never trusts the caller), creates salted commitments, and returns (claim, salts) with the
|
|
@@ -163,6 +163,8 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
|
|
|
163
163
|
claim["multiple_testing"] = multiple_testing
|
|
164
164
|
if prereg_sha256 is not None:
|
|
165
165
|
claim["prereg_sha256"] = prereg_sha256
|
|
166
|
+
if provenance is not None:
|
|
167
|
+
claim["provenance"] = provenance
|
|
166
168
|
_reject_non_jcs(claim)
|
|
167
169
|
return claim, {"model_salt": m_salt, "dataset_salt": d_salt}
|
|
168
170
|
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""in-toto Statement v1 view of an eval receipt (self-hosted predicate type).
|
|
2
|
+
|
|
3
|
+
A self-hosted `predicateType` URI is fully in-toto-spec-conform and the right choice for a solo v0.x
|
|
4
|
+
(no official in-toto/attestation PR needed). See PREDICATE.md.
|
|
5
|
+
|
|
6
|
+
HONESTY (important): the `subject.digest` here is a SALTED COMMITMENT to the model identifier, NOT the
|
|
7
|
+
content hash of an artifact. Placing it under the standard `sha256` key would suggest an artifact hash
|
|
8
|
+
and mislead generic in-toto verifiers. in-toto permits arbitrary digest keys, so we use a unique custom
|
|
9
|
+
key `proofbundleModelCommitV1`; the `subject.name` is the descriptive `model-id-commitment`; and the
|
|
10
|
+
predicate mirrors the note in `subject_digest_note`. Full artifact digests come only once a model artifact
|
|
11
|
+
exists (deferred, see the roadmap).
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
STATEMENT_TYPE = "https://in-toto.io/Statement/v1"
|
|
18
|
+
PREDICATE_TYPE = "https://b7n0de.com/proofbundle/eval-receipt/v0.1"
|
|
19
|
+
VERIFIER_ID = "https://b7n0de.com/proofbundle"
|
|
20
|
+
MODEL_COMMIT_DIGEST_KEY = "proofbundleModelCommitV1"
|
|
21
|
+
|
|
22
|
+
_SUBJECT_DIGEST_NOTE = (
|
|
23
|
+
"subject.digest is a salted commitment to the model identifier (key "
|
|
24
|
+
f"{MODEL_COMMIT_DIGEST_KEY}), NOT an artifact content hash — do not treat it as sha256.")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _commit_hex(commit: str) -> str:
|
|
28
|
+
"""Extract the hex of a `sha256:<hex>` salted commitment (the value that goes into the digest)."""
|
|
29
|
+
return commit.split(":", 1)[1] if ":" in commit else commit
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
|
|
33
|
+
harness: Optional[dict] = None) -> dict:
|
|
34
|
+
"""Build an in-toto Statement v1 whose predicate is the eval receipt.
|
|
35
|
+
|
|
36
|
+
`root_b64` (from the signed bundle's merkle root) binds the statement to the receipt. `harness`
|
|
37
|
+
(e.g. {"name": "inspect_ai", "version": "0.3.217"}) is optional. The subject digest is the model
|
|
38
|
+
commitment under a custom key (never `sha256`).
|
|
39
|
+
"""
|
|
40
|
+
statement = {
|
|
41
|
+
"_type": STATEMENT_TYPE,
|
|
42
|
+
"subject": [{
|
|
43
|
+
"name": "model-id-commitment",
|
|
44
|
+
"digest": {MODEL_COMMIT_DIGEST_KEY: _commit_hex(claim["model_id_commit"])},
|
|
45
|
+
}],
|
|
46
|
+
"predicateType": PREDICATE_TYPE,
|
|
47
|
+
"predicate": {
|
|
48
|
+
"verifier": {"id": VERIFIER_ID},
|
|
49
|
+
"evaluatedAt": claim["timestamp"],
|
|
50
|
+
"suite": claim["suite"],
|
|
51
|
+
"claims": [{
|
|
52
|
+
"metric": claim["metric"], "comparator": claim["comparator"],
|
|
53
|
+
"threshold": claim["threshold"], "passed": claim["passed"],
|
|
54
|
+
}],
|
|
55
|
+
"datasetCommit": claim.get("dataset_id_commit"),
|
|
56
|
+
"subject_digest_note": _SUBJECT_DIGEST_NOTE,
|
|
57
|
+
},
|
|
58
|
+
}
|
|
59
|
+
if harness:
|
|
60
|
+
statement["predicate"]["harness"] = harness
|
|
61
|
+
if root_b64:
|
|
62
|
+
statement["predicate"]["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
|
|
63
|
+
return statement
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""SD-JWT issuance per RFC 9901 — the differentiation feature (v0.5).
|
|
2
|
+
|
|
3
|
+
Issue an eval receipt so a holder can disclose `passed` + `threshold` while WITHHOLDING the exact score
|
|
4
|
+
and the identifier openings. The existing verifier (proofbundle.sdjwt) stays; this adds issuance.
|
|
5
|
+
|
|
6
|
+
Source of truth: the signed canonical bundle payload (evalclaim) is the ONLY truth. This SD-JWT is a
|
|
7
|
+
derived view — its always-open claims are copied bit-exact from that payload, and it binds the bundle
|
|
8
|
+
anchor via `receipt.root_b64`. Sign the SD-JWT with the SAME Ed25519 key that signed the bundle (matching
|
|
9
|
+
the `issuer` field). A holder cannot lift a claim under a different key.
|
|
10
|
+
|
|
11
|
+
Always-open (plaintext JWT claims, NEVER a disclosure): passed, threshold, comparator, suite, issuer,
|
|
12
|
+
receipt.root_b64. Selectively-disclosable (via `_sd` + disclosures): the exact metric value, ci95, and
|
|
13
|
+
the identifier-commitment openings (identifier + salt).
|
|
14
|
+
|
|
15
|
+
RFC 9901 §4.2.4.1 digest byte-chain (the subtle, load-bearing detail): for each disclosable field, a
|
|
16
|
+
CSPRNG salt of ≥128 bit (base64url); the disclosure is base64url(UTF-8(JSON array [salt, name, value]));
|
|
17
|
+
the digest placed in `_sd` is **base64url(SHA-256(ASCII bytes of the base64url-ENCODED disclosure
|
|
18
|
+
string)))** — hashed over the ENCODED string, NOT over the JSON bytes. `_sd_alg` = "sha-256" at the top
|
|
19
|
+
level. The JWT is signed with EdDSA. Compact form is tilde-separated: JWT~disclosure1~...~ (trailing ~).
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import base64
|
|
24
|
+
import hashlib
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
from typing import Optional, Sequence
|
|
28
|
+
|
|
29
|
+
from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
|
|
30
|
+
from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
|
|
31
|
+
|
|
32
|
+
SD_ALG = "sha-256"
|
|
33
|
+
_SALT_BYTES = 16 # 128 bit
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _b64url(data: bytes) -> str:
|
|
37
|
+
return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _make_disclosure(name: str, value, salt_b64: str) -> tuple[str, str]:
|
|
41
|
+
"""Return (disclosure_b64url, digest_b64url) per RFC 9901 §4.2.4.1.
|
|
42
|
+
|
|
43
|
+
The digest hashes the ASCII bytes of the base64url-ENCODED disclosure string (not the JSON bytes)."""
|
|
44
|
+
disclosure_json = json.dumps([salt_b64, name, value]) # array [salt, name, value]
|
|
45
|
+
disclosure_b64 = _b64url(disclosure_json.encode("utf-8"))
|
|
46
|
+
digest = _b64url(hashlib.sha256(disclosure_b64.encode("ascii")).digest())
|
|
47
|
+
return disclosure_b64, digest
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def issue_sd_jwt(claim: dict, signer: Ed25519PrivateKey, *, root_b64: str,
|
|
51
|
+
exact_score: Optional[str] = None, ci95: Optional[Sequence[str]] = None,
|
|
52
|
+
model_id_opening: Optional[Sequence] = None,
|
|
53
|
+
dataset_id_opening: Optional[Sequence] = None) -> str:
|
|
54
|
+
"""Issue a compact SD-JWT for the eval claim, signed with `signer` (must match claim['issuer']).
|
|
55
|
+
|
|
56
|
+
Openings are (identifier, salt_hex) pairs the issuer may later reveal; `exact_score`/`ci95` are the
|
|
57
|
+
withheld numeric detail. All extras are selectively-disclosable; the pass/threshold facts are open.
|
|
58
|
+
"""
|
|
59
|
+
always_open = {
|
|
60
|
+
"passed": claim["passed"], "threshold": claim["threshold"],
|
|
61
|
+
"comparator": claim["comparator"], "suite": claim["suite"],
|
|
62
|
+
"issuer": claim["issuer"], "receipt": {"root_b64": root_b64},
|
|
63
|
+
}
|
|
64
|
+
disclosures: list[str] = []
|
|
65
|
+
sd_digests: list[str] = []
|
|
66
|
+
|
|
67
|
+
def _add(name: str, value):
|
|
68
|
+
d, dig = _make_disclosure(name, value, _b64url(os.urandom(_SALT_BYTES)))
|
|
69
|
+
disclosures.append(d)
|
|
70
|
+
sd_digests.append(dig)
|
|
71
|
+
|
|
72
|
+
if exact_score is not None:
|
|
73
|
+
_add("exact_score", exact_score)
|
|
74
|
+
if ci95 is not None:
|
|
75
|
+
_add("ci95", list(ci95))
|
|
76
|
+
if model_id_opening is not None:
|
|
77
|
+
_add("model_id_opening", list(model_id_opening))
|
|
78
|
+
if dataset_id_opening is not None:
|
|
79
|
+
_add("dataset_id_opening", list(dataset_id_opening))
|
|
80
|
+
|
|
81
|
+
payload = dict(always_open)
|
|
82
|
+
if sd_digests:
|
|
83
|
+
payload["_sd"] = sd_digests
|
|
84
|
+
payload["_sd_alg"] = SD_ALG
|
|
85
|
+
|
|
86
|
+
header = {"alg": "EdDSA", "typ": "sd-jwt"}
|
|
87
|
+
signing_input = _b64url(json.dumps(header).encode("utf-8")) + "." + _b64url(json.dumps(payload).encode("utf-8"))
|
|
88
|
+
signature = signer.sign(signing_input.encode("ascii"))
|
|
89
|
+
jwt = signing_input + "." + _b64url(signature)
|
|
90
|
+
|
|
91
|
+
# compact: JWT ~ disclosure1 ~ ... ~ (trailing tilde, no key-binding JWT in v0.5)
|
|
92
|
+
return "~".join([jwt, *disclosures]) + "~"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def issuer_matches(claim: dict, signer: Ed25519PrivateKey) -> bool:
|
|
96
|
+
"""True iff the claim's issuer fingerprint equals the signer's public key (bundle↔SD-JWT same key)."""
|
|
97
|
+
raw = signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
|
|
98
|
+
return claim.get("issuer") == "ed25519:" + base64.b64encode(raw).decode("ascii")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _jwt_payload(compact: str) -> dict:
|
|
102
|
+
"""Decode the always-open JWT payload of a compact SD-JWT (the part before the first '~')."""
|
|
103
|
+
jwt = compact.split("~", 1)[0]
|
|
104
|
+
payload_b64 = jwt.split(".")[1]
|
|
105
|
+
padded = payload_b64 + "=" * (-len(payload_b64) % 4)
|
|
106
|
+
return json.loads(base64.urlsafe_b64decode(padded).decode("utf-8"))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def check_binds_bundle(compact: str, claim: dict, root_b64: str) -> bool:
|
|
110
|
+
"""No-Fake binding: the SD-JWT's always-open claims MUST match the signed bundle payload bit-exact and
|
|
111
|
+
bind its merkle root. A derived SD-JWT that diverges from its bundle source of truth is rejected."""
|
|
112
|
+
try:
|
|
113
|
+
p = _jwt_payload(compact)
|
|
114
|
+
except (ValueError, KeyError, IndexError):
|
|
115
|
+
return False
|
|
116
|
+
return (p.get("passed") == claim["passed"] and p.get("threshold") == claim["threshold"]
|
|
117
|
+
and p.get("comparator") == claim["comparator"] and p.get("suite") == claim["suite"]
|
|
118
|
+
and p.get("issuer") == claim["issuer"]
|
|
119
|
+
and (p.get("receipt") or {}).get("root_b64") == root_b64)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -27,6 +27,8 @@ Provides-Extra: sdjwt
|
|
|
27
27
|
Provides-Extra: eval
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
|
+
Provides-Extra: inspect
|
|
31
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
|
|
30
32
|
Provides-Extra: dev
|
|
31
33
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
32
34
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.8; extra == "dev"
|
|
|
35
37
|
Requires-Dist: build>=1; extra == "dev"
|
|
36
38
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
37
39
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
|
+
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
|
|
38
42
|
Dynamic: license-file
|
|
39
43
|
|
|
40
44
|
<div align="center">
|
|
@@ -51,17 +55,19 @@ signed and anchored in a tamper-evident log — and optionally carries a
|
|
|
51
55
|
selectively disclosable credential. Pure Python, no server, no daemon, one JSON file.**
|
|
52
56
|
|
|
53
57
|
[](https://github.com/b7n0de/proofbundle/actions/workflows/ci.yml)
|
|
54
|
-
[](https://pypi.org/project/proofbundle/)
|
|
55
|
-
[](https://pypi.org/project/proofbundle/)
|
|
58
|
+
[](https://pypi.org/project/proofbundle/)
|
|
59
|
+
[](https://pypi.org/project/proofbundle/)
|
|
60
|
+
[](https://pepy.tech/project/proofbundle)
|
|
56
61
|
[](LICENSE)
|
|
57
62
|
[](https://github.com/astral-sh/ruff)
|
|
58
63
|
[](https://slsa.dev)
|
|
64
|
+
[-D6248A.svg)](https://pypi.org/project/proofbundle/)
|
|
59
65
|
|
|
60
66
|
</div>
|
|
61
67
|
|
|
62
68
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
63
69
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
64
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
70
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
|
|
65
71
|
|
|
66
72
|
## Contents
|
|
67
73
|
|
|
@@ -282,15 +288,44 @@ commitments — it does **not** prove the evaluation was well designed or that t
|
|
|
282
288
|
itself is correct. Those are human judgements; what it removes is the need to simply
|
|
283
289
|
trust the number.
|
|
284
290
|
|
|
291
|
+
### A verification layer for trustworthy eval logs
|
|
292
|
+
|
|
293
|
+
The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
|
|
294
|
+
a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
|
|
295
|
+
missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
|
|
296
|
+
aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
|
|
297
|
+
See [INTEROP.md](INTEROP.md) for how it maps to OpenSSF Model Signing, CycloneDX ML-BOM, and in-toto.
|
|
298
|
+
|
|
299
|
+
- **Two framework adapters** — `pip install "proofbundle[inspect]"` reads a UK AISI
|
|
300
|
+
[inspect_ai](https://github.com/UKGovernmentBEIS/inspect_ai) eval log via the stable `read_eval_log`
|
|
301
|
+
API (lazy import). `proofbundle.adapters.from_lm_eval_results` reads a real EleutherAI
|
|
302
|
+
[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) `results_*.json` (the
|
|
303
|
+
genuine `acc,none` filter-suffix format) and captures run provenance — no framework import either way.
|
|
304
|
+
- **in-toto Statement v1** — `proofbundle.intoto.to_intoto_statement(claim, root_b64=…)`
|
|
305
|
+
emits the receipt as an in-toto statement with a self-hosted predicate type. The subject
|
|
306
|
+
digest is an *honest salted commitment* under a custom key, never `sha256` (see
|
|
307
|
+
[PREDICATE.md](PREDICATE.md)).
|
|
308
|
+
- **SD-JWT issuance** (RFC 9901) — `proofbundle.sdjwt_issue.issue_sd_jwt(claim, signer,
|
|
309
|
+
root_b64=…, exact_score=…)` issues the receipt so a holder can disclose `passed` +
|
|
310
|
+
`threshold` while **withholding the exact score** and the identifier openings. The signed
|
|
311
|
+
bundle payload is the source of truth; the SD-JWT is a derived, bundle-bound view, verified
|
|
312
|
+
by proofbundle's own verifier **and** the `sd-jwt-python` reference.
|
|
313
|
+
|
|
314
|
+
Every release ships **PEP 740 attestations** (Trusted Publishing) + an SLSA build-provenance
|
|
315
|
+
attestation — see [SECURITY.md](SECURITY.md).
|
|
316
|
+
|
|
285
317
|
## Roadmap
|
|
286
318
|
|
|
287
319
|
- **v0.1** — the offline verifier plus a real example bundle.
|
|
288
320
|
- **v0.2** — the emitter: `emit_bundle` / `proofbundle emit`.
|
|
289
321
|
- **v0.3** — external RFC 6962 conformance vectors + real Sigstore Rekor interop.
|
|
290
|
-
- **v0.4
|
|
291
|
-
|
|
292
|
-
- **v0.5** —
|
|
293
|
-
|
|
322
|
+
- **v0.4** — the eval-receipt emitter (`emit_eval_receipt` / `proofbundle emit-eval`),
|
|
323
|
+
salted commitments, issuer binding.
|
|
324
|
+
- **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
|
|
325
|
+
- **v0.6 (current release)** — a second eval adapter (lm-evaluation-harness, real format + provenance),
|
|
326
|
+
INTEROP.md, CITATION.cff, PEP 740 attestations documented.
|
|
327
|
+
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
328
|
+
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
294
329
|
|
|
295
330
|
## Contributing
|
|
296
331
|
|
|
@@ -7,9 +7,11 @@ src/proofbundle/cli.py
|
|
|
7
7
|
src/proofbundle/emit.py
|
|
8
8
|
src/proofbundle/errors.py
|
|
9
9
|
src/proofbundle/evalclaim.py
|
|
10
|
+
src/proofbundle/intoto.py
|
|
10
11
|
src/proofbundle/merkle.py
|
|
11
12
|
src/proofbundle/py.typed
|
|
12
13
|
src/proofbundle/sdjwt.py
|
|
14
|
+
src/proofbundle/sdjwt_issue.py
|
|
13
15
|
src/proofbundle/signature.py
|
|
14
16
|
src/proofbundle.egg-info/PKG-INFO
|
|
15
17
|
src/proofbundle.egg-info/SOURCES.txt
|
|
@@ -27,10 +29,12 @@ tests/test_cli_eval.py
|
|
|
27
29
|
tests/test_emit.py
|
|
28
30
|
tests/test_eval_claim_schema.py
|
|
29
31
|
tests/test_evalclaim.py
|
|
32
|
+
tests/test_intoto.py
|
|
30
33
|
tests/test_merkle.py
|
|
31
34
|
tests/test_merkle_property.py
|
|
32
35
|
tests/test_rekor_interop.py
|
|
33
36
|
tests/test_rfc6962_external_vectors.py
|
|
34
37
|
tests/test_schema.py
|
|
38
|
+
tests/test_sdjwt_issue.py
|
|
35
39
|
tests/test_sdjwt_reference.py
|
|
36
40
|
tests/test_signature.py
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
|
|
6
|
+
|
|
7
|
+
FX = Path(__file__).resolve().parent / "fixtures"
|
|
8
|
+
TS = "2026-07-01T12:00:00Z"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestAdapters(unittest.TestCase):
|
|
12
|
+
def test_lm_eval_real_acc_none_format(self):
|
|
13
|
+
# REAL lm-evaluation-harness 0.4.12 export: metric key is "acc,none", stderr sibling "acc_stderr,none".
|
|
14
|
+
claim, salts = from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "acc",
|
|
15
|
+
comparator=">=", threshold="0.30", timestamp=TS,
|
|
16
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
17
|
+
self.assertEqual(claim["suite"], "arc_easy")
|
|
18
|
+
self.assertTrue(claim["passed"]) # acc 0.5 >= 0.30
|
|
19
|
+
self.assertEqual(claim["provenance"]["matched_metric_key"], "acc,none") # suffix handled
|
|
20
|
+
self.assertIn("git_hash", claim["provenance"]) # provenance captured
|
|
21
|
+
self.assertEqual(claim["provenance"]["n_shot"], "0")
|
|
22
|
+
self.assertIn("stderr", claim["provenance"]) # sibling stderr, not nested
|
|
23
|
+
|
|
24
|
+
def test_lm_eval_missing_metric_lists_available(self):
|
|
25
|
+
with self.assertRaises(ValueError):
|
|
26
|
+
from_lm_eval_results(FX / "lm_eval_arc_easy_real.json", "arc_easy", "nonexistent",
|
|
27
|
+
comparator=">=", threshold="0.5", timestamp=TS,
|
|
28
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
29
|
+
|
|
30
|
+
def test_inspect_ai_stable_api(self):
|
|
31
|
+
# Real .eval log fixture, read via the stable inspect_ai.log.read_eval_log API (proofbundle[inspect]).
|
|
32
|
+
try:
|
|
33
|
+
import inspect_ai.log # noqa: F401
|
|
34
|
+
except ImportError:
|
|
35
|
+
self.skipTest("inspect_ai not installed (pip install proofbundle[inspect])")
|
|
36
|
+
claim, salts = from_inspect_ai_log(FX / "inspect_logs" / "safety_refusal_demo.eval", "accuracy",
|
|
37
|
+
comparator=">=", threshold="0.00", timestamp=TS,
|
|
38
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
39
|
+
self.assertEqual(claim["suite"], "safety_refusal_demo")
|
|
40
|
+
self.assertTrue(claim["passed"]) # accuracy 0.0 >= 0.00
|
|
41
|
+
self.assertNotIn("mockllm/model", str(claim)) # model id only as salted commitment
|
|
42
|
+
|
|
43
|
+
def test_inspect_ai_missing_metric_clear_error(self):
|
|
44
|
+
from proofbundle.adapters.inspect_ai import InspectAdapterError
|
|
45
|
+
try:
|
|
46
|
+
import inspect_ai.log # noqa: F401
|
|
47
|
+
except ImportError:
|
|
48
|
+
self.skipTest("inspect_ai not installed")
|
|
49
|
+
with self.assertRaises(InspectAdapterError):
|
|
50
|
+
from_inspect_ai_log(FX / "inspect_logs" / "safety_refusal_demo.eval", "nonexistent_metric",
|
|
51
|
+
comparator=">=", threshold="0.5", timestamp=TS,
|
|
52
|
+
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
unittest.main()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""in-toto Statement v1 view of an eval receipt — structurally valid + honest salted-commitment digest."""
|
|
2
|
+
import json
|
|
3
|
+
import unittest
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import jsonschema
|
|
8
|
+
except ImportError:
|
|
9
|
+
jsonschema = None
|
|
10
|
+
|
|
11
|
+
from proofbundle.emit import generate_signer
|
|
12
|
+
from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
|
|
13
|
+
from proofbundle.intoto import MODEL_COMMIT_DIGEST_KEY, PREDICATE_TYPE, to_intoto_statement
|
|
14
|
+
|
|
15
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
16
|
+
TS = "2026-07-01T12:00:00Z"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _claim():
|
|
20
|
+
signer = generate_signer()
|
|
21
|
+
claim, _ = build_eval_claim(
|
|
22
|
+
suite="safety-refusal", suite_version="v1", metric="accuracy", comparator=">=",
|
|
23
|
+
threshold="0.65", score="0.92", n=500, model_id="acme/model-x", dataset_id="acme/set",
|
|
24
|
+
issuer=issuer_fingerprint(signer), timestamp=TS, model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
25
|
+
return claim
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TestInToto(unittest.TestCase):
|
|
29
|
+
def test_structure(self):
|
|
30
|
+
stmt = to_intoto_statement(_claim(), root_b64="cm9vdA==",
|
|
31
|
+
harness={"name": "inspect_ai", "version": "0.3.217"})
|
|
32
|
+
self.assertEqual(stmt["_type"], "https://in-toto.io/Statement/v1")
|
|
33
|
+
self.assertEqual(stmt["predicateType"], PREDICATE_TYPE)
|
|
34
|
+
self.assertEqual(len(stmt["subject"]), 1)
|
|
35
|
+
self.assertIn("digest", stmt["subject"][0])
|
|
36
|
+
# honest custom digest key, NOT sha256 (would mislead generic verifiers about an artifact hash)
|
|
37
|
+
self.assertIn(MODEL_COMMIT_DIGEST_KEY, stmt["subject"][0]["digest"])
|
|
38
|
+
self.assertNotIn("sha256", stmt["subject"][0]["digest"])
|
|
39
|
+
self.assertIn("salted commitment", stmt["predicate"]["subject_digest_note"])
|
|
40
|
+
self.assertEqual(stmt["predicate"]["receipt"]["root_b64"], "cm9vdA==")
|
|
41
|
+
|
|
42
|
+
def test_digest_is_commit_hex(self):
|
|
43
|
+
claim = _claim()
|
|
44
|
+
stmt = to_intoto_statement(claim)
|
|
45
|
+
expected_hex = claim["model_id_commit"].split(":", 1)[1]
|
|
46
|
+
self.assertEqual(stmt["subject"][0]["digest"][MODEL_COMMIT_DIGEST_KEY], expected_hex)
|
|
47
|
+
|
|
48
|
+
@unittest.skipIf(jsonschema is None, "jsonschema not installed (pip install proofbundle[dev])")
|
|
49
|
+
def test_validates_against_official_intoto_v1_schema(self):
|
|
50
|
+
schema = json.loads((ROOT / "schemas" / "in_toto_statement_v1.schema.json").read_text(encoding="utf-8"))
|
|
51
|
+
stmt = to_intoto_statement(_claim(), root_b64="cm9vdA==")
|
|
52
|
+
jsonschema.validate(instance=stmt, schema=schema) # raises if invalid
|
|
53
|
+
|
|
54
|
+
@unittest.skipIf(jsonschema is None, "jsonschema not installed")
|
|
55
|
+
def test_schema_rejects_missing_subject(self):
|
|
56
|
+
schema = json.loads((ROOT / "schemas" / "in_toto_statement_v1.schema.json").read_text(encoding="utf-8"))
|
|
57
|
+
bad = {"_type": "https://in-toto.io/Statement/v1", "predicateType": "x", "subject": []}
|
|
58
|
+
with self.assertRaises(jsonschema.ValidationError):
|
|
59
|
+
jsonschema.validate(instance=bad, schema=schema)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
unittest.main()
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""SD-JWT issuance (v0.5, RFC 9901) — own verifier + reference interop + red-tests. No-Fake."""
|
|
2
|
+
import base64
|
|
3
|
+
import json
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat
|
|
8
|
+
|
|
9
|
+
from proofbundle.emit import generate_signer
|
|
10
|
+
from proofbundle.evalclaim import build_eval_claim, issuer_fingerprint
|
|
11
|
+
from proofbundle.sdjwt import verify_sd_jwt
|
|
12
|
+
from proofbundle.sdjwt_issue import (
|
|
13
|
+
_make_disclosure,
|
|
14
|
+
check_binds_bundle,
|
|
15
|
+
issue_sd_jwt,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
FX = Path(__file__).resolve().parent / "fixtures"
|
|
19
|
+
TS = "2026-07-01T12:00:00Z"
|
|
20
|
+
ROOT_B64 = "cm9vdA=="
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _claim(signer):
|
|
24
|
+
claim, _ = build_eval_claim(suite="safety", suite_version="v1", metric="accuracy", comparator=">=",
|
|
25
|
+
threshold="0.65", score="0.92", n=500, model_id="acme/model-x", dataset_id="acme/set",
|
|
26
|
+
issuer=issuer_fingerprint(signer), timestamp=TS, model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
27
|
+
return claim
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _raw_pub(signer):
|
|
31
|
+
return signer.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TestSdJwtIssue(unittest.TestCase):
|
|
35
|
+
def test_own_verifier_accepts(self):
|
|
36
|
+
signer = generate_signer()
|
|
37
|
+
compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92", ci95=["0.90", "0.94"])
|
|
38
|
+
res = verify_sd_jwt(compact, _raw_pub(signer))
|
|
39
|
+
self.assertTrue(res["structure_ok"], res)
|
|
40
|
+
self.assertTrue(res["sig_ok"], res)
|
|
41
|
+
|
|
42
|
+
def test_reference_verifier_accepts(self):
|
|
43
|
+
try:
|
|
44
|
+
from jwcrypto.jwk import JWK
|
|
45
|
+
from sd_jwt.verifier import SDJWTVerifier
|
|
46
|
+
except ImportError:
|
|
47
|
+
self.skipTest("sd-jwt-python not installed (dev extra)")
|
|
48
|
+
signer = generate_signer()
|
|
49
|
+
compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
|
|
50
|
+
jwk = JWK(kty="OKP", crv="Ed25519", x=base64.urlsafe_b64encode(_raw_pub(signer)).rstrip(b"=").decode())
|
|
51
|
+
payload = SDJWTVerifier(compact, lambda *_a, **_k: jwk).get_verified_payload()
|
|
52
|
+
self.assertEqual(payload["passed"], True) # always-open
|
|
53
|
+
self.assertEqual(payload["exact_score"], "0.92") # selectively disclosed
|
|
54
|
+
|
|
55
|
+
def test_digest_byte_chain_vector(self):
|
|
56
|
+
# RFC 9901 §4.2.4.1: digest over the base64url-ENCODED disclosure string, not the JSON bytes.
|
|
57
|
+
v = json.loads((FX / "sdjwt_disclosure_vector.json").read_text(encoding="utf-8"))
|
|
58
|
+
d_b64, dig = _make_disclosure(v["name"], v["value"], v["salt_b64url"])
|
|
59
|
+
self.assertEqual(d_b64, v["disclosure_b64url"])
|
|
60
|
+
self.assertEqual(dig, v["expected_digest_b64url"])
|
|
61
|
+
|
|
62
|
+
def test_always_open_vs_selective(self):
|
|
63
|
+
signer = generate_signer()
|
|
64
|
+
compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
|
|
65
|
+
jwt_payload = json.loads(base64.urlsafe_b64decode(
|
|
66
|
+
compact.split("~")[0].split(".")[1] + "==").decode("utf-8"))
|
|
67
|
+
# passed/threshold are plaintext; exact_score is NOT (only its digest is in _sd)
|
|
68
|
+
self.assertEqual(jwt_payload["passed"], True)
|
|
69
|
+
self.assertIn("threshold", jwt_payload)
|
|
70
|
+
self.assertNotIn("exact_score", jwt_payload)
|
|
71
|
+
self.assertIn("_sd", jwt_payload)
|
|
72
|
+
|
|
73
|
+
def test_binds_bundle(self):
|
|
74
|
+
signer = generate_signer()
|
|
75
|
+
claim = _claim(signer)
|
|
76
|
+
compact = issue_sd_jwt(claim, signer, root_b64=ROOT_B64, exact_score="0.92")
|
|
77
|
+
self.assertTrue(check_binds_bundle(compact, claim, ROOT_B64))
|
|
78
|
+
|
|
79
|
+
def test_divergence_red(self): # SD-JWT claims diverge from bundle → rejected
|
|
80
|
+
signer = generate_signer()
|
|
81
|
+
claim = _claim(signer)
|
|
82
|
+
compact = issue_sd_jwt(claim, signer, root_b64=ROOT_B64, exact_score="0.92")
|
|
83
|
+
diverged = dict(claim, passed=False) # bundle says passed=False, SD-JWT says True
|
|
84
|
+
self.assertFalse(check_binds_bundle(compact, diverged, ROOT_B64))
|
|
85
|
+
self.assertFalse(check_binds_bundle(compact, claim, "d3Jvbmc=")) # wrong root
|
|
86
|
+
|
|
87
|
+
def test_tamper_disclosure_red(self): # tampered disclosure → digest mismatch → own verifier fails
|
|
88
|
+
signer = generate_signer()
|
|
89
|
+
compact = issue_sd_jwt(_claim(signer), signer, root_b64=ROOT_B64, exact_score="0.92")
|
|
90
|
+
jwt, *disc = compact.rstrip("~").split("~")
|
|
91
|
+
tampered_d, _ = _make_disclosure("exact_score", "0.99", "AAAAAAAAAAAAAAAAAAAAAA") # not committed in _sd
|
|
92
|
+
tampered = "~".join([jwt, tampered_d]) + "~"
|
|
93
|
+
res = verify_sd_jwt(tampered, _raw_pub(signer))
|
|
94
|
+
self.assertFalse(res.get("structure_ok") and res.get("sig_ok") and "1 disclosure" in res.get("detail", ""))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
unittest.main()
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
"""Adapter for UK AISI inspect_ai eval-log JSON (file-based, no framework import)."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from ..evalclaim import build_eval_claim
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
|
|
12
|
-
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
13
|
-
"""Read an inspect_ai eval-log JSON and build an eval claim.
|
|
14
|
-
|
|
15
|
-
Expects: {"eval": {"task": ..., "model": ..., "dataset": {"name": ...}},
|
|
16
|
-
"results": {"total_samples": n, "scores": [{"metrics": {metric: {"value": <number>}}}]}}.
|
|
17
|
-
Returns (claim, salts).
|
|
18
|
-
"""
|
|
19
|
-
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
20
|
-
ev = data.get("eval", {})
|
|
21
|
-
scores = data.get("results", {}).get("scores", [])
|
|
22
|
-
value = None
|
|
23
|
-
for s in scores:
|
|
24
|
-
m = s.get("metrics", {})
|
|
25
|
-
if metric in m:
|
|
26
|
-
value = m[metric].get("value")
|
|
27
|
-
break
|
|
28
|
-
if value is None:
|
|
29
|
-
raise ValueError(f"metric {metric!r} not found in inspect_ai scores")
|
|
30
|
-
n = int(data.get("results", {}).get("total_samples") or 0)
|
|
31
|
-
return build_eval_claim(
|
|
32
|
-
suite=str(ev.get("task", "inspect_ai")), suite_version=str(ev.get("task_version", "1")),
|
|
33
|
-
metric=metric, comparator=comparator, threshold=threshold, score=repr(value), n=n,
|
|
34
|
-
model_id=str(ev.get("model", "unknown")),
|
|
35
|
-
dataset_id=str(ev.get("dataset", {}).get("name", ev.get("task", "unknown"))),
|
|
36
|
-
issuer="", timestamp=timestamp, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Adapter for EleutherAI lm-evaluation-harness results.json (file-based, no framework import)."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from ..evalclaim import build_eval_claim
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def from_lm_eval_results(path, task: str, metric: str, *, comparator: str, threshold: str,
|
|
12
|
-
timestamp: str, model_salt: Optional[bytes] = None,
|
|
13
|
-
dataset_salt: Optional[bytes] = None):
|
|
14
|
-
"""Read an lm-evaluation-harness results.json and build an eval claim for `task`/`metric`.
|
|
15
|
-
|
|
16
|
-
Expects the standard shape: {"results": {task: {metric: <number>, ...}, ...},
|
|
17
|
-
"n-samples": {task: {"effective": n}}, "config"/"model_name": ...}. The score is read as a
|
|
18
|
-
STRING to avoid float canonicalization issues. Returns (claim, salts).
|
|
19
|
-
"""
|
|
20
|
-
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
21
|
-
res = data.get("results", {}).get(task)
|
|
22
|
-
if res is None or metric not in res:
|
|
23
|
-
raise ValueError(f"task/metric not found in results: {task}/{metric}")
|
|
24
|
-
score = repr(res[metric]) if not isinstance(res[metric], str) else res[metric]
|
|
25
|
-
n = int(data.get("n-samples", {}).get(task, {}).get("effective")
|
|
26
|
-
or data.get("n-samples", {}).get(task, {}).get("original") or 0)
|
|
27
|
-
model_id = str(data.get("model_name") or data.get("config", {}).get("model") or "unknown")
|
|
28
|
-
return build_eval_claim(
|
|
29
|
-
suite=task, suite_version=str(data.get("config", {}).get("model_source", "lm-eval")),
|
|
30
|
-
metric=metric, comparator=comparator, threshold=threshold, score=str(score), n=n,
|
|
31
|
-
model_id=model_id, dataset_id=task, issuer="", timestamp=timestamp,
|
|
32
|
-
model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Adapters map real exported eval JSON to a valid claim (file-based, no framework import)."""
|
|
2
|
-
import unittest
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from proofbundle.adapters import from_inspect_ai_log, from_lm_eval_results
|
|
6
|
-
|
|
7
|
-
FX = Path(__file__).resolve().parent / "fixtures"
|
|
8
|
-
TS = "2026-07-01T12:00:00Z"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestAdapters(unittest.TestCase):
|
|
12
|
-
def test_lm_eval(self):
|
|
13
|
-
claim, salts = from_lm_eval_results(FX / "lm_eval_results.json", "hellaswag", "acc",
|
|
14
|
-
comparator=">=", threshold="0.70", timestamp=TS,
|
|
15
|
-
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
16
|
-
self.assertEqual(claim["suite"], "hellaswag")
|
|
17
|
-
self.assertEqual(claim["threshold"], "0.70")
|
|
18
|
-
self.assertTrue(claim["passed"]) # 0.7534 >= 0.70
|
|
19
|
-
self.assertNotIn("acme/model-x", str(claim)) # id only as salted commitment
|
|
20
|
-
self.assertEqual(claim["n"], 10042)
|
|
21
|
-
|
|
22
|
-
def test_inspect_ai(self):
|
|
23
|
-
claim, salts = from_inspect_ai_log(FX / "inspect_ai_log.json", "accuracy",
|
|
24
|
-
comparator=">=", threshold="0.80", timestamp=TS,
|
|
25
|
-
model_salt=b"0" * 16, dataset_salt=b"1" * 16)
|
|
26
|
-
self.assertEqual(claim["suite"], "safety_refusal")
|
|
27
|
-
self.assertTrue(claim["passed"]) # 0.92 >= 0.80
|
|
28
|
-
self.assertEqual(claim["n"], 500)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
if __name__ == "__main__":
|
|
32
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|