proofbundle 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {proofbundle-0.7.0/src/proofbundle.egg-info → proofbundle-0.8.0}/PKG-INFO +47 -14
  2. {proofbundle-0.7.0 → proofbundle-0.8.0}/README.md +44 -11
  3. {proofbundle-0.7.0 → proofbundle-0.8.0}/pyproject.toml +5 -4
  4. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/__init__.py +1 -1
  5. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/inspect_ai.py +15 -1
  6. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/bundle.py +46 -6
  7. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/evalclaim.py +17 -6
  8. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/intoto.py +17 -16
  9. {proofbundle-0.7.0 → proofbundle-0.8.0/src/proofbundle.egg-info}/PKG-INFO +47 -14
  10. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/SOURCES.txt +2 -0
  11. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/requires.txt +4 -0
  12. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_adapters.py +2 -0
  13. proofbundle-0.8.0/tests/test_bundle_robustness.py +74 -0
  14. proofbundle-0.8.0/tests/test_examples.py +28 -0
  15. {proofbundle-0.7.0 → proofbundle-0.8.0}/LICENSE +0 -0
  16. {proofbundle-0.7.0 → proofbundle-0.8.0}/setup.cfg +0 -0
  17. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/__init__.py +0 -0
  18. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/adapters/lm_eval.py +0 -0
  19. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/cli.py +0 -0
  20. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/emit.py +0 -0
  21. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/errors.py +0 -0
  22. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/merkle.py +0 -0
  23. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/py.typed +0 -0
  24. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/sdjwt.py +0 -0
  25. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/sdjwt_issue.py +0 -0
  26. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle/signature.py +0 -0
  27. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
  28. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/entry_points.txt +0 -0
  29. {proofbundle-0.7.0 → proofbundle-0.8.0}/src/proofbundle.egg-info/top_level.txt +0 -0
  30. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_bundle.py +0 -0
  31. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_cli.py +0 -0
  32. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_cli_eval.py +0 -0
  33. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_emit.py +0 -0
  34. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_eval_claim_schema.py +0 -0
  35. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_evalclaim.py +0 -0
  36. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_intoto.py +0 -0
  37. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_merkle.py +0 -0
  38. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_merkle_property.py +0 -0
  39. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_rekor_interop.py +0 -0
  40. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_rfc6962_external_vectors.py +0 -0
  41. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_schema.py +0 -0
  42. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_sdjwt_issue.py +0 -0
  43. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_sdjwt_reference.py +0 -0
  44. {proofbundle-0.7.0 → proofbundle-0.8.0}/tests/test_signature.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: eval
28
28
  Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
29
  Provides-Extra: adapters
30
30
  Provides-Extra: inspect
31
- Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
31
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
32
32
  Provides-Extra: dev
33
33
  Requires-Dist: pytest>=7; extra == "dev"
34
34
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
38
38
  Requires-Dist: hypothesis>=6; extra == "dev"
39
39
  Requires-Dist: rfc8785>=0.1.4; extra == "dev"
40
40
  Requires-Dist: sd-jwt>=0.10; extra == "dev"
41
- Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
41
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
42
42
  Dynamic: license-file
43
43
 
44
44
  <div align="center">
@@ -62,14 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
62
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
63
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
64
  [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
65
- <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
66
- here (and the DOI to CITATION.cff) once Zenodo assigns it it does not exist at build time. -->
65
+ <!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
66
+ badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
67
+ been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
67
68
 
68
69
  </div>
69
70
 
70
71
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
71
72
  verify` checks one self-contained `bundle.json` with three offline cryptographic
72
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
73
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
73
74
 
74
75
  ## Contents
75
76
 
@@ -78,6 +79,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
78
79
  - [How it fits together](#how-it-fits-together)
79
80
  - [Install](#install)
80
81
  - [Quickstart](#quickstart)
82
+ - [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
81
83
  - [Interoperability](#interoperability)
82
84
  - [Bundle format](#bundle-format-proofbundlev01)
83
85
  - [Eval receipts](#eval-receipts)
@@ -208,6 +210,21 @@ from proofbundle import verify_consistency
208
210
  verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
209
211
  ```
210
212
 
213
+ ## Demo — a real eval log to a verified receipt, offline
214
+
215
+ ```bash
216
+ pip install "proofbundle[eval,inspect]"
217
+ make demo # or: bash scripts/demo.sh
218
+ ```
219
+
220
+ `make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
221
+ inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
222
+ (committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
223
+ proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
224
+ that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
225
+ See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
226
+ [`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
227
+
211
228
  ## Interoperability
212
229
 
213
230
  proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
@@ -284,15 +301,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
284
301
  ```
285
302
 
286
303
  The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
287
- RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
288
- a receipt proves `passed` against `threshold` and hides the model/dataset via salted
289
- commitments it does **not** prove the evaluation was well designed or that the score
290
- itself is correct. Those are human judgements; what it removes is the need to simply
291
- trust the number.
304
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free.
305
+
306
+ **Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
307
+ *claimed* result and its context these exact bytes, signed by this key, anchored under this root, with
308
+ model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
309
+ and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
310
+ suite measures what it claims, and whether the number was computed honestly are separate questions.
311
+ Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
312
+ computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
313
+ hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
314
+
315
+ **How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
316
+ signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
317
+ those lines,
318
+ [inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
319
+ belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
320
+ computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
321
+ SD-JWT selective disclosure over one portable file, offline.
292
322
 
293
323
  ### A verification layer for trustworthy eval logs
294
324
 
295
- The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
325
+ The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
326
+ gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
296
327
  a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
297
328
  missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
298
329
  aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
@@ -326,8 +357,10 @@ attestation — see [SECURITY.md](SECURITY.md).
326
357
  - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
327
358
  - **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
328
359
  CITATION.cff, PEP 740 attestations documented.
329
- - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
330
- (assigned on release), and a draft in-toto ML-eval predicate proposal.
360
+ - **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
361
+ verifier robustness + CI on Python 3.9 after a holistic review.
362
+ - **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
363
+ a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
331
364
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
332
365
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
333
366
 
@@ -19,14 +19,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
19
19
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
20
20
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
21
21
  [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
22
- <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
23
- here (and the DOI to CITATION.cff) once Zenodo assigns it it does not exist at build time. -->
22
+ <!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
23
+ badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
24
+ been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
24
25
 
25
26
  </div>
26
27
 
27
28
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
28
29
  verify` checks one self-contained `bundle.json` with three offline cryptographic
29
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
30
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
30
31
 
31
32
  ## Contents
32
33
 
@@ -35,6 +36,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
35
36
  - [How it fits together](#how-it-fits-together)
36
37
  - [Install](#install)
37
38
  - [Quickstart](#quickstart)
39
+ - [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
38
40
  - [Interoperability](#interoperability)
39
41
  - [Bundle format](#bundle-format-proofbundlev01)
40
42
  - [Eval receipts](#eval-receipts)
@@ -165,6 +167,21 @@ from proofbundle import verify_consistency
165
167
  verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
166
168
  ```
167
169
 
170
+ ## Demo — a real eval log to a verified receipt, offline
171
+
172
+ ```bash
173
+ pip install "proofbundle[eval,inspect]"
174
+ make demo # or: bash scripts/demo.sh
175
+ ```
176
+
177
+ `make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
178
+ inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
179
+ (committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
180
+ proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
181
+ that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
182
+ See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
183
+ [`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
184
+
168
185
  ## Interoperability
169
186
 
170
187
  proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
@@ -241,15 +258,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
241
258
  ```
242
259
 
243
260
  The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
244
- RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
245
- a receipt proves `passed` against `threshold` and hides the model/dataset via salted
246
- commitments it does **not** prove the evaluation was well designed or that the score
247
- itself is correct. Those are human judgements; what it removes is the need to simply
248
- trust the number.
261
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free.
262
+
263
+ **Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
264
+ *claimed* result and its context these exact bytes, signed by this key, anchored under this root, with
265
+ model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
266
+ and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
267
+ suite measures what it claims, and whether the number was computed honestly are separate questions.
268
+ Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
269
+ computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
270
+ hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
271
+
272
+ **How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
273
+ signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
274
+ those lines,
275
+ [inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
276
+ belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
277
+ computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
278
+ SD-JWT selective disclosure over one portable file, offline.
249
279
 
250
280
  ### A verification layer for trustworthy eval logs
251
281
 
252
- The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
282
+ The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
283
+ gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
253
284
  a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
254
285
  missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
255
286
  aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
@@ -283,8 +314,10 @@ attestation — see [SECURITY.md](SECURITY.md).
283
314
  - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
284
315
  - **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
285
316
  CITATION.cff, PEP 740 attestations documented.
286
- - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
287
- (assigned on release), and a draft in-toto ML-eval predicate proposal.
317
+ - **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
318
+ verifier robustness + CI on Python 3.9 after a holistic review.
319
+ - **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
320
+ a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
288
321
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
289
322
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
290
323
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "proofbundle"
7
- version = "0.7.0"
7
+ version = "0.8.0"
8
8
  description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -47,10 +47,11 @@ eval = ["rfc8785>=0.1.4"]
47
47
  adapters = []
48
48
  # The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
49
49
  # the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
50
- # test is bound to this range. `pip install "proofbundle[inspect]"`.
51
- inspect = ["inspect_ai>=0.3.100,<0.4"]
50
+ # test is bound to this range. inspect_ai requires Python >= 3.10, so the marker gates it out on 3.9
51
+ # (base + [eval]/[sdjwt] still work on 3.9; the inspect adapter test skips there). Fixes the red 3.9 CI.
52
+ inspect = ['inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
52
53
  dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
53
- "rfc8785>=0.1.4", "sd-jwt>=0.10", "inspect_ai>=0.3.100,<0.4"]
54
+ "rfc8785>=0.1.4", "sd-jwt>=0.10", 'inspect_ai>=0.3.100,<0.4; python_version >= "3.10"']
54
55
 
55
56
  [project.urls]
56
57
  Homepage = "https://b7n0de.com"
@@ -13,7 +13,7 @@ from .emit import emit_bundle, generate_signer
13
13
  from .errors import Check, ProofBundleError, VerificationResult
14
14
  from .merkle import verify_consistency, verify_inclusion
15
15
 
16
- __version__ = "0.7.0"
16
+ __version__ = "0.8.0"
17
17
 
18
18
  __all__ = [
19
19
  "__version__",
@@ -57,9 +57,23 @@ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, t
57
57
  model_id = str(getattr(ev, "model", "unknown"))
58
58
  dataset = getattr(ev, "dataset", None)
59
59
  dataset_id = str(getattr(dataset, "name", None) or suite)
60
+
61
+ # Provenance parity with the lm-eval adapter: inspect_ai exposes the same run provenance for free.
62
+ provenance = {"harness": "inspect_ai"}
63
+ revision = getattr(ev, "revision", None)
64
+ commit = getattr(revision, "commit", None)
65
+ if commit:
66
+ provenance["git_hash"] = str(commit)
67
+ packages = getattr(ev, "packages", None) or {}
68
+ if isinstance(packages, dict) and packages.get("inspect_ai"):
69
+ provenance["harness_version"] = str(packages["inspect_ai"])
70
+ tv = getattr(ev, "task_version", None)
71
+ if tv is not None:
72
+ provenance["task_version"] = str(tv)
73
+
60
74
  return build_eval_claim(
61
75
  suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
62
76
  metric=metric, comparator=comparator, threshold=threshold, score=repr(value),
63
77
  n=int(getattr(results, "total_samples", 0) or 0),
64
78
  model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
65
- model_salt=model_salt, dataset_salt=dataset_salt)
79
+ provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
@@ -12,7 +12,11 @@ checks, fully offline and without any running log server:
12
12
  The verifier treats ``payload`` as opaque bytes: it proves *that these exact
13
13
  bytes were signed and anchored*, not what they mean. That keeps v0.1 small and
14
14
  correct. Turning a reproducible eval run into such a payload is the job of the
15
- emitter (see ``emit.py``, roadmap).
15
+ eval-receipt emitter (see :mod:`proofbundle.evalclaim`, since v0.4).
16
+
17
+ Malformed input (wrong types, missing or unknown fields) is rejected with a
18
+ ``BundleFormatError`` — never a raw traceback — so a caller gets the documented
19
+ malformed exit code, not a crash.
16
20
  """
17
21
 
18
22
  from __future__ import annotations
@@ -30,6 +34,13 @@ __all__ = ["SCHEMA", "verify_bundle", "load_bundle"]
30
34
 
31
35
  SCHEMA = "proofbundle/v0.1"
32
36
 
37
+ # Allowed keys per object — SPEC.md §3: a verifier MUST reject unknown fields (schema is
38
+ # additionalProperties: false). Enforced here so the code matches its own normative spec.
39
+ _TOP_KEYS = {"schema", "payload_b64", "signature", "merkle", "sd_jwt_vc"}
40
+ _SIG_KEYS = {"alg", "public_key_b64", "sig_b64"}
41
+ _MERKLE_KEYS = {"hash_alg", "leaf_index", "tree_size", "inclusion_proof_b64", "root_b64"}
42
+ _SD_KEYS = {"compact", "issuer_public_key_b64"}
43
+
33
44
 
34
45
  def _b64d(value: str, field: str) -> bytes:
35
46
  try:
@@ -44,6 +55,27 @@ def _require(obj: dict, key: str, field: str):
44
55
  return obj[key]
45
56
 
46
57
 
58
+ def _require_dict(obj, field: str) -> dict:
59
+ """The value must be a JSON object — a string/list/number is malformed, not a crash."""
60
+ if not isinstance(obj, dict):
61
+ raise BundleFormatError(f"field {field} must be a JSON object")
62
+ return obj
63
+
64
+
65
+ def _require_int(obj: dict, key: str, field: str) -> int:
66
+ """The value must be a JSON integer — reject floats (SPEC §2) and non-numeric strings/None."""
67
+ val = _require(obj, key, field)
68
+ if isinstance(val, bool) or not isinstance(val, int): # bool is an int subclass; a float/str/None is not
69
+ raise BundleFormatError(f"field {field} must be an integer, got {type(val).__name__}")
70
+ return val
71
+
72
+
73
+ def _reject_unknown(obj: dict, allowed: set, field: str) -> None:
74
+ extra = set(obj) - allowed
75
+ if extra:
76
+ raise BundleFormatError(f"unknown field(s) in {field}: {sorted(extra)}")
77
+
78
+
47
79
  def load_bundle(path: str) -> dict:
48
80
  """Read and JSON-parse a bundle file."""
49
81
  with open(path, "r", encoding="utf-8") as handle:
@@ -60,12 +92,14 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
60
92
  schema = bundle.get("schema")
61
93
  if schema != SCHEMA:
62
94
  raise UnsupportedError(f"unsupported schema {schema!r}, expected {SCHEMA!r}")
95
+ _reject_unknown(bundle, _TOP_KEYS, "bundle")
63
96
 
64
97
  result = VerificationResult()
65
98
  payload = _b64d(_require(bundle, "payload_b64", "payload_b64"), "payload_b64")
66
99
 
67
100
  # 1. signature over the payload
68
- sig = _require(bundle, "signature", "signature")
101
+ sig = _require_dict(_require(bundle, "signature", "signature"), "signature")
102
+ _reject_unknown(sig, _SIG_KEYS, "signature")
69
103
  alg = sig.get("alg")
70
104
  if alg != "ed25519":
71
105
  raise UnsupportedError(f"signature alg {alg!r} not supported in v0.1")
@@ -75,13 +109,17 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
75
109
  result.add("ed25519-signature", sig_ok, "payload signed by stated key" if sig_ok else "invalid signature")
76
110
 
77
111
  # 2. merkle inclusion of the payload
78
- mk = _require(bundle, "merkle", "merkle")
112
+ mk = _require_dict(_require(bundle, "merkle", "merkle"), "merkle")
113
+ _reject_unknown(mk, _MERKLE_KEYS, "merkle")
79
114
  hash_alg = mk.get("hash_alg", "sha256-rfc6962")
80
115
  if hash_alg != "sha256-rfc6962":
81
116
  raise UnsupportedError(f"merkle hash_alg {hash_alg!r} not supported in v0.1")
82
- leaf_index = int(_require(mk, "leaf_index", "merkle.leaf_index"))
83
- tree_size = int(_require(mk, "tree_size", "merkle.tree_size"))
84
- proof = [_b64d(p, "merkle.inclusion_proof_b64[]") for p in mk.get("inclusion_proof_b64", [])]
117
+ leaf_index = _require_int(mk, "leaf_index", "merkle.leaf_index")
118
+ tree_size = _require_int(mk, "tree_size", "merkle.tree_size")
119
+ proof_list = _require(mk, "inclusion_proof_b64", "merkle.inclusion_proof_b64") # required per SPEC §5
120
+ if not isinstance(proof_list, list):
121
+ raise BundleFormatError("field merkle.inclusion_proof_b64 must be a list")
122
+ proof = [_b64d(p, "merkle.inclusion_proof_b64[]") for p in proof_list]
85
123
  root = _b64d(_require(mk, "root_b64", "merkle.root_b64"), "merkle.root_b64")
86
124
  incl_ok = merkle.verify_inclusion(payload, leaf_index, tree_size, proof, root)
87
125
  result.add(
@@ -93,6 +131,8 @@ def verify_bundle(bundle: Union[dict, str]) -> VerificationResult:
93
131
  # 3. optional SD-JWT selective disclosure credential
94
132
  sd = bundle.get("sd_jwt_vc")
95
133
  if sd is not None:
134
+ sd = _require_dict(sd, "sd_jwt_vc")
135
+ _reject_unknown(sd, _SD_KEYS, "sd_jwt_vc")
96
136
  compact = _require(sd, "compact", "sd_jwt_vc.compact")
97
137
  issuer_pub = None
98
138
  if sd.get("issuer_public_key_b64"):
@@ -21,6 +21,7 @@ import base64
21
21
  import hashlib
22
22
  import json
23
23
  import os
24
+ import re
24
25
  import unicodedata
25
26
  from typing import Optional, Sequence
26
27
 
@@ -34,6 +35,8 @@ EVAL_CLAIM_SCHEMA = "proofbundle/eval-claim/v0.1"
34
35
  COMMIT_ALG = "sha256-salted-v1"
35
36
  _COMPARATORS = {">=", ">", "<=", "<"}
36
37
  _MAX_SAFE_INT = 2 ** 53 - 1
38
+ # The published eval-claim schema's decimal pattern for threshold/score (no exponent, no sign+, no spaces).
39
+ _DECIMAL_RE = re.compile(r"^-?[0-9]+(\.[0-9]+)?$")
37
40
  # The exact key set of an eval claim; decode/validate reject anything else.
38
41
  _REQUIRED = {"schema", "suite", "suite_version", "metric", "comparator", "threshold",
39
42
  "passed", "n", "model_id_commit", "dataset_id_commit", "commit_alg", "issuer", "timestamp"}
@@ -103,7 +106,12 @@ def canonicalize(claim: dict) -> bytes:
103
106
  for the UTF-16 code-unit key sort + compact UTF-8 serialization.
104
107
  """
105
108
  _reject_non_jcs(claim)
106
- import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
109
+ try:
110
+ import rfc8785 # noqa: PLC0415 — lazy: only the emit path pulls the JCS dependency
111
+ except ImportError as e:
112
+ raise EvalClaimError(
113
+ "emitting eval receipts needs an RFC 8785 canonicalizer — install with: "
114
+ "pip install \"proofbundle[eval]\"") from e
107
115
  try:
108
116
  return rfc8785.dumps(claim)
109
117
  except (rfc8785.FloatDomainError, rfc8785.IntegerDomainError, rfc8785.CanonicalizationError) as e:
@@ -137,14 +145,17 @@ def build_eval_claim(*, suite: str, suite_version: str, metric: str, comparator:
137
145
  """
138
146
  if comparator not in _COMPARATORS:
139
147
  raise EvalClaimError(f"comparator must be one of {sorted(_COMPARATORS)}")
148
+ # threshold/score must match the PUBLISHED schema's decimal pattern exactly — reject "1e2",
149
+ # "Infinity", "+5", " 5 " etc. that Decimal() would accept but jsonschema rejects (schema-conformance).
140
150
  for name, val in (("threshold", threshold), ("score", score)):
141
151
  if not isinstance(val, str):
142
152
  raise EvalClaimError(f"{name} must be a decimal STRING, not {type(val).__name__}")
143
- from decimal import Decimal, InvalidOperation # noqa: PLC0415
144
- try:
145
- s, t = Decimal(score), Decimal(threshold)
146
- except InvalidOperation as e:
147
- raise EvalClaimError(f"threshold/score are not valid decimals: {e}") from e
153
+ if not _DECIMAL_RE.match(val):
154
+ raise EvalClaimError(f"{name} must be a plain decimal string (^-?[0-9]+(\\.[0-9]+)?$), got {val!r}")
155
+ if not isinstance(n, int) or isinstance(n, bool) or n < 0 or n > _MAX_SAFE_INT:
156
+ raise EvalClaimError(f"n must be a non-negative integer <= 2**53-1, got {n!r}")
157
+ from decimal import Decimal # noqa: PLC0415
158
+ s, t = Decimal(score), Decimal(threshold)
148
159
  passed = {">=": s >= t, ">": s > t, "<=": s <= t, "<": s < t}[comparator]
149
160
  m_salt = model_salt if model_salt is not None else os.urandom(16)
150
161
  d_salt = dataset_salt if dataset_salt is not None else os.urandom(16)
@@ -12,7 +12,7 @@ exists (deferred, see the roadmap).
12
12
  """
13
13
  from __future__ import annotations
14
14
 
15
- from typing import Optional
15
+ from typing import Any, Optional
16
16
 
17
17
  STATEMENT_TYPE = "https://in-toto.io/Statement/v1"
18
18
  PREDICATE_TYPE = "https://b7n0de.com/proofbundle/eval-receipt/v0.1"
@@ -37,6 +37,21 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
37
37
  (e.g. {"name": "inspect_ai", "version": "0.3.217"}) is optional. The subject digest is the model
38
38
  commitment under a custom key (never `sha256`).
39
39
  """
40
+ predicate: dict[str, Any] = {
41
+ "verifier": {"id": VERIFIER_ID},
42
+ "evaluatedAt": claim["timestamp"],
43
+ "suite": claim["suite"],
44
+ "claims": [{
45
+ "metric": claim["metric"], "comparator": claim["comparator"],
46
+ "threshold": claim["threshold"], "passed": claim["passed"],
47
+ }],
48
+ "datasetCommit": claim.get("dataset_id_commit"),
49
+ "subject_digest_note": _SUBJECT_DIGEST_NOTE,
50
+ }
51
+ if harness:
52
+ predicate["harness"] = harness
53
+ if root_b64:
54
+ predicate["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
40
55
  statement = {
41
56
  "_type": STATEMENT_TYPE,
42
57
  "subject": [{
@@ -44,20 +59,6 @@ def to_intoto_statement(claim: dict, *, root_b64: Optional[str] = None,
44
59
  "digest": {MODEL_COMMIT_DIGEST_KEY: _commit_hex(claim["model_id_commit"])},
45
60
  }],
46
61
  "predicateType": PREDICATE_TYPE,
47
- "predicate": {
48
- "verifier": {"id": VERIFIER_ID},
49
- "evaluatedAt": claim["timestamp"],
50
- "suite": claim["suite"],
51
- "claims": [{
52
- "metric": claim["metric"], "comparator": claim["comparator"],
53
- "threshold": claim["threshold"], "passed": claim["passed"],
54
- }],
55
- "datasetCommit": claim.get("dataset_id_commit"),
56
- "subject_digest_note": _SUBJECT_DIGEST_NOTE,
57
- },
62
+ "predicate": predicate,
58
63
  }
59
- if harness:
60
- statement["predicate"]["harness"] = harness
61
- if root_b64:
62
- statement["predicate"]["receipt"] = {"schema": "proofbundle/v0.1", "root_b64": root_b64}
63
64
  return statement
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: proofbundle
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
5
5
  Author: Konrad Gruszka
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: eval
28
28
  Requires-Dist: rfc8785>=0.1.4; extra == "eval"
29
29
  Provides-Extra: adapters
30
30
  Provides-Extra: inspect
31
- Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "inspect"
31
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "inspect"
32
32
  Provides-Extra: dev
33
33
  Requires-Dist: pytest>=7; extra == "dev"
34
34
  Requires-Dist: ruff>=0.5; extra == "dev"
@@ -38,7 +38,7 @@ Requires-Dist: build>=1; extra == "dev"
38
38
  Requires-Dist: hypothesis>=6; extra == "dev"
39
39
  Requires-Dist: rfc8785>=0.1.4; extra == "dev"
40
40
  Requires-Dist: sd-jwt>=0.10; extra == "dev"
41
- Requires-Dist: inspect_ai<0.4,>=0.3.100; extra == "dev"
41
+ Requires-Dist: inspect_ai<0.4,>=0.3.100; python_version >= "3.10" and extra == "dev"
42
42
  Dynamic: license-file
43
43
 
44
44
  <div align="center">
@@ -62,14 +62,15 @@ selectively disclosable credential. Pure Python, no server, no daemon, one JSON
62
62
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
63
63
  [![SLSA build provenance](https://img.shields.io/badge/SLSA-build_provenance-D6248A.svg)](https://slsa.dev)
64
64
  [![PyPI attestations](https://img.shields.io/badge/PyPI-attestations_(PEP_740)-D6248A.svg)](https://pypi.org/project/proofbundle/)
65
- <!-- DOI badge placeholder: Zenodo is linked and archives each release. Add the Zenodo concept-DOI badge
66
- here (and the DOI to CITATION.cff) once Zenodo assigns it it does not exist at build time. -->
65
+ <!-- DOI badge placeholder: enable Zenodo archiving for this repo, then add the Zenodo concept-DOI
66
+ badge here (and the DOI to CITATION.cff) once Zenodo assigns one on the next release. No DOI has
67
+ been assigned yet (no archived record exists at build time) — tracked in the human checklist. -->
67
68
 
68
69
  </div>
69
70
 
70
71
  **At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
71
72
  verify` checks one self-contained `bundle.json` with three offline cryptographic
72
- checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
73
+ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 74 tests.
73
74
 
74
75
  ## Contents
75
76
 
@@ -78,6 +79,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 63 tests.
78
79
  - [How it fits together](#how-it-fits-together)
79
80
  - [Install](#install)
80
81
  - [Quickstart](#quickstart)
82
+ - [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
81
83
  - [Interoperability](#interoperability)
82
84
  - [Bundle format](#bundle-format-proofbundlev01)
83
85
  - [Eval receipts](#eval-receipts)
@@ -208,6 +210,21 @@ from proofbundle import verify_consistency
208
210
  verify_consistency(first_size, second_size, proof, first_root, second_root) # -> bool
209
211
  ```
210
212
 
213
+ ## Demo — a real eval log to a verified receipt, offline
214
+
215
+ ```bash
216
+ pip install "proofbundle[eval,inspect]"
217
+ make demo # or: bash scripts/demo.sh
218
+ ```
219
+
220
+ `make demo` runs end-to-end with **no network, no API key, no GPU**: it takes genuine eval logs — an
221
+ inspect_ai `mockllm/model` `.eval` log and an lm-evaluation-harness `--model dummy` `results.json`
222
+ (committed under `tests/fixtures/`, generated offline) — turns each into a signed, Merkle-anchored
223
+ proofbundle receipt, and verifies it to `=> OK`. The scores are random (a dummy model); the point is
224
+ that the *artifact* is signed and offline-verifiable, with model and dataset kept as salted commitments.
225
+ See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
226
+ [`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
227
+
211
228
  ## Interoperability
212
229
 
213
230
  proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
@@ -284,15 +301,29 @@ proofbundle show-eval receipt.json # verify + print the claim (issuer-boun
284
301
  ```
285
302
 
286
303
  The claim format is specified in [EVAL_CLAIM.md](EVAL_CLAIM.md); the emit path uses
287
- RFC 8785 JCS canonicalization, the verify path stays dependency-free. **Honest scope:**
288
- a receipt proves `passed` against `threshold` and hides the model/dataset via salted
289
- commitments it does **not** prove the evaluation was well designed or that the score
290
- itself is correct. Those are human judgements; what it removes is the need to simply
291
- trust the number.
304
+ RFC 8785 JCS canonicalization, the verify path stays dependency-free.
305
+
306
+ **Honesty guardrail (the exact scope).** A receipt attests the **authenticity and integrity** of a
307
+ *claimed* result and its context these exact bytes, signed by this key, anchored under this root, with
308
+ model/dataset kept as salted commitments. It does **not** attest the **correctness of the computation**,
309
+ and it cannot detect **cherry-picking** of the eval. Whether the eval was well designed, whether the
310
+ suite measures what it claims, and whether the number was computed honestly are separate questions.
311
+ Trusted-execution approaches such as [Attestable Audits](https://arxiv.org/abs/2506.23706) target
312
+ computation-correctness with a different (hardware) trust model; proofbundle is the lightweight,
313
+ hardware-free path to a portable, tamper-evident, selectively disclosable *result artifact*.
314
+
315
+ **How this differs from a bare hash or a TEE.** A plain SHA-256 of a log commits to bytes but carries no
316
+ signature, no tamper-evident anchor, and no selective disclosure (an attestation-exporter idea along
317
+ those lines,
318
+ [inspect_evals PR #1610](https://github.com/UKGovernmentBEIS/inspect_evals/pull/1610), was closed as
319
+ belonging *a layer above* the framework — which is exactly where proofbundle sits). A TEE proves the
320
+ computation ran untampered but needs specific hardware. proofbundle adds Ed25519 + RFC 6962 Merkle +
321
+ SD-JWT selective disclosure over one portable file, offline.
292
322
 
293
323
  ### A verification layer for trustworthy eval logs
294
324
 
295
- The UK AISI inspect_ai team names an open gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
325
+ The maintainers of inspect_evals (Arcadia Impact, funded by the UK AI Safety Institute) name an open
326
+ gap ([arXiv:2507.06893](https://arxiv.org/abs/2507.06893)):
296
327
  a database of trustworthy evaluation results with proper provenance tracking. proofbundle is the
297
328
  missing **signature + selective-disclosure layer** for exactly that — complementary to metadata
298
329
  aggregation (Every Eval Ever) and documentation taxonomies (Eval Factsheets), not a competitor.
@@ -326,8 +357,10 @@ attestation — see [SECURITY.md](SECURITY.md).
326
357
  - **v0.5** — inspect_ai adapter (stable API), in-toto Statement v1 view, SD-JWT **issuance** (RFC 9901).
327
358
  - **v0.6** — a second eval adapter (lm-evaluation-harness, real format + provenance), INTEROP.md,
328
359
  CITATION.cff, PEP 740 attestations documented.
329
- - **v0.7 (current release)** — citability polish: ORCID in CITATION.cff, a Zenodo DOI placeholder
330
- (assigned on release), and a draft in-toto ML-eval predicate proposal.
360
+ - **v0.7** — citability polish (ORCID, Zenodo DOI placeholder, in-toto proposal draft); v0.7.1 hardened
361
+ verifier robustness + CI on Python 3.9 after a holistic review.
362
+ - **v0.8 (current release)** — an offline `make demo` (real eval log -> signed receipt -> verified),
363
+ a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
331
364
  - **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
332
365
  Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
333
366
 
@@ -24,11 +24,13 @@ src/proofbundle/adapters/inspect_ai.py
24
24
  src/proofbundle/adapters/lm_eval.py
25
25
  tests/test_adapters.py
26
26
  tests/test_bundle.py
27
+ tests/test_bundle_robustness.py
27
28
  tests/test_cli.py
28
29
  tests/test_cli_eval.py
29
30
  tests/test_emit.py
30
31
  tests/test_eval_claim_schema.py
31
32
  tests/test_evalclaim.py
33
+ tests/test_examples.py
32
34
  tests/test_intoto.py
33
35
  tests/test_merkle.py
34
36
  tests/test_merkle_property.py
@@ -11,12 +11,16 @@ build>=1
11
11
  hypothesis>=6
12
12
  rfc8785>=0.1.4
13
13
  sd-jwt>=0.10
14
+
15
+ [dev:python_version >= "3.10"]
14
16
  inspect_ai<0.4,>=0.3.100
15
17
 
16
18
  [eval]
17
19
  rfc8785>=0.1.4
18
20
 
19
21
  [inspect]
22
+
23
+ [inspect:python_version >= "3.10"]
20
24
  inspect_ai<0.4,>=0.3.100
21
25
 
22
26
  [sdjwt]
@@ -39,6 +39,8 @@ class TestAdapters(unittest.TestCase):
39
39
  self.assertEqual(claim["suite"], "safety_refusal_demo")
40
40
  self.assertTrue(claim["passed"]) # accuracy 0.0 >= 0.00
41
41
  self.assertNotIn("mockllm/model", str(claim)) # model id only as salted commitment
42
+ self.assertEqual(claim["provenance"]["harness"], "inspect_ai") # provenance parity with lm-eval
43
+ self.assertIn("harness_version", claim["provenance"])
42
44
 
43
45
  def test_inspect_ai_missing_metric_clear_error(self):
44
46
  from proofbundle.adapters.inspect_ai import InspectAdapterError
@@ -0,0 +1,74 @@
1
+ """Malformed-input robustness of verify_bundle + build_eval_claim (holistic-review findings, 0.7.1).
2
+
3
+ The verifier's contract is OK/FAILED/malformed — never a raw traceback. build_eval_claim must not emit a
4
+ receipt that fails its own published schema. One red-test per finding."""
5
+ import copy
6
+ import unittest
7
+
8
+ from proofbundle import verify_bundle
9
+ from proofbundle.emit import emit_bundle, generate_signer
10
+ from proofbundle.errors import BundleFormatError
11
+ from proofbundle.evalclaim import EvalClaimError, build_eval_claim
12
+
13
+
14
+ def _bundle():
15
+ return emit_bundle(b"payload", generate_signer())
16
+
17
+
18
+ def _mut(mut):
19
+ b = copy.deepcopy(_bundle())
20
+ mut(b)
21
+ return b
22
+
23
+
24
+ class TestBundleRobustness(unittest.TestCase):
25
+ def test_leaf_index_non_numeric_raises_format_error(self): # D1
26
+ with self.assertRaises(BundleFormatError):
27
+ verify_bundle(_mut(lambda b: b["merkle"].__setitem__("leaf_index", "abc")))
28
+
29
+ def test_signature_non_object_raises_format_error(self): # D2
30
+ with self.assertRaises(BundleFormatError):
31
+ verify_bundle(_mut(lambda b: b.__setitem__("signature", "notadict")))
32
+ with self.assertRaises(BundleFormatError):
33
+ verify_bundle(_mut(lambda b: b.__setitem__("merkle", ["x"])))
34
+
35
+ def test_tree_size_float_rejected(self): # D3 (SPEC §2: integers only)
36
+ with self.assertRaises(BundleFormatError):
37
+ verify_bundle(_mut(lambda b: b["merkle"].__setitem__("tree_size", 1.5)))
38
+
39
+ def test_missing_inclusion_proof_rejected(self): # D4 (SPEC §5: required)
40
+ with self.assertRaises(BundleFormatError):
41
+ verify_bundle(_mut(lambda b: b["merkle"].pop("inclusion_proof_b64")))
42
+
43
+ def test_unknown_fields_rejected(self): # SPEC §3: additionalProperties false
44
+ with self.assertRaises(BundleFormatError):
45
+ verify_bundle(_mut(lambda b: b.__setitem__("evil", "x")))
46
+ with self.assertRaises(BundleFormatError):
47
+ verify_bundle(_mut(lambda b: b["signature"].__setitem__("evil", "x")))
48
+ with self.assertRaises(BundleFormatError):
49
+ verify_bundle(_mut(lambda b: b["merkle"].__setitem__("evil", "x")))
50
+
51
+ def test_well_formed_still_ok(self): # no false positive
52
+ self.assertTrue(verify_bundle(_bundle()).ok)
53
+
54
+
55
+ class TestEvalClaimSchemaConformance(unittest.TestCase):
56
+ def _build(self, **kw):
57
+ base = dict(suite="s", suite_version="v1", metric="acc", comparator=">=", threshold="0.8",
58
+ score="0.9", n=1, model_id="m", dataset_id="d", issuer="",
59
+ timestamp="2026-07-01T12:00:00Z", model_salt=b"0" * 16, dataset_salt=b"1" * 16)
60
+ base.update(kw)
61
+ return build_eval_claim(**base)
62
+
63
+ def test_negative_n_rejected(self): # schema minimum 0
64
+ with self.assertRaises(EvalClaimError):
65
+ self._build(n=-5)
66
+
67
+ def test_exponent_and_sign_threshold_rejected(self): # schema decimal pattern
68
+ for bad in ("1e2", "Infinity", "+5", " 0.9 "):
69
+ with self.assertRaises(EvalClaimError):
70
+ self._build(threshold=bad)
71
+
72
+ def test_plain_decimal_accepted(self):
73
+ claim, _ = self._build(threshold="0.80", score="0.92")
74
+ self.assertTrue(claim["passed"])
@@ -0,0 +1,28 @@
1
+ """The demo examples run end-to-end (real fixtures -> receipt -> verify). Covers `make demo` (Phase B)."""
2
+ import importlib.util
3
+ import sys
4
+ import unittest
5
+ from pathlib import Path
6
+
7
+ REPO = Path(__file__).resolve().parents[1]
8
+
9
+
10
+ def _run_example(name):
11
+ try:
12
+ import inspect_ai.log # noqa: F401 (inspect example needs it)
13
+ except ImportError:
14
+ if name == "inspect_receipt":
15
+ raise unittest.SkipTest("inspect_ai not installed")
16
+ spec = importlib.util.spec_from_file_location(name, REPO / "examples" / f"{name}.py")
17
+ m = importlib.util.module_from_spec(spec)
18
+ sys.modules[name] = m
19
+ spec.loader.exec_module(m)
20
+ return m.main()
21
+
22
+
23
+ class TestExamples(unittest.TestCase):
24
+ def test_lm_eval_receipt_example(self):
25
+ self.assertEqual(_run_example("lm_eval_receipt"), 0)
26
+
27
+ def test_inspect_receipt_example(self):
28
+ self.assertEqual(_run_example("inspect_receipt"), 0)
File without changes
File without changes