proofbundle 0.9.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {proofbundle-0.9.0/src/proofbundle.egg-info → proofbundle-1.1.0}/PKG-INFO +62 -9
- {proofbundle-0.9.0 → proofbundle-1.1.0}/README.md +57 -6
- {proofbundle-0.9.0 → proofbundle-1.1.0}/pyproject.toml +12 -3
- proofbundle-1.1.0/src/proofbundle/__init__.py +56 -0
- proofbundle-1.1.0/src/proofbundle/_inspect_registry.py +3 -0
- proofbundle-1.1.0/src/proofbundle/_integration.py +84 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/adapters/inspect_ai.py +32 -13
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/cli.py +12 -1
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/evalclaim.py +104 -5
- proofbundle-1.1.0/src/proofbundle/inspect_hook.py +63 -0
- proofbundle-1.1.0/src/proofbundle/pytest_plugin.py +67 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0/src/proofbundle.egg-info}/PKG-INFO +62 -9
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle.egg-info/SOURCES.txt +7 -0
- proofbundle-1.1.0/src/proofbundle.egg-info/entry_points.txt +8 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle.egg-info/requires.txt +5 -2
- proofbundle-1.1.0/tests/test_adversarial.py +95 -0
- proofbundle-1.1.0/tests/test_inspect_hook.py +57 -0
- proofbundle-1.1.0/tests/test_pytest_plugin.py +62 -0
- proofbundle-0.9.0/src/proofbundle/__init__.py +0 -30
- proofbundle-0.9.0/src/proofbundle.egg-info/entry_points.txt +0 -2
- {proofbundle-0.9.0 → proofbundle-1.1.0}/LICENSE +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/setup.cfg +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/adapters/__init__.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/adapters/eee.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/adapters/lm_eval.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/bundle.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/checkpoint.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/dsse.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/eee_eval_schema.json +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/emit.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/errors.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/intoto.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/merkle.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/py.typed +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/sdjwt.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/sdjwt_issue.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle/signature.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle.egg-info/dependency_links.txt +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/src/proofbundle.egg-info/top_level.txt +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_adapters.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_bundle.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_bundle_robustness.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_checkpoint.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_cli.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_cli_eval.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_eee.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_emit.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_eval_claim_schema.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_evalclaim.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_examples.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_intoto.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_intoto_dsse.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_merkle.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_merkle_property.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_rekor_interop.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_rfc6962_external_vectors.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_schema.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_sdjwt_issue.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_sdjwt_reference.py +0 -0
- {proofbundle-0.9.0 → proofbundle-1.1.0}/tests/test_signature.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: proofbundle
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT.
|
|
5
5
|
Author: Konrad Gruszka
|
|
6
6
|
License: MIT
|
|
@@ -27,8 +27,10 @@ Provides-Extra: sdjwt
|
|
|
27
27
|
Provides-Extra: eval
|
|
28
28
|
Requires-Dist: rfc8785>=0.1.4; extra == "eval"
|
|
29
29
|
Provides-Extra: adapters
|
|
30
|
+
Provides-Extra: pytest
|
|
31
|
+
Requires-Dist: pytest>=7; extra == "pytest"
|
|
30
32
|
Provides-Extra: inspect
|
|
31
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.
|
|
33
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.112; python_version >= "3.10" and extra == "inspect"
|
|
32
34
|
Provides-Extra: dev
|
|
33
35
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
34
36
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
@@ -38,7 +40,7 @@ Requires-Dist: build>=1; extra == "dev"
|
|
|
38
40
|
Requires-Dist: hypothesis>=6; extra == "dev"
|
|
39
41
|
Requires-Dist: rfc8785>=0.1.4; extra == "dev"
|
|
40
42
|
Requires-Dist: sd-jwt>=0.10; extra == "dev"
|
|
41
|
-
Requires-Dist: inspect_ai<0.4,>=0.3.
|
|
43
|
+
Requires-Dist: inspect_ai<0.4,>=0.3.112; python_version >= "3.10" and extra == "dev"
|
|
42
44
|
Dynamic: license-file
|
|
43
45
|
|
|
44
46
|
<div align="center">
|
|
@@ -71,7 +73,7 @@ no server, no network.**
|
|
|
71
73
|
|
|
72
74
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
73
75
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
74
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
76
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 102 tests.
|
|
75
77
|
|
|
76
78
|
## Contents
|
|
77
79
|
|
|
@@ -81,6 +83,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
|
|
|
81
83
|
- [Install](#install)
|
|
82
84
|
- [Quickstart](#quickstart)
|
|
83
85
|
- [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
|
|
86
|
+
- [Integrations](#integrations--a-signed-receipt-of-your-eval-or-test-run-automatically-v10)
|
|
84
87
|
- [Interoperability](#interoperability)
|
|
85
88
|
- [Bundle format](#bundle-format-proofbundlev01)
|
|
86
89
|
- [Eval receipts](#eval-receipts)
|
|
@@ -108,6 +111,33 @@ disclosable receipt. The verifier shipped first, small and correct, so it could
|
|
|
108
111
|
be reviewed and trusted on its own; `emit_bundle` now creates bundles that
|
|
109
112
|
`verify_bundle` accepts, fully offline on both sides.
|
|
110
113
|
|
|
114
|
+
## What a receipt proves (and what it does not)
|
|
115
|
+
|
|
116
|
+
A receipt is a **tamper-evident, signed statement of authorship and integrity** over an eval or test result —
|
|
117
|
+
not a proof that the number is *true* or that the evaluation was well designed. Hold these apart:
|
|
118
|
+
|
|
119
|
+
- **It proves:** the payload was signed by the stated issuer (authorship), no byte changed since (integrity,
|
|
120
|
+
Ed25519 + RFC 6962), the model/dataset behind salted commitments, and — since v1.1 — the **assurance level**
|
|
121
|
+
is signed in — tamper-evident and bound to the issuer, so a third party cannot alter it. `show-eval`
|
|
122
|
+
displays the level, warns on the weakest combination (self_attested with no pre-registration), and shows
|
|
123
|
+
withheld SD-JWT fields + receipt age; the `verify_commitment` library call (the holder presents the
|
|
124
|
+
identifier + salt out of band) makes a model-swap visible.
|
|
125
|
+
- **It does not prove:** that a *self-attested* issuer is honest. The level is issuer-DECLARED: a dishonest
|
|
126
|
+
issuer can sign `reproduced` on a self-run eval — the signature binds *who claimed it* to them, it does not
|
|
127
|
+
make the claim true (same as the score). The warning catches the honest self_attested case; a higher level
|
|
128
|
+
is only as trustworthy as the process behind it.
|
|
129
|
+
- **Also not proven:** that a result was not cherry-picked from many runs without pre-registration, or that
|
|
130
|
+
the suite measures what it claims. Those need a pre-registered protocol or independent reproduction.
|
|
131
|
+
|
|
132
|
+
| assurance_level | meaning |
|
|
133
|
+
|---|---|
|
|
134
|
+
| `self_attested` | issuer ran + signed it (default); trust rests on the issuer |
|
|
135
|
+
| `third_party` | a third party checked before signing |
|
|
136
|
+
| `reproduced` | independently re-run and matched |
|
|
137
|
+
| `enclave_attested` | produced in an attested trusted execution environment |
|
|
138
|
+
|
|
139
|
+
Full detail: **[THREAT_MODEL.md](THREAT_MODEL.md)** — what `verify` catches and what it structurally cannot.
|
|
140
|
+
|
|
111
141
|
## What it verifies
|
|
112
142
|
|
|
113
143
|
A bundle is a single JSON document. `proofbundle` checks, offline:
|
|
@@ -226,6 +256,25 @@ that the *artifact* is signed and offline-verifiable, with model and dataset kep
|
|
|
226
256
|
See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
|
|
227
257
|
[`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
|
|
228
258
|
|
|
259
|
+
## Integrations — a signed receipt of your eval or test run, automatically (v1.0)
|
|
260
|
+
|
|
261
|
+
Since v1.0, proofbundle can **auto-emit** a signed receipt of an **inspect_ai eval** or a **pytest run** via
|
|
262
|
+
each framework's native plugin API — installed and ready, but strictly **opt-in** (it emits only when you set
|
|
263
|
+
`PROOFBUNDLE_EMIT=1` or pass a flag; never silently, never failing your run):
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
pip install "proofbundle[inspect,eval]" && PROOFBUNDLE_EMIT=1 inspect eval task.py --model mockllm/model
|
|
267
|
+
pip install "proofbundle[pytest,eval]" && PROOFBUNDLE_EMIT=1 pytest
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
The distinguishing angle is exactly this opt-in **auto-emit of an Ed25519-signed receipt via the framework's
|
|
271
|
+
own plugin** (an inspect_ai end-of-task hook + a pytest11 plugin), on top of the standards stack. Named
|
|
272
|
+
fairly: [ai-audit-trail](https://pypi.org/project/ai-audit-trail/) records *runtime* agent Decision Receipts
|
|
273
|
+
(FastAPI/LangChain, ISO 42001), a different layer; [ValiChord](https://github.com/topeuph-ai/ValiChord)
|
|
274
|
+
builds attestation bundles from inspect_ai logs *post-hoc* (its v1 library is JCS + SHA-256 Merkle + HMAC
|
|
275
|
+
challenge-response, **not digitally signed** — signatures are v2 scope). See
|
|
276
|
+
[INTEGRATIONS.md](INTEGRATIONS.md) (+ a prepared composite GitHub Action under [`action/`](action/action.yml)).
|
|
277
|
+
|
|
229
278
|
## Interoperability
|
|
230
279
|
|
|
231
280
|
proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
|
|
@@ -336,9 +385,9 @@ computation is the domain of TEE approaches such as
|
|
|
336
385
|
fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
|
|
337
386
|
adds no cryptography (proofbundle ships an EEE→receipt converter);
|
|
338
387
|
[OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
|
|
339
|
-
results; [ValiChord](https://github.com/topeuph-ai/ValiChord)
|
|
340
|
-
attested log
|
|
341
|
-
signature, no SD-JWT, no in-toto
|
|
388
|
+
results; [ValiChord](https://github.com/topeuph-ai/ValiChord) plans blind peer consensus and a
|
|
389
|
+
Holochain attested log (v2 scope); its current v1 attestation library uses a simple SHA-256 Merkle tree with
|
|
390
|
+
no digital signature, no SD-JWT, no in-toto. proofbundle is the lightweight, **standards-native** piece between them:
|
|
342
391
|
a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
|
|
343
392
|
threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
|
|
344
393
|
|
|
@@ -384,8 +433,12 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
384
433
|
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
385
434
|
- **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
386
435
|
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
387
|
-
- **v0.9
|
|
388
|
-
|
|
436
|
+
- **v0.9** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP tlog-checkpoint over
|
|
437
|
+
the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
|
|
438
|
+
- **v1.0** — distribution: opt-in framework integrations that auto-emit a signed receipt of an inspect_ai
|
|
439
|
+
eval (end-of-task hook) or a pytest run (pytest11 plugin), plus a composite GitHub Action.
|
|
440
|
+
- **v1.1 (current release)** — trust hardening: a signed `assurance_level`, a THREAT_MODEL, a self_attested-
|
|
441
|
+
without-prereg warning, model-swap + replay + withheld-field checks, and an adversarial No-Fake-PASS suite.
|
|
389
442
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
390
443
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
391
444
|
|
|
@@ -28,7 +28,7 @@ no server, no network.**
|
|
|
28
28
|
|
|
29
29
|
**At a glance:** `proofbundle emit` signs and anchors a payload; `proofbundle
|
|
30
30
|
verify` checks one self-contained `bundle.json` with three offline cryptographic
|
|
31
|
-
checks → `OK` or `FAILED`. No network, no daemon, no own crypto.
|
|
31
|
+
checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 102 tests.
|
|
32
32
|
|
|
33
33
|
## Contents
|
|
34
34
|
|
|
@@ -38,6 +38,7 @@ checks → `OK` or `FAILED`. No network, no daemon, no own crypto. 96 tests.
|
|
|
38
38
|
- [Install](#install)
|
|
39
39
|
- [Quickstart](#quickstart)
|
|
40
40
|
- [Demo](#demo--a-real-eval-log-to-a-verified-receipt-offline)
|
|
41
|
+
- [Integrations](#integrations--a-signed-receipt-of-your-eval-or-test-run-automatically-v10)
|
|
41
42
|
- [Interoperability](#interoperability)
|
|
42
43
|
- [Bundle format](#bundle-format-proofbundlev01)
|
|
43
44
|
- [Eval receipts](#eval-receipts)
|
|
@@ -65,6 +66,33 @@ disclosable receipt. The verifier shipped first, small and correct, so it could
|
|
|
65
66
|
be reviewed and trusted on its own; `emit_bundle` now creates bundles that
|
|
66
67
|
`verify_bundle` accepts, fully offline on both sides.
|
|
67
68
|
|
|
69
|
+
## What a receipt proves (and what it does not)
|
|
70
|
+
|
|
71
|
+
A receipt is a **tamper-evident, signed statement of authorship and integrity** over an eval or test result —
|
|
72
|
+
not a proof that the number is *true* or that the evaluation was well designed. Hold these apart:
|
|
73
|
+
|
|
74
|
+
- **It proves:** the payload was signed by the stated issuer (authorship), no byte changed since (integrity,
|
|
75
|
+
Ed25519 + RFC 6962), the model/dataset behind salted commitments, and — since v1.1 — the **assurance level**
|
|
76
|
+
is signed in — tamper-evident and bound to the issuer, so a third party cannot alter it. `show-eval`
|
|
77
|
+
displays the level, warns on the weakest combination (self_attested with no pre-registration), and shows
|
|
78
|
+
withheld SD-JWT fields + receipt age; the `verify_commitment` library call (the holder presents the
|
|
79
|
+
identifier + salt out of band) makes a model-swap visible.
|
|
80
|
+
- **It does not prove:** that a *self-attested* issuer is honest. The level is issuer-DECLARED: a dishonest
|
|
81
|
+
issuer can sign `reproduced` on a self-run eval — the signature binds *who claimed it* to them, it does not
|
|
82
|
+
make the claim true (same as the score). The warning catches the honest self_attested case; a higher level
|
|
83
|
+
is only as trustworthy as the process behind it.
|
|
84
|
+
- **Also not proven:** that a result was not cherry-picked from many runs without pre-registration, or that
|
|
85
|
+
the suite measures what it claims. Those need a pre-registered protocol or independent reproduction.
|
|
86
|
+
|
|
87
|
+
| assurance_level | meaning |
|
|
88
|
+
|---|---|
|
|
89
|
+
| `self_attested` | issuer ran + signed it (default); trust rests on the issuer |
|
|
90
|
+
| `third_party` | a third party checked before signing |
|
|
91
|
+
| `reproduced` | independently re-run and matched |
|
|
92
|
+
| `enclave_attested` | produced in an attested trusted execution environment |
|
|
93
|
+
|
|
94
|
+
Full detail: **[THREAT_MODEL.md](THREAT_MODEL.md)** — what `verify` catches and what it structurally cannot.
|
|
95
|
+
|
|
68
96
|
## What it verifies
|
|
69
97
|
|
|
70
98
|
A bundle is a single JSON document. `proofbundle` checks, offline:
|
|
@@ -183,6 +211,25 @@ that the *artifact* is signed and offline-verifiable, with model and dataset kep
|
|
|
183
211
|
See [`examples/inspect_receipt.py`](examples/inspect_receipt.py) and
|
|
184
212
|
[`examples/lm_eval_receipt.py`](examples/lm_eval_receipt.py).
|
|
185
213
|
|
|
214
|
+
## Integrations — a signed receipt of your eval or test run, automatically (v1.0)
|
|
215
|
+
|
|
216
|
+
Since v1.0, proofbundle can **auto-emit** a signed receipt of an **inspect_ai eval** or a **pytest run** via
|
|
217
|
+
each framework's native plugin API — installed and ready, but strictly **opt-in** (it emits only when you set
|
|
218
|
+
`PROOFBUNDLE_EMIT=1` or pass a flag; never silently, never failing your run):
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
pip install "proofbundle[inspect,eval]" && PROOFBUNDLE_EMIT=1 inspect eval task.py --model mockllm/model
|
|
222
|
+
pip install "proofbundle[pytest,eval]" && PROOFBUNDLE_EMIT=1 pytest
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
The distinguishing angle is exactly this opt-in **auto-emit of an Ed25519-signed receipt via the framework's
|
|
226
|
+
own plugin** (an inspect_ai end-of-task hook + a pytest11 plugin), on top of the standards stack. Named
|
|
227
|
+
fairly: [ai-audit-trail](https://pypi.org/project/ai-audit-trail/) records *runtime* agent Decision Receipts
|
|
228
|
+
(FastAPI/LangChain, ISO 42001), a different layer; [ValiChord](https://github.com/topeuph-ai/ValiChord)
|
|
229
|
+
builds attestation bundles from inspect_ai logs *post-hoc* (its v1 library is JCS + SHA-256 Merkle + HMAC
|
|
230
|
+
challenge-response, **not digitally signed** — signatures are v2 scope). See
|
|
231
|
+
[INTEGRATIONS.md](INTEGRATIONS.md) (+ a prepared composite GitHub Action under [`action/`](action/action.yml)).
|
|
232
|
+
|
|
186
233
|
## Interoperability
|
|
187
234
|
|
|
188
235
|
proofbundle uses the same RFC 6962 / RFC 9162 Merkle primitive as
|
|
@@ -293,9 +340,9 @@ computation is the domain of TEE approaches such as
|
|
|
293
340
|
fairly: [Every Eval Ever](https://github.com/evaleval/every_eval_ever) standardizes eval *metadata* but
|
|
294
341
|
adds no cryptography (proofbundle ships an EEE→receipt converter);
|
|
295
342
|
[OpenSSF Model Signing](https://github.com/ossf/model-signing-spec) signs *model weights*, not eval
|
|
296
|
-
results; [ValiChord](https://github.com/topeuph-ai/ValiChord)
|
|
297
|
-
attested log
|
|
298
|
-
signature, no SD-JWT, no in-toto
|
|
343
|
+
results; [ValiChord](https://github.com/topeuph-ai/ValiChord) plans blind peer consensus and a
|
|
344
|
+
Holochain attested log (v2 scope); its current v1 attestation library uses a simple SHA-256 Merkle tree with
|
|
345
|
+
no digital signature, no SD-JWT, no in-toto. proofbundle is the lightweight, **standards-native** piece between them:
|
|
299
346
|
a portable receipt a third party verifies offline, with selective disclosure so an auditor can prove a
|
|
300
347
|
threshold was met without revealing the model or the data. See [INTEROP.md](INTEROP.md).
|
|
301
348
|
|
|
@@ -341,8 +388,12 @@ attestation — see [SECURITY.md](SECURITY.md).
|
|
|
341
388
|
verifier robustness + CI on Python 3.9 after a holistic review.
|
|
342
389
|
- **v0.8** — an offline `make demo` (real eval log -> signed receipt -> verified),
|
|
343
390
|
a sharpened honesty guardrail (authenticity/integrity, not computation-correctness), and outreach drafts.
|
|
344
|
-
- **v0.9
|
|
345
|
-
|
|
391
|
+
- **v0.9** — the standards moat: a DSSE-signed in-toto `test-result` export, a C2SP tlog-checkpoint over
|
|
392
|
+
the RFC 6962 root, an Every Eval Ever converter, and standards-native repositioning.
|
|
393
|
+
- **v1.0** — distribution: opt-in framework integrations that auto-emit a signed receipt of an inspect_ai
|
|
394
|
+
eval (end-of-task hook) or a pytest run (pytest11 plugin), plus a composite GitHub Action.
|
|
395
|
+
- **v1.1 (current release)** — trust hardening: a signed `assurance_level`, a THREAT_MODEL, a self_attested-
|
|
396
|
+
without-prereg warning, model-swap + replay + withheld-field checks, and an adversarial No-Fake-PASS suite.
|
|
346
397
|
- **Deferred** (explicitly not yet built) — SD-JWT VC conformance + `vct` metadata,
|
|
347
398
|
Key-Binding JWT, status lists / revocation, an official in-toto PR, DSSE / a full in-toto client.
|
|
348
399
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "proofbundle"
|
|
7
|
-
version = "
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "Emit and verify portable cryptographic evidence bundles, offline: Ed25519 + RFC 6962 Merkle + optional SD-JWT."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -45,13 +45,16 @@ sdjwt = []
|
|
|
45
45
|
eval = ["rfc8785>=0.1.4"]
|
|
46
46
|
# The lm-eval adapter reads exported results.json (no import) → pure stdlib.
|
|
47
47
|
adapters = []
|
|
48
|
+
# The pytest plugin (opt-in test-run receipt via the pytest11 entry-point) — pytest is an optional extra,
|
|
49
|
+
# never a core dependency. Floor 7.0: the hooks/stats API is signature-stable across pytest 7/8/9.
|
|
50
|
+
pytest = ["pytest>=7"]
|
|
48
51
|
# The inspect_ai adapter uses the STABLE read_eval_log API (lazy import). Pinned with an UPPER bound:
|
|
49
52
|
# the .eval format + pydantic schema change between versions (inspect_ai issue 834), and the fixture
|
|
50
53
|
# test is bound to this range. inspect_ai requires Python >= 3.10, so the marker gates it out on 3.9
|
|
51
54
|
# (base + [eval]/[sdjwt] still work on 3.9; the inspect adapter test skips there). Fixes the red 3.9 CI.
|
|
52
|
-
inspect = ['inspect_ai>=0.3.
|
|
55
|
+
inspect = ['inspect_ai>=0.3.112,<0.4; python_version >= "3.10"']
|
|
53
56
|
dev = ["pytest>=7", "ruff>=0.5", "jsonschema>=4", "mypy>=1.8", "build>=1", "hypothesis>=6",
|
|
54
|
-
"rfc8785>=0.1.4", "sd-jwt>=0.10", 'inspect_ai>=0.3.
|
|
57
|
+
"rfc8785>=0.1.4", "sd-jwt>=0.10", 'inspect_ai>=0.3.112,<0.4; python_version >= "3.10"']
|
|
55
58
|
|
|
56
59
|
[project.urls]
|
|
57
60
|
Homepage = "https://b7n0de.com"
|
|
@@ -63,6 +66,12 @@ Documentation = "https://github.com/b7n0de/proofbundle#readme"
|
|
|
63
66
|
[project.scripts]
|
|
64
67
|
proofbundle = "proofbundle.cli:main"
|
|
65
68
|
|
|
69
|
+
# Framework integrations (opt-in auto-emit; gated on PROOFBUNDLE_EMIT / a flag, never silent).
|
|
70
|
+
[project.entry-points.inspect_ai]
|
|
71
|
+
proofbundle = "proofbundle._inspect_registry"
|
|
72
|
+
[project.entry-points.pytest11]
|
|
73
|
+
proofbundle = "proofbundle.pytest_plugin"
|
|
74
|
+
|
|
66
75
|
[tool.setuptools.packages.find]
|
|
67
76
|
where = ["src"]
|
|
68
77
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""proofbundle — emit and verify portable, offline cryptographic evidence bundles for AI eval receipts.
|
|
2
|
+
|
|
3
|
+
Verify, fully offline and in pure Python, that a payload was Ed25519 signed and anchored under an RFC 6962
|
|
4
|
+
Merkle root, with optional SD-JWT selective disclosure — plus opt-in framework integrations that auto-emit a
|
|
5
|
+
signed receipt of an inspect_ai eval or a pytest run.
|
|
6
|
+
|
|
7
|
+
The public API is loaded LAZILY (PEP 562): ``import proofbundle`` — and, via the entry points, loading the
|
|
8
|
+
pytest plugin / inspect_ai hook — does NOT pull the crypto core until a name like ``verify_bundle`` is
|
|
9
|
+
actually used. ``from proofbundle import verify_bundle`` works exactly as before; it just imports the backing
|
|
10
|
+
module on first access. This keeps the framework integrations light at startup.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
__version__ = "1.1.0"
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"__version__",
|
|
20
|
+
"SCHEMA",
|
|
21
|
+
"verify_bundle",
|
|
22
|
+
"load_bundle",
|
|
23
|
+
"emit_bundle",
|
|
24
|
+
"generate_signer",
|
|
25
|
+
"verify_inclusion",
|
|
26
|
+
"verify_consistency",
|
|
27
|
+
"VerificationResult",
|
|
28
|
+
"Check",
|
|
29
|
+
"ProofBundleError",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# name → backing submodule (relative). Loaded on first attribute access.
|
|
33
|
+
_LAZY = {
|
|
34
|
+
"SCHEMA": ".bundle", "load_bundle": ".bundle", "verify_bundle": ".bundle",
|
|
35
|
+
"emit_bundle": ".emit", "generate_signer": ".emit",
|
|
36
|
+
"Check": ".errors", "ProofBundleError": ".errors", "VerificationResult": ".errors",
|
|
37
|
+
"verify_consistency": ".merkle", "verify_inclusion": ".merkle",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING: # static analysers + IDEs see the real names/types; runtime stays lazy
|
|
41
|
+
from .bundle import SCHEMA, load_bundle, verify_bundle
|
|
42
|
+
from .emit import emit_bundle, generate_signer
|
|
43
|
+
from .errors import Check, ProofBundleError, VerificationResult
|
|
44
|
+
from .merkle import verify_consistency, verify_inclusion
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def __getattr__(name: str):
|
|
48
|
+
module = _LAZY.get(name)
|
|
49
|
+
if module is None:
|
|
50
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
51
|
+
import importlib # noqa: PLC0415
|
|
52
|
+
return getattr(importlib.import_module(module, __name__), name)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def __dir__():
|
|
56
|
+
return sorted(__all__)
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
"""Entry-point target for the inspect_ai hook group. Importing this module registers ProofbundleHooks as an
|
|
2
|
+
import side-effect. Kept intentionally minimal (no crypto) so inspect's startup discovery stays fast."""
|
|
3
|
+
from .inspect_hook import ProofbundleHooks # noqa: F401 — import side-effect registers the hook
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Shared opt-in helper for the framework integrations (inspect_ai hook, pytest plugin) — v1.0.
|
|
2
|
+
|
|
3
|
+
THE TOP RULE (opt-in safety): an integration must NEVER silently write a file or alter a host run. It emits
|
|
4
|
+
a receipt ONLY when the user explicitly turns it on — the ``PROOFBUNDLE_EMIT=1`` environment variable, or a
|
|
5
|
+
framework flag that maps to it. A security tool that surprises you loses trust. Every function here is a
|
|
6
|
+
no-op unless emission is enabled, catches its own errors (an integration must never fail the host run), and
|
|
7
|
+
imports the crypto lazily (this module is only imported from inside a hook body, never at framework startup).
|
|
8
|
+
|
|
9
|
+
Configuration (all optional, all env):
|
|
10
|
+
PROOFBUNDLE_EMIT "1" to enable emission (the master opt-in). Anything else = disabled.
|
|
11
|
+
PROOFBUNDLE_KEY path to a 32-byte raw Ed25519 seed to sign with. If unset, an EPHEMERAL key is
|
|
12
|
+
generated (a warning is printed; the receipt is self-verifiable but not tied to a
|
|
13
|
+
durable identity).
|
|
14
|
+
PROOFBUNDLE_OUT output path: a file, or a directory (the default file name is written into it).
|
|
15
|
+
Default: the default file name in the current directory.
|
|
16
|
+
PROOFBUNDLE_METRIC which metric to bind (else the integration's first/most-relevant metric).
|
|
17
|
+
PROOFBUNDLE_COMPARATOR ">=" | ">" | "<=" | "<" (default ">=").
|
|
18
|
+
PROOFBUNDLE_THRESHOLD decimal string (default "0") — the pass/fail threshold to assert.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
DEFAULT_COMPARATOR = ">="
|
|
27
|
+
DEFAULT_THRESHOLD = "0"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def emit_enabled(flag: bool = False) -> bool:
|
|
31
|
+
"""The master opt-in gate. True only if PROOFBUNDLE_EMIT == "1" OR an explicit framework flag is set."""
|
|
32
|
+
return flag or os.environ.get("PROOFBUNDLE_EMIT") == "1"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def emit_config() -> dict:
|
|
36
|
+
"""Read the (metric, comparator, threshold) emission config from the environment, with safe defaults."""
|
|
37
|
+
return {
|
|
38
|
+
"metric": os.environ.get("PROOFBUNDLE_METRIC"),
|
|
39
|
+
"comparator": os.environ.get("PROOFBUNDLE_COMPARATOR") or DEFAULT_COMPARATOR,
|
|
40
|
+
"threshold": os.environ.get("PROOFBUNDLE_THRESHOLD") or DEFAULT_THRESHOLD,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _resolve_signer():
|
|
45
|
+
"""Return (signer, is_ephemeral). Loads PROOFBUNDLE_KEY if set, else generates an ephemeral key."""
|
|
46
|
+
from .emit import generate_signer, load_signer # noqa: PLC0415 — lazy: only on actual emit
|
|
47
|
+
key_path = os.environ.get("PROOFBUNDLE_KEY")
|
|
48
|
+
if key_path:
|
|
49
|
+
return load_signer(key_path), False
|
|
50
|
+
return generate_signer(), True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _output_path(default_name: str) -> Path:
|
|
54
|
+
"""Resolve the output file path from PROOFBUNDLE_OUT (file or directory) or the default name in cwd."""
|
|
55
|
+
out = os.environ.get("PROOFBUNDLE_OUT")
|
|
56
|
+
if not out:
|
|
57
|
+
return Path.cwd() / default_name
|
|
58
|
+
p = Path(out)
|
|
59
|
+
if p.is_dir() or out.endswith(os.sep):
|
|
60
|
+
return p / default_name
|
|
61
|
+
return p
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def emit_claim_receipt(claim: dict, default_name: str) -> Optional[str]:
|
|
65
|
+
"""Sign ``claim`` into an eval receipt and write it to the resolved output path. Returns the path, or
|
|
66
|
+
None on any failure (an integration must never raise into the host run). Assumes emission is enabled
|
|
67
|
+
(the caller checks ``emit_enabled`` first)."""
|
|
68
|
+
try:
|
|
69
|
+
from .evalclaim import emit_eval_receipt # noqa: PLC0415 — lazy
|
|
70
|
+
import json # noqa: PLC0415
|
|
71
|
+
|
|
72
|
+
signer, ephemeral = _resolve_signer()
|
|
73
|
+
if ephemeral:
|
|
74
|
+
print("[proofbundle] PROOFBUNDLE_KEY not set — signing with an EPHEMERAL key "
|
|
75
|
+
"(receipt is self-verifiable but not bound to a durable identity).")
|
|
76
|
+
bundle = emit_eval_receipt(claim, signer)
|
|
77
|
+
out = _output_path(default_name)
|
|
78
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
out.write_text(json.dumps(bundle, indent=2), encoding="utf-8")
|
|
80
|
+
print(f"[proofbundle] wrote signed eval receipt → {out}")
|
|
81
|
+
return str(out)
|
|
82
|
+
except Exception as e: # noqa: BLE001 — never let emission break the host run
|
|
83
|
+
print(f"[proofbundle] receipt emission skipped ({type(e).__name__}: {e})")
|
|
84
|
+
return None
|
|
@@ -21,23 +21,42 @@ class InspectAdapterError(RuntimeError):
|
|
|
21
21
|
"""Raised when inspect_ai is missing or the log lacks the expected structure (no bare AttributeError)."""
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
def _score_str(value) -> str:
|
|
25
|
+
"""Render a metric value as a PLAIN decimal string (no scientific notation) that build_eval_claim
|
|
26
|
+
accepts. ``repr(float)`` emits '1e-05'/'1e+20' for very small/large values, which the claim's decimal
|
|
27
|
+
pattern rejects — so numbers are formatted fixed-point (like the pytest plugin's ``_fmt``)."""
|
|
28
|
+
if isinstance(value, bool):
|
|
29
|
+
return "1" if value else "0"
|
|
30
|
+
if isinstance(value, int):
|
|
31
|
+
return str(value)
|
|
32
|
+
if isinstance(value, float):
|
|
33
|
+
if value != value or value in (float("inf"), float("-inf")):
|
|
34
|
+
raise InspectAdapterError("metric value must be finite")
|
|
35
|
+
return format(value, ".12f").rstrip("0").rstrip(".") or "0"
|
|
36
|
+
return str(value)
|
|
37
|
+
|
|
38
|
+
|
|
24
39
|
def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, timestamp: str,
|
|
25
40
|
model_salt: Optional[bytes] = None, dataset_salt: Optional[bytes] = None):
|
|
26
41
|
"""Read an inspect_ai eval log via the stable API and build an eval claim for `metric`.
|
|
27
42
|
|
|
28
|
-
|
|
29
|
-
|
|
43
|
+
``path`` may be a path/str to a ``.eval`` log OR an already-loaded EvalLog object (e.g. the inspect_ai
|
|
44
|
+
hook's ``data.log``). Returns (claim, salts). Raises InspectAdapterError if inspect_ai is unavailable
|
|
45
|
+
or the log is missing the expected attributes — a clear error instead of an opaque AttributeError.
|
|
30
46
|
"""
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
47
|
+
# An already-loaded EvalLog (has .eval + .results) is used directly — no re-read from disk.
|
|
48
|
+
if hasattr(path, "eval") and hasattr(path, "results"):
|
|
49
|
+
log = path
|
|
50
|
+
else:
|
|
51
|
+
try:
|
|
52
|
+
from inspect_ai.log import read_eval_log # noqa: PLC0415 — lazy: keeps the core dependency-free
|
|
53
|
+
except ImportError as e:
|
|
54
|
+
raise InspectAdapterError(
|
|
55
|
+
"inspect_ai is required for this adapter — install with: pip install \"proofbundle[inspect]\"") from e
|
|
56
|
+
try:
|
|
57
|
+
log = read_eval_log(str(path), header_only=True)
|
|
58
|
+
except Exception as e: # noqa: BLE001 — surface any read/parse failure as a clear adapter error
|
|
59
|
+
raise InspectAdapterError(f"could not read inspect_ai log {path!r}: {e}") from e
|
|
41
60
|
|
|
42
61
|
ev = getattr(log, "eval", None)
|
|
43
62
|
results = getattr(log, "results", None)
|
|
@@ -73,7 +92,7 @@ def from_inspect_ai_log(path, metric: str, *, comparator: str, threshold: str, t
|
|
|
73
92
|
|
|
74
93
|
return build_eval_claim(
|
|
75
94
|
suite=suite, suite_version=str(getattr(ev, "task_version", "1")),
|
|
76
|
-
metric=metric, comparator=comparator, threshold=threshold, score=
|
|
95
|
+
metric=metric, comparator=comparator, threshold=threshold, score=_score_str(value),
|
|
77
96
|
n=int(getattr(results, "total_samples", 0) or 0),
|
|
78
97
|
model_id=model_id, dataset_id=dataset_id, issuer="", timestamp=timestamp,
|
|
79
98
|
provenance=provenance, model_salt=model_salt, dataset_salt=dataset_salt)
|
|
@@ -48,7 +48,9 @@ def _cmd_emit_eval(args: argparse.Namespace) -> int:
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _cmd_show_eval(args: argparse.Namespace) -> int:
|
|
51
|
-
from .evalclaim import
|
|
51
|
+
from .evalclaim import ( # noqa: PLC0415
|
|
52
|
+
DEFAULT_ASSURANCE, check_freshness, claim_warnings, decode_eval_claim, sd_jwt_hidden_count,
|
|
53
|
+
)
|
|
52
54
|
claim = decode_eval_claim(args.receipt)
|
|
53
55
|
if claim is None:
|
|
54
56
|
print("=> FAILED: not a valid, issuer-bound eval receipt", file=sys.stderr)
|
|
@@ -56,10 +58,19 @@ def _cmd_show_eval(args: argparse.Namespace) -> int:
|
|
|
56
58
|
print(f"suite {claim['suite']} ({claim['suite_version']})")
|
|
57
59
|
print(f"metric {claim['metric']} {claim['comparator']} {claim['threshold']}")
|
|
58
60
|
print(f"passed {claim['passed']} (n={claim['n']})")
|
|
61
|
+
print(f"assurance {claim.get('assurance_level', DEFAULT_ASSURANCE)}")
|
|
59
62
|
print(f"model commit {claim['model_id_commit']}")
|
|
60
63
|
print(f"dataset commit {claim['dataset_id_commit']}")
|
|
61
64
|
print(f"issuer {claim['issuer']}")
|
|
62
65
|
print(f"timestamp {claim['timestamp']}")
|
|
66
|
+
hidden = sd_jwt_hidden_count(args.receipt)
|
|
67
|
+
if hidden is not None:
|
|
68
|
+
print(f"sd-jwt {hidden} field(s) withheld (selective disclosure)")
|
|
69
|
+
fresh = check_freshness(claim)
|
|
70
|
+
if fresh["parsed"]:
|
|
71
|
+
print(f"age {fresh['age_seconds']}s")
|
|
72
|
+
for w in claim_warnings(claim):
|
|
73
|
+
print(f"WARNING {w}")
|
|
63
74
|
print("=> OK")
|
|
64
75
|
return 0
|
|
65
76
|
|