verifyhash 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +883 -0
- package/cli/abi/ContributionRegistry.json +881 -0
- package/cli/agent.js +2173 -0
- package/cli/anchor-artifact.js +853 -0
- package/cli/anchor.js +400 -0
- package/cli/claim.js +881 -0
- package/cli/core/agent-commit.js +448 -0
- package/cli/core/agent-session.js +598 -0
- package/cli/core/anchor-binding.js +663 -0
- package/cli/core/attestation.js +580 -0
- package/cli/core/evidence-plans.js +495 -0
- package/cli/core/fixtures/evidence-plans/baseline.json +19 -0
- package/cli/core/fulfill-intake.js +1082 -0
- package/cli/core/go-live-preflight.js +481 -0
- package/cli/core/license.js +534 -0
- package/cli/core/manifest.js +243 -0
- package/cli/core/packetseal.js +591 -0
- package/cli/core/registryArtifact.js +49 -0
- package/cli/core/revocation.js +539 -0
- package/cli/core/rfc3161.js +389 -0
- package/cli/core/timestamp.js +482 -0
- package/cli/core/trust-asof.js +479 -0
- package/cli/dataset.js +2950 -0
- package/cli/evidence.js +2227 -0
- package/cli/fulfill-webhook-http.js +438 -0
- package/cli/git.js +220 -0
- package/cli/hash.js +550 -0
- package/cli/identity.js +1072 -0
- package/cli/journal-cli.js +1110 -0
- package/cli/journal-log.js +454 -0
- package/cli/journal.js +334 -0
- package/cli/lineage.js +447 -0
- package/cli/list.js +287 -0
- package/cli/parcel.js +1509 -0
- package/cli/proof.js +578 -0
- package/cli/prove.js +300 -0
- package/cli/receipt.js +631 -0
- package/cli/registry.js +331 -0
- package/cli/reputation.js +344 -0
- package/cli/revocation.js +495 -0
- package/cli/serve-verify-http.js +298 -0
- package/cli/serve-verify.js +333 -0
- package/cli/show.js +339 -0
- package/cli/verify.js +383 -0
- package/cli/vh.js +3927 -0
- package/docs/ADOPT.md +183 -0
- package/docs/ADOPTION.json +11 -0
- package/docs/AGENTTRACE.md +247 -0
- package/docs/ANCHORING.md +167 -0
- package/docs/AUDIT.md +55 -0
- package/docs/CONFORMANCE.md +107 -0
- package/docs/DATALEDGER.md +638 -0
- package/docs/DECIDE.md +47 -0
- package/docs/DECISIONS-PENDING.md +27 -0
- package/docs/DEPLOY-PUBLIC-SITE.md +301 -0
- package/docs/ENGINE-LEDGER.json +12 -0
- package/docs/EVIDENCE.md +519 -0
- package/docs/GO-LIVE.md +66 -0
- package/docs/IDENTITY.md +123 -0
- package/docs/INDEPENDENT-VERIFICATION.md +377 -0
- package/docs/INTEGRITY-JOURNAL.md +337 -0
- package/docs/KEY-LIFECYCLE.md +179 -0
- package/docs/LICENSING.md +46 -0
- package/docs/LINEAGE.md +307 -0
- package/docs/LOOP-AUDIT-2026-07-03.json +580 -0
- package/docs/LOOP-HARDENING-PLAN.md +44 -0
- package/docs/MERKLE-LEAVES.md +113 -0
- package/docs/METRICS.jsonl +31 -0
- package/docs/MORNING.md +204 -0
- package/docs/PILOT.md +444 -0
- package/docs/PROOFPARCEL.md +227 -0
- package/docs/PROOFS.md +262 -0
- package/docs/RECEIPTS.md +341 -0
- package/docs/REPUTATION.md +158 -0
- package/docs/SDK.md +301 -0
- package/docs/STRATEGY-ARCHIVE.md +5055 -0
- package/docs/SUPERVISOR-RUNBOOK.md +52 -0
- package/docs/TRUST-BOUNDARIES.md +335 -0
- package/docs/TRUSTLEDGER.md +1976 -0
- package/docs/USAGE-BUDGET.json +121 -0
- package/docs/VERIFY-SERVICE.md +168 -0
- package/index.js +160 -0
- package/package.json +41 -0
- package/trustledger/build-standalone.js +796 -0
- package/trustledger/cli.js +3179 -0
- package/trustledger/close.js +391 -0
- package/trustledger/corpus.js +159 -0
- package/trustledger/dist/BUILD-PROVENANCE.json +99 -0
- package/trustledger/dist/trustledger-standalone.html +6197 -0
- package/trustledger/dist/trustledger-standalone.html.sha256 +1 -0
- package/trustledger/door-core.js +442 -0
- package/trustledger/fixtures/bank.csv +7 -0
- package/trustledger/fixtures/bank.malformed.csv +3 -0
- package/trustledger/fixtures/bank.noalias.csv +5 -0
- package/trustledger/fixtures/bank.ofx +34 -0
- package/trustledger/fixtures/bank.real.csv +5 -0
- package/trustledger/fixtures/corpus/_shared/prior-close.json +22 -0
- package/trustledger/fixtures/corpus/bank-book-mismatch--benign-twin/inputs.json +14 -0
- package/trustledger/fixtures/corpus/bank-book-mismatch--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/bank-book-mismatch--out-of-trust/inputs.json +14 -0
- package/trustledger/fixtures/corpus/bank-book-mismatch--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/corpus/continuity-break--benign-twin/inputs.json +15 -0
- package/trustledger/fixtures/corpus/continuity-break--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/continuity-break--out-of-trust/inputs.json +15 -0
- package/trustledger/fixtures/corpus/continuity-break--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/corpus/negative-tenant-ledger--benign-twin/inputs.json +13 -0
- package/trustledger/fixtures/corpus/negative-tenant-ledger--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/negative-tenant-ledger--out-of-trust/inputs.json +13 -0
- package/trustledger/fixtures/corpus/negative-tenant-ledger--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/corpus/owner-overdraw--benign-twin/inputs.json +15 -0
- package/trustledger/fixtures/corpus/owner-overdraw--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/owner-overdraw--out-of-trust/inputs.json +15 -0
- package/trustledger/fixtures/corpus/owner-overdraw--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/corpus/security-deposit-segregation--benign-twin/inputs.json +16 -0
- package/trustledger/fixtures/corpus/security-deposit-segregation--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/security-deposit-segregation--out-of-trust/inputs.json +13 -0
- package/trustledger/fixtures/corpus/security-deposit-segregation--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/corpus/subledger-out-of-balance--benign-twin/inputs.json +13 -0
- package/trustledger/fixtures/corpus/subledger-out-of-balance--benign-twin/meta.json +7 -0
- package/trustledger/fixtures/corpus/subledger-out-of-balance--out-of-trust/inputs.json +13 -0
- package/trustledger/fixtures/corpus/subledger-out-of-balance--out-of-trust/meta.json +7 -0
- package/trustledger/fixtures/e2e/bank.aliased.csv +4 -0
- package/trustledger/fixtures/e2e/bank.csv +4 -0
- package/trustledger/fixtures/e2e/bank.nsf.csv +4 -0
- package/trustledger/fixtures/e2e/quickbooks.csv +6 -0
- package/trustledger/fixtures/e2e/quickbooks.nsf.csv +8 -0
- package/trustledger/fixtures/e2e/rentroll.csv +6 -0
- package/trustledger/fixtures/e2e/rentroll.nsf.csv +8 -0
- package/trustledger/fixtures/e2e/rentroll.short.csv +5 -0
- package/trustledger/fixtures/plans/baseline.json +25 -0
- package/trustledger/fixtures/plans/price-binding.example.json +27 -0
- package/trustledger/fixtures/policy/ambiguous-deposit-example.json +12 -0
- package/trustledger/fixtures/policy/baseline.json +19 -0
- package/trustledger/fixtures/policy/ca-example.json +12 -0
- package/trustledger/fixtures/policy/negative-tenant-ledger-example.json +12 -0
- package/trustledger/fixtures/policy/owner-overdraw-example.json +12 -0
- package/trustledger/fixtures/quickbooks.csv +7 -0
- package/trustledger/fixtures/quickbooks.real.csv +5 -0
- package/trustledger/fixtures/rentroll.csv +6 -0
- package/trustledger/fixtures/rentroll.real.csv +4 -0
- package/trustledger/ingest.js +1163 -0
- package/trustledger/lib/policy-bundled-loader.js +44 -0
- package/trustledger/lib/sha256-vendored.js +227 -0
- package/trustledger/license.js +563 -0
- package/trustledger/match.js +551 -0
- package/trustledger/plans.js +551 -0
- package/trustledger/policy.js +398 -0
- package/trustledger/public/index.html +512 -0
- package/trustledger/reconcile.js +1486 -0
- package/trustledger/report.js +887 -0
- package/trustledger/seal.js +854 -0
- package/trustledger/server.js +391 -0
- package/trustledger/valueproof.js +350 -0
package/cli/dataset.js
ADDED
|
@@ -0,0 +1,2950 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
// cli/dataset.js — a tamper-evident, versioned DATASET MANIFEST for verifyhash (DataLedger).
|
|
4
|
+
//
|
|
5
|
+
// WHY THIS EXISTS
|
|
6
|
+
// AI/ML training-data provenance needs one portable, verifiable artifact that pins EXACTLY what a
|
|
7
|
+
// dataset directory contained — byte-for-byte, including file names/paths — at the moment it was
|
|
8
|
+
// manifested. `vh dataset build <dir> --out <manifest>` walks the dataset tree and writes a strict,
|
|
9
|
+
// versioned JSON manifest: the Merkle ROOT plus a sorted per-file list of { relPath, contentHash,
|
|
10
|
+
// leaf }, so a later reader can (a) re-derive the same root from the same tree, (b) prove any single
|
|
11
|
+
// file is a member of the anchored root (the leaf is exactly what the on-chain verifyLeaf consumes),
|
|
12
|
+
// and (c) detect ANY edit/rename/add/remove as a root change.
|
|
13
|
+
//
|
|
14
|
+
// It reuses the EXISTING path-bound, domain-separated Merkle convention from cli/hash.js verbatim
|
|
15
|
+
// (pathLeaf / leafHash / nodeHash, the same DIR_LEAF_DOMAIN/LEAF_TAG/NODE_TAG the contract uses) —
|
|
16
|
+
// NO new hashing convention, so a dataset root is the SAME value `vh hash <dir>` and the contract's
|
|
17
|
+
// verifyLeaf produce for the same tree. The only difference from `hashDir` is that the tree is built
|
|
18
|
+
// by STREAMING each file (cli/hash.js › hashDirStream) so a multi-gigabyte dataset is hashed without
|
|
19
|
+
// ever holding all file content in memory at once — at most one ~1 MiB chunk plus the array of
|
|
20
|
+
// 32-byte per-file hashes.
|
|
21
|
+
//
|
|
22
|
+
// UNTRUSTED PROVENANCE HINTS
|
|
23
|
+
// A caller may attach OPTIONAL per-file { source, license } strings (where a file came from, under
|
|
24
|
+
// what license). These are recorded under an explicitly-labeled `hints` object and are UNTRUSTED
|
|
25
|
+
// self-asserted metadata — consistent with docs/TRUST-BOUNDARIES.md. They are NOT bound into the
|
|
26
|
+
// Merkle root and prove NOTHING; editing them does not change the root. The manifest's `note` field
|
|
27
|
+
// says so in-band so a downstream reader can never mistake a license hint for a verified fact.
|
|
28
|
+
//
|
|
29
|
+
// STRICTNESS
|
|
30
|
+
// A corrupt/edited manifest must never be silently half-accepted: `readManifest` validates strictly
|
|
31
|
+
// and throws on the FIRST deviation (wrong kind/schemaVersion, missing/!hex root, a file entry whose
|
|
32
|
+
// contentHash/leaf is missing or not 0x 32-byte hex, an empty relPath, a non-array files list)
|
|
33
|
+
// rather than filling defaults — mirroring cli/receipt.js and cli/proof.js. It deliberately does NOT
|
|
34
|
+
// re-verify the leaves against the content (it has no content); it guarantees only that the manifest
|
|
35
|
+
// is structurally sound. Re-deriving the root from the actual tree is the authoritative check.
|
|
36
|
+
|
|
37
|
+
const fs = require("fs");
|
|
38
|
+
const path = require("path");
|
|
39
|
+
const { keccak256, toUtf8Bytes } = require("ethers");
|
|
40
|
+
const { hashDirStream, hashFileStream, pathLeaf, buildTree, proofForIndex } = require("./hash");
|
|
41
|
+
const { diffManifest } = require("./receipt");
|
|
42
|
+
const {
|
|
43
|
+
buildProofArtifact,
|
|
44
|
+
writeProofArtifact,
|
|
45
|
+
readProofArtifact,
|
|
46
|
+
recomputeFold,
|
|
47
|
+
} = require("./proof");
|
|
48
|
+
// The GENERIC, product-agnostic provenance engine. DataLedger is a THIN adapter over it: the manifest
|
|
49
|
+
// builder/validator + the signed-attestation envelope live ONCE in cli/core/ and are shared with
|
|
50
|
+
// ProofParcel (and AttestKit later) so the Merkle/manifest/attest math and — critically — the TRUST
|
|
51
|
+
// caveats can NEVER drift between products. The dependency points dataset → core (never the reverse).
|
|
52
|
+
const coreManifest = require("./core/manifest");
|
|
53
|
+
const coreAttestation = require("./core/attestation");
|
|
54
|
+
const coreTimestamp = require("./core/timestamp");
|
|
55
|
+
const coreTrustAsOf = require("./core/trust-asof");
|
|
56
|
+
|
|
57
|
+
// On-disk schema discriminators. A dataset manifest carries its OWN kind + version (distinct from the
|
|
58
|
+
// receipt kinds in cli/receipt.js and the proof-artifact kind in cli/proof.js) so a random JSON file,
|
|
59
|
+
// a receipt, a proof artifact, or a future/foreign manifest is never misread as a current manifest.
|
|
60
|
+
const MANIFEST_KIND = "verifyhash.dataset-manifest";
|
|
61
|
+
const MANIFEST_SCHEMA_VERSION = 1;
|
|
62
|
+
const SUPPORTED_MANIFEST_SCHEMA_VERSIONS = Object.freeze([1]);
|
|
63
|
+
|
|
64
|
+
// Same hex shape cli/receipt.js / cli/proof.js validate against, so the modules never drift. Sourced
|
|
65
|
+
// from cli/core so the per-file hex check is the IDENTICAL regex the whole product family shares.
|
|
66
|
+
const HEX32_RE = coreManifest.HEX32_RE;
|
|
67
|
+
|
|
68
|
+
// In-band note so a reader of the raw JSON cannot mistake an untrusted license hint for a fact, nor
|
|
69
|
+
// the root for proof of anything more than set-membership of (relPath, content) pairs. The text lives
|
|
70
|
+
// in EXACTLY ONE place — cli/core/manifest.js — and is imported here (and by ProofParcel) so the
|
|
71
|
+
// caveats can NEVER drift between products.
|
|
72
|
+
const TRUST_NOTE = coreManifest.TRUST_NOTE;
|
|
73
|
+
|
|
74
|
+
// DataLedger's manifest framing, passed to the GENERIC core builder/validator. The core does the
|
|
75
|
+
// shared math + structural validation; this object supplies ONLY the DataLedger-specific framing
|
|
76
|
+
// (kind, schema, note, and the human "dataset manifest" label so the error strings are byte-identical
|
|
77
|
+
// to the pre-extraction code). ProofParcel passes its OWN config to the same core.
|
|
78
|
+
const MANIFEST_CFG = Object.freeze({
|
|
79
|
+
kind: MANIFEST_KIND,
|
|
80
|
+
schemaVersion: MANIFEST_SCHEMA_VERSION,
|
|
81
|
+
supportedSchemaVersions: SUPPORTED_MANIFEST_SCHEMA_VERSIONS,
|
|
82
|
+
note: TRUST_NOTE,
|
|
83
|
+
label: "dataset manifest",
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Build a normalized, fully-validated dataset-manifest object from a streamed directory result plus
|
|
88
|
+
* optional per-file hints. Throws if the result is malformed, so a corrupt manifest is never written.
|
|
89
|
+
*
|
|
90
|
+
* @param {{ root: string, leaves: {path:string,contentHash:string,leaf:string}[] }} built
|
|
91
|
+
* the object cli/hash.js › hashDirStream returns
|
|
92
|
+
* @param {object} [opts]
|
|
93
|
+
* @param {Object<string,{source?:string,license?:string}>} [opts.hints]
|
|
94
|
+
* OPTIONAL untrusted per-file hints keyed by relPath. Only `source`/`license` string fields are
|
|
95
|
+
* recorded; a hint for a relPath not present in the tree is rejected (so a typo'd path is caught
|
|
96
|
+
* rather than silently kept as dangling metadata).
|
|
97
|
+
* @returns {object} a validated manifest object
|
|
98
|
+
*/
|
|
99
|
+
function buildManifest(built, opts = {}) {
|
|
100
|
+
// THIN wrapper over the generic core: the core does the shared hint-normalization, the Merkle/manifest
|
|
101
|
+
// assembly, and the strict validation; DataLedger supplies ONLY its framing (MANIFEST_CFG). Behaviour
|
|
102
|
+
// is byte-for-byte identical to the pre-extraction code (same kind, note, fields, error strings).
|
|
103
|
+
return coreManifest.buildItemManifest(built, MANIFEST_CFG, opts);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Strictly validate a parsed dataset-manifest object. THIN wrapper over the generic core validator with
|
|
108
|
+
* DataLedger's framing (MANIFEST_CFG) — the core enforces the shared structural rules (kind/
|
|
109
|
+
* schemaVersion, hex root, per-file leaf == pathLeaf(relPath, contentHash), hint shape) and the
|
|
110
|
+
* "dataset manifest" label keeps every error string byte-identical. Throws on the FIRST problem; never
|
|
111
|
+
* mutates and never fills defaults.
|
|
112
|
+
* @param {any} obj
|
|
113
|
+
* @returns {object} the same object, if valid
|
|
114
|
+
*/
|
|
115
|
+
function validateManifest(obj) {
|
|
116
|
+
return coreManifest.validateItemManifest(obj, MANIFEST_CFG);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Read, parse, and STRICTLY validate the manifest at `manifestPath`. Throws on a missing file, invalid
|
|
121
|
+
* JSON, or ANY schema deviation (so a malformed/edited manifest is rejected, never half-accepted).
|
|
122
|
+
* @param {string} manifestPath
|
|
123
|
+
* @returns {object} the validated manifest object
|
|
124
|
+
*/
|
|
125
|
+
function readManifest(manifestPath) {
|
|
126
|
+
if (!manifestPath || typeof manifestPath !== "string") {
|
|
127
|
+
throw new Error("readManifest requires a manifest file path");
|
|
128
|
+
}
|
|
129
|
+
let raw;
|
|
130
|
+
try {
|
|
131
|
+
raw = fs.readFileSync(manifestPath, "utf8");
|
|
132
|
+
} catch (e) {
|
|
133
|
+
throw new Error(`cannot read dataset manifest at ${manifestPath}: ${e.message}`);
|
|
134
|
+
}
|
|
135
|
+
let obj;
|
|
136
|
+
try {
|
|
137
|
+
obj = JSON.parse(raw);
|
|
138
|
+
} catch (e) {
|
|
139
|
+
throw new Error(`dataset manifest at ${manifestPath} is not valid JSON: ${e.message}`);
|
|
140
|
+
}
|
|
141
|
+
return validateManifest(obj);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Validate and write a dataset manifest to `outPath` as pretty JSON + a trailing newline. The ONLY
|
|
146
|
+
* side effect is the file write at the caller-chosen path (never silently the cwd), and it throws
|
|
147
|
+
* (before writing) if the object is not a valid manifest, so a corrupt manifest never lands on disk.
|
|
148
|
+
* @param {object} obj a manifest (typically from buildManifest)
|
|
149
|
+
* @param {string} outPath destination file path (caller-chosen)
|
|
150
|
+
* @returns {object} the validated object that was written
|
|
151
|
+
*/
|
|
152
|
+
function writeManifest(obj, outPath) {
|
|
153
|
+
if (!outPath || typeof outPath !== "string") {
|
|
154
|
+
throw new Error("writeManifest requires an --out path");
|
|
155
|
+
}
|
|
156
|
+
validateManifest(obj);
|
|
157
|
+
fs.writeFileSync(outPath, JSON.stringify(obj, null, 2) + "\n");
|
|
158
|
+
return obj;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Orchestrate `vh dataset build <dir> --out <manifest>`: stream-hash the dataset tree, build the
|
|
163
|
+
* manifest (with optional untrusted hints), write it to the caller's --out path, and return a small
|
|
164
|
+
* summary. Writes ONLY to `outPath` — no cwd litter.
|
|
165
|
+
*
|
|
166
|
+
* @param {object} opts
|
|
167
|
+
* @param {string} opts.dir dataset directory to manifest
|
|
168
|
+
* @param {string} opts.out where to write the manifest (REQUIRED — never defaulted to cwd)
|
|
169
|
+
* @param {Object<string,{source?:string,license?:string}>} [opts.hints] optional untrusted hints
|
|
170
|
+
* @param {boolean} [opts.json] emit machine-readable JSON instead of the human summary
|
|
171
|
+
* @param {(s:string)=>void} [opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
172
|
+
* @returns {{ root: string, fileCount: number, out: string }}
|
|
173
|
+
*/
|
|
174
|
+
function runDatasetBuild(opts) {
|
|
175
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetBuild requires options");
|
|
176
|
+
const { dir, out, hints } = opts;
|
|
177
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
178
|
+
if (!dir) throw new Error("runDatasetBuild requires a dataset <dir>");
|
|
179
|
+
if (!out) throw new Error("runDatasetBuild requires an --out <manifest> path");
|
|
180
|
+
|
|
181
|
+
// Resolve to an absolute path so the manifest is written EXACTLY where the caller asked, regardless
|
|
182
|
+
// of cwd. statSync errors clearly (ENOENT / not a dir) before we walk anything.
|
|
183
|
+
const dirAbs = path.resolve(dir);
|
|
184
|
+
const stat = fs.statSync(dirAbs);
|
|
185
|
+
if (!stat.isDirectory()) {
|
|
186
|
+
throw new Error(`dataset target is not a directory: ${dir}`);
|
|
187
|
+
}
|
|
188
|
+
const outAbs = path.resolve(out);
|
|
189
|
+
|
|
190
|
+
const built = hashDirStream(dirAbs); // streams each file; never loads all content at once
|
|
191
|
+
const manifest = buildManifest(built, { hints });
|
|
192
|
+
writeManifest(manifest, outAbs);
|
|
193
|
+
|
|
194
|
+
if (opts.json) {
|
|
195
|
+
write(
|
|
196
|
+
JSON.stringify({ root: manifest.root, fileCount: manifest.fileCount, out: outAbs }) + "\n"
|
|
197
|
+
);
|
|
198
|
+
} else {
|
|
199
|
+
write(`dataset manifest written: ${outAbs}\n`);
|
|
200
|
+
write(` root: ${manifest.root}\n`);
|
|
201
|
+
write(` files: ${manifest.fileCount}\n`);
|
|
202
|
+
write(
|
|
203
|
+
" NOTE: the root commits to file names AND bytes; per-file source/license hints are UNTRUSTED.\n"
|
|
204
|
+
);
|
|
205
|
+
}
|
|
206
|
+
return { root: manifest.root, fileCount: manifest.fileCount, out: outAbs };
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Possible outcomes of a `vh dataset verify` run. The AUTHORITATIVE verdict is recomputed-root vs
|
|
210
|
+
// manifest-root — never the per-file diff (which only LOCALIZES which file moved).
|
|
211
|
+
const VERIFY_STATUS = Object.freeze({
|
|
212
|
+
MATCH: "MATCH", // root re-derived from the FRESH tree equals the manifest's recorded root
|
|
213
|
+
MISMATCH: "MISMATCH", // it does NOT — a file was added/removed/changed/renamed since the manifest
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Re-derive the dataset root from a FRESH copy of the dataset at `dir` and compare it to the
|
|
218
|
+
* (UNTRUSTED) manifest's recorded root, then localize any divergence to specific files.
|
|
219
|
+
*
|
|
220
|
+
* TRUST POSTURE (docs/TRUST-BOUNDARIES.md). The manifest is an UNTRUSTED hint: the AUTHORITATIVE
|
|
221
|
+
* MATCH/MISMATCH is `recomputed-root === manifest-root`, recomputed here from the actual bytes on
|
|
222
|
+
* disk via the SAME path-bound Merkle convention `vh hash <dir>` and the on-chain verifyLeaf use.
|
|
223
|
+
* The per-file ADDED/REMOVED/CHANGED diff is a CONVENIENCE that says WHICH file diverged; it never
|
|
224
|
+
* decides the verdict (so even a manifest with a hand-edited `root` cannot fake a MATCH — the root
|
|
225
|
+
* is recomputed, not read from the manifest). This is fully OFFLINE: no provider, no key, no network.
|
|
226
|
+
*
|
|
227
|
+
* The diff reuses the SAME receipt-manifest diff core (`cli/receipt.js › diffManifest`, the function
|
|
228
|
+
* `cli/verify.js` uses for its `--receipt` directory diff): a `CHANGED` entry carries old→new
|
|
229
|
+
* `contentHash`, exactly like the verify path. A rename surfaces as one REMOVED (old path) + one
|
|
230
|
+
* ADDED (new path), because the path is bound into the leaf — the root commits to file NAMES too.
|
|
231
|
+
*
|
|
232
|
+
* @param {object} opts
|
|
233
|
+
* @param {string} opts.dir dataset directory to re-derive the root from (the FRESH copy)
|
|
234
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build` (UNTRUSTED hint)
|
|
235
|
+
* @param {boolean}[opts.json] emit a machine-readable JSON object instead of the human block
|
|
236
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
237
|
+
* @returns {{
|
|
238
|
+
* status: "MATCH"|"MISMATCH",
|
|
239
|
+
* recomputedRoot: string,
|
|
240
|
+
* manifestRoot: string,
|
|
241
|
+
* fileCount: number,
|
|
242
|
+
* diff: { added: any[], removed: any[], changed: any[], unchanged: any[], identical: boolean }
|
|
243
|
+
* }}
|
|
244
|
+
*/
|
|
245
|
+
function runDatasetVerify(opts) {
|
|
246
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetVerify requires options");
|
|
247
|
+
const { dir, manifest: manifestPath } = opts;
|
|
248
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
249
|
+
if (!dir) throw new Error("runDatasetVerify requires a dataset <dir>");
|
|
250
|
+
if (!manifestPath) throw new Error("runDatasetVerify requires a --manifest <p> path");
|
|
251
|
+
|
|
252
|
+
// Resolve so we read EXACTLY where the caller asked regardless of cwd. statSync errors clearly
|
|
253
|
+
// (ENOENT / not a dir) before we walk anything — and BEFORE we trust the manifest at all.
|
|
254
|
+
const dirAbs = path.resolve(dir);
|
|
255
|
+
const stat = fs.statSync(dirAbs);
|
|
256
|
+
if (!stat.isDirectory()) {
|
|
257
|
+
throw new Error(`dataset target is not a directory: ${dir}`);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// The manifest is an untrusted hint, but it must be STRUCTURALLY sound or we cannot diff against it
|
|
261
|
+
// (readManifest rejects a corrupt/edited manifest rather than half-accepting it).
|
|
262
|
+
const manifest = readManifest(manifestPath);
|
|
263
|
+
|
|
264
|
+
// Re-derive the root + per-file leaves from the FRESH tree (streamed; never loads all content).
|
|
265
|
+
const built = hashDirStream(dirAbs);
|
|
266
|
+
const recomputedRoot = built.root;
|
|
267
|
+
const manifestRoot = manifest.root;
|
|
268
|
+
|
|
269
|
+
// AUTHORITATIVE verdict: recomputed root vs manifest root. Case-insensitive hex compare (both are
|
|
270
|
+
// 0x-prefixed lowercase here, but never let a case difference flip the verdict).
|
|
271
|
+
const status =
|
|
272
|
+
recomputedRoot.toLowerCase() === manifestRoot.toLowerCase()
|
|
273
|
+
? VERIFY_STATUS.MATCH
|
|
274
|
+
: VERIFY_STATUS.MISMATCH;
|
|
275
|
+
|
|
276
|
+
// Localize WHICH file diverged using the SAME diff core cli/verify.js uses for its --receipt diff.
|
|
277
|
+
// The manifest entries are keyed by `relPath`; diffManifest expects `path`, so map across (the leaf
|
|
278
|
+
// is what diffManifest compares, so a swapped file shows as CHANGED and a rename as REMOVED+ADDED).
|
|
279
|
+
const recordedManifest = manifest.files.map((f) => ({
|
|
280
|
+
path: f.relPath,
|
|
281
|
+
contentHash: f.contentHash,
|
|
282
|
+
leaf: f.leaf,
|
|
283
|
+
}));
|
|
284
|
+
const diff = diffManifest(recordedManifest, built.leaves);
|
|
285
|
+
|
|
286
|
+
if (opts.json) {
|
|
287
|
+
write(
|
|
288
|
+
JSON.stringify({
|
|
289
|
+
status,
|
|
290
|
+
recomputedRoot,
|
|
291
|
+
manifestRoot,
|
|
292
|
+
fileCount: built.leaves.length,
|
|
293
|
+
diff,
|
|
294
|
+
}) + "\n"
|
|
295
|
+
);
|
|
296
|
+
} else {
|
|
297
|
+
for (const line of formatDatasetVerify({ status, recomputedRoot, manifestRoot, diff })) {
|
|
298
|
+
write(line + "\n");
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return { status, recomputedRoot, manifestRoot, fileCount: built.leaves.length, diff };
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Render a dataset-verify result as the human-readable block the CLI prints. Leads with the
|
|
306
|
+
* authoritative root comparison, then the per-file diff (labeled as localization, never the verdict).
|
|
307
|
+
* @param {{status:string,recomputedRoot:string,manifestRoot:string,diff:object}} r
|
|
308
|
+
* @returns {string[]} lines
|
|
309
|
+
*/
|
|
310
|
+
function formatDatasetVerify(r) {
|
|
311
|
+
const lines = [
|
|
312
|
+
` dataset verify: ${r.status}`,
|
|
313
|
+
` recomputed root: ${r.recomputedRoot} (re-derived from the files on disk — AUTHORITATIVE)`,
|
|
314
|
+
` manifest root: ${r.manifestRoot} (untrusted hint)`,
|
|
315
|
+
];
|
|
316
|
+
if (r.status === VERIFY_STATUS.MATCH) {
|
|
317
|
+
lines.push(
|
|
318
|
+
" The dataset is byte-for-byte (and name-for-name) what the manifest committed to."
|
|
319
|
+
);
|
|
320
|
+
} else {
|
|
321
|
+
lines.push(
|
|
322
|
+
" The dataset does NOT match the manifest: a file was added, removed, changed, or renamed",
|
|
323
|
+
" since the manifest was built (the root commits to file NAMES and bytes)."
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
const d = r.diff;
|
|
327
|
+
lines.push("", " --- per-file diff (localization; the root comparison above is the verdict) ---");
|
|
328
|
+
if (d.identical) {
|
|
329
|
+
lines.push(" files: IDENTICAL — every file matches the manifest (no ADDED/REMOVED/CHANGED).");
|
|
330
|
+
return lines;
|
|
331
|
+
}
|
|
332
|
+
lines.push(
|
|
333
|
+
` files: ${d.changed.length} CHANGED, ${d.added.length} ADDED, ${d.removed.length} REMOVED` +
|
|
334
|
+
` (${d.unchanged.length} unchanged)`
|
|
335
|
+
);
|
|
336
|
+
for (const c of d.changed) {
|
|
337
|
+
lines.push(` CHANGED ${c.path}`);
|
|
338
|
+
lines.push(` old: ${c.oldContentHash}`);
|
|
339
|
+
lines.push(` new: ${c.newContentHash}`);
|
|
340
|
+
}
|
|
341
|
+
for (const a of d.added) {
|
|
342
|
+
lines.push(` ADDED ${a.path} (${a.contentHash}) present now, not in the manifest`);
|
|
343
|
+
}
|
|
344
|
+
for (const rm of d.removed) {
|
|
345
|
+
lines.push(` REMOVED ${rm.path} (${rm.contentHash}) in the manifest, gone now`);
|
|
346
|
+
}
|
|
347
|
+
return lines;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// =================================================================================================
|
|
351
|
+
// `vh dataset diff <manifestA> <manifestB>` — OFFLINE manifest-to-manifest change report.
|
|
352
|
+
//
|
|
353
|
+
// WHY THIS EXISTS
|
|
354
|
+
// `vh dataset verify` answers "does this manifest still match the live tree on disk?". But a CI
|
|
355
|
+
// pipeline (or a data scientist comparing two dataset SNAPSHOTS) often holds TWO manifests and no
|
|
356
|
+
// tree at all, and wants to answer "what changed between version A and version B of the training
|
|
357
|
+
// set?" — purely from the two portable artifacts, with NO dataset copy, NO provider, NO key, NO
|
|
358
|
+
// network. `vh dataset diff A B` reads both via the SAME strict `readManifest` (a corrupt/edited
|
|
359
|
+
// manifest is rejected, never half-accepted) and computes the change set by REUSING the EXACT diff
|
|
360
|
+
// core `vh dataset verify` uses — `cli/receipt.js › diffManifest` — verbatim. NO new diff logic.
|
|
361
|
+
//
|
|
362
|
+
// The diff compares what each manifest CLAIMS; it does NOT re-derive content (there is no tree to
|
|
363
|
+
// read). To actually re-derive a root from bytes, run `vh dataset verify` against the live tree.
|
|
364
|
+
//
|
|
365
|
+
// EXIT CODES (mirror the dataset family): 0 when the two manifests are IDENTICAL, 3 when they DIFFER
|
|
366
|
+
// (so a pipeline can `fail if the training set changed unexpectedly`), 2 usage, 1 runtime.
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Compute (purely, OFFLINE) the change set between two dataset manifests A and B. Reads both via the
|
|
370
|
+
* strict `readManifest` (so a corrupt/foreign manifest is rejected) and diffs them by REUSING
|
|
371
|
+
* `cli/receipt.js › diffManifest` verbatim — the SAME core `vh dataset verify` uses. The diff is
|
|
372
|
+
* directional: ADDED = present in B not A, REMOVED = present in A not B, CHANGED = same relPath with a
|
|
373
|
+
* different leaf (carrying old→new contentHash). A rename surfaces as REMOVED(old path) + ADDED(new
|
|
374
|
+
* path) because the relPath is bound into the leaf — never as a single edit.
|
|
375
|
+
*
|
|
376
|
+
* @param {object} opts
|
|
377
|
+
* @param {string} opts.manifestA path to the BASELINE manifest (the "from")
|
|
378
|
+
* @param {string} opts.manifestB path to the COMPARISON manifest (the "to")
|
|
379
|
+
* @param {boolean}[opts.json] emit a machine-readable object instead of the human block
|
|
380
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
381
|
+
* @returns {{
|
|
382
|
+
* rootA: string, rootB: string, rootsIdentical: boolean, identical: boolean,
|
|
383
|
+
* added: any[], removed: any[], changed: any[], unchanged: any[],
|
|
384
|
+
* counts: { added: number, removed: number, changed: number, unchanged: number }
|
|
385
|
+
* }}
|
|
386
|
+
*/
|
|
387
|
+
function runDatasetDiff(opts) {
|
|
388
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetDiff requires options");
|
|
389
|
+
const { manifestA, manifestB } = opts;
|
|
390
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
391
|
+
if (!manifestA) throw new Error("runDatasetDiff requires a <manifestA> path");
|
|
392
|
+
if (!manifestB) throw new Error("runDatasetDiff requires a <manifestB> path");
|
|
393
|
+
|
|
394
|
+
// Strict reads: a corrupt/edited/foreign manifest is rejected here, never half-accepted, BEFORE any
|
|
395
|
+
// diff is attempted. Both must be structurally sound (every leaf == pathLeaf(relPath, contentHash)).
|
|
396
|
+
const a = readManifest(manifestA);
|
|
397
|
+
const b = readManifest(manifestB);
|
|
398
|
+
|
|
399
|
+
const rootA = a.root;
|
|
400
|
+
const rootB = b.root;
|
|
401
|
+
// The two roots, recorded in the manifests, are DISPLAYED metadata only. readManifest validates that
|
|
402
|
+
// every leaf == pathLeaf(relPath, contentHash) and the fileCount, but it does NOT re-derive
|
|
403
|
+
// root == merkleRoot(leaves) (that only happens in `dataset verify` against a live tree). So a
|
|
404
|
+
// hand-edited `root` could disagree with the leaves it claims to summarize. We therefore do NOT let
|
|
405
|
+
// root-string equality decide the verdict — see `identical` below.
|
|
406
|
+
const rootsIdentical = rootA.toLowerCase() === rootB.toLowerCase();
|
|
407
|
+
|
|
408
|
+
// Map each manifest's `files` (relPath→path) into the shape diffManifest expects, then REUSE the
|
|
409
|
+
// SAME diff core verbatim. A is the baseline ("recorded"), B is the comparison ("current"): so
|
|
410
|
+
// diffManifest's ADDED = in B not A, REMOVED = in A not B, CHANGED = same relPath, different leaf.
|
|
411
|
+
const aManifest = a.files.map((f) => ({
|
|
412
|
+
path: f.relPath,
|
|
413
|
+
contentHash: f.contentHash,
|
|
414
|
+
leaf: f.leaf,
|
|
415
|
+
}));
|
|
416
|
+
const bManifest = b.files.map((f) => ({
|
|
417
|
+
path: f.relPath,
|
|
418
|
+
contentHash: f.contentHash,
|
|
419
|
+
leaf: f.leaf,
|
|
420
|
+
}));
|
|
421
|
+
const diff = diffManifest(aManifest, bManifest);
|
|
422
|
+
|
|
423
|
+
// AUTHORITATIVE verdict (and thus exit code + IDENTICAL/DIFFERENT headline) is the CHANGE SET, not
|
|
424
|
+
// root-string equality. diffManifest already returns `identical` (true iff there is no ADDED /
|
|
425
|
+
// REMOVED / CHANGED) from the per-file LEAVES — the same data the printed/JSON changeset is built
|
|
426
|
+
// from. Deriving the verdict from the changeset guarantees the exit code, the headline, and the body
|
|
427
|
+
// can never disagree: a manifest with a hand-edited `root` (whose leaves are unchanged) still reports
|
|
428
|
+
// IDENTICAL with exit 0 and an empty changeset, instead of a DIFFERENT verdict that contradicts a
|
|
429
|
+
// "+0 / -0 / ~0" body. rootA/rootB/rootsIdentical remain DISPLAYED metadata.
|
|
430
|
+
const identical = diff.identical;
|
|
431
|
+
|
|
432
|
+
const counts = {
|
|
433
|
+
added: diff.added.length,
|
|
434
|
+
removed: diff.removed.length,
|
|
435
|
+
changed: diff.changed.length,
|
|
436
|
+
unchanged: diff.unchanged.length,
|
|
437
|
+
};
|
|
438
|
+
|
|
439
|
+
if (opts.json) {
|
|
440
|
+
write(
|
|
441
|
+
JSON.stringify({
|
|
442
|
+
rootA,
|
|
443
|
+
rootB,
|
|
444
|
+
rootsIdentical,
|
|
445
|
+
identical,
|
|
446
|
+
added: diff.added,
|
|
447
|
+
removed: diff.removed,
|
|
448
|
+
changed: diff.changed,
|
|
449
|
+
unchanged: diff.unchanged,
|
|
450
|
+
counts,
|
|
451
|
+
}) + "\n"
|
|
452
|
+
);
|
|
453
|
+
} else {
|
|
454
|
+
for (const line of formatDatasetDiff({ rootA, rootB, rootsIdentical, identical, diff, counts })) {
|
|
455
|
+
write(line + "\n");
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
return {
|
|
460
|
+
rootA,
|
|
461
|
+
rootB,
|
|
462
|
+
rootsIdentical,
|
|
463
|
+
identical,
|
|
464
|
+
added: diff.added,
|
|
465
|
+
removed: diff.removed,
|
|
466
|
+
changed: diff.changed,
|
|
467
|
+
unchanged: diff.unchanged,
|
|
468
|
+
counts,
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Render a dataset-diff result as the human-readable block the CLI prints. Leads with the one-line
|
|
474
|
+
* TRUST note (reusing the dataset TRUST_NOTE wording), states whether the roots are IDENTICAL or
|
|
475
|
+
* DIFFERENT, prints the precise per-file ADDED/REMOVED/CHANGED set with a count line, and states that
|
|
476
|
+
* a rename surfaces as REMOVED+ADDED (so it is not mistaken for two unrelated edits).
|
|
477
|
+
* The headline (IDENTICAL vs DIFFERENT) is driven by `identical` — the CHANGE SET, not root-string
|
|
478
|
+
* equality — so it can never contradict the per-file body or the exit code. rootA/rootB and whether the
|
|
479
|
+
* raw root STRINGS matched are printed as metadata; if they disagree with the change set (a hand-edited
|
|
480
|
+
* `root` whose leaves are unchanged) the discrepancy is called out explicitly rather than letting it
|
|
481
|
+
* silently flip the verdict.
|
|
482
|
+
* @param {{rootA:string,rootB:string,rootsIdentical:boolean,identical:boolean,diff:object,counts:object}} r
|
|
483
|
+
* @returns {string[]} lines
|
|
484
|
+
*/
|
|
485
|
+
function formatDatasetDiff(r) {
|
|
486
|
+
const lines = [
|
|
487
|
+
// TRUST note FIRST: a diff compares what each manifest CLAIMS; it does not re-derive content.
|
|
488
|
+
" TRUST: this compares what each manifest CLAIMS — it does NOT re-derive content. " + TRUST_NOTE,
|
|
489
|
+
" (run `vh dataset verify` against the live tree to re-derive a root from bytes).",
|
|
490
|
+
"",
|
|
491
|
+
` manifest A root: ${r.rootA}`,
|
|
492
|
+
` manifest B root: ${r.rootB}`,
|
|
493
|
+
];
|
|
494
|
+
if (r.identical) {
|
|
495
|
+
lines.push(
|
|
496
|
+
" files: IDENTICAL — the two manifests commit to the SAME set of (relPath, content) pairs;",
|
|
497
|
+
" the file sets are identical (no ADDED / REMOVED / CHANGED).",
|
|
498
|
+
` +0 / -0 / ~0 / ${r.counts.unchanged} unchanged`
|
|
499
|
+
);
|
|
500
|
+
// The verdict is the change set, not the raw root strings. If those root strings DISAGREE while the
|
|
501
|
+
// file sets are identical, a `root` field was hand-edited (readManifest does not re-derive
|
|
502
|
+
// root-over-leaves); flag it so a reader is not surprised by mismatched root lines above.
|
|
503
|
+
if (!r.rootsIdentical) {
|
|
504
|
+
lines.push(
|
|
505
|
+
" NOTE: the two manifests' recorded `root` fields DIFFER even though their file sets are",
|
|
506
|
+
" identical — a `root` was hand-edited (a manifest's root is not re-derived from its",
|
|
507
|
+
" leaves on read). Run `vh dataset verify` against the live tree to re-derive a root.",
|
|
508
|
+
" The IDENTICAL verdict above is the file-set change set, which is authoritative here."
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
return lines;
|
|
512
|
+
}
|
|
513
|
+
lines.push(
|
|
514
|
+
" files: DIFFERENT — the manifests commit to different (relPath, content) sets. Per-file changes",
|
|
515
|
+
" (A→B). A rename surfaces as REMOVED(old path) + ADDED(new path) — the path is bound into",
|
|
516
|
+
" the leaf — NOT as two unrelated edits.",
|
|
517
|
+
` +${r.counts.added} / -${r.counts.removed} / ~${r.counts.changed} / ${r.counts.unchanged} unchanged`
|
|
518
|
+
);
|
|
519
|
+
for (const c of r.diff.changed) {
|
|
520
|
+
lines.push(` CHANGED ${c.path}`);
|
|
521
|
+
lines.push(` old: ${c.oldContentHash}`);
|
|
522
|
+
lines.push(` new: ${c.newContentHash}`);
|
|
523
|
+
}
|
|
524
|
+
for (const a of r.diff.added) {
|
|
525
|
+
lines.push(` ADDED ${a.path} (${a.contentHash}) in B, not in A`);
|
|
526
|
+
}
|
|
527
|
+
for (const rm of r.diff.removed) {
|
|
528
|
+
lines.push(` REMOVED ${rm.path} (${rm.contentHash}) in A, not in B`);
|
|
529
|
+
}
|
|
530
|
+
return lines;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// =================================================================================================
|
|
534
|
+
// `vh dataset summary <manifest> [--json]` — provenance/license roll-up the due-diligence reviewer reads.
|
|
535
|
+
//
|
|
536
|
+
// WHY THIS EXISTS
|
|
537
|
+
// A compliance/due-diligence reviewer holding a manifest wants a one-glance aggregate: how many files,
|
|
538
|
+
// what root, and a histogram of the self-asserted {source, license} hints — "what does this dataset
|
|
539
|
+
// CLAIM about where its files came from and under what license?". This is PURELY OFFLINE: it reads the
|
|
540
|
+
// manifest via the SAME strict `readManifest` (a corrupt/foreign manifest is rejected) and counts what
|
|
541
|
+
// the manifest records. NO dataset tree, NO provider, NO key, NO network.
|
|
542
|
+
//
|
|
543
|
+
// TRUST POSTURE (carried verbatim into output). The file SET (relPath + content) is bound into the root
|
|
544
|
+
// and is trustworthy; the {source, license} hints are UNTRUSTED, self-asserted metadata NOT bound into
|
|
545
|
+
// the root. The summary counts what the dataset CLAIMS — it does NOT verify any license/source is
|
|
546
|
+
// correct. A file with NO license hint lands in the explicit "(no license hint)" bucket: that means the
|
|
547
|
+
// manifest ASSERTS NOTHING, not that the file is unlicensed.
|
|
548
|
+
|
|
549
|
+
// Explicit bucket labels for files that carry no hint, so the histogram never silently drops them and a
|
|
550
|
+
// reader can never mistake "no claim" for a real license/source value.
|
|
551
|
+
const NO_LICENSE_BUCKET = "(no license hint)";
|
|
552
|
+
const NO_SOURCE_BUCKET = "(no source hint)";
|
|
553
|
+
|
|
554
|
+
/**
|
|
555
|
+
* Compute (purely, OFFLINE) the provenance/license roll-up over a manifest's TRUSTED file set. Reads the
|
|
556
|
+
* manifest via the strict `readManifest` (so a corrupt/foreign manifest is rejected, never half-accepted)
|
|
557
|
+
* and aggregates the per-file {source, license} hints into histograms. Files with no license hint are
|
|
558
|
+
* counted under NO_LICENSE_BUCKET, files with no source hint under NO_SOURCE_BUCKET — never dropped.
|
|
559
|
+
*
|
|
560
|
+
* The hints are UNTRUSTED self-asserted metadata (NOT bound into the root); this counts CLAIMS, it does
|
|
561
|
+
* not verify them. NO dataset tree, NO provider, NO key, NO network.
|
|
562
|
+
*
|
|
563
|
+
* @param {object} opts
|
|
564
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
565
|
+
* @param {boolean}[opts.json] emit a machine-readable object instead of the human block
|
|
566
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
567
|
+
* @returns {{
|
|
568
|
+
* root: string,
|
|
569
|
+
* fileCount: number,
|
|
570
|
+
* licenses: Object<string,number>,
|
|
571
|
+
* sources: Object<string,number>,
|
|
572
|
+
* filesWithLicenseHint: number,
|
|
573
|
+
* filesWithSourceHint: number,
|
|
574
|
+
* }}
|
|
575
|
+
*/
|
|
576
|
+
/**
|
|
577
|
+
* PURE aggregation core shared by `vh dataset summary` AND `vh dataset report`: given a validated
|
|
578
|
+
* manifest object, roll up the (UNTRUSTED) per-file {source, license} hints into histograms + counts.
|
|
579
|
+
* This is the SINGLE source of truth for the roll-up math, so `vh dataset report`'s histogram can never
|
|
580
|
+
* diverge from `vh dataset summary`'s (same buckets, same counts). It takes an already-validated
|
|
581
|
+
* manifest object (no I/O) and never mutates it.
|
|
582
|
+
*
|
|
583
|
+
* @param {object} manifest a manifest object that has passed validateManifest/readManifest
|
|
584
|
+
* @returns {{
|
|
585
|
+
* root: string, fileCount: number,
|
|
586
|
+
* licenses: Object<string,number>, sources: Object<string,number>,
|
|
587
|
+
* filesWithLicenseHint: number, filesWithSourceHint: number,
|
|
588
|
+
* }}
|
|
589
|
+
*/
|
|
590
|
+
function aggregateManifest(manifest) {
|
|
591
|
+
// Aggregate the UNTRUSTED hints. A file with no `hints.license` (or no hints at all) is counted under
|
|
592
|
+
// the explicit no-hint bucket; ditto for source. We never silently omit a file from either histogram,
|
|
593
|
+
// so the per-histogram counts always sum to fileCount.
|
|
594
|
+
const licenses = {};
|
|
595
|
+
const sources = {};
|
|
596
|
+
let filesWithLicenseHint = 0;
|
|
597
|
+
let filesWithSourceHint = 0;
|
|
598
|
+
for (const f of manifest.files) {
|
|
599
|
+
const license =
|
|
600
|
+
f.hints && typeof f.hints.license === "string" ? f.hints.license : null;
|
|
601
|
+
const source = f.hints && typeof f.hints.source === "string" ? f.hints.source : null;
|
|
602
|
+
const licenseKey = license === null ? NO_LICENSE_BUCKET : license;
|
|
603
|
+
const sourceKey = source === null ? NO_SOURCE_BUCKET : source;
|
|
604
|
+
licenses[licenseKey] = (licenses[licenseKey] || 0) + 1;
|
|
605
|
+
sources[sourceKey] = (sources[sourceKey] || 0) + 1;
|
|
606
|
+
if (license !== null) filesWithLicenseHint++;
|
|
607
|
+
if (source !== null) filesWithSourceHint++;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
return {
|
|
611
|
+
root: manifest.root,
|
|
612
|
+
// Derive fileCount from the TRUSTED files array (not the OPTIONAL manifest.fileCount passthrough): a
|
|
613
|
+
// valid third-party manifest may omit fileCount, and this keeps the field always present and always
|
|
614
|
+
// self-consistent with the histograms (which sum to manifest.files.length). Mirrors runDatasetVerify.
|
|
615
|
+
fileCount: manifest.files.length,
|
|
616
|
+
licenses,
|
|
617
|
+
sources,
|
|
618
|
+
filesWithLicenseHint,
|
|
619
|
+
filesWithSourceHint,
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
function runDatasetSummary(opts) {
|
|
624
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetSummary requires options");
|
|
625
|
+
const { manifest: manifestPath } = opts;
|
|
626
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
627
|
+
if (!manifestPath) throw new Error("runDatasetSummary requires a <manifest> path");
|
|
628
|
+
|
|
629
|
+
// Strict read: a corrupt/edited/foreign manifest is rejected here, never half-accepted, BEFORE any
|
|
630
|
+
// aggregation. The file SET it commits to is the TRUSTED basis of the roll-up.
|
|
631
|
+
const manifest = readManifest(manifestPath);
|
|
632
|
+
|
|
633
|
+
// The roll-up math lives in the SHARED pure aggregator so summary and report can never diverge.
|
|
634
|
+
const result = aggregateManifest(manifest);
|
|
635
|
+
|
|
636
|
+
if (opts.json) {
|
|
637
|
+
write(JSON.stringify(result) + "\n");
|
|
638
|
+
} else {
|
|
639
|
+
for (const line of formatDatasetSummary(result)) {
|
|
640
|
+
write(line + "\n");
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
return result;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Render a dataset-summary result as the human-readable block the CLI prints. LEADS with the trust caveat
|
|
648
|
+
* (reusing the dataset TRUST_NOTE wording): the file SET is bound into the root and trustworthy; the
|
|
649
|
+
* {source, license} hints are UNTRUSTED — the summary counts what the dataset CLAIMS, it does not verify
|
|
650
|
+
* any license/source is correct. States plainly that "(no license hint)" means the manifest asserts
|
|
651
|
+
* nothing, not that the file is unlicensed.
|
|
652
|
+
* @param {{root:string,fileCount:number,licenses:object,sources:object,filesWithLicenseHint:number,filesWithSourceHint:number}} r
|
|
653
|
+
* @returns {string[]} lines
|
|
654
|
+
*/
|
|
655
|
+
function formatDatasetSummary(r) {
|
|
656
|
+
const lines = [
|
|
657
|
+
// TRUST caveat FIRST: this counts CLAIMS, not verified facts.
|
|
658
|
+
" TRUST: the file SET (relPath + content) is bound into the root and is trustworthy. " + TRUST_NOTE,
|
|
659
|
+
" This summary counts what the dataset CLAIMS — it does NOT verify any license/source is",
|
|
660
|
+
" correct. \"(no license hint)\" means the manifest ASSERTS NOTHING for that file, NOT that",
|
|
661
|
+
" the file is unlicensed; likewise \"(no source hint)\".",
|
|
662
|
+
"",
|
|
663
|
+
` root: ${r.root}`,
|
|
664
|
+
` files: ${r.fileCount}`,
|
|
665
|
+
"",
|
|
666
|
+
` licenses (CLAIMED; ${r.filesWithLicenseHint}/${r.fileCount} files carry a license hint):`,
|
|
667
|
+
];
|
|
668
|
+
for (const line of _histogramLines(r.licenses)) lines.push(line);
|
|
669
|
+
lines.push(
|
|
670
|
+
"",
|
|
671
|
+
` sources (CLAIMED; ${r.filesWithSourceHint}/${r.fileCount} files carry a source hint):`
|
|
672
|
+
);
|
|
673
|
+
for (const line of _histogramLines(r.sources)) lines.push(line);
|
|
674
|
+
return lines;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/**
|
|
678
|
+
* Render a histogram { value -> count } as sorted, aligned lines. Real values are listed first (sorted by
|
|
679
|
+
* descending count, then by value for a stable order); a no-hint bucket, if present, is listed LAST so a
|
|
680
|
+
* reader sees the asserted values before the "no claim" tally.
|
|
681
|
+
*/
|
|
682
|
+
function _histogramLines(hist) {
|
|
683
|
+
const entries = Object.entries(hist);
|
|
684
|
+
const isNoHint = (k) => k === NO_LICENSE_BUCKET || k === NO_SOURCE_BUCKET;
|
|
685
|
+
entries.sort((a, b) => {
|
|
686
|
+
const an = isNoHint(a[0]);
|
|
687
|
+
const bn = isNoHint(b[0]);
|
|
688
|
+
if (an !== bn) return an ? 1 : -1; // no-hint bucket always last
|
|
689
|
+
if (b[1] !== a[1]) return b[1] - a[1]; // higher count first
|
|
690
|
+
return a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0; // then stable by value
|
|
691
|
+
});
|
|
692
|
+
if (entries.length === 0) return [" (no files)"];
|
|
693
|
+
return entries.map(([value, count]) => ` ${String(count).padStart(6)} ${value}`);
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
// =================================================================================================
|
|
697
|
+
// `vh dataset report <manifest> [--verify <dir>] [--json] [--out <p>]` — ONE self-contained,
|
|
698
|
+
// deterministic evidence document a compliance/due-diligence reviewer files.
|
|
699
|
+
//
|
|
700
|
+
// WHY THIS EXISTS
|
|
701
|
+
// A reviewer (or an automated compliance pipeline) needs ONE portable artifact that consolidates
|
|
702
|
+
// everything a manifest already proves: the dataset IDENTITY (root + fileCount), the provenance/
|
|
703
|
+
// license roll-up, the standing trust caveats, and — optionally — a live-tree verification verdict.
|
|
704
|
+
// Today that takes several commands (`vh dataset summary`, `vh dataset verify`); `vh dataset report`
|
|
705
|
+
// produces the single document to attach to a filing.
|
|
706
|
+
//
|
|
707
|
+
// IT INVENTS NO NEW MATH. The dataset identity comes from the strict `readManifest`; the
|
|
708
|
+
// provenance/license roll-up REUSES the SAME pure `aggregateManifest` core `vh dataset summary` uses
|
|
709
|
+
// (the histogram orders identically via `_histogramLines`); the optional verification REUSES
|
|
710
|
+
// `runDatasetVerify` VERBATIM. So the report can never drift from the commands it consolidates.
|
|
711
|
+
//
|
|
712
|
+
// PURELY OFFLINE for the manifest-only path: no dataset tree, no provider, no key, no network. With
|
|
713
|
+
// `--verify <dir>` it re-derives the root from the live tree (still offline — no network) and embeds
|
|
714
|
+
// the MATCH/MISMATCH verdict + per-file ADDED/REMOVED/CHANGED localization.
|
|
715
|
+
//
|
|
716
|
+
// DETERMINISM
|
|
717
|
+
// The default human output is a Markdown document with a STABLE section order and a histogram ordered
|
|
718
|
+
// by the SAME `_histogramLines` rule, so two runs over the same manifest produce byte-identical
|
|
719
|
+
// Markdown — suitable to attach to a filing and to diff in CI.
|
|
720
|
+
|
|
721
|
+
/**
|
|
722
|
+
* Build (purely) the consolidated report MODEL from a validated manifest object + an OPTIONAL verify
|
|
723
|
+
* result + an OPTIONAL policy result. No I/O, no aggregation/verdict math of its own — it composes
|
|
724
|
+
* `aggregateManifest`'s roll-up with the (already-run) `runDatasetVerify` result and the (already-run)
|
|
725
|
+
* `evaluatePolicy` verdict. This is the SAME object the `--json` mode emits.
|
|
726
|
+
*
|
|
727
|
+
* The policy block is the EXACT object `evaluatePolicy` returns (the same pure evaluator `vh dataset
|
|
728
|
+
* check` uses, never re-implemented), trimmed to the fields the report documents — so the report's
|
|
729
|
+
* policy verdict can never diverge from `vh dataset check`'s.
|
|
730
|
+
*
|
|
731
|
+
* @param {object} manifest a validated manifest object (from readManifest)
|
|
732
|
+
* @param {object|null} [verifyResult] the object runDatasetVerify returns, or null when no --verify
|
|
733
|
+
* @param {object|null} [policyResult] the object evaluatePolicy returns, or null when no --policy
|
|
734
|
+
* @returns {{
|
|
735
|
+
* root: string, fileCount: number,
|
|
736
|
+
* licenses: Object<string,number>, sources: Object<string,number>,
|
|
737
|
+
* filesWithLicenseHint: number, filesWithSourceHint: number,
|
|
738
|
+
* verify?: { status: string, added: any[], removed: any[], changed: any[] },
|
|
739
|
+
* policy?: { verdict: string, rulesEvaluated: number, violations: {relPath:string,rule:string,value:string}[] }
|
|
740
|
+
* }}
|
|
741
|
+
*/
|
|
742
|
+
function buildDatasetReport(manifest, verifyResult, policyResult) {
|
|
743
|
+
const agg = aggregateManifest(manifest); // SAME roll-up as `vh dataset summary` — never re-derived
|
|
744
|
+
const model = {
|
|
745
|
+
root: agg.root,
|
|
746
|
+
fileCount: agg.fileCount,
|
|
747
|
+
licenses: agg.licenses,
|
|
748
|
+
sources: agg.sources,
|
|
749
|
+
filesWithLicenseHint: agg.filesWithLicenseHint,
|
|
750
|
+
filesWithSourceHint: agg.filesWithSourceHint,
|
|
751
|
+
};
|
|
752
|
+
if (verifyResult) {
|
|
753
|
+
// Carry ONLY the localization arrays the report documents; the verdict is verifyResult.status,
|
|
754
|
+
// which (per runDatasetVerify) is the AUTHORITATIVE recomputed-root-vs-manifest-root comparison.
|
|
755
|
+
model.verify = {
|
|
756
|
+
status: verifyResult.status,
|
|
757
|
+
added: verifyResult.diff.added,
|
|
758
|
+
removed: verifyResult.diff.removed,
|
|
759
|
+
changed: verifyResult.diff.changed,
|
|
760
|
+
};
|
|
761
|
+
}
|
|
762
|
+
if (policyResult) {
|
|
763
|
+
// Carry the verdict, the rule count, and the exact violating files. These are taken VERBATIM from
|
|
764
|
+
// the same pure `evaluatePolicy` the `vh dataset check` command uses — no re-implementation — so the
|
|
765
|
+
// report's PASS/FAIL can never disagree with `vh dataset check`'s for the same manifest + policy.
|
|
766
|
+
model.policy = {
|
|
767
|
+
verdict: policyResult.verdict,
|
|
768
|
+
rulesEvaluated: policyResult.rulesEvaluated,
|
|
769
|
+
violations: policyResult.violations,
|
|
770
|
+
};
|
|
771
|
+
}
|
|
772
|
+
return model;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
/**
|
|
776
|
+
* Render the consolidated report MODEL as a DETERMINISTIC Markdown document. Stable section order
|
|
777
|
+
* (Trust posture, Dataset identity, Verification status, Policy compliance [only with --policy],
|
|
778
|
+
* Provenance roll-up); the histogram reuses `_histogramLines` and policy violations are pre-sorted by
|
|
779
|
+
* `evaluatePolicy`, so two runs over the same manifest + policy produce byte-identical Markdown. LEADS
|
|
780
|
+
* with the trust posture (reusing TRUST_NOTE verbatim) so the caveats can never drift; the Policy
|
|
781
|
+
* compliance section repeats the SAME UNTRUSTED-hints caveat as `vh dataset check`; and the document
|
|
782
|
+
* NEVER implies a live-tree verify (or a real license check) happened when it did not.
|
|
783
|
+
* @param {object} model the object buildDatasetReport returns
|
|
784
|
+
* @returns {string} the full Markdown document (newline-terminated)
|
|
785
|
+
*/
|
|
786
|
+
function formatDatasetReportMarkdown(model) {
|
|
787
|
+
const lines = [];
|
|
788
|
+
lines.push("# verifyhash dataset report");
|
|
789
|
+
lines.push("");
|
|
790
|
+
|
|
791
|
+
// --- 1. Trust posture FIRST (reuse TRUST_NOTE verbatim; do NOT overclaim). -----------------------
|
|
792
|
+
lines.push("## Trust posture");
|
|
793
|
+
lines.push("");
|
|
794
|
+
lines.push("The file SET (relPath + content) is bound into the Merkle root and is trustworthy.");
|
|
795
|
+
lines.push(TRUST_NOTE);
|
|
796
|
+
lines.push("");
|
|
797
|
+
lines.push(
|
|
798
|
+
"This report is NOT a timestamp: it does NOT prove the dataset is \"unaltered since date T\", nor " +
|
|
799
|
+
"authorship/licensing. That time-anchored claim needs the human-owned signing/timestamp " +
|
|
800
|
+
"trust-root (needs-human, P-3)."
|
|
801
|
+
);
|
|
802
|
+
lines.push("");
|
|
803
|
+
|
|
804
|
+
// --- 2. Dataset identity (root + fileCount), from the strict readManifest. -----------------------
|
|
805
|
+
lines.push("## Dataset identity");
|
|
806
|
+
lines.push("");
|
|
807
|
+
lines.push(`- root: \`${model.root}\``);
|
|
808
|
+
lines.push(`- fileCount: ${model.fileCount}`);
|
|
809
|
+
lines.push("");
|
|
810
|
+
|
|
811
|
+
// --- 3. Verification status. Either the embedded --verify verdict, or a PLAIN statement that NO ---
|
|
812
|
+
// live-tree verification was performed (so the report never implies a verify that didn't run).
|
|
813
|
+
lines.push("## Verification status");
|
|
814
|
+
lines.push("");
|
|
815
|
+
if (!model.verify) {
|
|
816
|
+
lines.push(
|
|
817
|
+
"NO live-tree verification was performed. The root above is the manifest's CLAIM until it is " +
|
|
818
|
+
"re-derived from the live tree (run `vh dataset report <manifest> --verify <dir>`, or " +
|
|
819
|
+
"`vh dataset verify <dir> --manifest <manifest>`)."
|
|
820
|
+
);
|
|
821
|
+
} else {
|
|
822
|
+
const v = model.verify;
|
|
823
|
+
lines.push(`- verdict: **${v.status}** (re-derived from the live tree — AUTHORITATIVE)`);
|
|
824
|
+
if (v.status === VERIFY_STATUS.MATCH) {
|
|
825
|
+
lines.push(
|
|
826
|
+
"- The live tree is byte-for-byte (and name-for-name) what the manifest committed to " +
|
|
827
|
+
"(no ADDED / REMOVED / CHANGED)."
|
|
828
|
+
);
|
|
829
|
+
} else {
|
|
830
|
+
lines.push(
|
|
831
|
+
`- changes: ${v.changed.length} CHANGED, ${v.added.length} ADDED, ${v.removed.length} REMOVED ` +
|
|
832
|
+
"(a rename surfaces as REMOVED + ADDED — the root commits to file NAMES)."
|
|
833
|
+
);
|
|
834
|
+
for (const c of v.changed) {
|
|
835
|
+
lines.push(` - CHANGED \`${c.path}\``);
|
|
836
|
+
lines.push(` - old: \`${c.oldContentHash}\``);
|
|
837
|
+
lines.push(` - new: \`${c.newContentHash}\``);
|
|
838
|
+
}
|
|
839
|
+
for (const a of v.added) {
|
|
840
|
+
lines.push(` - ADDED \`${a.path}\` (\`${a.contentHash}\`) — present now, not in the manifest`);
|
|
841
|
+
}
|
|
842
|
+
for (const rm of v.removed) {
|
|
843
|
+
lines.push(` - REMOVED \`${rm.path}\` (\`${rm.contentHash}\`) — in the manifest, gone now`);
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
lines.push("");
|
|
848
|
+
|
|
849
|
+
// --- 4. Policy compliance. ONLY when --policy was given; the verdict is the SAME pure `evaluatePolicy`
|
|
850
|
+
// the `vh dataset check` command uses (no re-implementation), so PASS/FAIL can never diverge.
|
|
851
|
+
// LEADS with the SAME UNTRUSTED-hints caveat as `vh dataset check` so the report never implies
|
|
852
|
+
// the licenses were verified to be genuinely correct. Violations are already sorted (relPath,
|
|
853
|
+
// then rule) by evaluatePolicy, so this section is byte-identical across runs.
|
|
854
|
+
if (model.policy) {
|
|
855
|
+
const p = model.policy;
|
|
856
|
+
lines.push("## Policy compliance");
|
|
857
|
+
lines.push("");
|
|
858
|
+
lines.push(
|
|
859
|
+
"The {source, license} hints evaluated below are UNTRUSTED, self-asserted metadata NOT bound into " +
|
|
860
|
+
"the root. A PASS means the dataset's SELF-ASSERTED hints satisfy this policy — NOT that the " +
|
|
861
|
+
"licenses are genuinely correct. \"(no license hint)\" asserts NOTHING (requireLicense flags it). " +
|
|
862
|
+
"This does NOT verify any license/source is real."
|
|
863
|
+
);
|
|
864
|
+
lines.push("");
|
|
865
|
+
lines.push(`- verdict: **${p.verdict}**`);
|
|
866
|
+
lines.push(`- rules evaluated: ${p.rulesEvaluated}`);
|
|
867
|
+
if (p.rulesEvaluated === 0) {
|
|
868
|
+
lines.push(
|
|
869
|
+
"- This policy declares NO rules, so it trivially PASSes — every dataset satisfies a policy with " +
|
|
870
|
+
"no constraints."
|
|
871
|
+
);
|
|
872
|
+
} else if (p.verdict === POLICY_VERDICT.PASS) {
|
|
873
|
+
lines.push("- No file's self-asserted hints violate any rule in this policy.");
|
|
874
|
+
} else {
|
|
875
|
+
lines.push(
|
|
876
|
+
`- violations: ${p.violations.length} ` +
|
|
877
|
+
"(each line: the file, the rule it broke, and the offending hint value)"
|
|
878
|
+
);
|
|
879
|
+
for (const v of p.violations) {
|
|
880
|
+
lines.push(` - \`${v.relPath}\` [${v.rule}] value: ${v.value}`);
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
lines.push("");
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// --- 5. Provenance / license roll-up. SAME aggregation + SAME histogram ordering as summary. ------
|
|
887
|
+
lines.push("## Provenance / license roll-up (CLAIMED — untrusted hints)");
|
|
888
|
+
lines.push("");
|
|
889
|
+
lines.push(
|
|
890
|
+
"The {source, license} hints below are UNTRUSTED, self-asserted metadata NOT bound into the root. " +
|
|
891
|
+
"This counts what the dataset CLAIMS; it does NOT verify any license/source is correct. " +
|
|
892
|
+
"\"(no license hint)\" means the manifest ASSERTS NOTHING for that file, NOT that it is unlicensed."
|
|
893
|
+
);
|
|
894
|
+
lines.push("");
|
|
895
|
+
lines.push(
|
|
896
|
+
`### Licenses (${model.filesWithLicenseHint}/${model.fileCount} files carry a license hint)`
|
|
897
|
+
);
|
|
898
|
+
lines.push("");
|
|
899
|
+
lines.push("```");
|
|
900
|
+
for (const line of _histogramLines(model.licenses)) lines.push(line);
|
|
901
|
+
lines.push("```");
|
|
902
|
+
lines.push("");
|
|
903
|
+
lines.push(
|
|
904
|
+
`### Sources (${model.filesWithSourceHint}/${model.fileCount} files carry a source hint)`
|
|
905
|
+
);
|
|
906
|
+
lines.push("");
|
|
907
|
+
lines.push("```");
|
|
908
|
+
for (const line of _histogramLines(model.sources)) lines.push(line);
|
|
909
|
+
lines.push("```");
|
|
910
|
+
lines.push("");
|
|
911
|
+
|
|
912
|
+
// Trailing newline so the document ends cleanly; join with \n for byte-stable output.
|
|
913
|
+
return lines.join("\n") + "\n";
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
/**
|
|
917
|
+
* Orchestrate `vh dataset report <manifest> [--verify <dir>] [--policy <p>] [--json] [--out <p>]`. Reads
|
|
918
|
+
* the manifest via the strict `readManifest`, OPTIONALLY runs `runDatasetVerify` against a live tree
|
|
919
|
+
* (REUSED verbatim) and OPTIONALLY reads `--policy` (strict `readPolicy`) and evaluates it via the SAME
|
|
920
|
+
* pure `evaluatePolicy` `vh dataset check` uses (REUSED verbatim — the report verdict can never diverge
|
|
921
|
+
* from `vh dataset check`'s), composes the consolidated report MODEL (reusing `aggregateManifest`), and
|
|
922
|
+
* emits it as deterministic Markdown (default) or a machine-readable JSON object (`--json`). With
|
|
923
|
+
* `--out <p>` it writes the report to the caller's EXPLICIT path (never cwd) and names the file; without
|
|
924
|
+
* `--out` it prints to stdout.
|
|
925
|
+
*
|
|
926
|
+
* EXIT-CODE PRECEDENCE (the caller in cli/vh.js maps these). The report is a COMBINED CI gate: it is
|
|
927
|
+
* non-zero whenever ANY embedded gate fails, and 0 only when ALL pass.
|
|
928
|
+
* - with `--verify`: the embedded verification returns its MATCH/MISMATCH verdict (MISMATCH => fail).
|
|
929
|
+
* - with `--policy`: the embedded policy returns its PASS/FAIL verdict (FAIL => fail).
|
|
930
|
+
* - with BOTH: fail (exit 3) if EITHER the verify is MISMATCH OR the policy is FAIL; 0 only when the
|
|
931
|
+
* verify is MATCH AND the policy is PASS. So a single invocation gates data integrity AND policy.
|
|
932
|
+
* This function returns `verifyStatus` and `policyVerdict`; the CLI derives exit 3 from either failing.
|
|
933
|
+
*
|
|
934
|
+
* @param {object} opts
|
|
935
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
936
|
+
* @param {string} [opts.verifyDir] when given, re-derive the root from this live tree (reuses runDatasetVerify)
|
|
937
|
+
* @param {string} [opts.policy] when given, evaluate the manifest against this policy (reuses evaluatePolicy)
|
|
938
|
+
* @param {boolean}[opts.json] emit a machine-readable object instead of the Markdown document
|
|
939
|
+
* @param {string} [opts.out] write the report to this explicit path (caller-chosen; never cwd)
|
|
940
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
941
|
+
* @returns {{
|
|
942
|
+
* model: object,
|
|
943
|
+
* verifyStatus: string|null,
|
|
944
|
+
* policyVerdict: string|null,
|
|
945
|
+
* out: string|null,
|
|
946
|
+
* }}
|
|
947
|
+
*/
|
|
948
|
+
function runDatasetReport(opts) {
|
|
949
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetReport requires options");
|
|
950
|
+
const { manifest: manifestPath, verifyDir, policy: policyPath } = opts;
|
|
951
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
952
|
+
if (!manifestPath) throw new Error("runDatasetReport requires a <manifest> path");
|
|
953
|
+
|
|
954
|
+
// Strict read: a corrupt/edited/foreign manifest is rejected here, never half-accepted, BEFORE the
|
|
955
|
+
// report is composed. The file SET it commits to is the TRUSTED basis of the document.
|
|
956
|
+
const manifest = readManifest(manifestPath);
|
|
957
|
+
|
|
958
|
+
// OPTIONAL live-tree verification: REUSE runDatasetVerify verbatim (no re-implementation). We pass a
|
|
959
|
+
// no-op stdout so the verify's own block is not printed — the report embeds the verdict itself. The
|
|
960
|
+
// verify recomputes the root from the bytes on disk, so a hand-edited manifest root cannot fake MATCH.
|
|
961
|
+
let verifyResult = null;
|
|
962
|
+
if (verifyDir) {
|
|
963
|
+
verifyResult = runDatasetVerify({ dir: verifyDir, manifest: manifestPath, stdout: () => {} });
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
// OPTIONAL policy evaluation: read the policy strictly (a corrupt/foreign policy is rejected, never
|
|
967
|
+
// half-accepted) and REUSE the SAME pure `evaluatePolicy` `vh dataset check` runs (no re-implementation)
|
|
968
|
+
// so the report's PASS/FAIL can never diverge from `vh dataset check`'s for the same manifest + policy.
|
|
969
|
+
let policyResult = null;
|
|
970
|
+
if (policyPath) {
|
|
971
|
+
const policy = readPolicy(policyPath);
|
|
972
|
+
policyResult = evaluatePolicy(manifest, policy);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
const model = buildDatasetReport(manifest, verifyResult, policyResult);
|
|
976
|
+
|
|
977
|
+
// Render the document: deterministic Markdown by default, machine-readable JSON with --json.
|
|
978
|
+
const document = opts.json ? JSON.stringify(model) + "\n" : formatDatasetReportMarkdown(model);
|
|
979
|
+
|
|
980
|
+
let outAbs = null;
|
|
981
|
+
if (opts.out) {
|
|
982
|
+
// Write to the EXACT caller-chosen path (resolved to absolute so the success line names precisely
|
|
983
|
+
// the file written) — never silently the cwd. The ONLY side effect.
|
|
984
|
+
outAbs = path.resolve(opts.out);
|
|
985
|
+
fs.writeFileSync(outAbs, document);
|
|
986
|
+
write(`dataset report written: ${outAbs}\n`);
|
|
987
|
+
} else {
|
|
988
|
+
write(document);
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
return {
|
|
992
|
+
model,
|
|
993
|
+
verifyStatus: verifyResult ? verifyResult.status : null,
|
|
994
|
+
policyVerdict: policyResult ? policyResult.verdict : null,
|
|
995
|
+
out: outAbs,
|
|
996
|
+
};
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
// =================================================================================================
|
|
1000
|
+
// `vh dataset prove --file <p> --manifest <m>` + `vh dataset verify-proof <proof>`
|
|
1001
|
+
// Offline set-membership of ONE file in a manifested dataset.
|
|
1002
|
+
//
|
|
1003
|
+
// WHY THIS EXISTS
|
|
1004
|
+
// `vh dataset build` commits a whole dataset to one Merkle root + a per-file leaf list. `vh dataset
|
|
1005
|
+
// verify` re-derives that root from a FULL fresh copy of the dataset. But a recipient often holds
|
|
1006
|
+
// only ONE file (a single training image, one document) and the manifest — NOT the whole multi-GB
|
|
1007
|
+
// dataset — and wants to answer "was THIS exact file a member of that dataset?" without re-walking
|
|
1008
|
+
// the entire tree, without a network, and without any key.
|
|
1009
|
+
//
|
|
1010
|
+
// `vh dataset prove` answers that by emitting a SELF-CONTAINED proof artifact: it recomputes the
|
|
1011
|
+
// one file's contentHash + path-bound leaf, finds that leaf in the manifest's committed leaf set,
|
|
1012
|
+
// and builds the Merkle proof (the sibling path) that folds the leaf back up to the manifest root —
|
|
1013
|
+
// reusing the EXACT buildTree/proofForIndex from cli/hash.js (the same construction `vh prove` uses)
|
|
1014
|
+
// and emitting the SAME `verifyhash.merkle-proof` artifact cli/proof.js reads. NO new crypto.
|
|
1015
|
+
//
|
|
1016
|
+
// `vh dataset verify-proof <proof>` then folds that artifact PURELY OFFLINE via cli/proof.js's
|
|
1017
|
+
// recomputeFold — NO dataset copy, NO manifest, NO key, NO network — and confirms the leaf folds to
|
|
1018
|
+
// the recorded root. A fabricated or altered file's proof does NOT fold to the root and is REJECTED.
|
|
1019
|
+
//
|
|
1020
|
+
// TRUST BOUNDARY (carried verbatim into output/docs — do NOT overclaim).
|
|
1021
|
+
// This proves SET-MEMBERSHIP: that the named file (its relPath + bytes) was a leaf of the manifest's
|
|
1022
|
+
// Merkle root. It does NOT prove "unaltered since date T", authorship, or licensing — that stronger,
|
|
1023
|
+
// time-anchored claim needs the human-owned signing/timestamp trust-root (a needs-human step). The
|
|
1024
|
+
// proof binds a file to a ROOT; whether that root is itself trustworthy/anchored is a separate layer.
|
|
1025
|
+
const MEMBERSHIP_TRUST_NOTE = [
|
|
1026
|
+
"NOTE: this proves SET-MEMBERSHIP only — that the named file (its relPath + bytes) is a leaf of the",
|
|
1027
|
+
"dataset manifest's Merkle root. It does NOT prove the file is UNALTERED SINCE a date, nor authorship",
|
|
1028
|
+
"or licensing: that time-anchored claim needs a signing/timestamp trust-root (a separate, human step).",
|
|
1029
|
+
].join("\n");
|
|
1030
|
+
|
|
1031
|
+
/**
|
|
1032
|
+
* Build (purely, OFFLINE) a portable set-membership proof that `filePath` was a member of the dataset
|
|
1033
|
+
* the manifest at `manifestPath` commits to. Reuses cli/hash.js's buildTree/proofForIndex (the SAME
|
|
1034
|
+
* fold/recompute construction `vh prove` uses) and emits the SAME `verifyhash.merkle-proof` artifact
|
|
1035
|
+
* cli/proof.js validates — no new crypto.
|
|
1036
|
+
*
|
|
1037
|
+
* Membership is decided by CONTENT, not by the caller's file name: the file's contentHash is streamed
|
|
1038
|
+
* from disk and the manifest entry is matched by contentHash. The proof binds the manifest's RECORDED
|
|
1039
|
+
* relPath for that entry (so the artifact's leaf re-derives to the manifest's committed leaf). If the
|
|
1040
|
+
* file's bytes are not present in the manifest at all, it is a clear NON-member (a fabricated/altered
|
|
1041
|
+
* file fails here, before any artifact is built).
|
|
1042
|
+
*
|
|
1043
|
+
* @param {object} opts
|
|
1044
|
+
* @param {string} opts.file path to the single file to prove membership of
|
|
1045
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
1046
|
+
* @returns {{
|
|
1047
|
+
* member: boolean,
|
|
1048
|
+
* contentHash: string, // streamed keccak256 of the file's bytes
|
|
1049
|
+
* relPath: string|null, // the manifest's recorded relPath for the matched entry (null if none)
|
|
1050
|
+
* leaf: string|null, // the path-bound leaf (what folds to the root) (null if non-member)
|
|
1051
|
+
* root: string, // the manifest's committed Merkle root
|
|
1052
|
+
* proof: string[]|null, // sibling path folding leaf -> root (null if non-member)
|
|
1053
|
+
* artifact: object|null, // a validated verifyhash.merkle-proof artifact (null if non-member)
|
|
1054
|
+
* }}
|
|
1055
|
+
*/
|
|
1056
|
+
function buildDatasetProof(opts) {
|
|
1057
|
+
if (!opts || typeof opts !== "object") throw new Error("buildDatasetProof requires options");
|
|
1058
|
+
const { file, manifest: manifestPath } = opts;
|
|
1059
|
+
if (!file) throw new Error("buildDatasetProof requires a --file <p>");
|
|
1060
|
+
if (!manifestPath) throw new Error("buildDatasetProof requires a --manifest <m> path");
|
|
1061
|
+
|
|
1062
|
+
// Resolve + stat the file first so a missing/non-regular file errors clearly before we trust the
|
|
1063
|
+
// manifest (statSync throws ENOENT; a directory is not provable as a single member).
|
|
1064
|
+
const fileAbs = path.resolve(file);
|
|
1065
|
+
const stat = fs.statSync(fileAbs);
|
|
1066
|
+
if (!stat.isFile()) {
|
|
1067
|
+
throw new Error(`--file must be a regular file (the single member to prove), got: ${file}`);
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
// The manifest is the (structurally-validated) commitment we prove against. readManifest rejects a
|
|
1071
|
+
// corrupt/edited manifest rather than half-accepting it (it also re-checks every leaf == pathLeaf).
|
|
1072
|
+
const manifest = readManifest(manifestPath);
|
|
1073
|
+
|
|
1074
|
+
// Stream the file's content digest (never loads the whole file at once — a large member stays cheap).
|
|
1075
|
+
const contentHash = hashFileStream(fileAbs);
|
|
1076
|
+
|
|
1077
|
+
// Membership is by CONTENT: find the manifest entry whose recorded contentHash equals this file's.
|
|
1078
|
+
// Matching by content (not by the caller's chosen path) means renaming the file on disk does not
|
|
1079
|
+
// change the answer, and a single file whose bytes appear in the dataset is provable regardless of
|
|
1080
|
+
// where the caller stored it. (The manifest entry carries the canonical relPath that binds the leaf.)
|
|
1081
|
+
const entry = manifest.files.find(
|
|
1082
|
+
(f) => f.contentHash.toLowerCase() === contentHash.toLowerCase()
|
|
1083
|
+
);
|
|
1084
|
+
|
|
1085
|
+
if (!entry) {
|
|
1086
|
+
// Clear NEGATIVE: the file's bytes are not committed by this manifest. No artifact is built.
|
|
1087
|
+
return {
|
|
1088
|
+
member: false,
|
|
1089
|
+
contentHash,
|
|
1090
|
+
relPath: null,
|
|
1091
|
+
leaf: null,
|
|
1092
|
+
root: manifest.root,
|
|
1093
|
+
proof: null,
|
|
1094
|
+
artifact: null,
|
|
1095
|
+
};
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
// Rebuild the SAME sorted-leaf tree the manifest committed to (its `leaf` list IS that committed set),
|
|
1099
|
+
// then generate the proof for this entry's leaf. buildTree sorts the leaves ascending exactly as
|
|
1100
|
+
// `vh dataset build` did, so the index we locate matches the canonical tree position.
|
|
1101
|
+
const leaves = manifest.files.map((f) => f.leaf);
|
|
1102
|
+
const { root, layers, sortedLeaves } = buildTree(leaves);
|
|
1103
|
+
|
|
1104
|
+
// Defense in depth: the tree we rebuilt from the manifest's leaves MUST reproduce the manifest's
|
|
1105
|
+
// recorded root, or the manifest is internally inconsistent (and any proof off it is meaningless).
|
|
1106
|
+
if (root.toLowerCase() !== manifest.root.toLowerCase()) {
|
|
1107
|
+
throw new Error(
|
|
1108
|
+
`manifest is internally inconsistent: its leaf set folds to ${root}, not its recorded root ` +
|
|
1109
|
+
`${manifest.root}. Refusing to build a proof against a self-contradictory manifest.`
|
|
1110
|
+
);
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
const index = sortedLeaves.findIndex((l) => BigInt(l) === BigInt(entry.leaf));
|
|
1114
|
+
if (index < 0) {
|
|
1115
|
+
// Should be unreachable (entry.leaf came from manifest.files), but never build a bogus proof.
|
|
1116
|
+
throw new Error(`internal: manifest leaf ${entry.leaf} not found in its own tree`);
|
|
1117
|
+
}
|
|
1118
|
+
const proof = proofForIndex(layers, index);
|
|
1119
|
+
|
|
1120
|
+
// Emit the SAME portable artifact cli/proof.js reads, so `vh dataset verify-proof` (and even
|
|
1121
|
+
// `vh verify-proof`, given an on-chain anchored root) fold it with the identical recompute path.
|
|
1122
|
+
const artifact = buildProofArtifact({
|
|
1123
|
+
root: manifest.root,
|
|
1124
|
+
leaf: entry.leaf,
|
|
1125
|
+
contentHash: entry.contentHash,
|
|
1126
|
+
proof,
|
|
1127
|
+
file: entry.relPath,
|
|
1128
|
+
});
|
|
1129
|
+
|
|
1130
|
+
return {
|
|
1131
|
+
member: true,
|
|
1132
|
+
contentHash,
|
|
1133
|
+
relPath: entry.relPath,
|
|
1134
|
+
leaf: entry.leaf,
|
|
1135
|
+
root: manifest.root,
|
|
1136
|
+
proof,
|
|
1137
|
+
artifact,
|
|
1138
|
+
};
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
/**
|
|
1142
|
+
* Orchestrate `vh dataset prove --file <p> --manifest <m> [--out <p>] [--json]`. Builds the membership
|
|
1143
|
+
* proof OFFLINE and, on a MEMBER, optionally writes the self-contained artifact to the caller's --out
|
|
1144
|
+
* path (never silently the cwd). On a NON-member it writes NO artifact and reports a clear negative.
|
|
1145
|
+
*
|
|
1146
|
+
* @param {object} opts
|
|
1147
|
+
* @param {string} opts.file
|
|
1148
|
+
* @param {string} opts.manifest
|
|
1149
|
+
* @param {string} [opts.out] where to write the proof artifact (caller-chosen; required to persist one)
|
|
1150
|
+
* @param {boolean}[opts.json]
|
|
1151
|
+
* @param {(s:string)=>void}[opts.stdout]
|
|
1152
|
+
* @returns {{ member: boolean, contentHash: string, relPath: string|null, root: string, out: string|null }}
|
|
1153
|
+
*/
|
|
1154
|
+
function runDatasetProve(opts) {
|
|
1155
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetProve requires options");
|
|
1156
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
1157
|
+
const built = buildDatasetProof({ file: opts.file, manifest: opts.manifest });
|
|
1158
|
+
|
|
1159
|
+
let outAbs = null;
|
|
1160
|
+
if (built.member && opts.out) {
|
|
1161
|
+
// Validate + write the artifact at the EXACT path the caller chose (resolved to absolute so the
|
|
1162
|
+
// success line names precisely the file written). writeProofArtifact re-validates before writing.
|
|
1163
|
+
outAbs = path.resolve(opts.out);
|
|
1164
|
+
writeProofArtifact(built.artifact, outAbs);
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
if (opts.json) {
|
|
1168
|
+
write(
|
|
1169
|
+
JSON.stringify({
|
|
1170
|
+
member: built.member,
|
|
1171
|
+
contentHash: built.contentHash,
|
|
1172
|
+
relPath: built.relPath,
|
|
1173
|
+
root: built.root,
|
|
1174
|
+
proofLength: built.proof ? built.proof.length : null,
|
|
1175
|
+
out: outAbs,
|
|
1176
|
+
}) + "\n"
|
|
1177
|
+
);
|
|
1178
|
+
} else if (built.member) {
|
|
1179
|
+
write(`dataset membership: MEMBER\n`);
|
|
1180
|
+
write(` relPath: ${built.relPath} (the manifest's committed path for this content)\n`);
|
|
1181
|
+
write(` contentHash: ${built.contentHash}\n`);
|
|
1182
|
+
write(` leaf: ${built.leaf}\n`);
|
|
1183
|
+
write(` root: ${built.root}\n`);
|
|
1184
|
+
write(` proof: ${built.proof.length} sibling${built.proof.length === 1 ? "" : "s"}\n`);
|
|
1185
|
+
if (outAbs) {
|
|
1186
|
+
write(` proof artifact written: ${outAbs} (verify with \`vh dataset verify-proof <p>\`)\n`);
|
|
1187
|
+
} else {
|
|
1188
|
+
write(` (pass --out <p> to write a portable proof artifact for offline verification)\n`);
|
|
1189
|
+
}
|
|
1190
|
+
write(MEMBERSHIP_TRUST_NOTE + "\n");
|
|
1191
|
+
} else {
|
|
1192
|
+
write(`dataset membership: NOT A MEMBER\n`);
|
|
1193
|
+
write(` contentHash: ${built.contentHash}\n`);
|
|
1194
|
+
write(` root: ${built.root}\n`);
|
|
1195
|
+
write(
|
|
1196
|
+
` The file's bytes are NOT committed by this manifest (it was never in the dataset, or it was\n` +
|
|
1197
|
+
` altered/fabricated). No proof artifact is written for a non-member.\n`
|
|
1198
|
+
);
|
|
1199
|
+
write(MEMBERSHIP_TRUST_NOTE + "\n");
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
return {
|
|
1203
|
+
member: built.member,
|
|
1204
|
+
contentHash: built.contentHash,
|
|
1205
|
+
relPath: built.relPath,
|
|
1206
|
+
root: built.root,
|
|
1207
|
+
out: outAbs,
|
|
1208
|
+
};
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
// Outcomes of `vh dataset verify-proof`. Distinct from cli/proof.js's on-chain STATUS: this command is
|
|
1212
|
+
// PURELY OFFLINE (no anchored-root check), so the only verdicts are CONFIRMED (folds to the root) or
|
|
1213
|
+
// REJECTED (does not). Confirming the root is itself anchored on-chain is `vh verify-proof`'s job.
|
|
1214
|
+
const MEMBERSHIP_STATUS = Object.freeze({
|
|
1215
|
+
CONFIRMED: "CONFIRMED", // the proof folds OFFLINE to its recorded root (set-membership holds)
|
|
1216
|
+
REJECTED: "REJECTED", // it does NOT (a fabricated/altered file, or a tampered proof/leaf/root)
|
|
1217
|
+
});
|
|
1218
|
+
|
|
1219
|
+
/**
|
|
1220
|
+
* Run `vh dataset verify-proof <proof>` — fold a portable proof artifact PURELY OFFLINE, with NO
|
|
1221
|
+
* dataset copy, NO manifest, NO key, and NO network, confirming the file's leaf folds to the recorded
|
|
1222
|
+
* root. Reuses cli/proof.js's readProofArtifact (strict validation) + recomputeFold (the SAME fold the
|
|
1223
|
+
* on-chain verifyLeaf does), so the fold path is byte-identical to `vh verify-proof`'s offline leg.
|
|
1224
|
+
*
|
|
1225
|
+
* CONFIRMED requires BOTH: (1) the artifact's leaf re-derives from its contentHash+relPath, and (2) the
|
|
1226
|
+
* leaf folds through the proof to the recorded root. Either failing -> REJECTED.
|
|
1227
|
+
*
|
|
1228
|
+
* @param {object} opts
|
|
1229
|
+
* @param {string} opts.artifact path to a proof artifact (from `vh dataset prove --out` / `vh prove --out`)
|
|
1230
|
+
* @param {boolean}[opts.json]
|
|
1231
|
+
* @param {(s:string)=>void}[opts.stdout]
|
|
1232
|
+
* @returns {{
|
|
1233
|
+
* status: "CONFIRMED"|"REJECTED",
|
|
1234
|
+
* leafMatches: boolean, foldsToRoot: boolean,
|
|
1235
|
+
* relPath: string, contentHash: string, leaf: string, root: string,
|
|
1236
|
+
* computedRoot: string, proofLength: number,
|
|
1237
|
+
* }}
|
|
1238
|
+
*/
|
|
1239
|
+
function runDatasetVerifyProof(opts) {
|
|
1240
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetVerifyProof requires options");
|
|
1241
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
1242
|
+
if (!opts.artifact) throw new Error("runDatasetVerifyProof requires a <proof> artifact path");
|
|
1243
|
+
|
|
1244
|
+
// Strict read (rejects a corrupt/forged artifact) then the OFFLINE fold — the entire verification.
|
|
1245
|
+
const artifact = readProofArtifact(opts.artifact);
|
|
1246
|
+
const fold = recomputeFold(artifact);
|
|
1247
|
+
const status = fold.offlineOk ? MEMBERSHIP_STATUS.CONFIRMED : MEMBERSHIP_STATUS.REJECTED;
|
|
1248
|
+
|
|
1249
|
+
const result = {
|
|
1250
|
+
status,
|
|
1251
|
+
leafMatches: fold.leafMatches,
|
|
1252
|
+
foldsToRoot: fold.foldsToRoot,
|
|
1253
|
+
relPath: artifact.relPath,
|
|
1254
|
+
contentHash: artifact.contentHash,
|
|
1255
|
+
leaf: artifact.leaf,
|
|
1256
|
+
root: artifact.root,
|
|
1257
|
+
computedRoot: fold.computedRoot,
|
|
1258
|
+
proofLength: artifact.proof.length,
|
|
1259
|
+
};
|
|
1260
|
+
|
|
1261
|
+
if (opts.json) {
|
|
1262
|
+
write(JSON.stringify(result) + "\n");
|
|
1263
|
+
} else {
|
|
1264
|
+
write(MEMBERSHIP_TRUST_NOTE + "\n\n");
|
|
1265
|
+
write(` proof artifact: ${opts.artifact}\n`);
|
|
1266
|
+
write(` relPath: ${result.relPath}\n`);
|
|
1267
|
+
write(` contentHash: ${result.contentHash}\n`);
|
|
1268
|
+
write(` leaf: ${result.leaf}\n`);
|
|
1269
|
+
write(` root: ${result.root}\n`);
|
|
1270
|
+
write(` proof siblings: ${result.proofLength}\n\n`);
|
|
1271
|
+
write(" offline recompute (no dataset, no network, no key):\n");
|
|
1272
|
+
write(` leaf re-derived from contentHash+relPath: ${result.leafMatches ? "yes" : "NO"}\n`);
|
|
1273
|
+
write(` proof folds to the recorded root: ${result.foldsToRoot ? "yes" : "NO"}\n\n`);
|
|
1274
|
+
write(` result: ${result.status}\n`);
|
|
1275
|
+
if (status === MEMBERSHIP_STATUS.CONFIRMED) {
|
|
1276
|
+
write(
|
|
1277
|
+
" CONFIRMED: the file is a leaf of the dataset manifest's Merkle root (set-membership proven\n" +
|
|
1278
|
+
" OFFLINE). This binds the file's relPath + bytes to that root; it does NOT prove the file is\n" +
|
|
1279
|
+
" unaltered since a date, nor authorship/licensing.\n"
|
|
1280
|
+
);
|
|
1281
|
+
} else if (!result.leafMatches) {
|
|
1282
|
+
write(
|
|
1283
|
+
" REJECTED: the artifact's leaf does NOT equal pathLeaf(relPath, contentHash) — the leaf,\n" +
|
|
1284
|
+
" contentHash, or relPath was altered. A fabricated/tampered member is caught here offline.\n"
|
|
1285
|
+
);
|
|
1286
|
+
} else {
|
|
1287
|
+
write(
|
|
1288
|
+
" REJECTED: the proof does NOT fold to the recorded root — a proof sibling (or the root) was\n" +
|
|
1289
|
+
" altered. The file is NOT a member of that root. Caught here offline, no dataset needed.\n"
|
|
1290
|
+
);
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
return result;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
// =================================================================================================
|
|
1298
|
+
// `vh dataset attest <manifest> [--json] [--out <p>]` — the deterministic, canonical UNSIGNED
|
|
1299
|
+
// attestation payload the human signing/timestamp trust-root (P-3) will sign.
|
|
1300
|
+
//
|
|
1301
|
+
// WHY THIS EXISTS
|
|
1302
|
+
// DataLedger's most-repeated limit is that a manifest is NOT a timestamp: until someone with a real
|
|
1303
|
+
// signing key / timestamp anchor signs it, a manifest proves only set-membership/identity — the same
|
|
1304
|
+
// thing it already proves — NOT "unaltered since date T". Standing up that key/timestamp anchor is a
|
|
1305
|
+
// HUMAN-owned trust-root (P-3, needs-human). But the deterministic, canonical BYTES that human/service
|
|
1306
|
+
// would sign are fully buildable NOW, purely offline. Producing them turns the future human signing
|
|
1307
|
+
// step from "design AND sign a payload" into "sign THIS exact file" — a one-liner.
|
|
1308
|
+
//
|
|
1309
|
+
// `vh dataset attest <manifest>` reads the manifest via the SAME strict `readManifest` (a corrupt/
|
|
1310
|
+
// foreign manifest is rejected, never half-accepted) and emits a versioned, strictly-validated
|
|
1311
|
+
// attestation ENVELOPE that commits to the dataset IDENTITY a signer signs over:
|
|
1312
|
+
// - `root` : the manifest's Merkle root (commits to file NAMES and bytes)
|
|
1313
|
+
// - `fileCount` : the number of committed files
|
|
1314
|
+
// - `manifestDigest`: keccak256 over a CANONICAL serialization of the manifest's `files` array
|
|
1315
|
+
// (see canonicalization below) — so the same committed file set always yields
|
|
1316
|
+
// the same digest, and ANY edit to the committed set changes it.
|
|
1317
|
+
// - `note` : the standing trust caveat (NOT a timestamp; signing is human-owned, P-3).
|
|
1318
|
+
// PURELY OFFLINE: no tree, no provider, no key, no network.
|
|
1319
|
+
//
|
|
1320
|
+
// CANONICALIZATION (documented exactly so signing the bytes is well-defined)
|
|
1321
|
+
// The `manifestDigest` is keccak256(utf8(canonicalFiles)), where canonicalFiles is the manifest's
|
|
1322
|
+
// `files` entries projected to ONLY the root-committed fields { relPath, contentHash, leaf } (the
|
|
1323
|
+
// UNTRUSTED `hints` are deliberately EXCLUDED — they are not bound into the root, so they must not
|
|
1324
|
+
// change the identity a signer commits to), each entry serialized with its keys in the FIXED order
|
|
1325
|
+
// [relPath, contentHash, leaf], the entries ORDERED by relPath ascending (a total, deterministic
|
|
1326
|
+
// order), and the whole array JSON-serialized with NO insignificant whitespace. So two runs over the
|
|
1327
|
+
// same committed file set produce byte-identical canonical bytes regardless of the on-disk manifest's
|
|
1328
|
+
// key order or whitespace — which is the property that makes signing the bytes well-defined.
|
|
1329
|
+
//
|
|
1330
|
+
// The ENVELOPE itself is then serialized canonically the same way (fixed top-level key order, no
|
|
1331
|
+
// insignificant whitespace, trailing newline) so `--json` / `--out` emit byte-deterministic bytes.
|
|
1332
|
+
//
|
|
1333
|
+
// UNSIGNED MARKER (never imply a signature/timestamp exists)
|
|
1334
|
+
// The envelope carries an explicit `signed: false` and a `signature: null` slot the human/timestamp
|
|
1335
|
+
// step fills in. Until a signature is attached, the artifact proves only the same set-membership/
|
|
1336
|
+
// identity the manifest already does — NOT "unaltered since date T". This is stated in-band in `note`.
|
|
1337
|
+
|
|
1338
|
+
const ATTESTATION_KIND = "verifyhash.dataset-attestation";
|
|
1339
|
+
const ATTESTATION_SCHEMA_VERSION = 1;
|
|
1340
|
+
const SUPPORTED_ATTESTATION_SCHEMA_VERSIONS = Object.freeze([1]);
|
|
1341
|
+
|
|
1342
|
+
// The standing trust caveat carried IN-BAND in every attestation envelope. Load-bearing, not
|
|
1343
|
+
// decorative: a reader (or the future human signer) must never mistake this UNSIGNED payload for a
|
|
1344
|
+
// time-anchored proof. It states plainly that signing is the human-owned trust-root (P-3, needs-human).
|
|
1345
|
+
const ATTESTATION_TRUST_NOTE =
|
|
1346
|
+
"This is the UNSIGNED attestation payload. It commits to the dataset IDENTITY (Merkle root, " +
|
|
1347
|
+
"fileCount, and a canonical manifestDigest over the committed file set). It is NOT signed and NOT " +
|
|
1348
|
+
"timestamped: `signed` is false and `signature` is null until a human/timestamp trust-root fills " +
|
|
1349
|
+
"them in. Standing up a real signing key / timestamp anchor is the human-owned trust-root " +
|
|
1350
|
+
"(needs-human, P-3). Until a signature is attached, this proves only the same set-membership / " +
|
|
1351
|
+
"identity the manifest already does — NOT that the dataset is unaltered since a date T.";
|
|
1352
|
+
|
|
1353
|
+
/**
|
|
1354
|
+
* Canonically serialize the manifest's COMMITTED file set to the exact UTF-8 bytes the `manifestDigest`
|
|
1355
|
+
* is taken over. Deterministic by construction (see CANONICALIZATION above): only the root-committed
|
|
1356
|
+
* fields { relPath, contentHash, leaf } are included (the untrusted `hints` are excluded), each entry's
|
|
1357
|
+
* keys are emitted in the FIXED order [relPath, contentHash, leaf], the entries are ordered by relPath
|
|
1358
|
+
* ascending, and the array is JSON-serialized with NO insignificant whitespace. Pure (no mutation).
|
|
1359
|
+
*
|
|
1360
|
+
* @param {object} manifest a validated manifest object (from readManifest/validateManifest)
|
|
1361
|
+
* @returns {string} the canonical JSON string of the committed file set
|
|
1362
|
+
*/
|
|
1363
|
+
function canonicalManifestFiles(manifest) {
|
|
1364
|
+
const entries = manifest.files.map((f) => ({
|
|
1365
|
+
relPath: f.relPath,
|
|
1366
|
+
contentHash: f.contentHash,
|
|
1367
|
+
leaf: f.leaf,
|
|
1368
|
+
}));
|
|
1369
|
+
// Total, deterministic order by relPath. readManifest already rejects duplicate relPaths, so this is
|
|
1370
|
+
// a strict total order (no ties) and the result is independent of the manifest's on-disk entry order.
|
|
1371
|
+
entries.sort((a, b) => (a.relPath < b.relPath ? -1 : a.relPath > b.relPath ? 1 : 0));
|
|
1372
|
+
// JSON.stringify with the explicit per-entry key list pins key ORDER and emits NO insignificant
|
|
1373
|
+
// whitespace; the fixed [relPath, contentHash, leaf] order is guaranteed by the object literal above
|
|
1374
|
+
// (V8 preserves insertion order for string keys), so the bytes are stable across runs/manifests.
|
|
1375
|
+
return JSON.stringify(entries);
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
/**
|
|
1379
|
+
* Compute the canonical `manifestDigest`: keccak256 over the canonical serialization of the manifest's
|
|
1380
|
+
* committed file set (see canonicalManifestFiles). Deterministic: the same committed set always yields
|
|
1381
|
+
* the same digest; any edit/rename/add/remove to the committed set changes it. Pure.
|
|
1382
|
+
* @param {object} manifest a validated manifest object
|
|
1383
|
+
* @returns {string} a 0x-prefixed 32-byte hex digest
|
|
1384
|
+
*/
|
|
1385
|
+
function manifestDigest(manifest) {
|
|
1386
|
+
return keccak256(toUtf8Bytes(canonicalManifestFiles(manifest)));
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
/**
|
|
1390
|
+
* Build a normalized, fully-validated UNSIGNED attestation envelope from a validated manifest object.
|
|
1391
|
+
* The envelope commits to the dataset identity (root, fileCount, manifestDigest) plus the standing trust
|
|
1392
|
+
* caveat, and carries the explicit `signed: false` / `signature: null` unsigned markers. PURE: no I/O,
|
|
1393
|
+
* no key, no network. Throws (via validateAttestation) if the result is malformed, so a corrupt envelope
|
|
1394
|
+
* is never produced.
|
|
1395
|
+
*
|
|
1396
|
+
* @param {object} manifest a validated manifest object (from readManifest)
|
|
1397
|
+
* @returns {object} a validated attestation envelope
|
|
1398
|
+
*/
|
|
1399
|
+
function buildAttestation(manifest) {
|
|
1400
|
+
// The manifest must itself be sound before we attest its identity (readManifest already did this for
|
|
1401
|
+
// the CLI path; revalidate here so a programmatic caller that hand-built a manifest is also checked).
|
|
1402
|
+
validateManifest(manifest);
|
|
1403
|
+
const env = {
|
|
1404
|
+
kind: ATTESTATION_KIND,
|
|
1405
|
+
schemaVersion: ATTESTATION_SCHEMA_VERSION,
|
|
1406
|
+
note: ATTESTATION_TRUST_NOTE,
|
|
1407
|
+
// Dataset identity the signer commits to.
|
|
1408
|
+
root: manifest.root,
|
|
1409
|
+
fileCount: manifest.files.length,
|
|
1410
|
+
manifestDigest: manifestDigest(manifest),
|
|
1411
|
+
// Explicit UNSIGNED markers — the human/timestamp trust-root (P-3) fills these in. The artifact
|
|
1412
|
+
// NEVER implies it has been signed or timestamped.
|
|
1413
|
+
signed: false,
|
|
1414
|
+
signature: null,
|
|
1415
|
+
};
|
|
1416
|
+
validateAttestation(env);
|
|
1417
|
+
return env;
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
/**
|
|
1421
|
+
* Strictly validate a parsed attestation envelope. Throws an Error describing the FIRST problem; never
|
|
1422
|
+
* mutates and never fills defaults (mirroring validateManifest / cli/proof.js's posture). A wrong kind/
|
|
1423
|
+
* schemaVersion, a missing/!hex root or manifestDigest, a bad fileCount, or an envelope that claims to be
|
|
1424
|
+
* signed (this UNSIGNED payload must never imply a signature) hard-errors here so a tampered/edited
|
|
1425
|
+
* payload is caught on read.
|
|
1426
|
+
* @param {any} obj
|
|
1427
|
+
* @returns {object} the same object, if valid
|
|
1428
|
+
*/
|
|
1429
|
+
function validateAttestation(obj) {
|
|
1430
|
+
if (obj == null || typeof obj !== "object" || Array.isArray(obj)) {
|
|
1431
|
+
throw new Error("dataset attestation must be a JSON object");
|
|
1432
|
+
}
|
|
1433
|
+
if (obj.kind !== ATTESTATION_KIND) {
|
|
1434
|
+
throw new Error(
|
|
1435
|
+
`not a verifyhash dataset attestation (kind: ${JSON.stringify(obj.kind)}; expected ${JSON.stringify(
|
|
1436
|
+
ATTESTATION_KIND
|
|
1437
|
+
)})`
|
|
1438
|
+
);
|
|
1439
|
+
}
|
|
1440
|
+
if (!SUPPORTED_ATTESTATION_SCHEMA_VERSIONS.includes(obj.schemaVersion)) {
|
|
1441
|
+
throw new Error(
|
|
1442
|
+
`unsupported dataset attestation schemaVersion: ${JSON.stringify(obj.schemaVersion)} ` +
|
|
1443
|
+
`(this build understands ${JSON.stringify(SUPPORTED_ATTESTATION_SCHEMA_VERSIONS)})`
|
|
1444
|
+
);
|
|
1445
|
+
}
|
|
1446
|
+
for (const f of ["root", "manifestDigest"]) {
|
|
1447
|
+
if (typeof obj[f] !== "string" || !HEX32_RE.test(obj[f])) {
|
|
1448
|
+
throw new Error(
|
|
1449
|
+
`dataset attestation ${f} must be a 0x-prefixed 32-byte hex string, got: ${String(obj[f])}`
|
|
1450
|
+
);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
if (!Number.isInteger(obj.fileCount) || obj.fileCount < 1) {
|
|
1454
|
+
throw new Error(
|
|
1455
|
+
`dataset attestation fileCount must be a positive integer, got: ${String(obj.fileCount)}`
|
|
1456
|
+
);
|
|
1457
|
+
}
|
|
1458
|
+
// The UNSIGNED payload must NEVER imply a signature/timestamp. `signed` must be exactly false and
|
|
1459
|
+
// `signature` exactly null — a payload that claims otherwise (e.g. a hand-edited `signed:true` with no
|
|
1460
|
+
// real signature scheme this build understands) is rejected rather than silently believed.
|
|
1461
|
+
if (obj.signed !== false) {
|
|
1462
|
+
throw new Error(
|
|
1463
|
+
`dataset attestation signed must be false (this build emits/reads only the UNSIGNED payload; ` +
|
|
1464
|
+
`attaching a real signature is the human-owned trust-root, P-3), got: ${String(obj.signed)}`
|
|
1465
|
+
);
|
|
1466
|
+
}
|
|
1467
|
+
if (obj.signature !== null) {
|
|
1468
|
+
throw new Error(
|
|
1469
|
+
`dataset attestation signature must be null in the UNSIGNED payload, got: ${String(obj.signature)}`
|
|
1470
|
+
);
|
|
1471
|
+
}
|
|
1472
|
+
return obj;
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
/**
|
|
1476
|
+
* Serialize an attestation envelope to its canonical, byte-deterministic bytes: a fixed top-level key
|
|
1477
|
+
* order, NO insignificant whitespace, a single trailing newline. Two runs over the same manifest produce
|
|
1478
|
+
* an identical string — this is the property that makes signing the bytes well-defined. The string IS
|
|
1479
|
+
* the canonical bytes the `--json` form emits and the `--out` file holds.
|
|
1480
|
+
* @param {object} env a validated attestation envelope
|
|
1481
|
+
* @returns {string} the canonical serialization (newline-terminated)
|
|
1482
|
+
*/
|
|
1483
|
+
function serializeAttestation(env) {
|
|
1484
|
+
validateAttestation(env);
|
|
1485
|
+
// Fixed top-level key order via the explicit object literal (V8 preserves string-key insertion order),
|
|
1486
|
+
// JSON.stringify with no spacing -> no insignificant whitespace.
|
|
1487
|
+
const canonical = {
|
|
1488
|
+
kind: env.kind,
|
|
1489
|
+
schemaVersion: env.schemaVersion,
|
|
1490
|
+
note: env.note,
|
|
1491
|
+
root: env.root,
|
|
1492
|
+
fileCount: env.fileCount,
|
|
1493
|
+
manifestDigest: env.manifestDigest,
|
|
1494
|
+
signed: env.signed,
|
|
1495
|
+
signature: env.signature,
|
|
1496
|
+
};
|
|
1497
|
+
return JSON.stringify(canonical) + "\n";
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
/**
|
|
1501
|
+
* Read, parse, and STRICTLY validate the attestation envelope at `attestationPath`. The strict reader
|
|
1502
|
+
* round-trips with serializeAttestation: a malformed/edited envelope (wrong kind/schemaVersion, missing
|
|
1503
|
+
* or !hex root/manifestDigest, a signed-looking payload) is rejected, never half-accepted. Throws on a
|
|
1504
|
+
* missing file or invalid JSON too.
|
|
1505
|
+
* @param {string} attestationPath
|
|
1506
|
+
* @returns {object} the validated envelope
|
|
1507
|
+
*/
|
|
1508
|
+
function readAttestation(attestationPath) {
|
|
1509
|
+
if (!attestationPath || typeof attestationPath !== "string") {
|
|
1510
|
+
throw new Error("readAttestation requires an attestation file path");
|
|
1511
|
+
}
|
|
1512
|
+
let raw;
|
|
1513
|
+
try {
|
|
1514
|
+
raw = fs.readFileSync(attestationPath, "utf8");
|
|
1515
|
+
} catch (e) {
|
|
1516
|
+
throw new Error(`cannot read dataset attestation at ${attestationPath}: ${e.message}`);
|
|
1517
|
+
}
|
|
1518
|
+
let obj;
|
|
1519
|
+
try {
|
|
1520
|
+
obj = JSON.parse(raw);
|
|
1521
|
+
} catch (e) {
|
|
1522
|
+
throw new Error(`dataset attestation at ${attestationPath} is not valid JSON: ${e.message}`);
|
|
1523
|
+
}
|
|
1524
|
+
return validateAttestation(obj);
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
// =================================================================================================
|
|
1528
|
+
// SIGNED-attestation envelope (T-17.1) — a detached signature WRAPPED AROUND the canonical UNSIGNED
|
|
1529
|
+
// payload, never an edit of it.
|
|
1530
|
+
//
|
|
1531
|
+
// WHY A SEPARATE KIND
|
|
1532
|
+
// The UNSIGNED attestation (above) deliberately hard-asserts `signed:false`/`signature:null`: that
|
|
1533
|
+
// guarantee must NEVER be loosened, because a reader who trusts `serializeAttestation`'s bytes is
|
|
1534
|
+
// trusting that they carry NO signature claim. So instead of mutating that payload to add a
|
|
1535
|
+
// signature, we WRAP it: a new, separately-versioned container kind that embeds the EXACT canonical
|
|
1536
|
+
// unsigned bytes (byte-for-byte the string `serializeAttestation` emits) as a string, alongside a
|
|
1537
|
+
// detached `signature` block. The embedded unsigned bytes are re-parsed and re-validated by the SAME
|
|
1538
|
+
// `validateAttestation`, so the wrapped payload is still provably `signed:false`/`signature:null` —
|
|
1539
|
+
// wrapping adds a vouch, it never edits the thing vouched for.
|
|
1540
|
+
//
|
|
1541
|
+
// THE SCHEME (detached, NOT EIP-712)
|
|
1542
|
+
// `eip191-personal-sign` means: the signer ran `personal_sign` (EIP-191) over the EXACT canonical
|
|
1543
|
+
// unsigned bytes (the UTF-8 of the embedded `attestation` string, including its single trailing
|
|
1544
|
+
// newline). We use a detached signature — not EIP-712 typed data — precisely so the signed message
|
|
1545
|
+
// IS the canonical payload bytes verbatim, with no separate domain/struct encoding to drift from
|
|
1546
|
+
// them. This container does NOT itself verify the signature (the loop holds no key and does no
|
|
1547
|
+
// crypto recovery — see T-17.2); it asserts the STRUCTURE is well-formed and the embedded payload is
|
|
1548
|
+
// a valid UNSIGNED attestation.
|
|
1549
|
+
//
|
|
1550
|
+
// WHAT IT PROVES / DOES NOT PROVE
|
|
1551
|
+
// A valid signed container asserts: the holder of `signer`'s key vouched for THIS dataset identity
|
|
1552
|
+
// (the embedded root/fileCount/manifestDigest) at signing time. It does NOT prove a timestamp — there
|
|
1553
|
+
// is no "unaltered since date T" unless `scheme` is a timestamp authority (still P-3, needs-human) —
|
|
1554
|
+
// and EVERY caveat of the embedded UNSIGNED payload (the {source,license} hints are untrusted, the
|
|
1555
|
+
// digest commits to the CLAIMED file set, not re-derived content) still applies verbatim.
|
|
1556
|
+
|
|
1557
|
+
const SIGNED_ATTESTATION_KIND = "verifyhash.dataset-attestation-signed";
|
|
1558
|
+
const SIGNED_ATTESTATION_SCHEMA_VERSION = 1;
|
|
1559
|
+
const SUPPORTED_SIGNED_ATTESTATION_SCHEMA_VERSIONS = Object.freeze([1]);
|
|
1560
|
+
|
|
1561
|
+
// The detached signature schemes this build understands, sourced from cli/core so the supported-scheme
|
|
1562
|
+
// set is the IDENTICAL one shared across the product family. `eip191-personal-sign` = EIP-191
|
|
1563
|
+
// personal_sign over the canonical UNSIGNED attestation bytes (a 65-byte r||s||v secp256k1 signature).
|
|
1564
|
+
const SIGNED_ATTESTATION_SCHEMES = coreAttestation.SIGNED_ATTESTATION_SCHEMES;
|
|
1565
|
+
|
|
1566
|
+
// The standing trust caveat carried IN-BAND in every signed container. It REUSES the dataset TRUST_NOTE
|
|
1567
|
+
// VERBATIM (so the dataset caveats never drift) and adds only the signed-container-specific assertion:
|
|
1568
|
+
// the container asserts the holder of `signer`'s key vouched for THIS dataset identity at signing time;
|
|
1569
|
+
// it does NOT prove a timestamp (no "unaltered since date T" unless `scheme` is a timestamp authority —
|
|
1570
|
+
// still P-3), and EVERY caveat of the embedded UNSIGNED payload still applies.
|
|
1571
|
+
const SIGNED_ATTESTATION_TRUST_NOTE =
|
|
1572
|
+
"This is a SIGNED attestation container: it wraps (never edits) the EXACT canonical UNSIGNED " +
|
|
1573
|
+
"attestation bytes in `attestation` and attaches a detached signature. It asserts that the holder of " +
|
|
1574
|
+
"the `signer` key vouched for THIS dataset identity (the embedded root, fileCount, manifestDigest) at " +
|
|
1575
|
+
"signing time. It does NOT prove a timestamp: there is no \"unaltered since a date T\" unless the " +
|
|
1576
|
+
"scheme is a timestamp authority (still needs-human, P-3). Every caveat of the embedded UNSIGNED " +
|
|
1577
|
+
"payload still applies. " +
|
|
1578
|
+
TRUST_NOTE;
|
|
1579
|
+
|
|
1580
|
+
// DataLedger's signed-container framing, passed to the GENERIC core. The core owns the envelope
|
|
1581
|
+
// machinery (the wrap-don't-edit invariant, the scheme list, signer recovery); this object supplies
|
|
1582
|
+
// ONLY DataLedger's kind/schema/note + the "signed dataset attestation" label (so error strings stay
|
|
1583
|
+
// byte-identical) and the DataLedger UNSIGNED-payload codec (validate/serialize) the core re-validates
|
|
1584
|
+
// the embedded payload with — so the core never needs to know anything dataset-specific (no back-edge).
|
|
1585
|
+
const SIGNED_ATTESTATION_CFG = Object.freeze({
|
|
1586
|
+
kind: SIGNED_ATTESTATION_KIND,
|
|
1587
|
+
schemaVersion: SIGNED_ATTESTATION_SCHEMA_VERSION,
|
|
1588
|
+
supportedSchemaVersions: SUPPORTED_SIGNED_ATTESTATION_SCHEMA_VERSIONS,
|
|
1589
|
+
note: SIGNED_ATTESTATION_TRUST_NOTE,
|
|
1590
|
+
label: "signed dataset attestation",
|
|
1591
|
+
validateUnsigned: validateAttestation,
|
|
1592
|
+
serializeUnsigned: serializeAttestation,
|
|
1593
|
+
});
|
|
1594
|
+
|
|
1595
|
+
/**
|
|
1596
|
+
* Strictly validate a parsed SIGNED-attestation container. Throws an Error describing the FIRST problem;
|
|
1597
|
+
* never mutates and never fills defaults (same discipline as validateAttestation). REJECTS: a wrong
|
|
1598
|
+
* kind/schemaVersion, a non-string embedded `attestation`, a missing/non-object `signature` block, an
|
|
1599
|
+
* unknown `scheme`, a malformed `signer` address, a missing/!hex `signature` value, or an embedded
|
|
1600
|
+
* `attestation` that does not re-validate as a sound UNSIGNED attestation (i.e. it must STILL be
|
|
1601
|
+
* `signed:false`/`signature:null` — wrapping never edits). It NEVER half-accepts.
|
|
1602
|
+
*
|
|
1603
|
+
* @param {any} obj
|
|
1604
|
+
* @returns {object} the same object, if valid
|
|
1605
|
+
*/
|
|
1606
|
+
function validateSignedAttestation(obj) {
|
|
1607
|
+
// THIN wrapper over the generic core validator with DataLedger's framing. The core enforces the shared
|
|
1608
|
+
// wrap-don't-edit invariant (re-validate + canonical-byte equality of the embedded UNSIGNED payload via
|
|
1609
|
+
// DataLedger's own validateAttestation/serializeAttestation), the scheme list, and the signer/signature
|
|
1610
|
+
// shape; the "signed dataset attestation" label keeps every error string byte-identical.
|
|
1611
|
+
return coreAttestation.validateSignedAttestation(obj, SIGNED_ATTESTATION_CFG);
|
|
1612
|
+
}
|
|
1613
|
+
|
|
1614
|
+
/**
|
|
1615
|
+
* Assemble + validate a SIGNED-attestation container from a validated UNSIGNED attestation envelope and
|
|
1616
|
+
* a detached signature triple. PURE: it performs NO signing and NO key handling — the loop never holds a
|
|
1617
|
+
* key (T-17.2). It embeds the EXACT canonical unsigned bytes (serializeAttestation(attestation)) as a
|
|
1618
|
+
* string so the signed-over bytes are unambiguous, then attaches { scheme, signer, signature } and
|
|
1619
|
+
* strictly validates the whole container (throws if anything is malformed, so a corrupt container is
|
|
1620
|
+
* never produced).
|
|
1621
|
+
*
|
|
1622
|
+
* The resulting container ASSERTS that the holder of `signer`'s key vouched for THIS dataset identity at
|
|
1623
|
+
* signing time. It does NOT prove a timestamp (no "unaltered since date T" unless `scheme` is a timestamp
|
|
1624
|
+
* authority — still P-3, needs-human), and EVERY caveat of the embedded UNSIGNED payload applies verbatim
|
|
1625
|
+
* (the {source,license} hints are untrusted; the digest commits to the CLAIMED file set, not re-derived
|
|
1626
|
+
* content). Signing WRAPS the unsigned payload, it never edits it.
|
|
1627
|
+
*
|
|
1628
|
+
* @param {object} params
|
|
1629
|
+
* @param {object} params.attestation a validated UNSIGNED attestation envelope (from buildAttestation/readAttestation)
|
|
1630
|
+
* @param {string} params.scheme one of SIGNED_ATTESTATION_SCHEMES (e.g. "eip191-personal-sign")
|
|
1631
|
+
* @param {string} params.signer the claimed 0x-address of the signer
|
|
1632
|
+
* @param {string} params.signature the 0x-hex detached signature over serializeAttestation(attestation)
|
|
1633
|
+
* @returns {object} a validated signed-attestation container
|
|
1634
|
+
*/
|
|
1635
|
+
function buildSignedAttestation(params) {
|
|
1636
|
+
// THIN wrapper: the core embeds the EXACT canonical UNSIGNED bytes (via DataLedger's serializeAttestation
|
|
1637
|
+
// in SIGNED_ATTESTATION_CFG), attaches { scheme, signer, signature }, and strictly validates the whole
|
|
1638
|
+
// container. NO signing, NO key handling — the loop never holds a key.
|
|
1639
|
+
return coreAttestation.buildSignedAttestation(params, SIGNED_ATTESTATION_CFG);
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
/**
|
|
1643
|
+
* Serialize a signed-attestation container to its canonical, byte-deterministic bytes: a FIXED top-level
|
|
1644
|
+
* (and signature-block) key order, NO insignificant whitespace, a single trailing newline — the same
|
|
1645
|
+
* discipline as serializeAttestation. Two runs over the same inputs produce an identical string.
|
|
1646
|
+
* @param {object} container a validated signed-attestation container
|
|
1647
|
+
* @returns {string} the canonical serialization (newline-terminated)
|
|
1648
|
+
*/
|
|
1649
|
+
function serializeSignedAttestation(container) {
|
|
1650
|
+
// THIN wrapper: the core serializes with the fixed top-level + signature-block key order, no
|
|
1651
|
+
// insignificant whitespace, and a single trailing newline — byte-deterministic across runs.
|
|
1652
|
+
return coreAttestation.serializeSignedAttestation(container, SIGNED_ATTESTATION_CFG);
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
/**
|
|
1656
|
+
* Read, parse, and STRICTLY validate the signed-attestation container at `signedPath`. Round-trips with
|
|
1657
|
+
* serializeSignedAttestation: a malformed/edited container (wrong kind/schemaVersion, unknown scheme,
|
|
1658
|
+
* malformed signer, missing/!hex signature, a non-canonical or itself-"signed" embedded payload) is
|
|
1659
|
+
* rejected, never half-accepted. Throws on a missing file or invalid JSON too.
|
|
1660
|
+
* @param {string} signedPath
|
|
1661
|
+
* @returns {object} the validated container
|
|
1662
|
+
*/
|
|
1663
|
+
function readSignedAttestation(signedPath) {
|
|
1664
|
+
// THIN wrapper over the generic core reader with DataLedger's framing (label keeps the I/O error
|
|
1665
|
+
// strings byte-identical). Reads, parses, and strictly validates — a malformed/edited/foreign
|
|
1666
|
+
// container is rejected, never half-accepted.
|
|
1667
|
+
return coreAttestation.readSignedAttestation(signedPath, SIGNED_ATTESTATION_CFG);
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
/**
|
|
1671
|
+
* Orchestrate `vh dataset attest <manifest> [--json] [--out <p>]`. Reads the manifest via the strict
|
|
1672
|
+
* `readManifest`, builds the UNSIGNED attestation envelope, and emits its canonical bytes. With `--out`
|
|
1673
|
+
* it writes those exact bytes to the caller's EXPLICIT path (never cwd) and names the file; without
|
|
1674
|
+
* `--out` it prints them to stdout. `--json` is the machine form AND is itself the canonical bytes.
|
|
1675
|
+
* PURELY OFFLINE: no tree, no provider, no key, no network.
|
|
1676
|
+
*
|
|
1677
|
+
* @param {object} opts
|
|
1678
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
1679
|
+
* @param {boolean}[opts.json] emit the canonical machine form (which is the same canonical bytes)
|
|
1680
|
+
* @param {string} [opts.out] write the canonical payload to this explicit path (caller-chosen; never cwd)
|
|
1681
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
1682
|
+
* @returns {{ envelope: object, canonical: string, out: string|null }}
|
|
1683
|
+
*/
|
|
1684
|
+
function runDatasetAttest(opts) {
|
|
1685
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetAttest requires options");
|
|
1686
|
+
const { manifest: manifestPath } = opts;
|
|
1687
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
1688
|
+
if (!manifestPath) throw new Error("runDatasetAttest requires a <manifest> path");
|
|
1689
|
+
|
|
1690
|
+
// Strict read: a corrupt/edited/foreign manifest is rejected here, never half-accepted, BEFORE any
|
|
1691
|
+
// payload is built. The file SET it commits to is the TRUSTED basis of the attestation identity.
|
|
1692
|
+
const manifest = readManifest(manifestPath);
|
|
1693
|
+
|
|
1694
|
+
const envelope = buildAttestation(manifest);
|
|
1695
|
+
// The canonical bytes are the SAME whether printed, written, or `--json`-emitted — signing is then a
|
|
1696
|
+
// one-liner over exactly these bytes.
|
|
1697
|
+
const canonical = serializeAttestation(envelope);
|
|
1698
|
+
|
|
1699
|
+
let outAbs = null;
|
|
1700
|
+
if (opts.out) {
|
|
1701
|
+
// Write the EXACT canonical bytes to the caller-chosen path (resolved to absolute so the success
|
|
1702
|
+
// line names precisely the file written) — never silently the cwd. The ONLY side effect.
|
|
1703
|
+
outAbs = path.resolve(opts.out);
|
|
1704
|
+
fs.writeFileSync(outAbs, canonical);
|
|
1705
|
+
// The success line goes to stdout for the human path; --json stays pure canonical bytes (no extra
|
|
1706
|
+
// lines) so its stdout IS the signable payload.
|
|
1707
|
+
if (!opts.json) write(`dataset attestation written: ${outAbs}\n`);
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
if (opts.json) {
|
|
1711
|
+
// The machine form IS the canonical bytes (so a caller can pipe `--json` straight into a signer).
|
|
1712
|
+
write(canonical);
|
|
1713
|
+
} else if (!outAbs) {
|
|
1714
|
+
// No --out: print the canonical payload to stdout. (When --out was given, the success line above is
|
|
1715
|
+
// the human feedback and the bytes live in the file.)
|
|
1716
|
+
write(canonical);
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
return { envelope, canonical, out: outAbs };
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
// =================================================================================================
|
|
1723
|
+
// `vh dataset sign <manifest> --key-env <VAR> | --key-file <path> [--out <p>] [--json]` — read a
|
|
1724
|
+
// HUMAN-supplied key, sign the UNSIGNED dataset attestation, write the SIGNED container (T-19.2).
|
|
1725
|
+
//
|
|
1726
|
+
// WHY THIS EXISTS
|
|
1727
|
+
// `vh dataset attest` emits the canonical UNSIGNED identity bytes a signer signs; T-19.1 added the pure
|
|
1728
|
+
// `signAttestation` core that turns a payload + a signer OBJECT into a wrapped, signed container. This
|
|
1729
|
+
// command is the CLI glue that lets a HUMAN actually sign: it reads a key the human provisioned OUTSIDE
|
|
1730
|
+
// this tool (an env var or a key file), constructs an in-process ethers Wallet from it, and routes it
|
|
1731
|
+
// through the SAME `signAttestation` core. The loop itself never generates or holds a key — the key is
|
|
1732
|
+
// 100% caller-supplied.
|
|
1733
|
+
//
|
|
1734
|
+
// KEY HYGIENE (load-bearing). The key source is EXACTLY ONE of `--key-env`/`--key-file`; neither, both, a
|
|
1735
|
+
// missing env var, an unreadable file, or a malformed/zero key HARD-ERRORS BEFORE any signing, with a
|
|
1736
|
+
// message that NEVER includes the key material. The key is read, used to build the Wallet, used to sign,
|
|
1737
|
+
// and discarded; success/`--json` output prints ONLY the signer ADDRESS (public), the output path, and
|
|
1738
|
+
// the scheme — never the key.
|
|
1739
|
+
//
|
|
1740
|
+
// TRUST POSTURE (P-3, verbatim). This signs the dataset IDENTITY with the key YOU supplied. A self-managed
|
|
1741
|
+
// key attests "the signer says so" — it is NOT an independent, trusted TIMESTAMP ("existed/unaltered
|
|
1742
|
+
// since date T" still needs the human-owned signing/timestamp trust-root, P-3). The in-band container
|
|
1743
|
+
// note (SIGNED_ATTESTATION_TRUST_NOTE) and the human output both say so plainly.
|
|
1744
|
+
|
|
1745
|
+
// The signing-specific caveat the human-output sign path LEADS with. States the P-3 posture verbatim: this
|
|
1746
|
+
// signs the dataset identity with the caller's OWN key; "the signer says so" is NOT a trusted timestamp.
|
|
1747
|
+
const SIGN_TRUST_NOTE =
|
|
1748
|
+
"This signs the dataset IDENTITY (root, fileCount, manifestDigest) with the key YOU supplied. A " +
|
|
1749
|
+
"self-managed key attests \"the signer says so\" — it is NOT an independent, trusted TIMESTAMP: " +
|
|
1750
|
+
'"existed/unaltered since a date T" still needs the human-owned signing/timestamp trust-root ' +
|
|
1751
|
+
"(needs-human, P-3). The key must be one YOU provisioned OUTSIDE this tool.";
|
|
1752
|
+
|
|
1753
|
+
/**
|
|
1754
|
+
* Orchestrate `vh dataset sign <manifest> --key-env <VAR> | --key-file <path> [--out <p>] [--json]`. Reads
|
|
1755
|
+
* the manifest via the strict `readManifest`, builds the UNSIGNED attestation payload via the EXISTING
|
|
1756
|
+
* `buildAttestation` path (NO re-implementation), resolves a HUMAN-supplied key into an in-process Wallet
|
|
1757
|
+
* via the shared `loadSigningWallet`, signs over the canonical bytes via the T-19.1 `signAttestation` core,
|
|
1758
|
+
* and writes the SIGNED container's canonical bytes to `--out` (or stdout). PURELY OFFLINE: the Wallet has
|
|
1759
|
+
* no provider, signing is EIP-191 personal_sign, no network is touched.
|
|
1760
|
+
*
|
|
1761
|
+
* KEY HYGIENE: the key is read, used, and discarded; it is NEVER returned, persisted, or logged. The
|
|
1762
|
+
* success/`--json` output prints ONLY the signer address, the output path, and the scheme — never the key.
|
|
1763
|
+
*
|
|
1764
|
+
* @param {object} opts
|
|
1765
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
1766
|
+
* @param {string} [opts.keyEnv] env var holding the signing key (EXACTLY ONE of keyEnv/keyFile)
|
|
1767
|
+
* @param {string} [opts.keyFile] path to a key file the human created (EXACTLY ONE of keyEnv/keyFile)
|
|
1768
|
+
* @param {boolean}[opts.json] emit a machine-readable { signer, out, scheme, container, ... } object;
|
|
1769
|
+
* with NO --out the `container` field carries the canonical signed bytes so
|
|
1770
|
+
* `--json` never silently drops the artifact (parity with `attest --json`)
|
|
1771
|
+
* @param {string} [opts.out] write the signed container to this explicit path (caller-chosen; never cwd)
|
|
1772
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
1773
|
+
* @returns {Promise<{ container: object, canonical: string, signer: string, scheme: string, out: string|null }>}
|
|
1774
|
+
*/
|
|
1775
|
+
async function runDatasetSign(opts) {
|
|
1776
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetSign requires options");
|
|
1777
|
+
const { manifest: manifestPath, keyEnv, keyFile } = opts;
|
|
1778
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
1779
|
+
if (!manifestPath) throw new Error("runDatasetSign requires a <manifest> path");
|
|
1780
|
+
|
|
1781
|
+
// Resolve the HUMAN-supplied key into an in-process Wallet FIRST (BEFORE any signing). Neither/both
|
|
1782
|
+
// sources, a missing env var, an unreadable file, or a malformed/zero key hard-errors here with a
|
|
1783
|
+
// key-free message — so we never read the manifest only to fail on a bad key, and never sign with junk.
|
|
1784
|
+
const { wallet } = coreAttestation.loadSigningWallet({ keyEnv, keyFile });
|
|
1785
|
+
|
|
1786
|
+
// Strict read: a corrupt/edited/foreign manifest is rejected here, never half-accepted. The file SET it
|
|
1787
|
+
// commits to is the TRUSTED basis of the attestation identity.
|
|
1788
|
+
const manifest = readManifest(manifestPath);
|
|
1789
|
+
|
|
1790
|
+
// Build the UNSIGNED payload via the EXISTING `vh dataset attest` code path (NO re-implementation), then
|
|
1791
|
+
// route the Wallet + payload through the SAME T-19.1 core `signAttestation`. The container ROUND-TRIPS by
|
|
1792
|
+
// construction: `vh dataset verify-attest` recovers exactly this signer over exactly these bytes.
|
|
1793
|
+
const unsigned = buildAttestation(manifest);
|
|
1794
|
+
const container = await coreAttestation.signAttestation(
|
|
1795
|
+
{ attestation: unsigned, signer: wallet },
|
|
1796
|
+
SIGNED_ATTESTATION_CFG
|
|
1797
|
+
);
|
|
1798
|
+
const canonical = serializeSignedAttestation(container);
|
|
1799
|
+
const signer = container.signature.signer; // lowercase 0x-address (PUBLIC) — never the key
|
|
1800
|
+
const scheme = container.signature.scheme;
|
|
1801
|
+
|
|
1802
|
+
let outAbs = null;
|
|
1803
|
+
if (opts.out) {
|
|
1804
|
+
// Write the EXACT canonical signed bytes to the caller-chosen path (resolved absolute) — never cwd.
|
|
1805
|
+
// The ONLY side effect. NOTHING about the key is written: a signed container holds only the public
|
|
1806
|
+
// signer address + the signature.
|
|
1807
|
+
outAbs = path.resolve(opts.out);
|
|
1808
|
+
fs.writeFileSync(outAbs, canonical);
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
if (opts.json) {
|
|
1812
|
+
// Machine form: ONLY public fields — signer ADDRESS, output path, scheme. NEVER the key.
|
|
1813
|
+
//
|
|
1814
|
+
// ARTIFACT PARITY with `attest --json` (which emits the canonical bytes on stdout so a caller can
|
|
1815
|
+
// pipe straight on). When there is NO --out, the signed container has nowhere else to live, so we
|
|
1816
|
+
// carry the EXACT canonical signed bytes in a `container` field — `--json` without --out NEVER drops
|
|
1817
|
+
// the artifact. With --out the bytes are on disk at `out`, so `container` is null (no redundant copy).
|
|
1818
|
+
write(
|
|
1819
|
+
JSON.stringify({
|
|
1820
|
+
signed: true,
|
|
1821
|
+
signer,
|
|
1822
|
+
scheme,
|
|
1823
|
+
out: outAbs,
|
|
1824
|
+
kind: container.kind,
|
|
1825
|
+
// The canonical signed bytes when there is no file to point at; null when --out holds them.
|
|
1826
|
+
container: outAbs ? null : canonical,
|
|
1827
|
+
note: SIGN_TRUST_NOTE,
|
|
1828
|
+
}) + "\n"
|
|
1829
|
+
);
|
|
1830
|
+
} else {
|
|
1831
|
+
write(` TRUST: ${SIGN_TRUST_NOTE}\n`);
|
|
1832
|
+
// The success line names WHICH key signed (by its PUBLIC address) so the human can confirm.
|
|
1833
|
+
write(`signed by ${signer}\n`);
|
|
1834
|
+
write(` scheme: ${scheme}\n`);
|
|
1835
|
+
if (outAbs) {
|
|
1836
|
+
write(` signed dataset attestation written: ${outAbs}\n`);
|
|
1837
|
+
} else {
|
|
1838
|
+
// No --out: emit the canonical signed bytes to stdout after the human header.
|
|
1839
|
+
write(canonical);
|
|
1840
|
+
}
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1843
|
+
return { container, canonical, signer, scheme, out: outAbs };
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
// =================================================================================================
|
|
1847
|
+
// `vh dataset verify-attest <signed> [--manifest <m>] [--signer <addr>] [--json]` — an OFFLINE verifier
|
|
1848
|
+
// that confirms a SIGNED attestation container (T-17.1) is genuinely signed and (optionally) binds the
|
|
1849
|
+
// buyer's own dataset.
|
|
1850
|
+
//
|
|
1851
|
+
// WHY THIS EXISTS
|
|
1852
|
+
// A buyer handed a "signed by the publisher" attestation needs ONE command that answers, with no key
|
|
1853
|
+
// and no network: (1) is the embedded signature genuine — i.e. does it recover to the address the
|
|
1854
|
+
// container CLAIMS as `signer`? Without this check a `signer` field is just a self-asserted label.
|
|
1855
|
+
// (2) Optionally: is the recovered signer the SPECIFIC publisher I expected (`--signer <addr>`)? — so a
|
|
1856
|
+
// buyer pins WHO must have signed, not merely that SOMEONE did. (3) Optionally: does the signature bind
|
|
1857
|
+
// the dataset I actually hold (`--manifest <m>`)? — by recomputing the canonical UNSIGNED bytes from MY
|
|
1858
|
+
// manifest via the EXISTING build path and confirming they are byte-identical to the embedded payload.
|
|
1859
|
+
//
|
|
1860
|
+
// PURELY OFFLINE: no tree walk, no provider, no key, no network. The signature recovery is ethers'
|
|
1861
|
+
// `verifyMessage` over the EXACT embedded canonical bytes (the wire is `eip191-personal-sign` = EIP-191
|
|
1862
|
+
// personal_sign over those bytes), so the message recovered-over IS the signed-over payload verbatim.
|
|
1863
|
+
//
|
|
1864
|
+
// TRUST POSTURE (carried verbatim into output). A valid signature proves the HOLDER OF `signer`'s KEY
|
|
1865
|
+
// vouched for THIS dataset identity. It does NOT by itself prove a trustworthy TIMESTAMP ("unaltered
|
|
1866
|
+
// since date T" still needs the human-owned trust-root, P-3), and it does NOT validate that the
|
|
1867
|
+
// dataset's license/source HINTS are genuinely correct (that is the `check` policy gate's untrusted-hint
|
|
1868
|
+
// caveat). The verdict never overclaims past P-3.
|
|
1869
|
+
//
|
|
1870
|
+
// EXIT CODES (mirror the dataset family's data-divergence convention): 0 on ACCEPTED, 3 on REJECTED (so a
|
|
1871
|
+
// buyer's CI can gate "attestation is genuinely signed by our publisher and binds this dataset"), 2 on a
|
|
1872
|
+
// usage error, 1 on a runtime error (missing/corrupt container/manifest). The CLI derives 3 from the
|
|
1873
|
+
// returned `accepted` boolean.
|
|
1874
|
+
|
|
1875
|
+
// Possible verdicts. ACCEPTED = every REQUESTED check passed; REJECTED = at least one failed.
|
|
1876
|
+
const VERIFY_ATTEST_VERDICT = Object.freeze({ ACCEPTED: "ACCEPTED", REJECTED: "REJECTED" });
|
|
1877
|
+
|
|
1878
|
+
// The standing trust caveat the verify-attest output LEADS with. REUSES the dataset TRUST_NOTE verbatim
|
|
1879
|
+
// (so the dataset caveats never drift) and adds the signing-specific caveat: a valid signature proves the
|
|
1880
|
+
// key-holder vouched for this dataset IDENTITY; it does NOT prove a timestamp (P-3, needs-human) and does
|
|
1881
|
+
// NOT validate the license/source hints (the `check` policy gate's untrusted-hint caveat). Never overclaims.
|
|
1882
|
+
const VERIFY_ATTEST_TRUST_NOTE =
|
|
1883
|
+
"A valid signature proves the HOLDER OF `signer`'s key vouched for THIS dataset identity (the embedded " +
|
|
1884
|
+
"root, fileCount, manifestDigest). It does NOT by itself prove a trustworthy TIMESTAMP: \"unaltered " +
|
|
1885
|
+
"since a date T\" still needs the human-owned signing/timestamp trust-root (needs-human, P-3). It does " +
|
|
1886
|
+
"NOT validate that the dataset's license/source HINTS are genuinely correct (that is the `vh dataset " +
|
|
1887
|
+
"check` policy gate's untrusted-hint caveat). " +
|
|
1888
|
+
TRUST_NOTE;
|
|
1889
|
+
|
|
1890
|
+
/**
|
|
1891
|
+
* Recover the signing address from a signed-attestation container's embedded canonical bytes + signature
|
|
1892
|
+
* per the declared `scheme`. PURE: no I/O, no key, no network. For `eip191-personal-sign` this is ethers'
|
|
1893
|
+
* `verifyMessage(<embedded canonical bytes>, signature)` — EIP-191 personal_sign recovery over the EXACT
|
|
1894
|
+
* bytes that were signed. Returns the recovered address as a LOWERCASE 0x-hex string (so it compares
|
|
1895
|
+
* directly to the container's lowercase `signer` and a lowercased `--signer`). Throws on an unknown scheme
|
|
1896
|
+
* (defense-in-depth: validateSignedAttestation already rejects one) or an unrecoverable signature.
|
|
1897
|
+
*
|
|
1898
|
+
* @param {object} container a validated signed-attestation container (from readSignedAttestation)
|
|
1899
|
+
* @returns {string} the recovered signer address, 0x-prefixed lowercase
|
|
1900
|
+
*/
|
|
1901
|
+
function recoverSignedAttestationSigner(container) {
|
|
1902
|
+
// THIN wrapper: the core recovers the signer from the embedded canonical bytes + signature per the
|
|
1903
|
+
// declared scheme (eip191-personal-sign = EIP-191 personal_sign recovery over the embedded bytes).
|
|
1904
|
+
return coreAttestation.recoverSigner(container);
|
|
1905
|
+
}
|
|
1906
|
+
|
|
1907
|
+
/**
|
|
1908
|
+
* Verify (purely, OFFLINE) a signed-attestation container: recover the signer from the embedded canonical
|
|
1909
|
+
* bytes + signature and confirm it equals the container's CLAIMED `signer`; OPTIONALLY pin it to an
|
|
1910
|
+
* EXPECTED publisher (`expectedSigner`); OPTIONALLY confirm the signature binds a buyer's own manifest
|
|
1911
|
+
* (`manifest`) by recomputing the canonical UNSIGNED bytes via the EXISTING build path and requiring them
|
|
1912
|
+
* byte-identical to the embedded payload. The verdict is ACCEPTED only when EVERY requested check passes.
|
|
1913
|
+
*
|
|
1914
|
+
* No I/O, no provider, no key, no network. Throws only on an unrecoverable signature; a recovered address
|
|
1915
|
+
* that simply doesn't match is a clean REJECTED (a normal verdict, not an error).
|
|
1916
|
+
*
|
|
1917
|
+
* @param {object} params
|
|
1918
|
+
* @param {object} params.container a validated signed-attestation container (from readSignedAttestation)
|
|
1919
|
+
* @param {string} [params.expectedSigner] OPTIONAL expected publisher 0x-address (--signer); checked when present
|
|
1920
|
+
* @param {object} [params.manifest] OPTIONAL validated manifest object (from readManifest); binding check when present
|
|
1921
|
+
* @returns {{
|
|
1922
|
+
* verdict: "ACCEPTED"|"REJECTED",
|
|
1923
|
+
* accepted: boolean,
|
|
1924
|
+
* recoveredSigner: string,
|
|
1925
|
+
* claimedSigner: string,
|
|
1926
|
+
* scheme: string,
|
|
1927
|
+
* checks: {
|
|
1928
|
+
* signatureMatchesSigner: boolean,
|
|
1929
|
+
* signerMatchesExpected: boolean|null,
|
|
1930
|
+
* manifestBindsAttestation: boolean|null,
|
|
1931
|
+
* },
|
|
1932
|
+
* expectedSigner: string|null,
|
|
1933
|
+
* manifestChecked: boolean,
|
|
1934
|
+
* failedChecks: string[],
|
|
1935
|
+
* }}
|
|
1936
|
+
*/
|
|
1937
|
+
function verifySignedAttestation(params) {
|
|
1938
|
+
if (!params || typeof params !== "object") {
|
|
1939
|
+
throw new Error("verifySignedAttestation requires { container, [expectedSigner], [manifest] }");
|
|
1940
|
+
}
|
|
1941
|
+
const { container, expectedSigner, manifest } = params;
|
|
1942
|
+
|
|
1943
|
+
// The ONLY DataLedger-specific step: the OPTIONAL dataset-binding check recomputes the canonical
|
|
1944
|
+
// UNSIGNED bytes from the buyer's OWN manifest via the EXISTING build path, then hands those bytes to
|
|
1945
|
+
// the GENERIC core as `expectedCanonical`. The core does the signer recovery, the claimed-signer
|
|
1946
|
+
// check, the OPTIONAL expected-signer pin, and the byte-identity binding comparison — all product-
|
|
1947
|
+
// agnostic. The returned shape (incl. the `manifestBindsAttestation`/`manifestChecked` field names) is
|
|
1948
|
+
// byte-for-byte what the pre-extraction function returned.
|
|
1949
|
+
let expectedCanonical;
|
|
1950
|
+
if (manifest !== undefined && manifest !== null) {
|
|
1951
|
+
expectedCanonical = serializeAttestation(buildAttestation(manifest));
|
|
1952
|
+
}
|
|
1953
|
+
return coreAttestation.verifySignedAttestation({ container, expectedSigner, expectedCanonical });
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
/**
|
|
1957
|
+
* Render a verify-attest result as the human-readable block the CLI prints. LEADS with the standing
|
|
1958
|
+
* trust caveat (VERIFY_ATTEST_TRUST_NOTE: reuses TRUST_NOTE verbatim + the signing caveat — never
|
|
1959
|
+
* overclaims past P-3), then the verdict, the recovered/claimed/expected signer, and each requested
|
|
1960
|
+
* check with PASS/FAIL. A REJECTED verdict NAMES which check(s) failed.
|
|
1961
|
+
* @param {object} r the object verifySignedAttestation returns
|
|
1962
|
+
* @returns {string[]} lines
|
|
1963
|
+
*/
|
|
1964
|
+
function formatVerifyAttest(r) {
|
|
1965
|
+
const lines = [
|
|
1966
|
+
// TRUST caveat FIRST: a valid signature proves identity-vouching, NOT a timestamp, NOT correct hints.
|
|
1967
|
+
" TRUST: " + VERIFY_ATTEST_TRUST_NOTE,
|
|
1968
|
+
"",
|
|
1969
|
+
` verify-attest: ${r.verdict}`,
|
|
1970
|
+
` scheme: ${r.scheme}`,
|
|
1971
|
+
` recovered signer: ${r.recoveredSigner} (from the embedded canonical bytes + signature)`,
|
|
1972
|
+
` claimed signer: ${r.claimedSigner} (the container's \`signer\` field)`,
|
|
1973
|
+
];
|
|
1974
|
+
// Check 1 (always performed): the signature recovers to the claimed signer.
|
|
1975
|
+
lines.push(
|
|
1976
|
+
` [${r.checks.signatureMatchesSigner ? "PASS" : "FAIL"}] signature recovers to the claimed signer`
|
|
1977
|
+
);
|
|
1978
|
+
// Check 2 (only when --signer pinned): the recovered signer equals the expected publisher.
|
|
1979
|
+
if (r.checks.signerMatchesExpected === null) {
|
|
1980
|
+
lines.push(" [skip] expected-signer pin: not requested (pass --signer <addr> to pin the publisher)");
|
|
1981
|
+
} else {
|
|
1982
|
+
lines.push(
|
|
1983
|
+
` [${r.checks.signerMatchesExpected ? "PASS" : "FAIL"}] recovered signer matches the expected ` +
|
|
1984
|
+
`publisher (${r.expectedSigner})`
|
|
1985
|
+
);
|
|
1986
|
+
}
|
|
1987
|
+
// Check 3 (only when --manifest given): the signature binds the buyer's own dataset.
|
|
1988
|
+
if (r.checks.manifestBindsAttestation === null) {
|
|
1989
|
+
lines.push(
|
|
1990
|
+
" [skip] dataset binding: not requested (pass --manifest <m> to bind the signature to YOUR dataset)"
|
|
1991
|
+
);
|
|
1992
|
+
} else {
|
|
1993
|
+
lines.push(
|
|
1994
|
+
` [${r.checks.manifestBindsAttestation ? "PASS" : "FAIL"}] the signature binds YOUR manifest ` +
|
|
1995
|
+
"(its canonical bytes are byte-identical to the signed payload)"
|
|
1996
|
+
);
|
|
1997
|
+
}
|
|
1998
|
+
if (r.accepted) {
|
|
1999
|
+
lines.push(" ACCEPTED: every requested check passed.");
|
|
2000
|
+
} else {
|
|
2001
|
+
lines.push(` REJECTED: failed check(s): ${r.failedChecks.join(", ")}.`);
|
|
2002
|
+
if (r.failedChecks.includes("manifestBindsAttestation")) {
|
|
2003
|
+
lines.push(
|
|
2004
|
+
" binding-mismatch: the signed payload does NOT match YOUR manifest — the signature vouches for a"
|
|
2005
|
+
);
|
|
2006
|
+
lines.push(" DIFFERENT dataset identity than the one you hold.");
|
|
2007
|
+
}
|
|
2008
|
+
}
|
|
2009
|
+
return lines;
|
|
2010
|
+
}
|
|
2011
|
+
|
|
2012
|
+
/**
|
|
2013
|
+
* Orchestrate `vh dataset verify-attest <signed> [--manifest <m>] [--signer <addr>] [--json]`. Reads the
|
|
2014
|
+
* signed container via the strict `readSignedAttestation` (a malformed/edited/foreign container is
|
|
2015
|
+
* rejected, never half-accepted) and, when given, the buyer's manifest via the strict `readManifest`,
|
|
2016
|
+
* then runs the PURE `verifySignedAttestation`. Emits the verdict as a human block (LEADS with the trust
|
|
2017
|
+
* caveat) or a `--json` machine-readable object carrying the recovered signer, expected signer (if any),
|
|
2018
|
+
* the manifest-binding result (if checked), and per-check booleans. PURELY OFFLINE: no tree, no provider,
|
|
2019
|
+
* no key, no network.
|
|
2020
|
+
*
|
|
2021
|
+
* @param {object} opts
|
|
2022
|
+
* @param {string} opts.signed path to a signed-attestation container (from T-17.1)
|
|
2023
|
+
* @param {string} [opts.manifest] OPTIONAL path to the buyer's manifest (binds the signature to it)
|
|
2024
|
+
* @param {string} [opts.signer] OPTIONAL expected publisher 0x-address to pin
|
|
2025
|
+
* @param {boolean}[opts.json] emit the machine-readable verdict instead of the human block
|
|
2026
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
2027
|
+
* @returns {object} the object verifySignedAttestation returns
|
|
2028
|
+
*/
|
|
2029
|
+
function runDatasetVerifyAttest(opts) {
|
|
2030
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetVerifyAttest requires options");
|
|
2031
|
+
const { signed: signedPath, manifest: manifestPath, signer: expectedSigner } = opts;
|
|
2032
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
2033
|
+
if (!signedPath) throw new Error("runDatasetVerifyAttest requires a <signed> path");
|
|
2034
|
+
|
|
2035
|
+
// Strict read: a malformed/edited/foreign signed container is rejected here, never half-accepted, BEFORE
|
|
2036
|
+
// any recovery is attempted. (This also re-validates the embedded UNSIGNED payload, scheme, signer, and
|
|
2037
|
+
// signature shape.)
|
|
2038
|
+
const container = readSignedAttestation(signedPath);
|
|
2039
|
+
|
|
2040
|
+
// OPTIONAL: read the buyer's manifest strictly (a corrupt/foreign manifest is rejected) so the binding
|
|
2041
|
+
// check recomputes canonical bytes from a sound manifest.
|
|
2042
|
+
let manifest;
|
|
2043
|
+
if (manifestPath !== undefined && manifestPath !== null) {
|
|
2044
|
+
manifest = readManifest(manifestPath);
|
|
2045
|
+
}
|
|
2046
|
+
|
|
2047
|
+
let result = verifySignedAttestation({ container, expectedSigner, manifest });
|
|
2048
|
+
|
|
2049
|
+
// OPTIONAL recipient-side TRUST-DECISION-AS-OF (EPIC-51 / T-51.2). Runs ONLY under --revocations — with no
|
|
2050
|
+
// flag `result` is byte-identical to the pre-EPIC baseline. A publisher key revoked-before-as-of downgrades
|
|
2051
|
+
// an otherwise-ACCEPTED attestation to REVOKED (accepted:false => exit 3 via the caller's `accepted ? 0 :
|
|
2052
|
+
// 3` mapping); a later revocation is informational; a forged one is ignored with a warning. OFFLINE /
|
|
2053
|
+
// key-free on the read side. The revocations file is the ONLY new I/O.
|
|
2054
|
+
let defaulted = false;
|
|
2055
|
+
if (opts.revocations) {
|
|
2056
|
+
const applied = coreTrustAsOf.loadAndApply({
|
|
2057
|
+
result,
|
|
2058
|
+
revocationsPath: opts.revocations,
|
|
2059
|
+
asOf: opts.asOf,
|
|
2060
|
+
nowISO: opts.nowISO || new Date().toISOString(),
|
|
2061
|
+
readFile: (p) => fs.readFileSync(path.resolve(p), "utf8"),
|
|
2062
|
+
});
|
|
2063
|
+
result = applied.result;
|
|
2064
|
+
defaulted = applied.defaulted;
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
if (opts.json) {
|
|
2068
|
+
write(JSON.stringify(result) + "\n");
|
|
2069
|
+
} else {
|
|
2070
|
+
for (const line of formatVerifyAttest(result)) write(line + "\n");
|
|
2071
|
+
if (result.trustAsOf) {
|
|
2072
|
+
for (const line of coreTrustAsOf.renderTrustAsOf(result.trustAsOf, { indent: " ", defaulted })) {
|
|
2073
|
+
write(line + "\n");
|
|
2074
|
+
}
|
|
2075
|
+
}
|
|
2076
|
+
}
|
|
2077
|
+
return result;
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
// =================================================================================================
|
|
2081
|
+
// DETACHED TIMESTAMP container (T-20.2, EPIC-20) — an INDEPENDENT RFC-3161 TSA timestamp WRAPPED AROUND
|
|
2082
|
+
// the canonical UNSIGNED dataset attestation, over the SAME generic timestamp core ProofParcel uses.
|
|
2083
|
+
//
|
|
2084
|
+
// WHY A SEPARATE KIND (the EPIC-17 move applied to the TIMESTAMP dimension)
|
|
2085
|
+
// The signed container proves "the publisher SAYS this dataset identity existed". The honestly-stronger
|
|
2086
|
+
// claim a due-diligence / EU-AI-Act reviewer wants is "an INDEPENDENT TSA saw this exact digest by time
|
|
2087
|
+
// T". This container delivers the FORMAT for that: it wraps (never edits) the EXACT canonical UNSIGNED
|
|
2088
|
+
// attestation bytes and attaches an RFC-3161 TimeStampToken bound to the SHA-256 digest OF those bytes.
|
|
2089
|
+
//
|
|
2090
|
+
// THE DIGEST IS SHA-256 — NOT the keccak256 manifestDigest. RFC-3161 TSAs stamp a messageImprint over a
|
|
2091
|
+
// STANDARD hash; SHA-256 is universal, keccak256 non-standard (most TSAs reject it). So the timestamp
|
|
2092
|
+
// digest is a FRESH sha256(utf8(canonical attestation string)) — the digest the buyer re-derives and the
|
|
2093
|
+
// human submits to their TSA — NOT the keccak `manifestDigest` that lives inside the payload.
|
|
2094
|
+
|
|
2095
|
+
const TIMESTAMPED_ATTESTATION_KIND = "verifyhash.dataset-attestation-timestamped";
|
|
2096
|
+
const TIMESTAMPED_ATTESTATION_SCHEMA_VERSION = 1;
|
|
2097
|
+
const SUPPORTED_TIMESTAMPED_ATTESTATION_SCHEMA_VERSIONS = Object.freeze([1]);
|
|
2098
|
+
|
|
2099
|
+
// The standing trust caveat carried IN-BAND in every timestamped container. REUSES the dataset TRUST_NOTE
|
|
2100
|
+
// VERBATIM (so caveats never drift) and adds ONLY the timestamp-specific caveat: a timestamp token attests
|
|
2101
|
+
// an INDEPENDENT TSA saw this digest by genTime — to the strength of the TSA you TRUST; this loop does NOT
|
|
2102
|
+
// validate the TSA cert chain / CMS signature (that is the human out-of-band trust anchor).
|
|
2103
|
+
const TIMESTAMPED_ATTESTATION_TRUST_NOTE =
|
|
2104
|
+
"This is a TIMESTAMPED attestation container: it wraps (never edits) the EXACT canonical UNSIGNED " +
|
|
2105
|
+
"attestation bytes in `attestation` and attaches an RFC-3161 timestamp token over the SHA-256 digest of " +
|
|
2106
|
+
"those exact bytes. It asserts that an INDEPENDENT Time-Stamping Authority (TSA) saw THIS digest by the " +
|
|
2107
|
+
"token's genTime — to the strength of the TSA you TRUST. It does NOT validate the TSA's certificate " +
|
|
2108
|
+
"chain or the token's CMS signature (verify those out-of-band, e.g. `openssl ts -verify`, exactly as " +
|
|
2109
|
+
"you pin a signer address). The digest is a STANDARD sha256(canonical attestation bytes) — NOT the " +
|
|
2110
|
+
"project's internal keccak256 manifestDigest. Every caveat of the embedded UNSIGNED payload still " +
|
|
2111
|
+
"applies. " +
|
|
2112
|
+
TRUST_NOTE;
|
|
2113
|
+
|
|
2114
|
+
// DataLedger's timestamp-container framing, passed to the GENERIC timestamp core. The core owns the
|
|
2115
|
+
// machinery (the wrap-don't-edit invariant, the SHA-256 digest, the RFC-3161 parse + bindsDigest check);
|
|
2116
|
+
// this object supplies ONLY DataLedger's kind/schema/note + the "timestamped dataset attestation" label
|
|
2117
|
+
// and the DataLedger UNSIGNED-payload codec the core re-validates the embedded payload with.
|
|
2118
|
+
const TIMESTAMPED_ATTESTATION_CFG = Object.freeze({
|
|
2119
|
+
kind: TIMESTAMPED_ATTESTATION_KIND,
|
|
2120
|
+
schemaVersion: TIMESTAMPED_ATTESTATION_SCHEMA_VERSION,
|
|
2121
|
+
supportedSchemaVersions: SUPPORTED_TIMESTAMPED_ATTESTATION_SCHEMA_VERSIONS,
|
|
2122
|
+
note: TIMESTAMPED_ATTESTATION_TRUST_NOTE,
|
|
2123
|
+
label: "timestamped dataset attestation",
|
|
2124
|
+
validateUnsigned: validateAttestation,
|
|
2125
|
+
serializeUnsigned: serializeAttestation,
|
|
2126
|
+
});
|
|
2127
|
+
|
|
2128
|
+
/**
|
|
2129
|
+
* Strictly validate a parsed TIMESTAMPED-attestation container. THIN wrapper over the generic timestamp
|
|
2130
|
+
* core validator with DataLedger's framing (label keeps error strings byte-identical). Rejects a
|
|
2131
|
+
* wrong-kind/edited/foreign container, never half-accepts.
|
|
2132
|
+
* @param {any} obj
|
|
2133
|
+
* @returns {object} the same object, if valid
|
|
2134
|
+
*/
|
|
2135
|
+
function validateTimestampedAttestation(obj) {
|
|
2136
|
+
return coreTimestamp.validateTimestampContainer(obj, TIMESTAMPED_ATTESTATION_CFG);
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
/**
|
|
2140
|
+
* Assemble + validate a TIMESTAMPED-attestation container from a validated UNSIGNED envelope and an
|
|
2141
|
+
* RFC-3161 token. THIN wrapper over the generic core: NO network, NO key. A token that does not bind the
|
|
2142
|
+
* re-derived SHA-256 digest hard-errors here.
|
|
2143
|
+
* @param {object} params { attestation, token }
|
|
2144
|
+
* @returns {object} a validated timestamped-attestation container
|
|
2145
|
+
*/
|
|
2146
|
+
function buildTimestampedAttestation(params) {
|
|
2147
|
+
return coreTimestamp.buildTimestampContainer(params, TIMESTAMPED_ATTESTATION_CFG);
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
/**
|
|
2151
|
+
* Serialize a timestamped-attestation container to its canonical, byte-deterministic bytes. THIN wrapper.
|
|
2152
|
+
* @param {object} container a validated timestamped-attestation container
|
|
2153
|
+
* @returns {string} the canonical serialization (newline-terminated)
|
|
2154
|
+
*/
|
|
2155
|
+
function serializeTimestampedAttestation(container) {
|
|
2156
|
+
return coreTimestamp.serializeTimestampContainer(container, TIMESTAMPED_ATTESTATION_CFG);
|
|
2157
|
+
}
|
|
2158
|
+
|
|
2159
|
+
/**
|
|
2160
|
+
* Read, parse, and STRICTLY validate the timestamped-attestation container at `containerPath`. THIN
|
|
2161
|
+
* wrapper over the generic core reader with DataLedger's framing. Rejects a malformed/edited/foreign one.
|
|
2162
|
+
* @param {string} containerPath
|
|
2163
|
+
* @returns {object} the validated container
|
|
2164
|
+
*/
|
|
2165
|
+
function readTimestampedAttestation(containerPath) {
|
|
2166
|
+
return coreTimestamp.readTimestampContainer(containerPath, TIMESTAMPED_ATTESTATION_CFG);
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
// The timestamp-request human note: how to turn the emitted digest into a token. States the trust caveat
|
|
2170
|
+
// and a concrete `openssl ts -query` recipe (the digest is the messageImprint a TSA stamps).
|
|
2171
|
+
const TIMESTAMP_REQUEST_TRUST_NOTE =
|
|
2172
|
+
"This emits the SHA-256 digest of the canonical UNSIGNED attestation bytes — the EXACT digest you submit " +
|
|
2173
|
+
"to your RFC-3161 Time-Stamping Authority (TSA). A timestamp token will attest an INDEPENDENT TSA saw " +
|
|
2174
|
+
"THIS digest by its genTime — to the strength of the TSA you TRUST; this tool does NOT obtain the token " +
|
|
2175
|
+
"(that is a human/network step) and does NOT validate the TSA cert chain. The digest is a STANDARD " +
|
|
2176
|
+
"SHA-256 (universal across TSAs) — NOT the project's internal keccak256 manifestDigest.";
|
|
2177
|
+
|
|
2178
|
+
/**
|
|
2179
|
+
* Build the human "how to produce the token" recipe for a given SHA-256 digest. Concrete, copy-pasteable:
|
|
2180
|
+
* an `openssl ts -query` over the digest, then submit to the TSA, then `vh dataset timestamp-wrap`.
|
|
2181
|
+
* @param {string} digestHex the lowercase SHA-256 digest (no 0x)
|
|
2182
|
+
* @returns {string[]} recipe lines (no trailing newlines)
|
|
2183
|
+
*/
|
|
2184
|
+
function timestampRequestRecipe(digestHex) {
|
|
2185
|
+
return [
|
|
2186
|
+
" To obtain an RFC-3161 timestamp token over this digest (a HUMAN/network step):",
|
|
2187
|
+
` openssl ts -query -digest ${digestHex} -sha256 -cert -out request.tsq`,
|
|
2188
|
+
" # send request.tsq to your TSA (e.g. `curl` to its HTTP endpoint) -> response.tsr",
|
|
2189
|
+
" openssl ts -reply -in response.tsr -token_out -out token.der",
|
|
2190
|
+
" Then wrap it back into a verifiable container (no key, no network):",
|
|
2191
|
+
" vh dataset timestamp-wrap <manifest> --token token.der --out attestation.timestamped.json",
|
|
2192
|
+
];
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
/**
|
|
2196
|
+
* Orchestrate `vh dataset timestamp-request <manifest> [--out <p>] [--json]`. Builds the UNSIGNED payload
|
|
2197
|
+
* EXACTLY as `vh dataset attest` does (REUSES buildAttestation — no re-impl), computes the canonical bytes,
|
|
2198
|
+
* and emits the SHA-256 digest (hex) the human submits to their TSA, plus a ready-to-use recipe for
|
|
2199
|
+
* producing the token. With `--out` it writes a small machine-readable request descriptor to the caller's
|
|
2200
|
+
* EXPLICIT path (never cwd). PURELY OFFLINE: NO key, NO network. This is the "here's exactly what to stamp"
|
|
2201
|
+
* half of the human handoff.
|
|
2202
|
+
*
|
|
2203
|
+
* @param {object} opts
|
|
2204
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
2205
|
+
* @param {boolean}[opts.json] emit a machine-readable { digest, hashAlgorithm, canonical, ... } object
|
|
2206
|
+
* @param {string} [opts.out] write the request descriptor to this explicit path (caller-chosen; never cwd)
|
|
2207
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
2208
|
+
* @returns {{ digest: string, hashAlgorithm: string, canonical: string, out: string|null }}
|
|
2209
|
+
*/
|
|
2210
|
+
function runDatasetTimestampRequest(opts) {
|
|
2211
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetTimestampRequest requires options");
|
|
2212
|
+
const { manifest: manifestPath } = opts;
|
|
2213
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
2214
|
+
if (!manifestPath) throw new Error("runDatasetTimestampRequest requires a <manifest> path");
|
|
2215
|
+
|
|
2216
|
+
// Strict read + the EXISTING attest build path (NO re-impl) so the canonical bytes are byte-for-byte the
|
|
2217
|
+
// SAME bytes `vh dataset attest` emits — the bytes the buyer re-derives and the SHA-256 is taken over.
|
|
2218
|
+
const manifest = readManifest(manifestPath);
|
|
2219
|
+
const canonical = serializeAttestation(buildAttestation(manifest));
|
|
2220
|
+
const digest = coreTimestamp.sha256Hex(canonical);
|
|
2221
|
+
|
|
2222
|
+
let outAbs = null;
|
|
2223
|
+
if (opts.out) {
|
|
2224
|
+
// Write a small request descriptor (the digest + the bytes it is over) to the caller-chosen path —
|
|
2225
|
+
// never cwd. The ONLY side effect.
|
|
2226
|
+
outAbs = path.resolve(opts.out);
|
|
2227
|
+
fs.writeFileSync(
|
|
2228
|
+
outAbs,
|
|
2229
|
+
JSON.stringify(
|
|
2230
|
+
{
|
|
2231
|
+
kind: "verifyhash.timestamp-request",
|
|
2232
|
+
hashAlgorithm: "sha256",
|
|
2233
|
+
digest,
|
|
2234
|
+
attestation: canonical,
|
|
2235
|
+
note: TIMESTAMP_REQUEST_TRUST_NOTE,
|
|
2236
|
+
},
|
|
2237
|
+
null,
|
|
2238
|
+
2
|
|
2239
|
+
) + "\n"
|
|
2240
|
+
);
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2243
|
+
if (opts.json) {
|
|
2244
|
+
write(
|
|
2245
|
+
JSON.stringify({
|
|
2246
|
+
hashAlgorithm: "sha256",
|
|
2247
|
+
digest,
|
|
2248
|
+
canonical,
|
|
2249
|
+
out: outAbs,
|
|
2250
|
+
note: TIMESTAMP_REQUEST_TRUST_NOTE,
|
|
2251
|
+
}) + "\n"
|
|
2252
|
+
);
|
|
2253
|
+
} else {
|
|
2254
|
+
write(` TRUST: ${TIMESTAMP_REQUEST_TRUST_NOTE}\n`);
|
|
2255
|
+
write("\n");
|
|
2256
|
+
write(` sha256 digest (the messageImprint to stamp): ${digest}\n`);
|
|
2257
|
+
write("\n");
|
|
2258
|
+
for (const line of timestampRequestRecipe(digest)) write(line + "\n");
|
|
2259
|
+
if (outAbs) write(` timestamp request written: ${outAbs}\n`);
|
|
2260
|
+
}
|
|
2261
|
+
return { digest, hashAlgorithm: "sha256", canonical, out: outAbs };
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
// The timestamp-wrap human note: leads with the inherited container TRUST_NOTE plus the timestamp caveat.
|
|
2265
|
+
const TIMESTAMP_WRAP_TRUST_NOTE = TIMESTAMPED_ATTESTATION_TRUST_NOTE;
|
|
2266
|
+
|
|
2267
|
+
/**
|
|
2268
|
+
* Resolve the `--token` argument into raw RFC-3161 DER bytes. ACCEPTS either a PATH to a token file
|
|
2269
|
+
* (read as bytes) OR an inline base64 string. We try the filesystem FIRST (the common case — a `token.der`
|
|
2270
|
+
* the human produced), falling back to treating the argument as inline base64/hex only when it is not a
|
|
2271
|
+
* readable file. Throws a clear error if neither yields parseable token bytes.
|
|
2272
|
+
* @param {string} tokenArg a path to a DER token file OR an inline base64/hex token string
|
|
2273
|
+
* @returns {Buffer} the raw DER bytes
|
|
2274
|
+
*/
|
|
2275
|
+
function resolveTimestampToken(tokenArg) {
|
|
2276
|
+
if (typeof tokenArg !== "string" || tokenArg.length === 0) {
|
|
2277
|
+
throw new Error("--token requires a path to an RFC-3161 token file OR an inline base64 token");
|
|
2278
|
+
}
|
|
2279
|
+
// Prefer a file path (the natural artifact `openssl ts -reply -token_out` writes).
|
|
2280
|
+
if (fs.existsSync(tokenArg)) {
|
|
2281
|
+
return fs.readFileSync(tokenArg); // raw DER bytes
|
|
2282
|
+
}
|
|
2283
|
+
// Fall back to inline base64/hex; coreTimestamp.buildTimestampContainer's toBuf will reject non-token.
|
|
2284
|
+
return tokenArg;
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
/**
|
|
2288
|
+
* Orchestrate `vh dataset timestamp-wrap <manifest> --token <path|base64> [--out <p>] [--json]`. Reads the
|
|
2289
|
+
* manifest strictly, builds the UNSIGNED payload via the EXISTING attest path (NO re-impl), reads the
|
|
2290
|
+
* human-obtained RFC-3161 token, and builds the validated TIMESTAMPED container via the generic engine —
|
|
2291
|
+
* binding it to the re-derived canonical SHA-256 digest. ERRORS CLEARLY if the token does not bind the
|
|
2292
|
+
* digest. With `--out` it writes the container to the caller's EXPLICIT path (never cwd). PURELY OFFLINE:
|
|
2293
|
+
* NO key, NO network.
|
|
2294
|
+
*
|
|
2295
|
+
* @param {object} opts
|
|
2296
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
2297
|
+
* @param {string} opts.token path to an RFC-3161 token file OR an inline base64 token (REQUIRED)
|
|
2298
|
+
* @param {boolean}[opts.json] emit a machine-readable { kind, digest, genTime, ..., container } object
|
|
2299
|
+
* @param {string} [opts.out] write the timestamped container to this explicit path (caller-chosen; never cwd)
|
|
2300
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
2301
|
+
* @returns {{ container: object, canonical: string, digest: string, genTime: string, out: string|null }}
|
|
2302
|
+
*/
|
|
2303
|
+
function runDatasetTimestampWrap(opts) {
|
|
2304
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetTimestampWrap requires options");
|
|
2305
|
+
const { manifest: manifestPath, token: tokenArg } = opts;
|
|
2306
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
2307
|
+
if (!manifestPath) throw new Error("runDatasetTimestampWrap requires a <manifest> path");
|
|
2308
|
+
if (!tokenArg) throw new Error("runDatasetTimestampWrap requires a --token <path|base64>");
|
|
2309
|
+
|
|
2310
|
+
// Strict read + the EXISTING attest build path (NO re-impl) so the timestamped-over bytes are byte-for-
|
|
2311
|
+
// byte the SAME bytes `vh dataset attest`/`timestamp-request` emit.
|
|
2312
|
+
const manifest = readManifest(manifestPath);
|
|
2313
|
+
const unsigned = buildAttestation(manifest);
|
|
2314
|
+
const token = resolveTimestampToken(tokenArg);
|
|
2315
|
+
|
|
2316
|
+
// The engine re-derives the canonical SHA-256 digest, parses the token, and confirms bindsDigest — a
|
|
2317
|
+
// token that stamps a DIFFERENT digest (or hash algorithm) hard-errors HERE, never lands a bad container.
|
|
2318
|
+
const container = buildTimestampedAttestation({ attestation: unsigned, token });
|
|
2319
|
+
const canonical = serializeTimestampedAttestation(container);
|
|
2320
|
+
const facts = coreTimestamp.readTimestampFacts(container);
|
|
2321
|
+
|
|
2322
|
+
let outAbs = null;
|
|
2323
|
+
if (opts.out) {
|
|
2324
|
+
outAbs = path.resolve(opts.out);
|
|
2325
|
+
fs.writeFileSync(outAbs, canonical); // the ONLY side effect — at the caller's explicit path, never cwd
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2328
|
+
if (opts.json) {
|
|
2329
|
+
write(
|
|
2330
|
+
JSON.stringify({
|
|
2331
|
+
kind: container.kind,
|
|
2332
|
+
scheme: container.timestamp.scheme,
|
|
2333
|
+
hashAlgorithm: container.timestamp.hashAlgorithm,
|
|
2334
|
+
digest: facts.digest,
|
|
2335
|
+
genTime: facts.genTime,
|
|
2336
|
+
serialNumber: facts.serialNumber,
|
|
2337
|
+
policyOID: facts.policyOID,
|
|
2338
|
+
out: outAbs,
|
|
2339
|
+
// ARTIFACT PARITY with `attest --json`: when there is no --out, carry the canonical bytes so --json
|
|
2340
|
+
// never drops the artifact; with --out the bytes are on disk so `container` is null.
|
|
2341
|
+
container: outAbs ? null : canonical,
|
|
2342
|
+
note: TIMESTAMP_WRAP_TRUST_NOTE,
|
|
2343
|
+
}) + "\n"
|
|
2344
|
+
);
|
|
2345
|
+
} else {
|
|
2346
|
+
write(` TRUST: ${TIMESTAMP_WRAP_TRUST_NOTE}\n`);
|
|
2347
|
+
write("\n");
|
|
2348
|
+
write(` timestamped: an INDEPENDENT TSA stamped this digest by genTime\n`);
|
|
2349
|
+
write(` digest (sha256 of the canonical attestation bytes): ${facts.digest}\n`);
|
|
2350
|
+
write(` genTime (asserted by the TSA): ${facts.genTime}\n`);
|
|
2351
|
+
write(` TSA serial: ${facts.serialNumber.hex}\n`);
|
|
2352
|
+
write(` policy OID: ${facts.policyOID}\n`);
|
|
2353
|
+
if (outAbs) {
|
|
2354
|
+
write(` timestamped dataset attestation written: ${outAbs}\n`);
|
|
2355
|
+
} else {
|
|
2356
|
+
write(canonical);
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
return { container, canonical, digest: facts.digest, genTime: facts.genTime, out: outAbs };
|
|
2360
|
+
}
|
|
2361
|
+
|
|
2362
|
+
// =================================================================================================
|
|
2363
|
+
// `vh dataset verify-timestamp <container> [--manifest <m>] [--json]` — the OFFLINE independent-timestamp
|
|
2364
|
+
// verifier (T-20.3, EPIC-20). The read-only sibling of `verify-attest`, for the TIMESTAMP dimension.
|
|
2365
|
+
//
|
|
2366
|
+
// WHY THIS EXISTS
|
|
2367
|
+
// A buyer handed a `*-attestation-timestamped` container needs ONE command that answers, with no key and
|
|
2368
|
+
// no network: does an INDEPENDENT RFC-3161 TSA's token genuinely bind THIS dataset's identity, and by
|
|
2369
|
+
// WHAT genTime? It re-derives the canonical attestation bytes from the embedded UNSIGNED payload, confirms
|
|
2370
|
+
// `digest === sha256(those bytes)`, parses the token (T-20.1), and confirms its messageImprint BINDS that
|
|
2371
|
+
// digest — printing ACCEPTED with the asserted genTime (ISO UTC) / TSA serialNumber / policy OID, or
|
|
2372
|
+
// REJECTED naming which check failed. With `--manifest` it ALSO re-derives the canonical bytes from the
|
|
2373
|
+
// buyer's OWN manifest and requires a byte-identical match (binding the token to the buyer's data, exactly
|
|
2374
|
+
// like verify-attest's `--manifest`).
|
|
2375
|
+
//
|
|
2376
|
+
// PURELY OFFLINE: no tree walk, no provider, no key, no network. A tampered token / mismatched digest /
|
|
2377
|
+
// edited embedded attestation REJECTS (never a false ACCEPT) — the same strict validator the build/read
|
|
2378
|
+
// path uses decides the structure + binding, so a verify-timestamp ACCEPT can never disagree with what
|
|
2379
|
+
// timestamp-wrap would have produced.
|
|
2380
|
+
//
|
|
2381
|
+
// BOUNDED, HONEST CLAIM (carried verbatim into output, never overclaims). ACCEPTED means an RFC-3161 TSA
|
|
2382
|
+
// ASSERTED this exact dataset identity (digest) existed by <genTime>; this is as trustworthy as the TSA
|
|
2383
|
+
// whose certificate YOU trust. This command does NOT validate the TSA's certificate chain / the token's
|
|
2384
|
+
// CMS signature — use your platform's CMS verifier (`openssl ts -verify`) for full PKI validation. It
|
|
2385
|
+
// NEVER prints "unaltered since date T" without that qualification.
|
|
2386
|
+
//
|
|
2387
|
+
// EXIT CODES (the family's 0/3 convention, shared with `vh dataset verify`/`verify-attest`): 0 ACCEPTED, 3
|
|
2388
|
+
// REJECTED, 2 usage error, 1 runtime error (missing/corrupt container or manifest).
|
|
2389
|
+
|
|
2390
|
+
const VERIFY_TIMESTAMP_VERDICT = coreTimestamp.VERIFY_TIMESTAMP_VERDICT;
|
|
2391
|
+
|
|
2392
|
+
// The standing trust caveat the verify-timestamp output LEADS with — the honest, BOUNDED claim. REUSES the
|
|
2393
|
+
// dataset TRUST_NOTE verbatim (so the dataset caveats never drift) and states EXACTLY what ACCEPTED means:
|
|
2394
|
+
// an RFC-3161 TSA asserted this digest existed by genTime, to the strength of the TSA YOU trust; this
|
|
2395
|
+
// command does NOT validate the TSA cert chain / CMS signature (use a CMS verifier / `openssl ts -verify`).
|
|
2396
|
+
const VERIFY_TIMESTAMP_TRUST_NOTE =
|
|
2397
|
+
"ACCEPTED means an RFC-3161 Time-Stamping Authority (TSA) asserted this exact dataset identity (the " +
|
|
2398
|
+
"SHA-256 digest of the canonical attestation bytes) existed by the asserted genTime. This is as " +
|
|
2399
|
+
"trustworthy as the TSA whose certificate YOU trust — this command does NOT validate the TSA's " +
|
|
2400
|
+
"certificate chain or the token's CMS signature (use your platform's CMS verifier, e.g. " +
|
|
2401
|
+
"`openssl ts -verify`, for full PKI validation). It NEVER claims \"unaltered since date T\" without that " +
|
|
2402
|
+
"qualification. The digest is a STANDARD sha256(canonical attestation bytes) — NOT the project's " +
|
|
2403
|
+
"internal keccak256 manifestDigest. Every caveat of the embedded UNSIGNED payload still applies. " +
|
|
2404
|
+
TRUST_NOTE;
|
|
2405
|
+
|
|
2406
|
+
/**
|
|
2407
|
+
* Verify (purely, OFFLINE) a TIMESTAMPED dataset-attestation container. THIN wrapper over the generic core
|
|
2408
|
+
* verifier with DataLedger's framing. When `manifest` is given, re-derives the buyer's OWN canonical
|
|
2409
|
+
* UNSIGNED bytes via the EXISTING build path and requires the embedded attestation to match byte-for-byte.
|
|
2410
|
+
*
|
|
2411
|
+
* @param {object} params
|
|
2412
|
+
* @param {object} params.container the parsed container object (from JSON.parse / readTimestampedAttestation)
|
|
2413
|
+
* @param {object} [params.manifest] OPTIONAL validated manifest object (from readManifest); binding check when present
|
|
2414
|
+
* @returns {object} the object the core verifyTimestampContainer returns
|
|
2415
|
+
*/
|
|
2416
|
+
function verifyTimestampedAttestation(params) {
|
|
2417
|
+
if (!params || typeof params !== "object") {
|
|
2418
|
+
throw new Error("verifyTimestampedAttestation requires { container, [manifest] }");
|
|
2419
|
+
}
|
|
2420
|
+
const { container, manifest } = params;
|
|
2421
|
+
let expectedManifestCanonical;
|
|
2422
|
+
if (manifest !== undefined && manifest !== null) {
|
|
2423
|
+
expectedManifestCanonical = serializeAttestation(buildAttestation(manifest));
|
|
2424
|
+
}
|
|
2425
|
+
return coreTimestamp.verifyTimestampContainer(
|
|
2426
|
+
{ container, expectedManifestCanonical },
|
|
2427
|
+
TIMESTAMPED_ATTESTATION_CFG
|
|
2428
|
+
);
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2431
|
+
/**
|
|
2432
|
+
* Render a verify-timestamp result as the human-readable block the CLI prints. LEADS with the bounded
|
|
2433
|
+
* trust claim (VERIFY_TIMESTAMP_TRUST_NOTE), then the verdict, the asserted genTime / TSA serial / policy
|
|
2434
|
+
* OID (on ACCEPTED), and each requested check with PASS/FAIL. A REJECTED verdict NAMES which check failed.
|
|
2435
|
+
* @param {object} r the object verifyTimestampedAttestation returns
|
|
2436
|
+
* @returns {string[]} lines
|
|
2437
|
+
*/
|
|
2438
|
+
function formatVerifyTimestamp(r) {
|
|
2439
|
+
const lines = [
|
|
2440
|
+
// BOUNDED claim FIRST: ACCEPTED == a TSA asserted this digest by genTime, to the strength of the TSA
|
|
2441
|
+
// YOU trust; NOT a cert-chain validation, NEVER "unaltered since T" unqualified.
|
|
2442
|
+
" TRUST: " + VERIFY_TIMESTAMP_TRUST_NOTE,
|
|
2443
|
+
"",
|
|
2444
|
+
` verify-timestamp: ${r.verdict}`,
|
|
2445
|
+
];
|
|
2446
|
+
// Check 1 + 2 (always performed): structure sound, digest == sha256(bytes), token parses + binds digest.
|
|
2447
|
+
lines.push(
|
|
2448
|
+
` [${r.checks.structureAndBinding ? "PASS" : "FAIL"}] the token binds sha256(canonical attestation ` +
|
|
2449
|
+
"bytes) under RFC-3161 (structure + digest + messageImprint)"
|
|
2450
|
+
);
|
|
2451
|
+
// Check 3 (only when --manifest given): the timestamp binds the buyer's own dataset.
|
|
2452
|
+
if (r.checks.manifestBindsAttestation === null) {
|
|
2453
|
+
lines.push(
|
|
2454
|
+
" [skip] dataset binding: not requested (pass --manifest <m> to bind the timestamp to YOUR dataset)"
|
|
2455
|
+
);
|
|
2456
|
+
} else {
|
|
2457
|
+
lines.push(
|
|
2458
|
+
` [${r.checks.manifestBindsAttestation ? "PASS" : "FAIL"}] the timestamp binds YOUR manifest ` +
|
|
2459
|
+
"(its canonical bytes are byte-identical to the timestamped payload)"
|
|
2460
|
+
);
|
|
2461
|
+
}
|
|
2462
|
+
if (r.accepted) {
|
|
2463
|
+
lines.push(" ACCEPTED: an RFC-3161 TSA asserted this dataset identity existed by:");
|
|
2464
|
+
lines.push(` genTime (ISO UTC): ${r.genTime}`);
|
|
2465
|
+
lines.push(` TSA serialNumber: ${r.serialNumber.hex} (decimal ${r.serialNumber.decimal})`);
|
|
2466
|
+
lines.push(` policy OID: ${r.policyOID}`);
|
|
2467
|
+
lines.push(` digest (sha256): ${r.digest}`);
|
|
2468
|
+
} else {
|
|
2469
|
+
lines.push(` REJECTED: failed check(s): ${r.failedChecks.join(", ")}.`);
|
|
2470
|
+
if (r.reason) lines.push(` reason: ${r.reason}`);
|
|
2471
|
+
}
|
|
2472
|
+
return lines;
|
|
2473
|
+
}
|
|
2474
|
+
|
|
2475
|
+
/**
|
|
2476
|
+
* Orchestrate `vh dataset verify-timestamp <container> [--manifest <m>] [--json]`. Reads the timestamped
|
|
2477
|
+
* container via the strict generic reader (a malformed/edited/foreign/non-binding container is rejected
|
|
2478
|
+
* here, never half-accepted) and, when given, the buyer's manifest via the strict `readManifest`, then runs
|
|
2479
|
+
* the PURE `verifyTimestampedAttestation`. Emits the verdict as a human block (LEADS with the bounded trust
|
|
2480
|
+
* claim) or a `--json` machine-readable object. PURELY OFFLINE: NO key, NO network.
|
|
2481
|
+
*
|
|
2482
|
+
* NOTE: the strict reader (readTimestampedAttestation) ALSO performs the structure+binding checks. To give a
|
|
2483
|
+
* NAMED REJECTED for a tampered token / mismatched digest / edited embedded attestation (rather than a bare
|
|
2484
|
+
* runtime error), we read the raw JSON ourselves and let the PURE verifier turn the validator's throw into a
|
|
2485
|
+
* clean REJECTED — a corrupt JSON / missing FILE is still a runtime error (exit 1) at the I/O boundary.
|
|
2486
|
+
*
|
|
2487
|
+
* @param {object} opts
|
|
2488
|
+
* @param {string} opts.container path to a timestamped-attestation container (from `timestamp-wrap`)
|
|
2489
|
+
* @param {string} [opts.manifest] OPTIONAL path to the buyer's manifest (binds the timestamp to it)
|
|
2490
|
+
* @param {boolean}[opts.json] emit the machine-readable verdict instead of the human block
|
|
2491
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
2492
|
+
* @returns {object} the object verifyTimestampedAttestation returns
|
|
2493
|
+
*/
|
|
2494
|
+
function runDatasetVerifyTimestamp(opts) {
|
|
2495
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetVerifyTimestamp requires options");
|
|
2496
|
+
const { container: containerPath, manifest: manifestPath } = opts;
|
|
2497
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
2498
|
+
if (!containerPath) throw new Error("runDatasetVerifyTimestamp requires a <container> path");
|
|
2499
|
+
|
|
2500
|
+
// Read the raw JSON at the I/O boundary (a missing file / non-JSON is a runtime error, exit 1). The
|
|
2501
|
+
// STRUCTURE + binding checks are then the PURE verifier's job, so a tampered-but-parseable container is a
|
|
2502
|
+
// clean NAMED REJECTED (exit 3), never a thrown error.
|
|
2503
|
+
let raw;
|
|
2504
|
+
try {
|
|
2505
|
+
raw = fs.readFileSync(containerPath, "utf8");
|
|
2506
|
+
} catch (e) {
|
|
2507
|
+
throw new Error(`cannot read timestamped dataset attestation at ${containerPath}: ${e.message}`);
|
|
2508
|
+
}
|
|
2509
|
+
let container;
|
|
2510
|
+
try {
|
|
2511
|
+
container = JSON.parse(raw);
|
|
2512
|
+
} catch (e) {
|
|
2513
|
+
throw new Error(`timestamped dataset attestation at ${containerPath} is not valid JSON: ${e.message}`);
|
|
2514
|
+
}
|
|
2515
|
+
|
|
2516
|
+
// OPTIONAL: read the buyer's manifest strictly (a corrupt/foreign manifest is a runtime error) so the
|
|
2517
|
+
// binding check recomputes canonical bytes from a sound manifest.
|
|
2518
|
+
let manifest;
|
|
2519
|
+
if (manifestPath !== undefined && manifestPath !== null) {
|
|
2520
|
+
manifest = readManifest(manifestPath);
|
|
2521
|
+
}
|
|
2522
|
+
|
|
2523
|
+
const result = verifyTimestampedAttestation({ container, manifest });
|
|
2524
|
+
|
|
2525
|
+
if (opts.json) {
|
|
2526
|
+
write(JSON.stringify(result) + "\n");
|
|
2527
|
+
} else {
|
|
2528
|
+
for (const line of formatVerifyTimestamp(result)) write(line + "\n");
|
|
2529
|
+
}
|
|
2530
|
+
return result;
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
// =================================================================================================
|
|
2534
|
+
// `vh dataset check <manifest> --policy <p> [--json]` — deterministic, OFFLINE license/source policy gate.
|
|
2535
|
+
//
|
|
2536
|
+
// WHY THIS EXISTS
|
|
2537
|
+
// `vh dataset summary` rolls up what a dataset CLAIMS about its files' {source, license}. But a CI
|
|
2538
|
+
// pipeline (or a compliance reviewer) wants the next step: a PASS/FAIL GATE — "does this dataset's
|
|
2539
|
+
// self-asserted provenance satisfy MY policy?" (e.g. "no GPL in my proprietary product", "only files
|
|
2540
|
+
// from this allowed corpus", "every file MUST carry a license"). `vh dataset check` reads the manifest
|
|
2541
|
+
// via the SAME strict `readManifest` (a corrupt/foreign manifest is rejected, never half-accepted) and a
|
|
2542
|
+
// new strict, versioned POLICY file, then evaluates the manifest's TRUSTED file set against the policy
|
|
2543
|
+
// in a PURE, deterministic function (no I/O, no provider, no key, no network) and returns a verdict.
|
|
2544
|
+
//
|
|
2545
|
+
// TRUST POSTURE (carried verbatim — reuses TRUST_NOTE so caveats never drift)
|
|
2546
|
+
// The {source, license} hints are UNTRUSTED, self-asserted metadata NOT bound into the root. A PASS
|
|
2547
|
+
// means "the dataset's self-asserted hints satisfy this policy" — NOT "the licenses are genuinely
|
|
2548
|
+
// correct". A `(no license hint)` file ASSERTS NOTHING (which `requireLicense` is the rule that flags).
|
|
2549
|
+
// This NEVER implies it verified a license is real.
|
|
2550
|
+
//
|
|
2551
|
+
// MATCH SEMANTICS (documented so a verdict is reproducible)
|
|
2552
|
+
// A file's "license hint value" is its `hints.license` string, or the absence of one (no `hints` at
|
|
2553
|
+
// all, or `hints` with no `license`). Likewise for `hints.source`. All comparisons against the policy's
|
|
2554
|
+
// lists are CASE-SENSITIVE EXACT STRING MATCHES on the hint value ("GPL-3.0" matches only "GPL-3.0",
|
|
2555
|
+
// never "gpl-3.0" or "GPL-3.0-or-later"). The rules:
|
|
2556
|
+
// - allowLicenses : any file whose license hint is NOT in the allowlist VIOLATES (a file with no
|
|
2557
|
+
// license hint also violates — it is not in any allowlist).
|
|
2558
|
+
// - denyLicenses : any file whose license hint IS in the denylist VIOLATES (a file with no license
|
|
2559
|
+
// hint does NOT violate — there is no value on the denylist to match).
|
|
2560
|
+
// - allowSources / denySources : the same, on the source hint.
|
|
2561
|
+
// - requireLicense: true : every file MUST carry a license hint; a `(no license hint)` file VIOLATES.
|
|
2562
|
+
// A policy with NO rules is valid and trivially PASSes (with a clear "no rules" note).
|
|
2563
|
+
|
|
2564
|
+
const POLICY_KIND = "verifyhash.dataset-policy";
|
|
2565
|
+
const POLICY_SCHEMA_VERSION = 1;
|
|
2566
|
+
const SUPPORTED_POLICY_SCHEMA_VERSIONS = Object.freeze([1]);
|
|
2567
|
+
|
|
2568
|
+
// The (stable, documented) rule identifiers a violation reports in its `rule` field. A consumer can gate
|
|
2569
|
+
// on these exact strings.
|
|
2570
|
+
const POLICY_RULE = Object.freeze({
|
|
2571
|
+
ALLOW_LICENSES: "allowLicenses",
|
|
2572
|
+
DENY_LICENSES: "denyLicenses",
|
|
2573
|
+
ALLOW_SOURCES: "allowSources",
|
|
2574
|
+
DENY_SOURCES: "denySources",
|
|
2575
|
+
REQUIRE_LICENSE: "requireLicense",
|
|
2576
|
+
});
|
|
2577
|
+
|
|
2578
|
+
// The sentinel value a violation carries for a file that asserts NO license/source hint. It is NOT a real
|
|
2579
|
+
// hint value — it is the explicit "(no license hint)" / "(no source hint)" label (reusing the summary's
|
|
2580
|
+
// buckets), so a reader can never mistake "no claim" for a literal hint string named "(no license hint)".
|
|
2581
|
+
const NO_HINT_VALUE = Object.freeze({
|
|
2582
|
+
license: NO_LICENSE_BUCKET,
|
|
2583
|
+
source: NO_SOURCE_BUCKET,
|
|
2584
|
+
});
|
|
2585
|
+
|
|
2586
|
+
// Possible verdicts. PASS = no file violates any rule; FAIL = at least one file violates at least one rule.
|
|
2587
|
+
const POLICY_VERDICT = Object.freeze({ PASS: "PASS", FAIL: "FAIL" });
|
|
2588
|
+
|
|
2589
|
+
/**
|
|
2590
|
+
* Strictly validate a parsed policy object. Throws an Error describing the FIRST problem; never mutates
|
|
2591
|
+
* and never fills defaults (mirroring validateManifest / validateAttestation). A wrong kind/schemaVersion,
|
|
2592
|
+
* or any malformed field (a non-array allow/deny list, a non-string list entry, a non-boolean
|
|
2593
|
+
* requireLicense) hard-errors here so a corrupt/foreign policy is rejected, never half-accepted. Every
|
|
2594
|
+
* rule field is OPTIONAL and combinable; a policy with NO rules is valid (and trivially PASSes).
|
|
2595
|
+
* @param {any} obj
|
|
2596
|
+
* @returns {object} the same object, if valid
|
|
2597
|
+
*/
|
|
2598
|
+
function validatePolicy(obj) {
|
|
2599
|
+
if (obj == null || typeof obj !== "object" || Array.isArray(obj)) {
|
|
2600
|
+
throw new Error("dataset policy must be a JSON object");
|
|
2601
|
+
}
|
|
2602
|
+
if (obj.kind !== POLICY_KIND) {
|
|
2603
|
+
throw new Error(
|
|
2604
|
+
`not a verifyhash dataset policy (kind: ${JSON.stringify(obj.kind)}; expected ${JSON.stringify(
|
|
2605
|
+
POLICY_KIND
|
|
2606
|
+
)})`
|
|
2607
|
+
);
|
|
2608
|
+
}
|
|
2609
|
+
if (!SUPPORTED_POLICY_SCHEMA_VERSIONS.includes(obj.schemaVersion)) {
|
|
2610
|
+
throw new Error(
|
|
2611
|
+
`unsupported dataset policy schemaVersion: ${JSON.stringify(obj.schemaVersion)} ` +
|
|
2612
|
+
`(this build understands ${JSON.stringify(SUPPORTED_POLICY_SCHEMA_VERSIONS)})`
|
|
2613
|
+
);
|
|
2614
|
+
}
|
|
2615
|
+
// The four list rules: each, WHEN PRESENT, must be an array of non-empty strings. We reject a non-array,
|
|
2616
|
+
// an empty-string entry, or a non-string entry rather than silently coercing — a malformed list must
|
|
2617
|
+
// never half-evaluate into a surprise verdict.
|
|
2618
|
+
for (const f of [
|
|
2619
|
+
POLICY_RULE.ALLOW_LICENSES,
|
|
2620
|
+
POLICY_RULE.DENY_LICENSES,
|
|
2621
|
+
POLICY_RULE.ALLOW_SOURCES,
|
|
2622
|
+
POLICY_RULE.DENY_SOURCES,
|
|
2623
|
+
]) {
|
|
2624
|
+
if (obj[f] === undefined) continue;
|
|
2625
|
+
if (!Array.isArray(obj[f])) {
|
|
2626
|
+
throw new Error(`dataset policy ${f} must be an array of strings when present, got: ${String(obj[f])}`);
|
|
2627
|
+
}
|
|
2628
|
+
obj[f].forEach((v, i) => {
|
|
2629
|
+
if (typeof v !== "string" || v.length === 0) {
|
|
2630
|
+
throw new Error(`dataset policy ${f}[${i}] must be a non-empty string, got: ${String(v)}`);
|
|
2631
|
+
}
|
|
2632
|
+
});
|
|
2633
|
+
}
|
|
2634
|
+
// requireLicense, WHEN PRESENT, must be a strict boolean (reject a truthy string/number that would
|
|
2635
|
+
// silently enable the rule).
|
|
2636
|
+
if (obj.requireLicense !== undefined && typeof obj.requireLicense !== "boolean") {
|
|
2637
|
+
throw new Error(
|
|
2638
|
+
`dataset policy requireLicense must be a boolean when present, got: ${String(obj.requireLicense)}`
|
|
2639
|
+
);
|
|
2640
|
+
}
|
|
2641
|
+
return obj;
|
|
2642
|
+
}
|
|
2643
|
+
|
|
2644
|
+
/**
|
|
2645
|
+
* Read, parse, and STRICTLY validate the policy at `policyPath`. Throws on a missing file, invalid JSON,
|
|
2646
|
+
* or ANY schema deviation (so a malformed/foreign policy is rejected, never half-accepted) — mirroring
|
|
2647
|
+
* readManifest / readAttestation.
|
|
2648
|
+
* @param {string} policyPath
|
|
2649
|
+
* @returns {object} the validated policy object
|
|
2650
|
+
*/
|
|
2651
|
+
function readPolicy(policyPath) {
|
|
2652
|
+
if (!policyPath || typeof policyPath !== "string") {
|
|
2653
|
+
throw new Error("readPolicy requires a policy file path");
|
|
2654
|
+
}
|
|
2655
|
+
let raw;
|
|
2656
|
+
try {
|
|
2657
|
+
raw = fs.readFileSync(policyPath, "utf8");
|
|
2658
|
+
} catch (e) {
|
|
2659
|
+
throw new Error(`cannot read dataset policy at ${policyPath}: ${e.message}`);
|
|
2660
|
+
}
|
|
2661
|
+
let obj;
|
|
2662
|
+
try {
|
|
2663
|
+
obj = JSON.parse(raw);
|
|
2664
|
+
} catch (e) {
|
|
2665
|
+
throw new Error(`dataset policy at ${policyPath} is not valid JSON: ${e.message}`);
|
|
2666
|
+
}
|
|
2667
|
+
return validatePolicy(obj);
|
|
2668
|
+
}
|
|
2669
|
+
|
|
2670
|
+
/**
|
|
2671
|
+
* Count the rules a (validated) policy actually carries — so the verdict can report `rulesEvaluated` and
|
|
2672
|
+
* a no-rules policy is announced clearly. A list rule counts only when present AND non-empty (an empty
|
|
2673
|
+
* `allowLicenses: []` carries no constraint). `requireLicense` counts only when exactly `true`.
|
|
2674
|
+
* @param {object} policy a validated policy object
|
|
2675
|
+
* @returns {number}
|
|
2676
|
+
*/
|
|
2677
|
+
function _countPolicyRules(policy) {
|
|
2678
|
+
let n = 0;
|
|
2679
|
+
for (const f of [
|
|
2680
|
+
POLICY_RULE.ALLOW_LICENSES,
|
|
2681
|
+
POLICY_RULE.DENY_LICENSES,
|
|
2682
|
+
POLICY_RULE.ALLOW_SOURCES,
|
|
2683
|
+
POLICY_RULE.DENY_SOURCES,
|
|
2684
|
+
]) {
|
|
2685
|
+
if (Array.isArray(policy[f]) && policy[f].length > 0) n++;
|
|
2686
|
+
}
|
|
2687
|
+
if (policy.requireLicense === true) n++;
|
|
2688
|
+
return n;
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2691
|
+
/**
|
|
2692
|
+
* Evaluate a manifest's TRUSTED file set against a policy in a PURE, deterministic function (no I/O, no
|
|
2693
|
+
* provider, no key, no network). Returns a verdict: PASS (no file violates any rule) or FAIL with, per
|
|
2694
|
+
* violating file, the relPath + which rule it broke + the offending hint value. A single file can violate
|
|
2695
|
+
* more than one rule (each is its own violation entry). Violations are sorted by relPath then rule, so two
|
|
2696
|
+
* runs over the same inputs produce a byte-identical verdict.
|
|
2697
|
+
*
|
|
2698
|
+
* Match semantics (see header): CASE-SENSITIVE EXACT STRING match on the hint value. A file with no
|
|
2699
|
+
* license hint has the NO_HINT_VALUE.license sentinel as its "value"; ditto source.
|
|
2700
|
+
*
|
|
2701
|
+
* @param {object} manifest a validated manifest object (from readManifest)
|
|
2702
|
+
* @param {object} policy a validated policy object (from readPolicy)
|
|
2703
|
+
* @returns {{
|
|
2704
|
+
* verdict: "PASS"|"FAIL",
|
|
2705
|
+
* fileCount: number,
|
|
2706
|
+
* rulesEvaluated: number,
|
|
2707
|
+
* violations: { relPath: string, rule: string, value: string }[],
|
|
2708
|
+
* }}
|
|
2709
|
+
*/
|
|
2710
|
+
function evaluatePolicy(manifest, policy) {
|
|
2711
|
+
const allowLicenses =
|
|
2712
|
+
Array.isArray(policy.allowLicenses) && policy.allowLicenses.length > 0
|
|
2713
|
+
? new Set(policy.allowLicenses)
|
|
2714
|
+
: null;
|
|
2715
|
+
const denyLicenses =
|
|
2716
|
+
Array.isArray(policy.denyLicenses) && policy.denyLicenses.length > 0
|
|
2717
|
+
? new Set(policy.denyLicenses)
|
|
2718
|
+
: null;
|
|
2719
|
+
const allowSources =
|
|
2720
|
+
Array.isArray(policy.allowSources) && policy.allowSources.length > 0
|
|
2721
|
+
? new Set(policy.allowSources)
|
|
2722
|
+
: null;
|
|
2723
|
+
const denySources =
|
|
2724
|
+
Array.isArray(policy.denySources) && policy.denySources.length > 0
|
|
2725
|
+
? new Set(policy.denySources)
|
|
2726
|
+
: null;
|
|
2727
|
+
const requireLicense = policy.requireLicense === true;
|
|
2728
|
+
|
|
2729
|
+
const violations = [];
|
|
2730
|
+
for (const f of manifest.files) {
|
|
2731
|
+
const license =
|
|
2732
|
+
f.hints && typeof f.hints.license === "string" ? f.hints.license : null;
|
|
2733
|
+
const source = f.hints && typeof f.hints.source === "string" ? f.hints.source : null;
|
|
2734
|
+
|
|
2735
|
+
// requireLicense: a file with NO license hint asserts nothing — it violates. (This is the ONE rule
|
|
2736
|
+
// that flags a missing hint; allow/deny lists below handle PRESENT vs absent per their own semantics.)
|
|
2737
|
+
if (requireLicense && license === null) {
|
|
2738
|
+
violations.push({
|
|
2739
|
+
relPath: f.relPath,
|
|
2740
|
+
rule: POLICY_RULE.REQUIRE_LICENSE,
|
|
2741
|
+
value: NO_HINT_VALUE.license,
|
|
2742
|
+
});
|
|
2743
|
+
}
|
|
2744
|
+
// allowLicenses: a license hint NOT in the allowlist violates. A file with no license hint is not in
|
|
2745
|
+
// any allowlist, so it also violates (reported with the explicit no-hint sentinel value).
|
|
2746
|
+
if (allowLicenses && (license === null || !allowLicenses.has(license))) {
|
|
2747
|
+
violations.push({
|
|
2748
|
+
relPath: f.relPath,
|
|
2749
|
+
rule: POLICY_RULE.ALLOW_LICENSES,
|
|
2750
|
+
value: license === null ? NO_HINT_VALUE.license : license,
|
|
2751
|
+
});
|
|
2752
|
+
}
|
|
2753
|
+
// denyLicenses: a license hint IN the denylist violates. A file with no license hint has no value to
|
|
2754
|
+
// match on the denylist, so it does NOT violate this rule.
|
|
2755
|
+
if (denyLicenses && license !== null && denyLicenses.has(license)) {
|
|
2756
|
+
violations.push({ relPath: f.relPath, rule: POLICY_RULE.DENY_LICENSES, value: license });
|
|
2757
|
+
}
|
|
2758
|
+
// allowSources: a source hint NOT in the allowlist violates (a missing source hint is not in it).
|
|
2759
|
+
if (allowSources && (source === null || !allowSources.has(source))) {
|
|
2760
|
+
violations.push({
|
|
2761
|
+
relPath: f.relPath,
|
|
2762
|
+
rule: POLICY_RULE.ALLOW_SOURCES,
|
|
2763
|
+
value: source === null ? NO_HINT_VALUE.source : source,
|
|
2764
|
+
});
|
|
2765
|
+
}
|
|
2766
|
+
// denySources: a source hint IN the denylist violates (a missing source hint does not).
|
|
2767
|
+
if (denySources && source !== null && denySources.has(source)) {
|
|
2768
|
+
violations.push({ relPath: f.relPath, rule: POLICY_RULE.DENY_SOURCES, value: source });
|
|
2769
|
+
}
|
|
2770
|
+
}
|
|
2771
|
+
|
|
2772
|
+
// Deterministic order: by relPath, then by rule (a stable total order, so two runs are byte-identical).
|
|
2773
|
+
violations.sort((a, b) => {
|
|
2774
|
+
if (a.relPath !== b.relPath) return a.relPath < b.relPath ? -1 : 1;
|
|
2775
|
+
return a.rule < b.rule ? -1 : a.rule > b.rule ? 1 : 0;
|
|
2776
|
+
});
|
|
2777
|
+
|
|
2778
|
+
return {
|
|
2779
|
+
verdict: violations.length === 0 ? POLICY_VERDICT.PASS : POLICY_VERDICT.FAIL,
|
|
2780
|
+
fileCount: manifest.files.length,
|
|
2781
|
+
rulesEvaluated: _countPolicyRules(policy),
|
|
2782
|
+
violations,
|
|
2783
|
+
};
|
|
2784
|
+
}
|
|
2785
|
+
|
|
2786
|
+
/**
|
|
2787
|
+
* Render a policy-check result as the human-readable block the CLI prints. LEADS with the trust caveat
|
|
2788
|
+
* (reusing TRUST_NOTE verbatim so caveats never drift): the {source, license} hints are UNTRUSTED — a
|
|
2789
|
+
* PASS means the dataset's self-asserted hints satisfy this policy, NOT that the licenses are genuinely
|
|
2790
|
+
* correct. NEVER implies a license was verified to be real.
|
|
2791
|
+
* @param {object} r the object evaluatePolicy returns
|
|
2792
|
+
* @returns {string[]} lines
|
|
2793
|
+
*/
|
|
2794
|
+
function formatDatasetCheck(r) {
|
|
2795
|
+
const lines = [
|
|
2796
|
+
// TRUST caveat FIRST: a PASS is about self-asserted hints, not verified licenses.
|
|
2797
|
+
" TRUST: the {source, license} hints checked here are UNTRUSTED, self-asserted metadata. " +
|
|
2798
|
+
TRUST_NOTE,
|
|
2799
|
+
" A PASS means the dataset's SELF-ASSERTED hints satisfy this policy — NOT that the licenses",
|
|
2800
|
+
" are genuinely correct. \"(no license hint)\" asserts NOTHING (requireLicense flags it). This",
|
|
2801
|
+
" does NOT verify any license/source is real.",
|
|
2802
|
+
"",
|
|
2803
|
+
` policy check: ${r.verdict}`,
|
|
2804
|
+
` files: ${r.fileCount}`,
|
|
2805
|
+
` rules evaluated: ${r.rulesEvaluated}`,
|
|
2806
|
+
];
|
|
2807
|
+
if (r.rulesEvaluated === 0) {
|
|
2808
|
+
lines.push(
|
|
2809
|
+
" NOTE: this policy declares NO rules, so it trivially PASSes — every dataset satisfies a policy",
|
|
2810
|
+
" with no constraints. Add allowLicenses/denyLicenses/allowSources/denySources/requireLicense."
|
|
2811
|
+
);
|
|
2812
|
+
return lines;
|
|
2813
|
+
}
|
|
2814
|
+
if (r.verdict === POLICY_VERDICT.PASS) {
|
|
2815
|
+
lines.push(" PASS: no file's self-asserted hints violate any rule in this policy.");
|
|
2816
|
+
return lines;
|
|
2817
|
+
}
|
|
2818
|
+
lines.push(
|
|
2819
|
+
` FAIL: ${r.violations.length} violation${r.violations.length === 1 ? "" : "s"} ` +
|
|
2820
|
+
"(each line: the file, the rule it broke, and the offending hint value):"
|
|
2821
|
+
);
|
|
2822
|
+
for (const v of r.violations) {
|
|
2823
|
+
lines.push(` ${v.relPath} [${v.rule}] value: ${v.value}`);
|
|
2824
|
+
}
|
|
2825
|
+
return lines;
|
|
2826
|
+
}
|
|
2827
|
+
|
|
2828
|
+
/**
|
|
2829
|
+
* Orchestrate `vh dataset check <manifest> --policy <p> [--json]`. Reads the manifest via the strict
|
|
2830
|
+
* `readManifest` (a corrupt/foreign manifest is rejected) and the policy via the strict `readPolicy`,
|
|
2831
|
+
* then evaluates the manifest's TRUSTED file set against the policy in the PURE `evaluatePolicy`. Emits
|
|
2832
|
+
* the deterministic verdict as a human block (LEADS with the trust caveat) or `--json` machine form.
|
|
2833
|
+
* PURELY OFFLINE: no tree, no provider, no key, no network.
|
|
2834
|
+
*
|
|
2835
|
+
* @param {object} opts
|
|
2836
|
+
* @param {string} opts.manifest path to a manifest written by `vh dataset build`
|
|
2837
|
+
* @param {string} opts.policy path to a policy file (the new strict, versioned schema)
|
|
2838
|
+
* @param {boolean}[opts.json] emit the machine-readable object instead of the human block
|
|
2839
|
+
* @param {(s:string)=>void}[opts.stdout] sink for stdout (default process.stdout.write); injectable for tests
|
|
2840
|
+
* @returns {{ verdict: "PASS"|"FAIL", fileCount: number, rulesEvaluated: number, violations: object[] }}
|
|
2841
|
+
*/
|
|
2842
|
+
function runDatasetCheck(opts) {
|
|
2843
|
+
if (!opts || typeof opts !== "object") throw new Error("runDatasetCheck requires options");
|
|
2844
|
+
const { manifest: manifestPath, policy: policyPath } = opts;
|
|
2845
|
+
const write = opts.stdout || ((s) => process.stdout.write(s));
|
|
2846
|
+
if (!manifestPath) throw new Error("runDatasetCheck requires a <manifest> path");
|
|
2847
|
+
if (!policyPath) throw new Error("runDatasetCheck requires a --policy <p> path");
|
|
2848
|
+
|
|
2849
|
+
// Strict reads: a corrupt/edited/foreign manifest OR policy is rejected here, never half-accepted,
|
|
2850
|
+
// BEFORE any evaluation. The manifest's file SET is the TRUSTED basis of the check.
|
|
2851
|
+
const manifest = readManifest(manifestPath);
|
|
2852
|
+
const policy = readPolicy(policyPath);
|
|
2853
|
+
|
|
2854
|
+
// The verdict math lives in the PURE evaluator (no I/O) so it is deterministic and unit-testable.
|
|
2855
|
+
const result = evaluatePolicy(manifest, policy);
|
|
2856
|
+
|
|
2857
|
+
if (opts.json) {
|
|
2858
|
+
write(JSON.stringify(result) + "\n");
|
|
2859
|
+
} else {
|
|
2860
|
+
for (const line of formatDatasetCheck(result)) write(line + "\n");
|
|
2861
|
+
}
|
|
2862
|
+
return result;
|
|
2863
|
+
}
|
|
2864
|
+
|
|
2865
|
+
module.exports = {
|
|
2866
|
+
MANIFEST_KIND,
|
|
2867
|
+
MANIFEST_SCHEMA_VERSION,
|
|
2868
|
+
SUPPORTED_MANIFEST_SCHEMA_VERSIONS,
|
|
2869
|
+
POLICY_KIND,
|
|
2870
|
+
POLICY_SCHEMA_VERSION,
|
|
2871
|
+
SUPPORTED_POLICY_SCHEMA_VERSIONS,
|
|
2872
|
+
POLICY_RULE,
|
|
2873
|
+
POLICY_VERDICT,
|
|
2874
|
+
NO_HINT_VALUE,
|
|
2875
|
+
validatePolicy,
|
|
2876
|
+
readPolicy,
|
|
2877
|
+
evaluatePolicy,
|
|
2878
|
+
formatDatasetCheck,
|
|
2879
|
+
runDatasetCheck,
|
|
2880
|
+
ATTESTATION_KIND,
|
|
2881
|
+
ATTESTATION_SCHEMA_VERSION,
|
|
2882
|
+
SUPPORTED_ATTESTATION_SCHEMA_VERSIONS,
|
|
2883
|
+
ATTESTATION_TRUST_NOTE,
|
|
2884
|
+
canonicalManifestFiles,
|
|
2885
|
+
manifestDigest,
|
|
2886
|
+
buildAttestation,
|
|
2887
|
+
validateAttestation,
|
|
2888
|
+
serializeAttestation,
|
|
2889
|
+
readAttestation,
|
|
2890
|
+
SIGNED_ATTESTATION_KIND,
|
|
2891
|
+
SIGNED_ATTESTATION_SCHEMA_VERSION,
|
|
2892
|
+
SUPPORTED_SIGNED_ATTESTATION_SCHEMA_VERSIONS,
|
|
2893
|
+
SIGNED_ATTESTATION_SCHEMES,
|
|
2894
|
+
SIGNED_ATTESTATION_TRUST_NOTE,
|
|
2895
|
+
buildSignedAttestation,
|
|
2896
|
+
validateSignedAttestation,
|
|
2897
|
+
serializeSignedAttestation,
|
|
2898
|
+
readSignedAttestation,
|
|
2899
|
+
runDatasetAttest,
|
|
2900
|
+
SIGN_TRUST_NOTE,
|
|
2901
|
+
runDatasetSign,
|
|
2902
|
+
VERIFY_ATTEST_VERDICT,
|
|
2903
|
+
VERIFY_ATTEST_TRUST_NOTE,
|
|
2904
|
+
recoverSignedAttestationSigner,
|
|
2905
|
+
verifySignedAttestation,
|
|
2906
|
+
formatVerifyAttest,
|
|
2907
|
+
runDatasetVerifyAttest,
|
|
2908
|
+
// timestamp (T-20.2) — detached RFC-3161 container over the SAME generic timestamp core.
|
|
2909
|
+
TIMESTAMPED_ATTESTATION_KIND,
|
|
2910
|
+
TIMESTAMPED_ATTESTATION_SCHEMA_VERSION,
|
|
2911
|
+
SUPPORTED_TIMESTAMPED_ATTESTATION_SCHEMA_VERSIONS,
|
|
2912
|
+
TIMESTAMPED_ATTESTATION_TRUST_NOTE,
|
|
2913
|
+
TIMESTAMP_REQUEST_TRUST_NOTE,
|
|
2914
|
+
validateTimestampedAttestation,
|
|
2915
|
+
buildTimestampedAttestation,
|
|
2916
|
+
serializeTimestampedAttestation,
|
|
2917
|
+
readTimestampedAttestation,
|
|
2918
|
+
runDatasetTimestampRequest,
|
|
2919
|
+
runDatasetTimestampWrap,
|
|
2920
|
+
// verify-timestamp (T-20.3) — OFFLINE independent-timestamp verifier over the SAME generic core.
|
|
2921
|
+
VERIFY_TIMESTAMP_VERDICT,
|
|
2922
|
+
VERIFY_TIMESTAMP_TRUST_NOTE,
|
|
2923
|
+
verifyTimestampedAttestation,
|
|
2924
|
+
formatVerifyTimestamp,
|
|
2925
|
+
runDatasetVerifyTimestamp,
|
|
2926
|
+
TRUST_NOTE,
|
|
2927
|
+
MEMBERSHIP_TRUST_NOTE,
|
|
2928
|
+
NO_LICENSE_BUCKET,
|
|
2929
|
+
NO_SOURCE_BUCKET,
|
|
2930
|
+
VERIFY_STATUS,
|
|
2931
|
+
MEMBERSHIP_STATUS,
|
|
2932
|
+
buildManifest,
|
|
2933
|
+
validateManifest,
|
|
2934
|
+
readManifest,
|
|
2935
|
+
writeManifest,
|
|
2936
|
+
runDatasetBuild,
|
|
2937
|
+
runDatasetVerify,
|
|
2938
|
+
formatDatasetVerify,
|
|
2939
|
+
runDatasetDiff,
|
|
2940
|
+
formatDatasetDiff,
|
|
2941
|
+
runDatasetSummary,
|
|
2942
|
+
formatDatasetSummary,
|
|
2943
|
+
aggregateManifest,
|
|
2944
|
+
buildDatasetReport,
|
|
2945
|
+
formatDatasetReportMarkdown,
|
|
2946
|
+
runDatasetReport,
|
|
2947
|
+
buildDatasetProof,
|
|
2948
|
+
runDatasetProve,
|
|
2949
|
+
runDatasetVerifyProof,
|
|
2950
|
+
};
|