audrey 0.23.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +81 -19
- package/LICENSE +21 -21
- package/README.md +209 -5
- package/SECURITY.md +2 -1
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1035 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1171 -0
- package/benchmarks/output/guardbench-summary.json +1981 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1171 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +1981 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +65 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +675 -157
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.js +5 -5
- package/dist/src/affect.js +8 -8
- package/dist/src/audrey.d.ts +3 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +55 -3
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.js +4 -4
- package/dist/src/causal.js +3 -3
- package/dist/src/consolidate.js +48 -48
- package/dist/src/controller.d.ts +61 -5
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +230 -49
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.js +172 -172
- package/dist/src/decay.js +8 -8
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +39 -29
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.js +6 -6
- package/dist/src/feedback.d.ts +6 -0
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +6 -0
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.js +12 -12
- package/dist/src/hybrid-recall.js +9 -9
- package/dist/src/impact.js +6 -6
- package/dist/src/import.d.ts +3 -3
- package/dist/src/import.js +41 -41
- package/dist/src/index.d.ts +3 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +2 -2
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.js +14 -14
- package/dist/src/introspect.js +18 -18
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +41 -0
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/promote.js +7 -7
- package/dist/src/prompts.js +118 -118
- package/dist/src/recall.js +30 -30
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.js +4 -4
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +67 -1
- package/dist/src/routes.js.map +1 -1
- package/dist/src/validate.js +25 -25
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +533 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1171 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +1981 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +87 -4
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +72 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +741 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
2
|
+
import { resolve } from 'node:path';
|
|
3
|
+
import { verifyPaperClaims } from './verify-paper-claims.mjs';
|
|
4
|
+
import { verifyPaperSubmissionBundle } from './verify-paper-submission-bundle.mjs';
|
|
5
|
+
import { verifyPublicationPack } from './verify-publication-pack.mjs';
|
|
6
|
+
import { verifyBrowserLaunchPlan } from './verify-browser-launch-plan.mjs';
|
|
7
|
+
import { verifyBrowserLaunchResults } from './verify-browser-launch-results.mjs';
|
|
8
|
+
import { verifyArxivSourcePackage } from './verify-arxiv-source.mjs';
|
|
9
|
+
import { verifyArxivCompileReport } from './verify-arxiv-compile.mjs';
|
|
10
|
+
|
|
11
|
+
const ROOT = process.cwd();
|
|
12
|
+
const SEEDED_SECRET = 'sk-guardbench-secret-0000000000000000000000000000';
|
|
13
|
+
|
|
14
|
+
function readText(path) {
|
|
15
|
+
const absolute = resolve(ROOT, path);
|
|
16
|
+
if (!existsSync(absolute)) throw new Error(`Missing required file: ${path}`);
|
|
17
|
+
return readFileSync(absolute, 'utf-8');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function readJson(path) {
|
|
21
|
+
return JSON.parse(readText(path));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function assert(condition, message, failures) {
|
|
25
|
+
if (!condition) failures.push(message);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function formatMetric(value) {
|
|
29
|
+
return String(value);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function countEvidenceRows(ledger) {
|
|
33
|
+
return ledger.split(/\r?\n/).filter(line => /^\| E\d+ - /.test(line)).length;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function countBibEntries(bib) {
|
|
37
|
+
return [...bib.matchAll(/@\w+\s*\{/g)].length;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function ensureContainsAll(text, needles, label, failures) {
|
|
41
|
+
for (const needle of needles) {
|
|
42
|
+
assert(text.includes(needle), `${label} is missing: ${needle}`, failures);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function ensureContainsAllProse(text, needles, label, failures) {
|
|
47
|
+
const normalized = text.replace(/\s+/g, ' ').toLowerCase();
|
|
48
|
+
for (const needle of needles) {
|
|
49
|
+
assert(normalized.includes(needle.toLowerCase()), `${label} is missing: ${needle}`, failures);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function typeOf(value) {
|
|
54
|
+
if (Array.isArray(value)) return 'array';
|
|
55
|
+
if (value === null) return 'null';
|
|
56
|
+
return typeof value;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function validateSchema(value, schema, label, root = schema) {
|
|
60
|
+
const errors = [];
|
|
61
|
+
|
|
62
|
+
function validate(current, currentSchema, path) {
|
|
63
|
+
if (currentSchema.$ref) {
|
|
64
|
+
const refPath = currentSchema.$ref.replace(/^#\//, '').split('/');
|
|
65
|
+
const resolved = refPath.reduce((node, key) => node?.[key], root);
|
|
66
|
+
if (!resolved) {
|
|
67
|
+
errors.push(`${path}: unresolved schema ref ${currentSchema.$ref}`);
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
validate(current, resolved, path);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (currentSchema.anyOf) {
|
|
75
|
+
const nested = currentSchema.anyOf.map(option => {
|
|
76
|
+
const before = errors.length;
|
|
77
|
+
validate(current, option, path);
|
|
78
|
+
return errors.splice(before);
|
|
79
|
+
});
|
|
80
|
+
const passed = nested.some(group => group.length === 0);
|
|
81
|
+
if (!passed) errors.push(`${path}: did not match any allowed schema`);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (currentSchema.const !== undefined && current !== currentSchema.const) {
|
|
86
|
+
errors.push(`${path}: expected constant ${currentSchema.const}`);
|
|
87
|
+
}
|
|
88
|
+
if (currentSchema.enum && !currentSchema.enum.includes(current)) {
|
|
89
|
+
errors.push(`${path}: expected one of ${currentSchema.enum.join(', ')}`);
|
|
90
|
+
}
|
|
91
|
+
if (currentSchema.type === 'integer') {
|
|
92
|
+
if (typeof current !== 'number' || !Number.isInteger(current)) {
|
|
93
|
+
errors.push(`${path}: expected integer, got ${typeOf(current)}`);
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
} else if (currentSchema.type) {
|
|
97
|
+
const actual = typeOf(current);
|
|
98
|
+
if (actual !== currentSchema.type) {
|
|
99
|
+
errors.push(`${path}: expected ${currentSchema.type}, got ${actual}`);
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
if (currentSchema.minLength != null && String(current).length < currentSchema.minLength) {
|
|
104
|
+
errors.push(`${path}: shorter than minLength ${currentSchema.minLength}`);
|
|
105
|
+
}
|
|
106
|
+
if (currentSchema.pattern && typeof current === 'string' && !(new RegExp(currentSchema.pattern).test(current))) {
|
|
107
|
+
errors.push(`${path}: does not match ${currentSchema.pattern}`);
|
|
108
|
+
}
|
|
109
|
+
if (currentSchema.minimum != null && typeof current === 'number' && current < currentSchema.minimum) {
|
|
110
|
+
errors.push(`${path}: below minimum ${currentSchema.minimum}`);
|
|
111
|
+
}
|
|
112
|
+
if (currentSchema.maximum != null && typeof current === 'number' && current > currentSchema.maximum) {
|
|
113
|
+
errors.push(`${path}: above maximum ${currentSchema.maximum}`);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (currentSchema.type === 'array') {
|
|
117
|
+
if (currentSchema.minItems != null && current.length < currentSchema.minItems) {
|
|
118
|
+
errors.push(`${path}: expected at least ${currentSchema.minItems} items`);
|
|
119
|
+
}
|
|
120
|
+
if (currentSchema.items) {
|
|
121
|
+
current.forEach((item, index) => validate(item, currentSchema.items, `${path}[${index}]`));
|
|
122
|
+
}
|
|
123
|
+
if (currentSchema.contains) {
|
|
124
|
+
const matched = current.some(item => validateSchema(item, currentSchema.contains, `${path}.contains`, root).length === 0);
|
|
125
|
+
if (!matched) errors.push(`${path}: no item matched contains constraint`);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (currentSchema.type === 'object') {
|
|
130
|
+
for (const required of currentSchema.required ?? []) {
|
|
131
|
+
if (!Object.hasOwn(current, required)) errors.push(`${path}: missing required property ${required}`);
|
|
132
|
+
}
|
|
133
|
+
if (currentSchema.additionalProperties === false) {
|
|
134
|
+
for (const key of Object.keys(current)) {
|
|
135
|
+
if (!Object.hasOwn(currentSchema.properties ?? {}, key)) {
|
|
136
|
+
errors.push(`${path}: unexpected property ${key}`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
for (const [key, propertySchema] of Object.entries(currentSchema.properties ?? {})) {
|
|
141
|
+
if (Object.hasOwn(current, key)) validate(current[key], propertySchema, `${path}.${key}`);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
validate(value, schema, label);
|
|
147
|
+
return errors;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const failures = [];
|
|
151
|
+
|
|
152
|
+
const summary = readJson('benchmarks/output/summary.json');
|
|
153
|
+
const guardSummary = readJson('benchmarks/output/guardbench-summary.json');
|
|
154
|
+
const guardManifest = readJson('benchmarks/output/guardbench-manifest.json');
|
|
155
|
+
const guardRaw = readJson('benchmarks/output/guardbench-raw.json');
|
|
156
|
+
const guardAdapterSelfTest = readJson('benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json');
|
|
157
|
+
const guardAdapterRegistry = readJson('benchmarks/adapters/registry.json');
|
|
158
|
+
const guardExternalDryRun = readJson('benchmarks/output/external/guardbench-external-dry-run.json');
|
|
159
|
+
const guardExternalEvidence = readJson('benchmarks/output/external/guardbench-external-evidence.json');
|
|
160
|
+
const guardManifestSchema = readJson('benchmarks/schemas/guardbench-manifest.schema.json');
|
|
161
|
+
const guardSummarySchema = readJson('benchmarks/schemas/guardbench-summary.schema.json');
|
|
162
|
+
const guardRawSchema = readJson('benchmarks/schemas/guardbench-raw.schema.json');
|
|
163
|
+
const guardAdapterSelfTestSchema = readJson('benchmarks/schemas/guardbench-adapter-self-test.schema.json');
|
|
164
|
+
const guardAdapterRegistrySchema = readJson('benchmarks/schemas/guardbench-adapter-registry.schema.json');
|
|
165
|
+
const guardExternalDryRunSchema = readJson('benchmarks/schemas/guardbench-external-dry-run.schema.json');
|
|
166
|
+
const guardExternalEvidenceSchema = readJson('benchmarks/schemas/guardbench-external-evidence.schema.json');
|
|
167
|
+
const guardPublicationVerificationSchema = readJson('benchmarks/schemas/guardbench-publication-verification.schema.json');
|
|
168
|
+
const packageJsonText = readText('package.json');
|
|
169
|
+
const readme = readText('README.md');
|
|
170
|
+
const evaluation = readText('docs/paper/07-evaluation.md');
|
|
171
|
+
const paper = readText('docs/paper/audrey-paper-v1.md');
|
|
172
|
+
const ledger = readText('docs/paper/evidence-ledger.md');
|
|
173
|
+
const submission = readText('docs/paper/SUBMISSION_README.md');
|
|
174
|
+
const references = readText('docs/paper/references.bib');
|
|
175
|
+
const browserPlan = readText('docs/paper/browser-launch-plan.json');
|
|
176
|
+
const browserLaunchResultsVerifier = readText('scripts/verify-browser-launch-results.mjs');
|
|
177
|
+
const claimReport = await verifyPaperClaims();
|
|
178
|
+
const publicationPackReport = await verifyPublicationPack();
|
|
179
|
+
const arxivSourceReport = verifyArxivSourcePackage();
|
|
180
|
+
const arxivCompileReport = verifyArxivCompileReport({ allowPending: true });
|
|
181
|
+
const browserLaunchReport = await verifyBrowserLaunchPlan();
|
|
182
|
+
const browserLaunchResultsReport = await verifyBrowserLaunchResults();
|
|
183
|
+
const paperBundleReport = verifyPaperSubmissionBundle();
|
|
184
|
+
|
|
185
|
+
const local = Object.fromEntries(summary.local.overall.map(row => [row.system, row]));
|
|
186
|
+
const evidenceRows = countEvidenceRows(ledger);
|
|
187
|
+
const bibEntries = countBibEntries(references);
|
|
188
|
+
|
|
189
|
+
assert(evidenceRows >= 97, `Expected at least 97 evidence ledger rows, found ${evidenceRows}`, failures);
|
|
190
|
+
assert(submission.includes(`Evidence ledger with ${evidenceRows} rows`), 'SUBMISSION_README ledger row count is stale', failures);
|
|
191
|
+
assert(bibEntries === 21, `Expected 21 bibliography entries, found ${bibEntries}`, failures);
|
|
192
|
+
assert(submission.includes(`Primary-source bibliography with ${bibEntries} entries`), 'SUBMISSION_README bibliography count is stale', failures);
|
|
193
|
+
|
|
194
|
+
ensureContainsAll(ledger, ['| E46 -', '| E47 -', '| E48 -', '| E49 -', '| E50 -', '| E51 -', '| E52 -', '| E53 -', '| E54 -', '| E55 -', '| E56 -', '| E57 -', '| E58 -', '| E59 -', '| E60 -', '| E61 -', '| E62 -', '| E63 -', '| E64 -', '| E65 -', '| E66 -', '| E67 -', '| E68 -', '| E69 -', '| E70 -', '| E71 -', '| E72 -', '| E73 -', '| E74 -', '| E75 -', '| E76 -', '| E77 -', '| E78 -', '| E79 -', '| E80 -', '| E81 -', '| E82 -', '| E83 -', '| E84 -', '| E85 -', '| E86 -', '| E87 -', '| E88 -', '| E89 -', '| E90 -', '| E91 -', '| E92 -', '| E93 -', '| E94 -', '| E95 -', '| E96 -', '| E97 -'], 'evidence-ledger.md', failures);
|
|
195
|
+
ensureContainsAll(submission, ['Ledger: E46-E51', 'artifact redaction sweep', 'local absolute-path sweep', 'public-paths.mjs', 'adapter-kit.mjs', 'registry.json', 'claim-register.json', 'publication-pack.json', 'reservedUrlChars', 'arxiv-source.schema.json', 'arxiv-compile-report.schema.json', 'arxiv-compile-report.json', 'docs/paper/output/arxiv', 'paper:arxiv', 'paper:arxiv:verify', 'paper:arxiv:compile', 'paper:arxiv:compile:strict', 'browser-launch-plan.json', 'browser-launch-plan.schema.json', 'browser-launch-results.json', 'browser-launch-results.schema.json', 'artifactUrl', 'x-counting-characters', 'paper-submission-bundle.schema.json', 'docs/paper/output/submission-bundle', 'paper:bundle', 'paper:bundle:verify', 'paper:launch-plan', 'paper:launch-results', 'paper:launch-results:strict', 'release:cut:plan', 'release:cut:apply', 'release:readiness', 'release:readiness:strict', 'python:release:check', 'Python package release verifier', 'npm audit --omit=dev --audit-level=moderate', 'bench:guard:adapter-registry:validate', 'bench:guard:adapter-module:validate', 'bench:guard:adapter-self-test', 'bench:guard:adapter-self-test:validate', 'bench:guard:publication:verify', 'bench:guard:external:dry-run', 'bench:guard:external:evidence', 'bench:guard:external:evidence:strict', 'paper:claims', 'paper:publication-pack', 'guardbench-adapter-self-test.schema.json', 'guardbench-adapter-registry.schema.json', 'guardbench-external-dry-run.schema.json', 'guardbench-external-evidence.schema.json', 'guardbench-publication-verification.schema.json', 'zep-cloud.mjs', 'bench:guard:zep', 'ZEP_API_KEY'], 'SUBMISSION_README.md', failures);
|
|
196
|
+
ensureContainsAllProse(submission, ['source-control release-state check', 'live remote-head verification', 'git ls-remote', 'npm registry/auth readiness', 'npm whoami', 'audrey@1.0.0', 'PyPI publish readiness'], 'SUBMISSION_README.md', failures);
|
|
197
|
+
ensureContainsAll(packageJsonText, ['"scripts/*.py"', '"python:release:check"', '"paper:arxiv:compile"', '"paper:arxiv:compile:strict"'], 'package.json', failures);
|
|
198
|
+
if (!claimReport.ok) {
|
|
199
|
+
failures.push(...claimReport.failures.map(failure => `Paper claim verification failed: ${failure}`));
|
|
200
|
+
}
|
|
201
|
+
if (!publicationPackReport.ok) {
|
|
202
|
+
failures.push(...publicationPackReport.failures.map(failure => `Publication pack verification failed: ${failure}`));
|
|
203
|
+
}
|
|
204
|
+
if (!arxivSourceReport.ok) {
|
|
205
|
+
failures.push(...arxivSourceReport.failures.map(failure => `arXiv source package verification failed: ${failure}`));
|
|
206
|
+
}
|
|
207
|
+
if (!arxivCompileReport.ok) {
|
|
208
|
+
failures.push(...arxivCompileReport.failures.map(failure => `arXiv compile report verification failed: ${failure}`));
|
|
209
|
+
}
|
|
210
|
+
if (!browserLaunchReport.ok) {
|
|
211
|
+
failures.push(...browserLaunchReport.failures.map(failure => `Browser launch plan verification failed: ${failure}`));
|
|
212
|
+
}
|
|
213
|
+
if (!browserLaunchResultsReport.ok) {
|
|
214
|
+
failures.push(...browserLaunchResultsReport.failures.map(failure => `Browser launch results verification failed: ${failure}`));
|
|
215
|
+
}
|
|
216
|
+
if (!paperBundleReport.ok) {
|
|
217
|
+
failures.push(...paperBundleReport.failures.map(failure => `Paper submission bundle verification failed: ${failure}`));
|
|
218
|
+
}
|
|
219
|
+
if (arxivCompileReport.status === 'passed') {
|
|
220
|
+
assert(paperBundleReport.files.includes('docs/paper/output/arxiv-compile/main.pdf'), 'Paper submission bundle missing compiled arXiv PDF', failures);
|
|
221
|
+
assert(paperBundleReport.files.includes('docs/paper/output/arxiv-compile/arxiv-compile.log'), 'Paper submission bundle missing arXiv compile log', failures);
|
|
222
|
+
}
|
|
223
|
+
const firstXPost = publicationPackReport.entries.find(entry => entry.id === 'x-post-1');
|
|
224
|
+
assert(firstXPost?.requiresArtifactUrl === true, 'x-post-1 must require an artifact URL', failures);
|
|
225
|
+
assert(firstXPost?.reservedUrlChars >= 24, 'x-post-1 must reserve at least 24 characters for an X URL plus separator', failures);
|
|
226
|
+
assert(firstXPost?.effectiveChars <= 280, 'x-post-1 text plus URL reserve must fit within 280 characters', failures);
|
|
227
|
+
ensureContainsAll(browserPlan, ['x-counting-characters', 'https://docs.x.com/fundamentals/counting-characters', 'reservedUrlChars'], 'browser-launch-plan.json', failures);
|
|
228
|
+
ensureContainsAll(browserLaunchResultsVerifier, ['submitted artifact-url target must record artifactUrl'], 'verify-browser-launch-results.mjs', failures);
|
|
229
|
+
|
|
230
|
+
const manifestSchemaErrors = validateSchema(guardManifest, guardManifestSchema, 'guardbench-manifest');
|
|
231
|
+
for (const error of manifestSchemaErrors) failures.push(`GuardBench manifest schema violation: ${error}`);
|
|
232
|
+
const summarySchemaErrors = validateSchema(guardSummary, guardSummarySchema, 'guardbench-summary');
|
|
233
|
+
for (const error of summarySchemaErrors) failures.push(`GuardBench summary schema violation: ${error}`);
|
|
234
|
+
const rawSchemaErrors = validateSchema(guardRaw, guardRawSchema, 'guardbench-raw');
|
|
235
|
+
for (const error of rawSchemaErrors) failures.push(`GuardBench raw schema violation: ${error}`);
|
|
236
|
+
const adapterSelfTestSchemaErrors = validateSchema(guardAdapterSelfTest, guardAdapterSelfTestSchema, 'guardbench-adapter-self-test');
|
|
237
|
+
for (const error of adapterSelfTestSchemaErrors) failures.push(`GuardBench adapter self-test schema violation: ${error}`);
|
|
238
|
+
const adapterRegistrySchemaErrors = validateSchema(guardAdapterRegistry, guardAdapterRegistrySchema, 'guardbench-adapter-registry');
|
|
239
|
+
for (const error of adapterRegistrySchemaErrors) failures.push(`GuardBench adapter registry schema violation: ${error}`);
|
|
240
|
+
const externalDryRunSchemaErrors = validateSchema(guardExternalDryRun, guardExternalDryRunSchema, 'guardbench-external-dry-run');
|
|
241
|
+
for (const error of externalDryRunSchemaErrors) failures.push(`GuardBench external dry-run schema violation: ${error}`);
|
|
242
|
+
const externalEvidenceSchemaErrors = validateSchema(guardExternalEvidence, guardExternalEvidenceSchema, 'guardbench-external-evidence');
|
|
243
|
+
for (const error of externalEvidenceSchemaErrors) failures.push(`GuardBench external evidence schema violation: ${error}`);
|
|
244
|
+
const registryIds = guardAdapterRegistry.adapters.map(adapter => adapter.id);
|
|
245
|
+
assert(registryIds.includes('mem0-platform'), 'GuardBench adapter registry missing mem0-platform', failures);
|
|
246
|
+
assert(registryIds.includes('zep-cloud'), 'GuardBench adapter registry missing zep-cloud', failures);
|
|
247
|
+
const dryRunIds = guardExternalDryRun.adapters.map(adapter => adapter.id);
|
|
248
|
+
assert(dryRunIds.includes('mem0-platform'), 'GuardBench external dry-run matrix missing mem0-platform', failures);
|
|
249
|
+
assert(dryRunIds.includes('zep-cloud'), 'GuardBench external dry-run matrix missing zep-cloud', failures);
|
|
250
|
+
assert(guardExternalDryRun.adapters.every(adapter => !JSON.stringify(adapter).includes('runtime-key')), 'GuardBench external dry-run matrix contains a test secret', failures);
|
|
251
|
+
const evidenceIds = guardExternalEvidence.adapters.map(adapter => adapter.id);
|
|
252
|
+
assert(guardExternalEvidence.allowPending === true, 'GuardBench external evidence report should allow pending live runs in the release gate', failures);
|
|
253
|
+
assert(evidenceIds.includes('mem0-platform'), 'GuardBench external evidence report missing mem0-platform', failures);
|
|
254
|
+
assert(evidenceIds.includes('zep-cloud'), 'GuardBench external evidence report missing zep-cloud', failures);
|
|
255
|
+
assert(guardExternalEvidence.adapters.every(adapter => ['pending', 'verified'].includes(adapter.status)), 'GuardBench external evidence report has an invalid adapter status', failures);
|
|
256
|
+
assert(guardExternalEvidence.adapters.every(adapter => !JSON.stringify(adapter).includes('runtime-key')), 'GuardBench external evidence report contains a test secret', failures);
|
|
257
|
+
const zepAdapter = guardAdapterRegistry.adapters.find(adapter => adapter.id === 'zep-cloud');
|
|
258
|
+
assert(zepAdapter?.credentialMode === 'runtime-env', 'Zep adapter must require runtime environment credentials', failures);
|
|
259
|
+
assert(zepAdapter?.requiredEnv?.includes('ZEP_API_KEY'), 'Zep adapter registry entry missing ZEP_API_KEY', failures);
|
|
260
|
+
assert(zepAdapter?.commands?.externalRun === 'npm run bench:guard:zep', 'Zep adapter external-run command is stale', failures);
|
|
261
|
+
const publicationVerificationFixture = {
|
|
262
|
+
schemaVersion: '1.0.0',
|
|
263
|
+
suite: 'GuardBench publication artifact verification',
|
|
264
|
+
generatedAt: '2026-05-13T00:00:00.000Z',
|
|
265
|
+
ok: true,
|
|
266
|
+
checks: {
|
|
267
|
+
registry: { ok: true, failures: [] },
|
|
268
|
+
adapterModule: { ok: true, failures: [] },
|
|
269
|
+
selfTest: { ok: true, failures: [] },
|
|
270
|
+
artifacts: { ok: true, failures: [] },
|
|
271
|
+
bundle: { ok: true, failures: [] },
|
|
272
|
+
externalDryRun: { ok: true, failures: [] },
|
|
273
|
+
externalEvidence: { ok: true, failures: [] },
|
|
274
|
+
leaderboard: { ok: true, failures: [] },
|
|
275
|
+
localPaths: { ok: true, failures: [] },
|
|
276
|
+
},
|
|
277
|
+
failures: [],
|
|
278
|
+
};
|
|
279
|
+
const publicationVerificationSchemaErrors = validateSchema(
|
|
280
|
+
publicationVerificationFixture,
|
|
281
|
+
guardPublicationVerificationSchema,
|
|
282
|
+
'guardbench-publication-verification',
|
|
283
|
+
);
|
|
284
|
+
for (const error of publicationVerificationSchemaErrors) failures.push(`GuardBench publication verifier schema violation: ${error}`);
|
|
285
|
+
|
|
286
|
+
const benchmarkNeedles = [
|
|
287
|
+
summary.generatedAt,
|
|
288
|
+
`| Audrey | ${local.Audrey.scorePercent} | ${local.Audrey.passRate} | ${formatMetric(local.Audrey.avgDurationMs)} |`,
|
|
289
|
+
`| Vector Only | ${local['Vector Only'].scorePercent} | ${local['Vector Only'].passRate} | ${formatMetric(local['Vector Only'].avgDurationMs)} |`,
|
|
290
|
+
`| Keyword + Recency | ${local['Keyword + Recency'].scorePercent} | ${local['Keyword + Recency'].passRate} | ${formatMetric(local['Keyword + Recency'].avgDurationMs)} |`,
|
|
291
|
+
];
|
|
292
|
+
ensureContainsAll(evaluation, benchmarkNeedles, '07-evaluation.md', failures);
|
|
293
|
+
ensureContainsAll(paper, benchmarkNeedles, 'audrey-paper-v1.md', failures);
|
|
294
|
+
|
|
295
|
+
const latency = guardSummary.latency;
|
|
296
|
+
const guardLatencyText = `${formatMetric(latency.p50Ms)} ms / ${formatMetric(latency.p95Ms)} ms`;
|
|
297
|
+
ensureContainsAll(evaluation, [guardLatencyText, '| Published artifact raw-secret leaks | 0 |'], '07-evaluation.md', failures);
|
|
298
|
+
ensureContainsAll(paper, [guardLatencyText, '| Published artifact raw-secret leaks | 0 |'], 'audrey-paper-v1.md', failures);
|
|
299
|
+
ensureContainsAll(readme, [`${formatMetric(latency.p50Ms)}ms / ${formatMetric(latency.p95Ms)}ms`, '0 published artifact leaks'], 'README.md', failures);
|
|
300
|
+
ensureContainsAll(readme, ['bench:guard:zep', 'bench:guard:external:dry-run', 'bench:guard:external:evidence', 'bench:guard:external:evidence:strict', 'paper:arxiv:compile', 'paper:arxiv:compile:strict', 'paper:launch-results', 'paper:launch-results:strict', 'release:cut:plan', 'release:cut:apply', 'release:readiness', 'release:readiness:strict', 'python:release:check', 'absolute-path sweep', 'X URL reserve', 'submitted artifact-url targets', 'external dry-run matrix', 'external evidence verification', 'ZEP_API_KEY', 'ZEP_GUARDBENCH_INGEST_DELAY_MS'], 'README.md', failures);
|
|
301
|
+
ensureContainsAllProse(readme, ['source-control state', 'live remote-head verification', 'npm registry/auth readiness', 'PyPI publish readiness'], 'README.md', failures);
|
|
302
|
+
ensureContainsAll(paper, ['Zep Cloud', 'ZEP_API_KEY', 'Mem0 and Zep adapters', 'external dry-run matrix', 'external evidence verification', 'reserved URL budget', 'submitted artifact-url targets', 'arXiv compile report', 'release-readiness verifier', 'release-cut planner', 'Python package verifier'], 'audrey-paper-v1.md', failures);
|
|
303
|
+
ensureContainsAllProse(paper, ['source-control release-state check', 'live remote-head verification', 'npm registry/auth readiness', 'npm whoami', 'audrey@1.0.0', 'PyPI publish readiness'], 'audrey-paper-v1.md', failures);
|
|
304
|
+
ensureContainsAll(ledger, [`${formatMetric(latency.p50Ms)}ms/${formatMetric(latency.p95Ms)}ms`, 'zero published artifact raw-secret leaks'], 'evidence-ledger.md', failures);
|
|
305
|
+
|
|
306
|
+
assert(guardSummary.passed === 10, `GuardBench expected 10 passed scenarios, got ${guardSummary.passed}`, failures);
|
|
307
|
+
assert(guardSummary.scenarios === 10, `GuardBench expected 10 scenarios, got ${guardSummary.scenarios}`, failures);
|
|
308
|
+
assert(guardSummary.redactionLeaks === 0, `GuardBench decision-output leaks expected 0, got ${guardSummary.redactionLeaks}`, failures);
|
|
309
|
+
assert(guardSummary.artifactRedactionSweep?.passed === true, 'GuardBench artifactRedactionSweep did not pass', failures);
|
|
310
|
+
assert(guardSummary.artifactRedactionSweep?.leakCount === 0, `GuardBench artifact leak count expected 0, got ${guardSummary.artifactRedactionSweep?.leakCount}`, failures);
|
|
311
|
+
assert(guardRaw.artifactRedactionSweep?.passed === true, 'Raw GuardBench artifactRedactionSweep did not pass', failures);
|
|
312
|
+
|
|
313
|
+
const manifestText = JSON.stringify(guardManifest);
|
|
314
|
+
const summaryText = JSON.stringify(guardSummary);
|
|
315
|
+
const rawText = JSON.stringify(guardRaw);
|
|
316
|
+
assert(!manifestText.includes(SEEDED_SECRET), 'GuardBench manifest contains the raw seeded secret', failures);
|
|
317
|
+
assert(!summaryText.includes(SEEDED_SECRET), 'GuardBench summary contains the raw seeded secret', failures);
|
|
318
|
+
assert(!rawText.includes(SEEDED_SECRET), 'GuardBench raw output contains the raw seeded secret', failures);
|
|
319
|
+
assert(manifestText.includes('seededSecretRefs'), 'GuardBench manifest missing seededSecretRefs', failures);
|
|
320
|
+
assert(!manifestText.includes('"seededSecrets"'), 'GuardBench manifest still publishes seededSecrets', failures);
|
|
321
|
+
|
|
322
|
+
if (failures.length) {
|
|
323
|
+
console.error('Paper artifact verification failed:');
|
|
324
|
+
for (const failure of failures) console.error(`- ${failure}`);
|
|
325
|
+
process.exit(1);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
console.log('Paper artifact verification passed.');
|
|
329
|
+
console.log(`Evidence rows: ${evidenceRows}`);
|
|
330
|
+
console.log(`Bibliography entries: ${bibEntries}`);
|
|
331
|
+
console.log(`Paper claims: ${claimReport.claims.length}`);
|
|
332
|
+
console.log(`Publication pack entries: ${publicationPackReport.entries.length}`);
|
|
333
|
+
console.log(`arXiv source files: ${arxivSourceReport.files.length}, citations ${arxivSourceReport.citationCount}`);
|
|
334
|
+
console.log(`arXiv compile status: ${arxivCompileReport.status}`);
|
|
335
|
+
console.log(`Browser launch targets: ${browserLaunchReport.targets.length}`);
|
|
336
|
+
console.log(`Browser launch results: ${browserLaunchResultsReport.targets.length} targets, ready=${browserLaunchResultsReport.ready}`);
|
|
337
|
+
console.log(`Paper bundle files: ${paperBundleReport.files.length}`);
|
|
338
|
+
console.log(`GuardBench: ${guardSummary.passed}/${guardSummary.scenarios}, latency ${latency.p50Ms}ms/${latency.p95Ms}ms, artifact leaks ${guardSummary.artifactRedactionSweep.leakCount}`);
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
2
|
+
import { dirname, resolve } from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import { validateSchema } from '../benchmarks/validate-guardbench-artifacts.mjs';
|
|
5
|
+
import { verifyGuardBenchPublicationArtifacts } from '../benchmarks/verify-publication-artifacts.mjs';
|
|
6
|
+
|
|
7
|
+
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
8
|
+
const DEFAULT_REGISTER = 'docs/paper/claim-register.json';
|
|
9
|
+
const DEFAULT_SCHEMA = 'docs/paper/claim-register.schema.json';
|
|
10
|
+
const SEEDED_SECRET = 'sk-guardbench-secret-0000000000000000000000000000';
|
|
11
|
+
|
|
12
|
+
function fromRoot(path) {
|
|
13
|
+
return resolve(ROOT, path);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function readText(path) {
|
|
17
|
+
const absolute = fromRoot(path);
|
|
18
|
+
if (!existsSync(absolute)) throw new Error(`Missing required file: ${path}`);
|
|
19
|
+
return readFileSync(absolute, 'utf-8');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function readJson(path) {
|
|
23
|
+
return JSON.parse(readText(path));
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function parseArgs(argv = process.argv.slice(2)) {
|
|
27
|
+
const args = {
|
|
28
|
+
register: DEFAULT_REGISTER,
|
|
29
|
+
schema: DEFAULT_SCHEMA,
|
|
30
|
+
json: false,
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
for (let i = 0; i < argv.length; i++) {
|
|
34
|
+
const token = argv[i];
|
|
35
|
+
if (token === '--register' && argv[i + 1]) args.register = argv[++i];
|
|
36
|
+
else if (token === '--schema' && argv[i + 1]) args.schema = argv[++i];
|
|
37
|
+
else if (token === '--json') args.json = true;
|
|
38
|
+
else if (token === '--help' || token === '-h') args.help = true;
|
|
39
|
+
else throw new Error(`Unknown argument: ${token}`);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return args;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function usage() {
|
|
46
|
+
return `Usage: node scripts/verify-paper-claims.mjs [options]
|
|
47
|
+
|
|
48
|
+
Options:
|
|
49
|
+
--register <path> Claim register JSON. Default: ${DEFAULT_REGISTER}.
|
|
50
|
+
--schema <path> Claim register schema. Default: ${DEFAULT_SCHEMA}.
|
|
51
|
+
--json Print the machine-readable claim verification report.
|
|
52
|
+
`;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function assertTextNeedles(needles, shouldExist, failures) {
|
|
56
|
+
for (const needle of needles) {
|
|
57
|
+
let text = '';
|
|
58
|
+
try {
|
|
59
|
+
text = readText(needle.path);
|
|
60
|
+
} catch (error) {
|
|
61
|
+
failures.push(error.message);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
const normalizedText = text.replace(/\s+/g, ' ');
|
|
65
|
+
const normalizedNeedle = needle.text.replace(/\s+/g, ' ');
|
|
66
|
+
const found = text.includes(needle.text) || normalizedText.includes(normalizedNeedle);
|
|
67
|
+
if (shouldExist && !found) failures.push(`${needle.path} is missing claim text: ${needle.text}`);
|
|
68
|
+
if (!shouldExist && found) failures.push(`${needle.path} contains forbidden claim text: ${needle.text}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function guardbenchLocalPassed() {
|
|
73
|
+
const summary = readJson('benchmarks/output/guardbench-summary.json');
|
|
74
|
+
const failures = [];
|
|
75
|
+
if (summary.passed !== 10) failures.push(`GuardBench passed expected 10, got ${summary.passed}`);
|
|
76
|
+
if (summary.scenarios !== 10) failures.push(`GuardBench scenarios expected 10, got ${summary.scenarios}`);
|
|
77
|
+
if (summary.redactionLeaks !== 0) failures.push(`GuardBench decision redaction leaks expected 0, got ${summary.redactionLeaks}`);
|
|
78
|
+
if (summary.artifactRedactionSweep?.passed !== true) failures.push('GuardBench artifact redaction sweep did not pass');
|
|
79
|
+
if (summary.artifactRedactionSweep?.leakCount !== 0) {
|
|
80
|
+
failures.push(`GuardBench artifact leak count expected 0, got ${summary.artifactRedactionSweep?.leakCount}`);
|
|
81
|
+
}
|
|
82
|
+
return failures;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function noPublishedSecretLeaks() {
|
|
86
|
+
const paths = [
|
|
87
|
+
'benchmarks/output/guardbench-manifest.json',
|
|
88
|
+
'benchmarks/output/guardbench-summary.json',
|
|
89
|
+
'benchmarks/output/guardbench-raw.json',
|
|
90
|
+
];
|
|
91
|
+
return paths.flatMap(path => readText(path).includes(SEEDED_SECRET)
|
|
92
|
+
? [`${path} contains the seeded raw secret`]
|
|
93
|
+
: []);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function adapterRegistryHasMem0Zep() {
|
|
97
|
+
const registry = readJson('benchmarks/adapters/registry.json');
|
|
98
|
+
const ids = new Set((registry.adapters ?? []).map(adapter => adapter.id));
|
|
99
|
+
const failures = [];
|
|
100
|
+
if (!ids.has('mem0-platform')) failures.push('Adapter registry missing mem0-platform');
|
|
101
|
+
if (!ids.has('zep-cloud')) failures.push('Adapter registry missing zep-cloud');
|
|
102
|
+
return failures;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function externalEvidencePending() {
|
|
106
|
+
const evidence = readJson('benchmarks/output/external/guardbench-external-evidence.json');
|
|
107
|
+
const rows = (evidence.adapters ?? []).filter(adapter => ['mem0-platform', 'zep-cloud'].includes(adapter.id));
|
|
108
|
+
const failures = [];
|
|
109
|
+
if (rows.length !== 2) failures.push(`External evidence expected Mem0 and Zep rows, got ${rows.length}`);
|
|
110
|
+
if (rows.every(row => row.status === 'verified')) {
|
|
111
|
+
failures.push('External evidence is fully verified but claim register still marks external scores pending');
|
|
112
|
+
}
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
if (row.status !== 'pending') failures.push(`External evidence row ${row.id} should remain pending until strict live evidence passes`);
|
|
115
|
+
if (row.evidenceKind !== 'dry-run') failures.push(`External evidence row ${row.id} should be dry-run evidence before live credentials`);
|
|
116
|
+
}
|
|
117
|
+
return failures;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function externalEvidenceNoSecrets() {
|
|
121
|
+
const text = readText('benchmarks/output/external/guardbench-external-evidence.json');
|
|
122
|
+
const evidence = JSON.parse(text);
|
|
123
|
+
const failures = [];
|
|
124
|
+
if (text.includes('runtime-key')) failures.push('External evidence report contains test runtime-key');
|
|
125
|
+
for (const row of evidence.adapters ?? []) {
|
|
126
|
+
if (row.secretLeakCount !== 0) failures.push(`External evidence row ${row.id} reports ${row.secretLeakCount} credential leak(s)`);
|
|
127
|
+
}
|
|
128
|
+
return failures;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function paperStageBoundaryExcludesExternalScores() {
|
|
132
|
+
const paper = readText('docs/paper/audrey-paper-v1.md');
|
|
133
|
+
const failures = [];
|
|
134
|
+
if (!paper.includes('this paper does not report external-system GuardBench scores')) {
|
|
135
|
+
failures.push('Paper missing explicit external-score exclusion');
|
|
136
|
+
}
|
|
137
|
+
if (!paper.includes('External scores added only when live adapter runs and raw outputs are published')) {
|
|
138
|
+
failures.push('Paper missing Stage-B external-score condition');
|
|
139
|
+
}
|
|
140
|
+
return failures;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
async function publicationVerifierOk() {
|
|
144
|
+
const report = await verifyGuardBenchPublicationArtifacts();
|
|
145
|
+
return report.ok ? [] : report.failures.map(failure => `publication verifier: ${failure}`);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async function runArtifactCheck(name) {
|
|
149
|
+
if (name === 'adapter-registry-has-mem0-zep') return adapterRegistryHasMem0Zep();
|
|
150
|
+
if (name === 'external-evidence-no-secrets') return externalEvidenceNoSecrets();
|
|
151
|
+
if (name === 'external-evidence-pending') return externalEvidencePending();
|
|
152
|
+
if (name === 'guardbench-local-passed') return guardbenchLocalPassed();
|
|
153
|
+
if (name === 'no-published-secret-leaks') return noPublishedSecretLeaks();
|
|
154
|
+
if (name === 'paper-stage-boundary-excludes-external-scores') return paperStageBoundaryExcludesExternalScores();
|
|
155
|
+
if (name === 'publication-verifier-ok') return publicationVerifierOk();
|
|
156
|
+
return [`Unknown claim artifact check: ${name}`];
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
export async function verifyPaperClaims(options = {}) {
|
|
160
|
+
const register = readJson(options.register ?? DEFAULT_REGISTER);
|
|
161
|
+
const schema = readJson(options.schema ?? DEFAULT_SCHEMA);
|
|
162
|
+
const schemaFailures = validateSchema(register, schema, 'audrey-paper-claim-register');
|
|
163
|
+
const claimReports = [];
|
|
164
|
+
|
|
165
|
+
for (const claim of register.claims ?? []) {
|
|
166
|
+
const failures = [];
|
|
167
|
+
assertTextNeedles(claim.requiredText ?? [], true, failures);
|
|
168
|
+
assertTextNeedles(claim.forbiddenText ?? [], false, failures);
|
|
169
|
+
for (const evidence of claim.evidence ?? []) {
|
|
170
|
+
const [path] = evidence.split('#');
|
|
171
|
+
if (!existsSync(fromRoot(path))) failures.push(`Missing evidence file for ${claim.id}: ${path}`);
|
|
172
|
+
}
|
|
173
|
+
for (const check of claim.artifactChecks ?? []) {
|
|
174
|
+
failures.push(...(await runArtifactCheck(check)));
|
|
175
|
+
}
|
|
176
|
+
claimReports.push({
|
|
177
|
+
id: claim.id,
|
|
178
|
+
status: claim.status,
|
|
179
|
+
ok: failures.length === 0,
|
|
180
|
+
artifactChecks: claim.artifactChecks ?? [],
|
|
181
|
+
failures,
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const failures = [
|
|
186
|
+
...schemaFailures.map(failure => `claim register schema: ${failure}`),
|
|
187
|
+
...claimReports.flatMap(report => report.failures.map(failure => `${report.id}: ${failure}`)),
|
|
188
|
+
];
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
schemaVersion: '1.0.0',
|
|
192
|
+
suite: 'Audrey paper claim verification',
|
|
193
|
+
generatedAt: new Date().toISOString(),
|
|
194
|
+
ok: failures.length === 0,
|
|
195
|
+
register: fromRoot(options.register ?? DEFAULT_REGISTER),
|
|
196
|
+
claims: claimReports,
|
|
197
|
+
failures,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async function main() {
|
|
202
|
+
const args = parseArgs();
|
|
203
|
+
if (args.help) {
|
|
204
|
+
console.log(usage());
|
|
205
|
+
return;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const report = await verifyPaperClaims(args);
|
|
209
|
+
if (args.json) {
|
|
210
|
+
console.log(JSON.stringify(report, null, 2));
|
|
211
|
+
} else if (report.ok) {
|
|
212
|
+
console.log(`Paper claim verification passed: ${report.claims.length} claim(s)`);
|
|
213
|
+
} else {
|
|
214
|
+
console.error('Paper claim verification failed:');
|
|
215
|
+
for (const failure of report.failures) console.error(`- ${failure}`);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (!report.ok) process.exit(1);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
|
|
222
|
+
main().catch(error => {
|
|
223
|
+
console.error(error.stack ?? error.message);
|
|
224
|
+
process.exit(1);
|
|
225
|
+
});
|
|
226
|
+
}
|