audrey 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +30 -6
- package/benchmarks/adapter-self-test.mjs +6 -2
- package/benchmarks/adapters/example-allow.mjs +5 -2
- package/benchmarks/adapters/mem0-platform.mjs +19 -12
- package/benchmarks/adapters/zep-cloud.mjs +51 -27
- package/benchmarks/baselines.js +11 -6
- package/benchmarks/build-leaderboard.mjs +36 -23
- package/benchmarks/cases.js +24 -12
- package/benchmarks/create-conformance-card.mjs +12 -3
- package/benchmarks/create-submission-bundle.mjs +22 -8
- package/benchmarks/dry-run-external-adapters.mjs +24 -12
- package/benchmarks/guardbench.js +354 -124
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/guardbench-raw.json +243 -144
- package/benchmarks/output/guardbench-summary.json +354 -230
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/submission-bundle/guardbench-raw.json +243 -144
- package/benchmarks/output/submission-bundle/guardbench-summary.json +354 -230
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +23 -2
- package/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +58 -58
- package/benchmarks/perf-snapshot.js +12 -9
- package/benchmarks/perf.bench.js +14 -6
- package/benchmarks/public-paths.mjs +11 -5
- package/benchmarks/reference-results.js +10 -5
- package/benchmarks/report.js +48 -27
- package/benchmarks/run-external-guardbench.mjs +47 -25
- package/benchmarks/run.js +112 -59
- package/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/benchmarks/validate-adapter-module.mjs +13 -10
- package/benchmarks/validate-adapter-registry.mjs +16 -5
- package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
- package/benchmarks/verify-external-evidence.mjs +86 -31
- package/benchmarks/verify-publication-artifacts.mjs +34 -11
- package/benchmarks/verify-submission-bundle.mjs +9 -4
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +5 -3
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +4 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +479 -172
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts.map +1 -1
- package/dist/src/action-key.js +6 -2
- package/dist/src/action-key.js.map +1 -1
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +4 -2
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +8 -5
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +11 -1
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +110 -53
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +37 -15
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +4 -2
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +5 -5
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +17 -9
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/context.js +1 -1
- package/dist/src/context.js.map +1 -1
- package/dist/src/controller.d.ts +17 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +73 -23
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +78 -27
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +1 -1
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +12 -4
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +18 -16
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +5 -4
- package/dist/src/encode.js.map +1 -1
- package/dist/src/events.d.ts +3 -2
- package/dist/src/events.d.ts.map +1 -1
- package/dist/src/events.js +7 -3
- package/dist/src/events.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +21 -7
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +1 -1
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +12 -6
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.d.ts.map +1 -1
- package/dist/src/fts.js +20 -8
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +12 -6
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts.map +1 -1
- package/dist/src/impact.js +26 -10
- package/dist/src/impact.js.map +1 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +11 -6
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +5 -4
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +4 -4
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +10 -5
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.d.ts.map +1 -1
- package/dist/src/introspect.js +12 -6
- package/dist/src/introspect.js.map +1 -1
- package/dist/src/llm.d.ts +2 -2
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +6 -6
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +10 -4
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +6 -8
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts.map +1 -1
- package/dist/src/profile.js.map +1 -1
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +16 -7
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +1 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +85 -18
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +9 -4
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +1 -7
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +4 -2
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +37 -14
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +24 -2
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.js +2 -2
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts +2 -2
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +12 -4
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.js +1 -1
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +20 -10
- package/dist/src/validate.js.map +1 -1
- package/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/audrey-paper-v1.md +6 -6
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +6 -6
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +30 -6
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +243 -144
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +354 -230
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +52 -52
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +18 -5
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +40 -40
- package/examples/fintech-ops-demo.js +12 -5
- package/examples/healthcare-ops-demo.js +8 -4
- package/examples/ollama-memory-agent.js +41 -13
- package/examples/stripe-demo.js +12 -5
- package/package.json +18 -5
- package/scripts/audit-release-completion.mjs +179 -101
- package/scripts/create-arxiv-source.mjs +20 -14
- package/scripts/create-paper-submission-bundle.mjs +6 -2
- package/scripts/finalize-release.mjs +111 -36
- package/scripts/prepare-release-cut.mjs +14 -6
- package/scripts/publish-release-bundle.mjs +62 -23
- package/scripts/publish-release-github-api.mjs +89 -24
- package/scripts/smoke-cli.js +26 -6
- package/scripts/sync-paper-artifacts.mjs +5 -1
- package/scripts/verify-arxiv-compile.mjs +52 -16
- package/scripts/verify-arxiv-source.mjs +45 -15
- package/scripts/verify-browser-launch-plan.mjs +28 -11
- package/scripts/verify-browser-launch-results.mjs +32 -14
- package/scripts/verify-paper-artifacts.mjs +539 -79
- package/scripts/verify-paper-claims.mjs +48 -20
- package/scripts/verify-paper-submission-bundle.mjs +22 -11
- package/scripts/verify-publication-pack.mjs +23 -9
- package/scripts/verify-release-readiness.mjs +250 -71
package/benchmarks/run.js
CHANGED
|
@@ -68,7 +68,9 @@ function normalizeSuiteSelection(value = 'all') {
|
|
|
68
68
|
|
|
69
69
|
const invalid = selected.filter(token => !ALL_SUITE_IDS.includes(token));
|
|
70
70
|
if (invalid.length > 0) {
|
|
71
|
-
throw new Error(
|
|
71
|
+
throw new Error(
|
|
72
|
+
`Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`,
|
|
73
|
+
);
|
|
72
74
|
}
|
|
73
75
|
return [...new Set(selected)];
|
|
74
76
|
}
|
|
@@ -94,12 +96,19 @@ function evaluateCase(benchmarkCase, results) {
|
|
|
94
96
|
const expected = (benchmarkCase.expectAny || []).map(normalize);
|
|
95
97
|
const required = (benchmarkCase.expectAll || []).map(normalize);
|
|
96
98
|
const forbidden = (benchmarkCase.forbid || []).map(normalize);
|
|
97
|
-
const firstMatchIndex =
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
const firstMatchIndex =
|
|
100
|
+
expected.length === 0
|
|
101
|
+
? -1
|
|
102
|
+
: normalizedContents.findIndex(content =>
|
|
103
|
+
expected.some(expectation => content.includes(expectation)),
|
|
104
|
+
);
|
|
105
|
+
const firstForbiddenIndex = normalizedContents.findIndex(content =>
|
|
106
|
+
forbidden.some(blocked => content.includes(blocked)),
|
|
107
|
+
);
|
|
101
108
|
const matched = firstMatchIndex !== -1;
|
|
102
|
-
const requiredMatches = required.filter(expectation =>
|
|
109
|
+
const requiredMatches = required.filter(expectation =>
|
|
110
|
+
normalizedContents.some(content => content.includes(expectation)),
|
|
111
|
+
);
|
|
103
112
|
const matchedRequired = required.length > 0 && requiredMatches.length === required.length;
|
|
104
113
|
const leakedForbidden = firstForbiddenIndex !== -1;
|
|
105
114
|
|
|
@@ -108,16 +117,21 @@ function evaluateCase(benchmarkCase, results) {
|
|
|
108
117
|
return {
|
|
109
118
|
passed: score === 1,
|
|
110
119
|
score,
|
|
111
|
-
summary: leakedForbidden
|
|
120
|
+
summary: leakedForbidden
|
|
121
|
+
? 'leaked restricted content'
|
|
122
|
+
: results.length === 0
|
|
123
|
+
? 'correct abstention'
|
|
124
|
+
: 'no leak, but retrieved tangential context',
|
|
112
125
|
};
|
|
113
126
|
}
|
|
114
127
|
|
|
115
128
|
if (required.length > 0) {
|
|
116
|
-
const score =
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
129
|
+
const score =
|
|
130
|
+
matchedRequired && !leakedForbidden
|
|
131
|
+
? 1
|
|
132
|
+
: leakedForbidden
|
|
133
|
+
? 0
|
|
134
|
+
: Math.min(0.5, requiredMatches.length / required.length);
|
|
121
135
|
const missing = required.filter(expectation => !requiredMatches.includes(expectation));
|
|
122
136
|
return {
|
|
123
137
|
passed: score === 1,
|
|
@@ -154,7 +168,9 @@ async function seedRetrievalCase(brain, benchmarkCase) {
|
|
|
154
168
|
const ids = [];
|
|
155
169
|
for (let index = 0; index < benchmarkCase.memory.length; index++) {
|
|
156
170
|
const memory = benchmarkCase.memory[index];
|
|
157
|
-
const supersedes = Number.isInteger(memory.supersedesIndex)
|
|
171
|
+
const supersedes = Number.isInteger(memory.supersedesIndex)
|
|
172
|
+
? ids[memory.supersedesIndex]
|
|
173
|
+
: undefined;
|
|
158
174
|
const id = await brain.encode({
|
|
159
175
|
content: memory.content,
|
|
160
176
|
source: memory.source,
|
|
@@ -264,7 +280,9 @@ async function executeGuardStep(brain, step, refs) {
|
|
|
264
280
|
if (step.type === 'expectGuardAfterError') {
|
|
265
281
|
const receiptId = step.receiptRef ? refs.get(step.receiptRef) : step.receiptId;
|
|
266
282
|
if (!receiptId) {
|
|
267
|
-
throw new Error(
|
|
283
|
+
throw new Error(
|
|
284
|
+
`Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`,
|
|
285
|
+
);
|
|
268
286
|
}
|
|
269
287
|
|
|
270
288
|
try {
|
|
@@ -278,15 +296,19 @@ async function executeGuardStep(brain, step, refs) {
|
|
|
278
296
|
} catch (err) {
|
|
279
297
|
const message = err instanceof Error ? err.message : String(err);
|
|
280
298
|
if (step.errorIncludes && !message.includes(step.errorIncludes)) {
|
|
281
|
-
throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"
|
|
299
|
+
throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`, {
|
|
300
|
+
cause: err,
|
|
301
|
+
});
|
|
282
302
|
}
|
|
283
303
|
const label = step.label ?? 'after_error_rejected';
|
|
284
|
-
return [
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
304
|
+
return [
|
|
305
|
+
{
|
|
306
|
+
id: `${receiptId}:${label}`,
|
|
307
|
+
content: `guard_hardened:${label} error:${message}`,
|
|
308
|
+
type: 'guard_hardening',
|
|
309
|
+
score: 1,
|
|
310
|
+
},
|
|
311
|
+
];
|
|
290
312
|
}
|
|
291
313
|
|
|
292
314
|
throw new Error(`Guard hardening expected an error for receipt ${receiptId}`);
|
|
@@ -299,18 +321,20 @@ async function seedGuardCase(brain, benchmarkCase) {
|
|
|
299
321
|
const refs = new Map();
|
|
300
322
|
const diagnostics = [];
|
|
301
323
|
for (const step of benchmarkCase.steps || []) {
|
|
302
|
-
diagnostics.push(...await executeGuardStep(brain, step, refs));
|
|
324
|
+
diagnostics.push(...(await executeGuardStep(brain, step, refs)));
|
|
303
325
|
}
|
|
304
326
|
return diagnostics;
|
|
305
327
|
}
|
|
306
328
|
|
|
307
329
|
function guardDecisionRows(decision) {
|
|
308
|
-
const rows = [
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
330
|
+
const rows = [
|
|
331
|
+
{
|
|
332
|
+
id: decision.receipt_id,
|
|
333
|
+
content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
|
|
334
|
+
type: 'guard_decision',
|
|
335
|
+
score: 1,
|
|
336
|
+
},
|
|
337
|
+
];
|
|
314
338
|
|
|
315
339
|
for (const [index, warning] of decision.warnings.entries()) {
|
|
316
340
|
rows.push({
|
|
@@ -380,12 +404,15 @@ async function runAudreyCase(benchmarkCase, providerConfig) {
|
|
|
380
404
|
|
|
381
405
|
async function runBaselineCase(system, benchmarkCase, providerConfig) {
|
|
382
406
|
if (benchmarkCase.kind === 'guard') {
|
|
383
|
-
return [
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
407
|
+
return [
|
|
408
|
+
{
|
|
409
|
+
id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
|
|
410
|
+
content:
|
|
411
|
+
'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
|
|
412
|
+
type: 'guard_decision',
|
|
413
|
+
score: 0,
|
|
414
|
+
},
|
|
415
|
+
];
|
|
389
416
|
}
|
|
390
417
|
|
|
391
418
|
return runBaselineScenario(system, benchmarkCase, providerConfig, 5);
|
|
@@ -394,9 +421,18 @@ async function runBaselineCase(system, benchmarkCase, providerConfig) {
|
|
|
394
421
|
async function runSystemsForCase(benchmarkCase, providerConfig) {
|
|
395
422
|
const systems = [
|
|
396
423
|
{ system: 'Audrey', run: () => runAudreyCase(benchmarkCase, providerConfig) },
|
|
397
|
-
{
|
|
398
|
-
|
|
399
|
-
|
|
424
|
+
{
|
|
425
|
+
system: 'Vector Only',
|
|
426
|
+
run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig),
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
system: 'Keyword + Recency',
|
|
430
|
+
run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig),
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
system: 'Recent Window',
|
|
434
|
+
run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig),
|
|
435
|
+
},
|
|
400
436
|
];
|
|
401
437
|
|
|
402
438
|
const results = [];
|
|
@@ -504,13 +540,13 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
504
540
|
|
|
505
541
|
if (audrey.scorePercent < settings.minAudreyScore) {
|
|
506
542
|
failures.push(
|
|
507
|
-
`Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}
|
|
543
|
+
`Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`,
|
|
508
544
|
);
|
|
509
545
|
}
|
|
510
546
|
|
|
511
547
|
if (audrey.passRate < settings.minAudreyPassRate) {
|
|
512
548
|
failures.push(
|
|
513
|
-
`Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}
|
|
549
|
+
`Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`,
|
|
514
550
|
);
|
|
515
551
|
}
|
|
516
552
|
|
|
@@ -518,8 +554,8 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
518
554
|
const margin = audrey.scorePercent - strongestBaseline.scorePercent;
|
|
519
555
|
if (margin < settings.minMarginOverBaseline) {
|
|
520
556
|
failures.push(
|
|
521
|
-
`Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required `
|
|
522
|
-
|
|
557
|
+
`Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required ` +
|
|
558
|
+
`${settings.minMarginOverBaseline.toFixed(1)}-point margin.`,
|
|
523
559
|
);
|
|
524
560
|
}
|
|
525
561
|
}
|
|
@@ -531,7 +567,9 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
531
567
|
return {
|
|
532
568
|
audrey,
|
|
533
569
|
strongestBaseline,
|
|
534
|
-
marginOverBaseline: strongestBaseline
|
|
570
|
+
marginOverBaseline: strongestBaseline
|
|
571
|
+
? audrey.scorePercent - strongestBaseline.scorePercent
|
|
572
|
+
: null,
|
|
535
573
|
thresholds: settings,
|
|
536
574
|
};
|
|
537
575
|
}
|
|
@@ -563,7 +601,9 @@ export async function runBenchmarkSuite(options = {}) {
|
|
|
563
601
|
}
|
|
564
602
|
}
|
|
565
603
|
|
|
566
|
-
const comparableCaseResults = caseResults.filter(
|
|
604
|
+
const comparableCaseResults = caseResults.filter(
|
|
605
|
+
caseResult => caseResult.comparable_to_baselines,
|
|
606
|
+
);
|
|
567
607
|
const overallCaseResults = comparableCaseResults.length > 0 ? comparableCaseResults : caseResults;
|
|
568
608
|
const overallScope = comparableCaseResults.length > 0 ? 'comparable_suites' : 'selected_suites';
|
|
569
609
|
const overallSuiteIds = [...new Set(overallCaseResults.map(caseResult => caseResult.suite))];
|
|
@@ -579,10 +619,14 @@ export async function runBenchmarkSuite(options = {}) {
|
|
|
579
619
|
suites: suiteIds,
|
|
580
620
|
},
|
|
581
621
|
methodology: {
|
|
582
|
-
localBenchmark:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
622
|
+
localBenchmark:
|
|
623
|
+
'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
|
|
624
|
+
retrievalBenchmark:
|
|
625
|
+
'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
|
|
626
|
+
operationsBenchmark:
|
|
627
|
+
'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
|
|
628
|
+
guardBenchmark:
|
|
629
|
+
'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
|
|
586
630
|
externalLeaderboard: 'Published LoCoMo scores from official papers and project blogs',
|
|
587
631
|
},
|
|
588
632
|
local: {
|
|
@@ -615,10 +659,10 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
615
659
|
});
|
|
616
660
|
const gate = args.check
|
|
617
661
|
? assertBenchmarkGuardrails(summary, {
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
662
|
+
minAudreyScore: args.minAudreyScore,
|
|
663
|
+
minAudreyPassRate: args.minAudreyPassRate,
|
|
664
|
+
minMarginOverBaseline: args.minMarginOverBaseline,
|
|
665
|
+
})
|
|
622
666
|
: null;
|
|
623
667
|
|
|
624
668
|
if (args.jsonOnly) {
|
|
@@ -629,15 +673,22 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
629
673
|
const lines = [];
|
|
630
674
|
lines.push('Audrey benchmark complete.');
|
|
631
675
|
lines.push('');
|
|
632
|
-
lines.push(
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
676
|
+
lines.push(
|
|
677
|
+
`Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`,
|
|
678
|
+
);
|
|
679
|
+
lines.push(
|
|
680
|
+
`Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`,
|
|
681
|
+
);
|
|
682
|
+
const comparableCaseCount = summary.local.cases.filter(testCase =>
|
|
683
|
+
summary.local.overall_suite_ids.includes(testCase.suite),
|
|
684
|
+
).length;
|
|
685
|
+
lines.push(
|
|
686
|
+
`Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`,
|
|
687
|
+
);
|
|
637
688
|
for (const row of summary.local.overall) {
|
|
638
689
|
lines.push(
|
|
639
|
-
`${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, `
|
|
640
|
-
|
|
690
|
+
`${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, ` +
|
|
691
|
+
`${row.avgDurationMs.toFixed(1)} ms avg/case`,
|
|
641
692
|
);
|
|
642
693
|
}
|
|
643
694
|
lines.push('');
|
|
@@ -667,7 +718,9 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
667
718
|
? `${gate.strongestBaseline.system} by ${gate.marginOverBaseline.toFixed(1)} points`
|
|
668
719
|
: 'all local baselines';
|
|
669
720
|
lines.push('');
|
|
670
|
-
lines.push(
|
|
721
|
+
lines.push(
|
|
722
|
+
`Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`,
|
|
723
|
+
);
|
|
671
724
|
}
|
|
672
725
|
|
|
673
726
|
out(lines.join('\n'));
|
|
@@ -25,6 +25,23 @@
|
|
|
25
25
|
"artifactRedactionSweep": { "$ref": "#/$defs/artifactRedactionSweep" }
|
|
26
26
|
},
|
|
27
27
|
"$defs": {
|
|
28
|
+
"jsonValue": {
|
|
29
|
+
"anyOf": [
|
|
30
|
+
{ "type": "null" },
|
|
31
|
+
{ "type": "string" },
|
|
32
|
+
{ "type": "boolean" },
|
|
33
|
+
{ "type": "number" },
|
|
34
|
+
{
|
|
35
|
+
"type": "array",
|
|
36
|
+
"items": { "$ref": "#/$defs/jsonValue" }
|
|
37
|
+
},
|
|
38
|
+
{ "$ref": "#/$defs/jsonObject" }
|
|
39
|
+
]
|
|
40
|
+
},
|
|
41
|
+
"jsonObject": {
|
|
42
|
+
"type": "object",
|
|
43
|
+
"additionalProperties": { "$ref": "#/$defs/jsonValue" }
|
|
44
|
+
},
|
|
28
45
|
"provenance": {
|
|
29
46
|
"type": "object",
|
|
30
47
|
"additionalProperties": false,
|
|
@@ -129,11 +146,14 @@
|
|
|
129
146
|
},
|
|
130
147
|
"summary": { "type": "string", "minLength": 1 },
|
|
131
148
|
"recallErrors": { "type": "array" },
|
|
149
|
+
"adapterExtensions": { "$ref": "#/$defs/jsonObject" },
|
|
132
150
|
"leakedSecrets": {
|
|
133
151
|
"type": "array",
|
|
134
152
|
"items": { "type": "string" }
|
|
135
153
|
},
|
|
136
|
-
"requiredEvidenceMatched": { "type": "boolean" }
|
|
154
|
+
"requiredEvidenceMatched": { "type": "boolean" },
|
|
155
|
+
"hasEvidenceForDecision": { "type": "boolean" },
|
|
156
|
+
"lineageTextMatched": { "type": "boolean" }
|
|
137
157
|
}
|
|
138
158
|
},
|
|
139
159
|
"artifactRedactionSweep": {
|
|
@@ -84,6 +84,23 @@
|
|
|
84
84
|
"artifactRedactionSweep": { "$ref": "#/$defs/artifactRedactionSweep" }
|
|
85
85
|
},
|
|
86
86
|
"$defs": {
|
|
87
|
+
"jsonValue": {
|
|
88
|
+
"anyOf": [
|
|
89
|
+
{ "type": "null" },
|
|
90
|
+
{ "type": "string" },
|
|
91
|
+
{ "type": "boolean" },
|
|
92
|
+
{ "type": "number" },
|
|
93
|
+
{
|
|
94
|
+
"type": "array",
|
|
95
|
+
"items": { "$ref": "#/$defs/jsonValue" }
|
|
96
|
+
},
|
|
97
|
+
{ "$ref": "#/$defs/jsonObject" }
|
|
98
|
+
]
|
|
99
|
+
},
|
|
100
|
+
"jsonObject": {
|
|
101
|
+
"type": "object",
|
|
102
|
+
"additionalProperties": { "$ref": "#/$defs/jsonValue" }
|
|
103
|
+
},
|
|
87
104
|
"latency": {
|
|
88
105
|
"type": "object",
|
|
89
106
|
"additionalProperties": false,
|
|
@@ -132,7 +149,8 @@
|
|
|
132
149
|
"evidenceRecall": { "type": "number", "minimum": 0, "maximum": 1 },
|
|
133
150
|
"redactionLeaks": { "type": "integer", "minimum": 0 },
|
|
134
151
|
"recallDegradationDetectionRate": { "type": "number", "minimum": 0, "maximum": 1 },
|
|
135
|
-
"latency": { "$ref": "#/$defs/latency" }
|
|
152
|
+
"latency": { "$ref": "#/$defs/latency" },
|
|
153
|
+
"lineageRichness": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
136
154
|
}
|
|
137
155
|
},
|
|
138
156
|
"resultRow": {
|
|
@@ -178,11 +196,14 @@
|
|
|
178
196
|
},
|
|
179
197
|
"summary": { "type": "string", "minLength": 1 },
|
|
180
198
|
"recallErrors": { "type": "array" },
|
|
199
|
+
"adapterExtensions": { "$ref": "#/$defs/jsonObject" },
|
|
181
200
|
"leakedSecrets": {
|
|
182
201
|
"type": "array",
|
|
183
202
|
"items": { "type": "string" }
|
|
184
203
|
},
|
|
185
|
-
"requiredEvidenceMatched": { "type": "boolean" }
|
|
204
|
+
"requiredEvidenceMatched": { "type": "boolean" },
|
|
205
|
+
"hasEvidenceForDecision": { "type": "boolean" },
|
|
206
|
+
"lineageTextMatched": { "type": "boolean" }
|
|
186
207
|
}
|
|
187
208
|
},
|
|
188
209
|
"caseResult": {
|
|
@@ -42,9 +42,10 @@ export async function validateAdapterModuleFile(options = {}) {
|
|
|
42
42
|
} else {
|
|
43
43
|
try {
|
|
44
44
|
const mod = await import(pathToFileURL(adapterPath).href);
|
|
45
|
-
const candidate =
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
const candidate =
|
|
46
|
+
typeof mod.createGuardBenchAdapter === 'function'
|
|
47
|
+
? await mod.createGuardBenchAdapter()
|
|
48
|
+
: (mod.default ?? mod.adapter);
|
|
48
49
|
adapter = validateGuardBenchAdapter(candidate, adapterPath);
|
|
49
50
|
} catch (error) {
|
|
50
51
|
failures.push(error.message);
|
|
@@ -57,12 +58,12 @@ export async function validateAdapterModuleFile(options = {}) {
|
|
|
57
58
|
moduleFile: basename(adapterPath),
|
|
58
59
|
adapter: adapter
|
|
59
60
|
? {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
61
|
+
name: adapter.name,
|
|
62
|
+
description: adapter.description ?? null,
|
|
63
|
+
hasSetup: typeof adapter.setup === 'function',
|
|
64
|
+
hasDecide: typeof adapter.decide === 'function',
|
|
65
|
+
hasCleanup: typeof adapter.cleanup === 'function',
|
|
66
|
+
}
|
|
66
67
|
: null,
|
|
67
68
|
contract: {
|
|
68
69
|
moduleFormat: 'ESM',
|
|
@@ -87,7 +88,9 @@ async function main() {
|
|
|
87
88
|
} else if (validation.ok) {
|
|
88
89
|
console.log(`GuardBench adapter module validation passed: ${validation.adapterPath}`);
|
|
89
90
|
console.log(`Adapter: ${validation.adapter.name}`);
|
|
90
|
-
console.log(
|
|
91
|
+
console.log(
|
|
92
|
+
`Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`,
|
|
93
|
+
);
|
|
91
94
|
} else {
|
|
92
95
|
console.error('GuardBench adapter module validation failed:');
|
|
93
96
|
for (const failure of validation.failures) console.error(`- ${failure}`);
|
|
@@ -69,11 +69,18 @@ export async function validateAdapterRegistry(options = {}) {
|
|
|
69
69
|
failures.push(`Adapter ${adapter.id} has credentialMode=none but declares requiredEnv`);
|
|
70
70
|
}
|
|
71
71
|
if (adapter.credentialMode === 'runtime-env' && adapter.requiredEnv.length === 0) {
|
|
72
|
-
failures.push(
|
|
72
|
+
failures.push(
|
|
73
|
+
`Adapter ${adapter.id} has credentialMode=runtime-env but declares no requiredEnv`,
|
|
74
|
+
);
|
|
73
75
|
}
|
|
74
76
|
for (const [commandName, command] of Object.entries(adapter.commands ?? {})) {
|
|
75
|
-
if (
|
|
76
|
-
|
|
77
|
+
if (
|
|
78
|
+
(commandName === 'moduleValidate' || commandName === 'selfTest') &&
|
|
79
|
+
!command.includes(adapter.path)
|
|
80
|
+
) {
|
|
81
|
+
failures.push(
|
|
82
|
+
`Adapter ${adapter.id} command ${commandName} does not reference ${adapter.path}`,
|
|
83
|
+
);
|
|
77
84
|
}
|
|
78
85
|
}
|
|
79
86
|
if (!existsSync(resolve(adapter.path))) {
|
|
@@ -89,10 +96,14 @@ export async function validateAdapterRegistry(options = {}) {
|
|
|
89
96
|
failures: report.failures,
|
|
90
97
|
});
|
|
91
98
|
if (!report.ok) {
|
|
92
|
-
failures.push(
|
|
99
|
+
failures.push(
|
|
100
|
+
`Adapter ${adapter.id} failed module validation: ${report.failures.join('; ')}`,
|
|
101
|
+
);
|
|
93
102
|
}
|
|
94
103
|
if (report.adapter?.name && report.adapter.name !== adapter.name) {
|
|
95
|
-
failures.push(
|
|
104
|
+
failures.push(
|
|
105
|
+
`Adapter ${adapter.id} registry name ${adapter.name} does not match module name ${report.adapter.name}`,
|
|
106
|
+
);
|
|
96
107
|
}
|
|
97
108
|
}
|
|
98
109
|
|
|
@@ -134,13 +134,25 @@ export function validateSchema(value, schema, label, root = schema) {
|
|
|
134
134
|
if (currentSchema.minLength != null && String(current).length < currentSchema.minLength) {
|
|
135
135
|
errors.push(`${path}: shorter than minLength ${currentSchema.minLength}`);
|
|
136
136
|
}
|
|
137
|
-
if (
|
|
137
|
+
if (
|
|
138
|
+
currentSchema.pattern &&
|
|
139
|
+
typeof current === 'string' &&
|
|
140
|
+
!new RegExp(currentSchema.pattern).test(current)
|
|
141
|
+
) {
|
|
138
142
|
errors.push(`${path}: does not match ${currentSchema.pattern}`);
|
|
139
143
|
}
|
|
140
|
-
if (
|
|
144
|
+
if (
|
|
145
|
+
currentSchema.minimum != null &&
|
|
146
|
+
typeof current === 'number' &&
|
|
147
|
+
current < currentSchema.minimum
|
|
148
|
+
) {
|
|
141
149
|
errors.push(`${path}: below minimum ${currentSchema.minimum}`);
|
|
142
150
|
}
|
|
143
|
-
if (
|
|
151
|
+
if (
|
|
152
|
+
currentSchema.maximum != null &&
|
|
153
|
+
typeof current === 'number' &&
|
|
154
|
+
current > currentSchema.maximum
|
|
155
|
+
) {
|
|
144
156
|
errors.push(`${path}: above maximum ${currentSchema.maximum}`);
|
|
145
157
|
}
|
|
146
158
|
|
|
@@ -155,7 +167,8 @@ export function validateSchema(value, schema, label, root = schema) {
|
|
|
155
167
|
|
|
156
168
|
if (currentSchema.type === 'object') {
|
|
157
169
|
for (const required of currentSchema.required ?? []) {
|
|
158
|
-
if (!Object.hasOwn(current, required))
|
|
170
|
+
if (!Object.hasOwn(current, required))
|
|
171
|
+
errors.push(`${path}: missing required property ${required}`);
|
|
159
172
|
}
|
|
160
173
|
if (currentSchema.additionalProperties === false) {
|
|
161
174
|
for (const key of Object.keys(current)) {
|
|
@@ -177,7 +190,10 @@ export function validateSchema(value, schema, label, root = schema) {
|
|
|
177
190
|
function stableJson(value) {
|
|
178
191
|
if (Array.isArray(value)) return `[${value.map(stableJson).join(',')}]`;
|
|
179
192
|
if (value && typeof value === 'object') {
|
|
180
|
-
return `{${Object.keys(value)
|
|
193
|
+
return `{${Object.keys(value)
|
|
194
|
+
.sort()
|
|
195
|
+
.map(key => `${JSON.stringify(key)}:${stableJson(value[key])}`)
|
|
196
|
+
.join(',')}}`;
|
|
181
197
|
}
|
|
182
198
|
return JSON.stringify(value);
|
|
183
199
|
}
|
|
@@ -231,7 +247,11 @@ export function validateGuardBenchArtifacts(options = {}) {
|
|
|
231
247
|
failures.push(error.message);
|
|
232
248
|
continue;
|
|
233
249
|
}
|
|
234
|
-
for (const error of validateSchema(
|
|
250
|
+
for (const error of validateSchema(
|
|
251
|
+
optionalArtifacts[key],
|
|
252
|
+
schemas[key],
|
|
253
|
+
`guardbench-${key}`,
|
|
254
|
+
)) {
|
|
235
255
|
failures.push(`${basename(path)}: ${error}`);
|
|
236
256
|
}
|
|
237
257
|
}
|
|
@@ -243,7 +263,9 @@ export function validateGuardBenchArtifacts(options = {}) {
|
|
|
243
263
|
if (!Object.hasOwn(currentHashes, file)) {
|
|
244
264
|
failures.push(`external-run-metadata.json: artifactHashes includes unknown file ${file}`);
|
|
245
265
|
} else if (currentHashes[file] !== expectedHash) {
|
|
246
|
-
failures.push(
|
|
266
|
+
failures.push(
|
|
267
|
+
`external-run-metadata.json: artifactHashes.${file} does not match current artifact`,
|
|
268
|
+
);
|
|
247
269
|
}
|
|
248
270
|
}
|
|
249
271
|
for (const file of Object.values(ARTIFACT_FILES)) {
|
|
@@ -255,27 +277,58 @@ export function validateGuardBenchArtifacts(options = {}) {
|
|
|
255
277
|
const conformanceCard = optionalArtifacts.conformanceCard;
|
|
256
278
|
if (conformanceCard) {
|
|
257
279
|
const currentHashes = computeGuardBenchArtifactHashes(dir);
|
|
258
|
-
for (const [file, expectedHash] of Object.entries(
|
|
280
|
+
for (const [file, expectedHash] of Object.entries(
|
|
281
|
+
conformanceCard.integrity?.artifactHashes ?? {},
|
|
282
|
+
)) {
|
|
259
283
|
if (!Object.hasOwn(currentHashes, file)) {
|
|
260
|
-
failures.push(
|
|
284
|
+
failures.push(
|
|
285
|
+
`guardbench-conformance-card.json: integrity.artifactHashes includes unknown file ${file}`,
|
|
286
|
+
);
|
|
261
287
|
} else if (currentHashes[file] !== expectedHash) {
|
|
262
|
-
failures.push(
|
|
288
|
+
failures.push(
|
|
289
|
+
`guardbench-conformance-card.json: integrity.artifactHashes.${file} does not match current artifact`,
|
|
290
|
+
);
|
|
263
291
|
}
|
|
264
292
|
}
|
|
265
293
|
if (conformanceCard.manifestVersion !== artifacts.manifest.manifestVersion) {
|
|
266
|
-
failures.push(
|
|
294
|
+
failures.push(
|
|
295
|
+
'guardbench-conformance-card.json: manifestVersion does not match guardbench-manifest.json',
|
|
296
|
+
);
|
|
267
297
|
}
|
|
268
298
|
if (conformanceCard.suiteId !== artifacts.manifest.suiteId) {
|
|
269
|
-
failures.push(
|
|
299
|
+
failures.push(
|
|
300
|
+
'guardbench-conformance-card.json: suiteId does not match guardbench-manifest.json',
|
|
301
|
+
);
|
|
270
302
|
}
|
|
271
|
-
if (
|
|
272
|
-
|
|
303
|
+
if (
|
|
304
|
+
!artifacts.summary.systemSummaries?.some(
|
|
305
|
+
row => row.system === conformanceCard.subject?.name,
|
|
306
|
+
)
|
|
307
|
+
) {
|
|
308
|
+
failures.push(
|
|
309
|
+
'guardbench-conformance-card.json: subject.name is not present in guardbench-summary.json',
|
|
310
|
+
);
|
|
273
311
|
}
|
|
274
312
|
}
|
|
275
313
|
|
|
276
|
-
assertSameJson(
|
|
277
|
-
|
|
278
|
-
|
|
314
|
+
assertSameJson(
|
|
315
|
+
artifacts.summary.manifest,
|
|
316
|
+
artifacts.manifest,
|
|
317
|
+
'summary.manifest vs guardbench-manifest.json',
|
|
318
|
+
failures,
|
|
319
|
+
);
|
|
320
|
+
assertSameJson(
|
|
321
|
+
artifacts.summary.cases,
|
|
322
|
+
artifacts.raw.cases,
|
|
323
|
+
'summary.cases vs raw.cases',
|
|
324
|
+
failures,
|
|
325
|
+
);
|
|
326
|
+
assertSameJson(
|
|
327
|
+
artifacts.summary.provenance,
|
|
328
|
+
artifacts.raw.provenance,
|
|
329
|
+
'summary.provenance vs raw.provenance',
|
|
330
|
+
failures,
|
|
331
|
+
);
|
|
279
332
|
if (artifacts.summary.generatedAt !== artifacts.raw.generatedAt) {
|
|
280
333
|
failures.push('summary.generatedAt vs raw.generatedAt: cross-artifact mismatch');
|
|
281
334
|
}
|
|
@@ -290,7 +343,9 @@ export function validateGuardBenchArtifacts(options = {}) {
|
|
|
290
343
|
failures.push('guardbench-raw.json: artifactRedactionSweep did not pass');
|
|
291
344
|
}
|
|
292
345
|
|
|
293
|
-
const artifactText = Object.values(artifacts)
|
|
346
|
+
const artifactText = Object.values(artifacts)
|
|
347
|
+
.map(value => JSON.stringify(value))
|
|
348
|
+
.join('\n');
|
|
294
349
|
for (const secret of seededSecrets) {
|
|
295
350
|
if (secret && artifactText.includes(secret)) {
|
|
296
351
|
failures.push(`raw seeded secret leaked into GuardBench artifacts: ${secret}`);
|
|
@@ -310,7 +365,9 @@ export function validateGuardBenchArtifacts(options = {}) {
|
|
|
310
365
|
dir: publicPath(dir),
|
|
311
366
|
schemasDir: publicPath(schemasDir),
|
|
312
367
|
files: Object.values(ARTIFACT_FILES),
|
|
313
|
-
optionalFiles: Object.values(OPTIONAL_ARTIFACT_FILES).filter(file =>
|
|
368
|
+
optionalFiles: Object.values(OPTIONAL_ARTIFACT_FILES).filter(file =>
|
|
369
|
+
existsSync(join(dir, file)),
|
|
370
|
+
),
|
|
314
371
|
failures,
|
|
315
372
|
};
|
|
316
373
|
}
|