audrey 0.23.1 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +101 -15
- package/LICENSE +21 -21
- package/README.md +232 -6
- package/SECURITY.md +2 -1
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1125 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1271 -0
- package/benchmarks/output/guardbench-summary.json +2107 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +65 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +675 -157
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.js +5 -5
- package/dist/src/affect.js +8 -8
- package/dist/src/audrey.d.ts +13 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +68 -3
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.js +4 -4
- package/dist/src/causal.js +3 -3
- package/dist/src/consolidate.js +48 -48
- package/dist/src/controller.d.ts +78 -6
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +273 -53
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.js +172 -172
- package/dist/src/decay.js +8 -8
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +39 -29
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.js +6 -6
- package/dist/src/feedback.d.ts +6 -0
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +6 -0
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.js +12 -12
- package/dist/src/hybrid-recall.js +9 -9
- package/dist/src/impact.js +6 -6
- package/dist/src/import.d.ts +3 -3
- package/dist/src/import.js +41 -41
- package/dist/src/index.d.ts +5 -4
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.js +14 -14
- package/dist/src/introspect.js +18 -18
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +41 -0
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/promote.js +7 -7
- package/dist/src/prompts.js +118 -118
- package/dist/src/recall.js +30 -30
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.js +4 -4
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +71 -2
- package/dist/src/routes.js.map +1 -1
- package/dist/src/validate.js +25 -25
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +555 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +87 -4
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +92 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +785 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
|
|
4
|
+
const PALETTE = {
|
|
5
|
+
audrey: '#0f766e',
|
|
6
|
+
vector: '#0369a1',
|
|
7
|
+
keyword: '#6d28d9',
|
|
8
|
+
recent: '#b45309',
|
|
9
|
+
external: '#1d4ed8',
|
|
10
|
+
accent: '#111827',
|
|
11
|
+
muted: '#6b7280',
|
|
12
|
+
surface: '#f8fafc',
|
|
13
|
+
border: '#cbd5e1',
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
function escapeHtml(text) {
|
|
17
|
+
return String(text)
|
|
18
|
+
.replaceAll('&', '&')
|
|
19
|
+
.replaceAll('<', '<')
|
|
20
|
+
.replaceAll('>', '>')
|
|
21
|
+
.replaceAll('"', '"');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function chartBarColor(label) {
|
|
25
|
+
if (label === 'Audrey') return PALETTE.audrey;
|
|
26
|
+
if (label.includes('Vector')) return PALETTE.vector;
|
|
27
|
+
if (label.includes('Keyword')) return PALETTE.keyword;
|
|
28
|
+
if (label.includes('Recent')) return PALETTE.recent;
|
|
29
|
+
return PALETTE.external;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
|
|
33
|
+
const width = 960;
|
|
34
|
+
const height = 420;
|
|
35
|
+
const margin = { top: 56, right: 32, bottom: 88, left: 64 };
|
|
36
|
+
const plotWidth = width - margin.left - margin.right;
|
|
37
|
+
const plotHeight = height - margin.top - margin.bottom;
|
|
38
|
+
const barWidth = Math.max(32, Math.floor(plotWidth / Math.max(rows.length, 1)) - 18);
|
|
39
|
+
const gap = rows.length > 1 ? (plotWidth - barWidth * rows.length) / (rows.length - 1) : 0;
|
|
40
|
+
|
|
41
|
+
const bars = rows.map((row, index) => {
|
|
42
|
+
const value = Math.max(0, Math.min(maxValue, row.value));
|
|
43
|
+
const barHeight = (value / maxValue) * plotHeight;
|
|
44
|
+
const x = margin.left + index * (barWidth + gap);
|
|
45
|
+
const y = margin.top + plotHeight - barHeight;
|
|
46
|
+
return `
|
|
47
|
+
<rect x="${x}" y="${y}" width="${barWidth}" height="${barHeight}" rx="8" fill="${chartBarColor(row.label)}" />
|
|
48
|
+
<text x="${x + barWidth / 2}" y="${y - 10}" text-anchor="middle" font-size="15" fill="${PALETTE.accent}">${value.toFixed(1)}${valueSuffix}</text>
|
|
49
|
+
<text x="${x + barWidth / 2}" y="${height - 42}" text-anchor="middle" font-size="14" fill="${PALETTE.muted}">${escapeHtml(row.label)}</text>
|
|
50
|
+
`;
|
|
51
|
+
}).join('\n');
|
|
52
|
+
|
|
53
|
+
const grid = [0, 25, 50, 75, 100].map(tick => {
|
|
54
|
+
const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
|
|
55
|
+
return `
|
|
56
|
+
<line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="${PALETTE.border}" stroke-dasharray="4 4" />
|
|
57
|
+
<text x="${margin.left - 10}" y="${y + 5}" text-anchor="end" font-size="13" fill="${PALETTE.muted}">${tick}${valueSuffix}</text>
|
|
58
|
+
`;
|
|
59
|
+
}).join('\n');
|
|
60
|
+
|
|
61
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
62
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}" role="img" aria-label="${escapeHtml(title)}">
|
|
63
|
+
<rect width="100%" height="100%" fill="white" />
|
|
64
|
+
<text x="${margin.left}" y="34" font-size="24" font-weight="700" fill="${PALETTE.accent}">${escapeHtml(title)}</text>
|
|
65
|
+
${grid}
|
|
66
|
+
${bars}
|
|
67
|
+
</svg>`;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function renderTrendList(trends) {
|
|
71
|
+
return trends.map(trend => `
|
|
72
|
+
<li>
|
|
73
|
+
<strong>${escapeHtml(trend.title)}</strong><br />
|
|
74
|
+
${escapeHtml(trend.summary)}<br />
|
|
75
|
+
<a href="${trend.source}">${escapeHtml(trend.source)}</a>
|
|
76
|
+
</li>
|
|
77
|
+
`).join('\n');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function renderCaseRows(localCases) {
|
|
81
|
+
return localCases.map(caseResult => `
|
|
82
|
+
<tr>
|
|
83
|
+
<td>${escapeHtml(caseResult.title)}</td>
|
|
84
|
+
<td>${escapeHtml(caseResult.suite)}</td>
|
|
85
|
+
<td>${escapeHtml(caseResult.family)}</td>
|
|
86
|
+
${caseResult.results.map(result => {
|
|
87
|
+
const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
|
|
88
|
+
const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
|
|
89
|
+
return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
|
|
90
|
+
}).join('')}
|
|
91
|
+
</tr>
|
|
92
|
+
`).join('\n');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function renderSuiteSections(suiteCharts) {
|
|
96
|
+
if (suiteCharts.length === 0) return '';
|
|
97
|
+
return suiteCharts.map(chart => `
|
|
98
|
+
<section class="callout">
|
|
99
|
+
<h2>${escapeHtml(chart.title)}</h2>
|
|
100
|
+
<p>${escapeHtml(chart.description)}</p>
|
|
101
|
+
<img src="./${escapeHtml(chart.fileName)}" alt="${escapeHtml(chart.title)} chart" />
|
|
102
|
+
</section>
|
|
103
|
+
`).join('\n');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export function writeBenchmarkArtifacts({
|
|
107
|
+
outputDir,
|
|
108
|
+
summary,
|
|
109
|
+
localOverall,
|
|
110
|
+
localSuites,
|
|
111
|
+
externalOverall,
|
|
112
|
+
trends,
|
|
113
|
+
readmeAssetsDir,
|
|
114
|
+
}) {
|
|
115
|
+
mkdirSync(outputDir, { recursive: true });
|
|
116
|
+
|
|
117
|
+
const localChartTitle = summary.local?.overall_scope === 'comparable_suites'
|
|
118
|
+
? 'Audrey vs Comparable Local Memory Baselines'
|
|
119
|
+
: 'Selected Audrey Regression Suite';
|
|
120
|
+
const localChart = renderBarChart({
|
|
121
|
+
title: localChartTitle,
|
|
122
|
+
rows: localOverall.map(row => ({ label: row.system, value: row.scorePercent })),
|
|
123
|
+
});
|
|
124
|
+
const externalChart = renderBarChart({
|
|
125
|
+
title: 'Published LLM Memory Standards (LoCoMo)',
|
|
126
|
+
rows: externalOverall.map(row => ({ label: row.system, value: row.score })),
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
writeFileSync(join(outputDir, 'local-overall.svg'), localChart, 'utf8');
|
|
130
|
+
writeFileSync(join(outputDir, 'published-locomo.svg'), externalChart, 'utf8');
|
|
131
|
+
writeFileSync(join(outputDir, 'summary.json'), JSON.stringify(summary, null, 2), 'utf8');
|
|
132
|
+
|
|
133
|
+
const suiteCharts = localSuites.map(suite => {
|
|
134
|
+
const fileName = `${suite.id}-overall.svg`;
|
|
135
|
+
const chart = renderBarChart({
|
|
136
|
+
title: `${suite.title} Benchmark`,
|
|
137
|
+
rows: suite.overall.map(row => ({ label: row.system, value: row.scorePercent })),
|
|
138
|
+
});
|
|
139
|
+
writeFileSync(join(outputDir, fileName), chart, 'utf8');
|
|
140
|
+
return {
|
|
141
|
+
id: suite.id,
|
|
142
|
+
title: `${suite.title} Benchmark`,
|
|
143
|
+
description: suite.description,
|
|
144
|
+
fileName,
|
|
145
|
+
path: join(outputDir, fileName),
|
|
146
|
+
};
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
let readmeAssets = null;
|
|
150
|
+
if (readmeAssetsDir) {
|
|
151
|
+
mkdirSync(readmeAssetsDir, { recursive: true });
|
|
152
|
+
const localReadmeChart = join(readmeAssetsDir, 'local-benchmark.svg');
|
|
153
|
+
const externalReadmeChart = join(readmeAssetsDir, 'published-memory-standards.svg');
|
|
154
|
+
writeFileSync(localReadmeChart, localChart, 'utf8');
|
|
155
|
+
writeFileSync(externalReadmeChart, externalChart, 'utf8');
|
|
156
|
+
|
|
157
|
+
const operationsSuite = suiteCharts.find(chart => chart.id === 'operations');
|
|
158
|
+
let operationsReadmeChart = null;
|
|
159
|
+
if (operationsSuite) {
|
|
160
|
+
operationsReadmeChart = join(readmeAssetsDir, 'operations-benchmark.svg');
|
|
161
|
+
writeFileSync(
|
|
162
|
+
operationsReadmeChart,
|
|
163
|
+
renderBarChart({
|
|
164
|
+
title: 'Audrey Memory Operations Benchmark',
|
|
165
|
+
rows: (localSuites.find(suite => suite.id === 'operations')?.overall || [])
|
|
166
|
+
.map(row => ({ label: row.system, value: row.scorePercent })),
|
|
167
|
+
}),
|
|
168
|
+
'utf8',
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
readmeAssets = {
|
|
173
|
+
localChart: localReadmeChart,
|
|
174
|
+
operationsChart: operationsReadmeChart,
|
|
175
|
+
externalChart: externalReadmeChart,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const html = `<!doctype html>
|
|
180
|
+
<html lang="en">
|
|
181
|
+
<head>
|
|
182
|
+
<meta charset="utf-8" />
|
|
183
|
+
<title>Audrey Memory Benchmark</title>
|
|
184
|
+
<style>
|
|
185
|
+
body { font-family: "Segoe UI", Arial, sans-serif; margin: 32px; color: ${PALETTE.accent}; background: ${PALETTE.surface}; }
|
|
186
|
+
main { max-width: 1120px; margin: 0 auto; }
|
|
187
|
+
h1, h2 { margin-bottom: 12px; }
|
|
188
|
+
p, li { line-height: 1.5; }
|
|
189
|
+
.callout { background: white; border: 1px solid ${PALETTE.border}; border-radius: 16px; padding: 20px; margin-bottom: 24px; }
|
|
190
|
+
.grid { display: grid; gap: 24px; grid-template-columns: 1fr; }
|
|
191
|
+
img { width: 100%; border: 1px solid ${PALETTE.border}; border-radius: 16px; background: white; }
|
|
192
|
+
table { width: 100%; border-collapse: collapse; background: white; border-radius: 16px; overflow: hidden; }
|
|
193
|
+
th, td { border: 1px solid ${PALETTE.border}; padding: 12px; vertical-align: top; text-align: left; }
|
|
194
|
+
th { background: #e2e8f0; }
|
|
195
|
+
code { background: #e2e8f0; padding: 2px 6px; border-radius: 6px; }
|
|
196
|
+
</style>
|
|
197
|
+
</head>
|
|
198
|
+
<body>
|
|
199
|
+
<main>
|
|
200
|
+
<h1>Audrey Memory Benchmark</h1>
|
|
201
|
+
<div class="callout">
|
|
202
|
+
<p><strong>Method:</strong> Audrey is scored on a local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle behavior, and agent guard-loop benchmarks. The combined local chart uses comparable retrieval/lifecycle suites when available; the guard loop is reported as its own controller regression suite. Published external LoCoMo numbers stay separate so the comparison remains honest.</p>
|
|
203
|
+
<p><strong>Scope:</strong> ${escapeHtml(summary.local?.overall_scope ?? 'unknown')} across ${escapeHtml((summary.local?.overall_suite_ids ?? []).join(', '))}; ${summary.local?.cases?.length ?? 0} total cases.</p>
|
|
204
|
+
<p><strong>Run:</strong> <code>${escapeHtml(summary.command)}</code></p>
|
|
205
|
+
<p><strong>Generated:</strong> ${escapeHtml(summary.generatedAt)}</p>
|
|
206
|
+
</div>
|
|
207
|
+
|
|
208
|
+
<div class="grid">
|
|
209
|
+
<section class="callout">
|
|
210
|
+
<h2>Combined Local Benchmark</h2>
|
|
211
|
+
<img src="./local-overall.svg" alt="Combined local benchmark bar chart" />
|
|
212
|
+
</section>
|
|
213
|
+
|
|
214
|
+
${renderSuiteSections(suiteCharts)}
|
|
215
|
+
|
|
216
|
+
<section class="callout">
|
|
217
|
+
<h2>Published Leaderboard</h2>
|
|
218
|
+
<img src="./published-locomo.svg" alt="Published LoCoMo leaderboard bar chart" />
|
|
219
|
+
</section>
|
|
220
|
+
</div>
|
|
221
|
+
|
|
222
|
+
<section class="callout">
|
|
223
|
+
<h2>Case Matrix</h2>
|
|
224
|
+
<table>
|
|
225
|
+
<thead>
|
|
226
|
+
<tr>
|
|
227
|
+
<th>Case</th>
|
|
228
|
+
<th>Suite</th>
|
|
229
|
+
<th>Family</th>
|
|
230
|
+
${summary.local.overall.map(row => `<th>${escapeHtml(row.system)}</th>`).join('')}
|
|
231
|
+
</tr>
|
|
232
|
+
</thead>
|
|
233
|
+
<tbody>
|
|
234
|
+
${renderCaseRows(summary.local.cases)}
|
|
235
|
+
</tbody>
|
|
236
|
+
</table>
|
|
237
|
+
</section>
|
|
238
|
+
|
|
239
|
+
<section class="callout">
|
|
240
|
+
<h2>March 23, 2026 Memory Trends</h2>
|
|
241
|
+
<ul>
|
|
242
|
+
${renderTrendList(trends)}
|
|
243
|
+
</ul>
|
|
244
|
+
</section>
|
|
245
|
+
</main>
|
|
246
|
+
</body>
|
|
247
|
+
</html>`;
|
|
248
|
+
|
|
249
|
+
writeFileSync(join(outputDir, 'report.html'), html, 'utf8');
|
|
250
|
+
|
|
251
|
+
return {
|
|
252
|
+
json: join(outputDir, 'summary.json'),
|
|
253
|
+
html: join(outputDir, 'report.html'),
|
|
254
|
+
localChart: join(outputDir, 'local-overall.svg'),
|
|
255
|
+
suiteCharts,
|
|
256
|
+
externalChart: join(outputDir, 'published-locomo.svg'),
|
|
257
|
+
readmeAssets,
|
|
258
|
+
};
|
|
259
|
+
}
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import { spawnSync } from 'node:child_process';
|
|
2
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
3
|
+
import { basename, dirname, resolve } from 'node:path';
|
|
4
|
+
import { fileURLToPath } from 'node:url';
|
|
5
|
+
import { writeGuardBenchConformanceCard } from './create-conformance-card.mjs';
|
|
6
|
+
import { computeGuardBenchArtifactHashes, validateGuardBenchArtifacts } from './validate-guardbench-artifacts.mjs';
|
|
7
|
+
import { publicArtifactValue } from './public-paths.mjs';
|
|
8
|
+
|
|
9
|
+
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
10
|
+
const KNOWN_ADAPTERS = new Map([
|
|
11
|
+
['mem0', {
|
|
12
|
+
name: 'mem0-platform',
|
|
13
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
14
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
15
|
+
}],
|
|
16
|
+
['mem0-platform', {
|
|
17
|
+
name: 'mem0-platform',
|
|
18
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
19
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
20
|
+
}],
|
|
21
|
+
['zep', {
|
|
22
|
+
name: 'zep-cloud',
|
|
23
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
24
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
25
|
+
}],
|
|
26
|
+
['zep-cloud', {
|
|
27
|
+
name: 'zep-cloud',
|
|
28
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
29
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
30
|
+
}],
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
export function parseExternalArgs(argv = process.argv.slice(2)) {
|
|
34
|
+
const args = {
|
|
35
|
+
adapter: 'mem0-platform',
|
|
36
|
+
outDir: null,
|
|
37
|
+
check: false,
|
|
38
|
+
dryRun: false,
|
|
39
|
+
json: false,
|
|
40
|
+
minPassRate: null,
|
|
41
|
+
allowMissingEnv: false,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
for (let i = 0; i < argv.length; i++) {
|
|
45
|
+
const token = argv[i];
|
|
46
|
+
if (token === '--adapter' && argv[i + 1]) args.adapter = argv[++i];
|
|
47
|
+
else if (token === '--out-dir' && argv[i + 1]) args.outDir = argv[++i];
|
|
48
|
+
else if (token === '--check') args.check = true;
|
|
49
|
+
else if (token === '--dry-run') args.dryRun = true;
|
|
50
|
+
else if (token === '--json') args.json = true;
|
|
51
|
+
else if (token === '--min-pass-rate' && argv[i + 1]) args.minPassRate = argv[++i];
|
|
52
|
+
else if (token === '--allow-missing-env') args.allowMissingEnv = true;
|
|
53
|
+
else if (token === '--help' || token === '-h') args.help = true;
|
|
54
|
+
else throw new Error(`Unknown argument: ${token}`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return args;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function readJson(path) {
|
|
61
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function adapterSpec(adapter) {
|
|
65
|
+
const known = KNOWN_ADAPTERS.get(adapter);
|
|
66
|
+
if (known) return known;
|
|
67
|
+
|
|
68
|
+
const adapterPath = resolve(ROOT, adapter);
|
|
69
|
+
return {
|
|
70
|
+
name: basename(adapter).replace(/\.[cm]?js$/i, ''),
|
|
71
|
+
path: adapterPath,
|
|
72
|
+
requiredEnv: [],
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export function buildExternalGuardBenchRun(args = {}, env = process.env) {
|
|
77
|
+
const spec = adapterSpec(args.adapter ?? 'mem0-platform');
|
|
78
|
+
const adapterPath = resolve(ROOT, spec.path);
|
|
79
|
+
const outDir = resolve(ROOT, args.outDir ?? `benchmarks/output/external/${spec.name}`);
|
|
80
|
+
const missingEnv = spec.requiredEnv.filter(name => !env[name]);
|
|
81
|
+
const command = [
|
|
82
|
+
process.execPath,
|
|
83
|
+
resolve(ROOT, 'benchmarks/guardbench.js'),
|
|
84
|
+
'--adapter',
|
|
85
|
+
adapterPath,
|
|
86
|
+
'--out-dir',
|
|
87
|
+
outDir,
|
|
88
|
+
];
|
|
89
|
+
|
|
90
|
+
if (args.check) command.push('--check');
|
|
91
|
+
if (args.json) command.push('--json');
|
|
92
|
+
if (args.minPassRate != null) command.push('--min-pass-rate', String(args.minPassRate));
|
|
93
|
+
const validationCommand = [
|
|
94
|
+
process.execPath,
|
|
95
|
+
resolve(ROOT, 'benchmarks/validate-guardbench-artifacts.mjs'),
|
|
96
|
+
'--dir',
|
|
97
|
+
outDir,
|
|
98
|
+
];
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
adapter: spec.name,
|
|
102
|
+
adapterPath,
|
|
103
|
+
outDir,
|
|
104
|
+
requiredEnv: spec.requiredEnv,
|
|
105
|
+
missingEnv,
|
|
106
|
+
command,
|
|
107
|
+
validationCommand,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export function evaluateAdapterConformance(summary, adapterName) {
|
|
112
|
+
const failures = [];
|
|
113
|
+
const externalSubjects = (summary.manifest?.subjects ?? []).filter(subject => subject.external);
|
|
114
|
+
const resolvedAdapterName = summary.systemSummaries?.some(row => row.system === adapterName)
|
|
115
|
+
? adapterName
|
|
116
|
+
: externalSubjects.length === 1
|
|
117
|
+
? externalSubjects[0].name
|
|
118
|
+
: adapterName;
|
|
119
|
+
const systemSummary = summary.systemSummaries?.find(row => row.system === resolvedAdapterName);
|
|
120
|
+
if (!systemSummary) {
|
|
121
|
+
failures.push(`Missing system summary for adapter ${resolvedAdapterName}`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const expectedScenarios = summary.scenarios ?? summary.manifest?.scenarios?.length ?? 0;
|
|
125
|
+
const adapterRows = (summary.cases ?? [])
|
|
126
|
+
.flatMap(entry => entry.results ?? [])
|
|
127
|
+
.filter(row => row.system === resolvedAdapterName);
|
|
128
|
+
|
|
129
|
+
if (adapterRows.length !== expectedScenarios) {
|
|
130
|
+
failures.push(`Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`);
|
|
131
|
+
}
|
|
132
|
+
if (systemSummary && systemSummary.scenarios !== expectedScenarios) {
|
|
133
|
+
failures.push(`Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`);
|
|
134
|
+
}
|
|
135
|
+
if (systemSummary && systemSummary.redactionLeaks !== 0) {
|
|
136
|
+
failures.push(`Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`);
|
|
137
|
+
}
|
|
138
|
+
if (adapterRows.some(row => row.external !== true)) {
|
|
139
|
+
failures.push(`Adapter ${resolvedAdapterName} rows are not marked external`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
ok: failures.length === 0,
|
|
144
|
+
adapter: resolvedAdapterName,
|
|
145
|
+
requestedAdapter: adapterName,
|
|
146
|
+
scenarios: systemSummary?.scenarios ?? adapterRows.length,
|
|
147
|
+
expectedScenarios,
|
|
148
|
+
fullContractPassRate: systemSummary?.passRate ?? null,
|
|
149
|
+
decisionAccuracy: systemSummary?.decisionAccuracy ?? null,
|
|
150
|
+
redactionLeaks: systemSummary?.redactionLeaks ?? null,
|
|
151
|
+
failures,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function usage() {
|
|
156
|
+
return `Usage: node benchmarks/run-external-guardbench.mjs [options]
|
|
157
|
+
|
|
158
|
+
Options:
|
|
159
|
+
--adapter <name|path> Adapter alias or ESM adapter path. Default: mem0-platform.
|
|
160
|
+
--out-dir <path> Output directory. Default: benchmarks/output/external/<adapter>.
|
|
161
|
+
--check Fail if Audrey Guard pass rate is below the threshold.
|
|
162
|
+
--min-pass-rate <percent> GuardBench pass-rate threshold for --check.
|
|
163
|
+
--json Forward JSON output from GuardBench.
|
|
164
|
+
--dry-run Print the resolved command and metadata without running.
|
|
165
|
+
--allow-missing-env Permit running even when known runtime env vars are absent.
|
|
166
|
+
`;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export function writeExternalRunMetadata(path, metadata) {
|
|
170
|
+
mkdirSync(path, { recursive: true });
|
|
171
|
+
const file = resolve(path, 'external-run-metadata.json');
|
|
172
|
+
writeFileSync(file, `${JSON.stringify(publicArtifactValue(metadata), null, 2)}\n`, 'utf-8');
|
|
173
|
+
return file;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
async function main() {
|
|
177
|
+
const args = parseExternalArgs();
|
|
178
|
+
if (args.help) {
|
|
179
|
+
console.log(usage());
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const run = buildExternalGuardBenchRun(args);
|
|
184
|
+
const startedAt = new Date().toISOString();
|
|
185
|
+
const metadata = {
|
|
186
|
+
suite: 'GuardBench external adapter run',
|
|
187
|
+
startedAt,
|
|
188
|
+
adapter: run.adapter,
|
|
189
|
+
adapterPath: run.adapterPath,
|
|
190
|
+
outDir: run.outDir,
|
|
191
|
+
requiredEnv: run.requiredEnv,
|
|
192
|
+
missingEnv: run.missingEnv,
|
|
193
|
+
command: run.command,
|
|
194
|
+
validationCommand: run.validationCommand,
|
|
195
|
+
dryRun: args.dryRun,
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
if (!existsSync(run.adapterPath)) {
|
|
199
|
+
throw new Error(`Adapter not found: ${run.adapterPath}`);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if (run.missingEnv.length && !args.allowMissingEnv && !args.dryRun) {
|
|
203
|
+
metadata.status = 'blocked';
|
|
204
|
+
metadata.blockReason = `Missing runtime environment: ${run.missingEnv.join(', ')}`;
|
|
205
|
+
const metadataPath = writeExternalRunMetadata(run.outDir, metadata);
|
|
206
|
+
throw new Error(`${metadata.blockReason}. Metadata written to ${metadataPath}`);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (args.dryRun) {
|
|
210
|
+
metadata.status = run.missingEnv.length ? 'dry-run-missing-env' : 'dry-run-ready';
|
|
211
|
+
const metadataPath = writeExternalRunMetadata(run.outDir, metadata);
|
|
212
|
+
if (args.json) {
|
|
213
|
+
console.log(JSON.stringify({ ...metadata, metadataPath }, null, 2));
|
|
214
|
+
} else {
|
|
215
|
+
console.log(`External GuardBench dry run: ${run.adapter}`);
|
|
216
|
+
console.log(`Command: ${run.command.map(part => JSON.stringify(part)).join(' ')}`);
|
|
217
|
+
console.log(`Metadata: ${metadataPath}`);
|
|
218
|
+
if (run.missingEnv.length) console.log(`Missing runtime env: ${run.missingEnv.join(', ')}`);
|
|
219
|
+
}
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
writeExternalRunMetadata(run.outDir, { ...metadata, status: 'running' });
|
|
224
|
+
const child = spawnSync(run.command[0], run.command.slice(1), {
|
|
225
|
+
cwd: ROOT,
|
|
226
|
+
env: process.env,
|
|
227
|
+
stdio: 'inherit',
|
|
228
|
+
});
|
|
229
|
+
const validation = validateGuardBenchArtifacts({ dir: run.outDir });
|
|
230
|
+
let adapterConformance = {
|
|
231
|
+
ok: false,
|
|
232
|
+
adapter: run.adapter,
|
|
233
|
+
failures: ['GuardBench summary was not available for adapter conformance evaluation'],
|
|
234
|
+
};
|
|
235
|
+
if (child.status === 0) {
|
|
236
|
+
try {
|
|
237
|
+
const summary = readJson(resolve(run.outDir, 'guardbench-summary.json'));
|
|
238
|
+
adapterConformance = evaluateAdapterConformance(summary, run.adapter);
|
|
239
|
+
} catch (error) {
|
|
240
|
+
adapterConformance = {
|
|
241
|
+
ok: false,
|
|
242
|
+
adapter: run.adapter,
|
|
243
|
+
failures: [error.message],
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
if (validation.ok) {
|
|
248
|
+
console.log(`External GuardBench artifact validation passed: ${run.outDir}`);
|
|
249
|
+
} else {
|
|
250
|
+
console.error('External GuardBench artifact validation failed:');
|
|
251
|
+
for (const failure of validation.failures) console.error(`- ${failure}`);
|
|
252
|
+
}
|
|
253
|
+
if (adapterConformance.ok) {
|
|
254
|
+
console.log(`External GuardBench adapter conformance passed: ${adapterConformance.adapter}`);
|
|
255
|
+
} else {
|
|
256
|
+
console.error('External GuardBench adapter conformance failed:');
|
|
257
|
+
for (const failure of adapterConformance.failures) console.error(`- ${failure}`);
|
|
258
|
+
}
|
|
259
|
+
const completed = {
|
|
260
|
+
...metadata,
|
|
261
|
+
completedAt: new Date().toISOString(),
|
|
262
|
+
status: child.status === 0 && validation.ok && adapterConformance.ok ? 'passed' : 'failed',
|
|
263
|
+
exitCode: child.status,
|
|
264
|
+
signal: child.signal,
|
|
265
|
+
artifactHashes: child.status === 0 ? computeGuardBenchArtifactHashes(run.outDir) : undefined,
|
|
266
|
+
artifactValidation: validation,
|
|
267
|
+
adapterConformance,
|
|
268
|
+
};
|
|
269
|
+
const metadataPath = writeExternalRunMetadata(run.outDir, completed);
|
|
270
|
+
const card = child.status === 0 ? writeGuardBenchConformanceCard({ dir: run.outDir }) : null;
|
|
271
|
+
console.log(`External GuardBench metadata: ${metadataPath}`);
|
|
272
|
+
if (card) console.log(`External GuardBench conformance card: ${card.path}`);
|
|
273
|
+
process.exitCode = child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (process.argv[1] && process.argv[1].endsWith('run-external-guardbench.mjs')) {
|
|
277
|
+
main().catch(err => {
|
|
278
|
+
console.error(err.message);
|
|
279
|
+
process.exit(1);
|
|
280
|
+
});
|
|
281
|
+
}
|