audrey 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/README.md +5 -3
- package/benchmarks/adapter-self-test.mjs +6 -2
- package/benchmarks/adapters/example-allow.mjs +5 -2
- package/benchmarks/adapters/mem0-platform.mjs +19 -12
- package/benchmarks/adapters/zep-cloud.mjs +51 -27
- package/benchmarks/baselines.js +11 -6
- package/benchmarks/build-leaderboard.mjs +36 -23
- package/benchmarks/cases.js +24 -12
- package/benchmarks/create-conformance-card.mjs +12 -3
- package/benchmarks/create-submission-bundle.mjs +22 -8
- package/benchmarks/dry-run-external-adapters.mjs +24 -12
- package/benchmarks/guardbench.js +263 -123
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +11 -11
- package/benchmarks/output/guardbench-raw.json +107 -108
- package/benchmarks/output/guardbench-summary.json +170 -172
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +11 -11
- package/benchmarks/output/submission-bundle/guardbench-raw.json +107 -108
- package/benchmarks/output/submission-bundle/guardbench-summary.json +170 -172
- package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +57 -57
- package/benchmarks/perf-snapshot.js +12 -9
- package/benchmarks/perf.bench.js +14 -6
- package/benchmarks/public-paths.mjs +11 -5
- package/benchmarks/reference-results.js +10 -5
- package/benchmarks/report.js +48 -27
- package/benchmarks/run-external-guardbench.mjs +47 -25
- package/benchmarks/run.js +112 -59
- package/benchmarks/validate-adapter-module.mjs +13 -10
- package/benchmarks/validate-adapter-registry.mjs +16 -5
- package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
- package/benchmarks/verify-external-evidence.mjs +86 -31
- package/benchmarks/verify-publication-artifacts.mjs +34 -11
- package/benchmarks/verify-submission-bundle.mjs +9 -4
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +5 -3
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +4 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +479 -172
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts.map +1 -1
- package/dist/src/action-key.js +6 -2
- package/dist/src/action-key.js.map +1 -1
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +4 -2
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +8 -5
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +1 -1
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +93 -49
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +37 -15
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +4 -2
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +5 -5
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +17 -9
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/context.js +1 -1
- package/dist/src/context.js.map +1 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +24 -13
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +78 -27
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +1 -1
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +12 -4
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +18 -16
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +5 -4
- package/dist/src/encode.js.map +1 -1
- package/dist/src/events.d.ts +3 -2
- package/dist/src/events.d.ts.map +1 -1
- package/dist/src/events.js +7 -3
- package/dist/src/events.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +21 -7
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +1 -1
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +12 -6
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.d.ts.map +1 -1
- package/dist/src/fts.js +20 -8
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +12 -6
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts.map +1 -1
- package/dist/src/impact.js +26 -10
- package/dist/src/impact.js.map +1 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +11 -6
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +3 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +10 -5
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.d.ts.map +1 -1
- package/dist/src/introspect.js +12 -6
- package/dist/src/introspect.js.map +1 -1
- package/dist/src/llm.d.ts +2 -2
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +6 -6
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +10 -4
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +6 -8
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts.map +1 -1
- package/dist/src/profile.js.map +1 -1
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +16 -7
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +1 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +85 -18
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +9 -4
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +1 -7
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +4 -2
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +33 -13
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +24 -2
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.js +2 -2
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts +2 -2
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +12 -4
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.js +1 -1
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +20 -10
- package/dist/src/validate.js.map +1 -1
- package/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +5 -3
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +11 -11
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +107 -108
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +170 -172
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +58 -58
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +17 -4
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +36 -36
- package/examples/fintech-ops-demo.js +12 -5
- package/examples/healthcare-ops-demo.js +8 -4
- package/examples/ollama-memory-agent.js +41 -13
- package/examples/stripe-demo.js +12 -5
- package/package.json +17 -4
- package/scripts/audit-release-completion.mjs +179 -101
- package/scripts/create-arxiv-source.mjs +20 -14
- package/scripts/create-paper-submission-bundle.mjs +6 -2
- package/scripts/finalize-release.mjs +111 -36
- package/scripts/prepare-release-cut.mjs +14 -6
- package/scripts/publish-release-bundle.mjs +62 -23
- package/scripts/publish-release-github-api.mjs +89 -24
- package/scripts/smoke-cli.js +9 -9
- package/scripts/sync-paper-artifacts.mjs +5 -1
- package/scripts/verify-arxiv-compile.mjs +52 -16
- package/scripts/verify-arxiv-source.mjs +45 -15
- package/scripts/verify-browser-launch-plan.mjs +28 -11
- package/scripts/verify-browser-launch-results.mjs +32 -14
- package/scripts/verify-paper-artifacts.mjs +539 -79
- package/scripts/verify-paper-claims.mjs +48 -20
- package/scripts/verify-paper-submission-bundle.mjs +22 -11
- package/scripts/verify-publication-pack.mjs +23 -9
- package/scripts/verify-release-readiness.mjs +211 -76
|
@@ -44,27 +44,32 @@ export const PUBLISHED_LEADERBOARD = [
|
|
|
44
44
|
export const MEMORY_TRENDS = [
|
|
45
45
|
{
|
|
46
46
|
title: 'Memory is moving from flat retrieval to typed systems',
|
|
47
|
-
summary:
|
|
47
|
+
summary:
|
|
48
|
+
'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
|
|
48
49
|
source: 'https://arxiv.org/abs/2507.03724',
|
|
49
50
|
},
|
|
50
51
|
{
|
|
51
52
|
title: 'Benchmarks now emphasize multi-session realism',
|
|
52
|
-
summary:
|
|
53
|
+
summary:
|
|
54
|
+
'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
|
|
53
55
|
source: 'https://arxiv.org/abs/2410.10813',
|
|
54
56
|
},
|
|
55
57
|
{
|
|
56
58
|
title: 'Context engineering is now competing with retrieval-first designs',
|
|
57
|
-
summary:
|
|
59
|
+
summary:
|
|
60
|
+
'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
|
|
58
61
|
source: 'https://www.letta.com/blog/memory-blocks',
|
|
59
62
|
},
|
|
60
63
|
{
|
|
61
64
|
title: 'Production teams care about latency and token footprint, not just recall quality',
|
|
62
|
-
summary:
|
|
65
|
+
summary:
|
|
66
|
+
'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
|
|
63
67
|
source: 'https://arxiv.org/abs/2504.19413',
|
|
64
68
|
},
|
|
65
69
|
{
|
|
66
70
|
title: 'Temporal and multimodal memory are becoming table stakes',
|
|
67
|
-
summary:
|
|
71
|
+
summary:
|
|
72
|
+
'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
|
|
68
73
|
source: 'https://arxiv.org/abs/2507.07957',
|
|
69
74
|
},
|
|
70
75
|
];
|
package/benchmarks/report.js
CHANGED
|
@@ -38,25 +38,29 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
|
|
|
38
38
|
const barWidth = Math.max(32, Math.floor(plotWidth / Math.max(rows.length, 1)) - 18);
|
|
39
39
|
const gap = rows.length > 1 ? (plotWidth - barWidth * rows.length) / (rows.length - 1) : 0;
|
|
40
40
|
|
|
41
|
-
const bars = rows
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
const bars = rows
|
|
42
|
+
.map((row, index) => {
|
|
43
|
+
const value = Math.max(0, Math.min(maxValue, row.value));
|
|
44
|
+
const barHeight = (value / maxValue) * plotHeight;
|
|
45
|
+
const x = margin.left + index * (barWidth + gap);
|
|
46
|
+
const y = margin.top + plotHeight - barHeight;
|
|
47
|
+
return `
|
|
47
48
|
<rect x="${x}" y="${y}" width="${barWidth}" height="${barHeight}" rx="8" fill="${chartBarColor(row.label)}" />
|
|
48
49
|
<text x="${x + barWidth / 2}" y="${y - 10}" text-anchor="middle" font-size="15" fill="${PALETTE.accent}">${value.toFixed(1)}${valueSuffix}</text>
|
|
49
50
|
<text x="${x + barWidth / 2}" y="${height - 42}" text-anchor="middle" font-size="14" fill="${PALETTE.muted}">${escapeHtml(row.label)}</text>
|
|
50
51
|
`;
|
|
51
|
-
|
|
52
|
+
})
|
|
53
|
+
.join('\n');
|
|
52
54
|
|
|
53
|
-
const grid = [0, 25, 50, 75, 100]
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
const grid = [0, 25, 50, 75, 100]
|
|
56
|
+
.map(tick => {
|
|
57
|
+
const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
|
|
58
|
+
return `
|
|
56
59
|
<line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="${PALETTE.border}" stroke-dasharray="4 4" />
|
|
57
60
|
<text x="${margin.left - 10}" y="${y + 5}" text-anchor="end" font-size="13" fill="${PALETTE.muted}">${tick}${valueSuffix}</text>
|
|
58
61
|
`;
|
|
59
|
-
|
|
62
|
+
})
|
|
63
|
+
.join('\n');
|
|
60
64
|
|
|
61
65
|
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
62
66
|
<svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}" role="img" aria-label="${escapeHtml(title)}">
|
|
@@ -68,39 +72,53 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
|
|
|
68
72
|
}
|
|
69
73
|
|
|
70
74
|
function renderTrendList(trends) {
|
|
71
|
-
return trends
|
|
75
|
+
return trends
|
|
76
|
+
.map(
|
|
77
|
+
trend => `
|
|
72
78
|
<li>
|
|
73
79
|
<strong>${escapeHtml(trend.title)}</strong><br />
|
|
74
80
|
${escapeHtml(trend.summary)}<br />
|
|
75
81
|
<a href="${trend.source}">${escapeHtml(trend.source)}</a>
|
|
76
82
|
</li>
|
|
77
|
-
|
|
83
|
+
`,
|
|
84
|
+
)
|
|
85
|
+
.join('\n');
|
|
78
86
|
}
|
|
79
87
|
|
|
80
88
|
function renderCaseRows(localCases) {
|
|
81
|
-
return localCases
|
|
89
|
+
return localCases
|
|
90
|
+
.map(
|
|
91
|
+
caseResult => `
|
|
82
92
|
<tr>
|
|
83
93
|
<td>${escapeHtml(caseResult.title)}</td>
|
|
84
94
|
<td>${escapeHtml(caseResult.suite)}</td>
|
|
85
95
|
<td>${escapeHtml(caseResult.family)}</td>
|
|
86
|
-
${caseResult.results
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
96
|
+
${caseResult.results
|
|
97
|
+
.map(result => {
|
|
98
|
+
const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
|
|
99
|
+
const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
|
|
100
|
+
return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
|
|
101
|
+
})
|
|
102
|
+
.join('')}
|
|
91
103
|
</tr>
|
|
92
|
-
|
|
104
|
+
`,
|
|
105
|
+
)
|
|
106
|
+
.join('\n');
|
|
93
107
|
}
|
|
94
108
|
|
|
95
109
|
function renderSuiteSections(suiteCharts) {
|
|
96
110
|
if (suiteCharts.length === 0) return '';
|
|
97
|
-
return suiteCharts
|
|
111
|
+
return suiteCharts
|
|
112
|
+
.map(
|
|
113
|
+
chart => `
|
|
98
114
|
<section class="callout">
|
|
99
115
|
<h2>${escapeHtml(chart.title)}</h2>
|
|
100
116
|
<p>${escapeHtml(chart.description)}</p>
|
|
101
117
|
<img src="./${escapeHtml(chart.fileName)}" alt="${escapeHtml(chart.title)} chart" />
|
|
102
118
|
</section>
|
|
103
|
-
|
|
119
|
+
`,
|
|
120
|
+
)
|
|
121
|
+
.join('\n');
|
|
104
122
|
}
|
|
105
123
|
|
|
106
124
|
export function writeBenchmarkArtifacts({
|
|
@@ -114,9 +132,10 @@ export function writeBenchmarkArtifacts({
|
|
|
114
132
|
}) {
|
|
115
133
|
mkdirSync(outputDir, { recursive: true });
|
|
116
134
|
|
|
117
|
-
const localChartTitle =
|
|
118
|
-
|
|
119
|
-
|
|
135
|
+
const localChartTitle =
|
|
136
|
+
summary.local?.overall_scope === 'comparable_suites'
|
|
137
|
+
? 'Audrey vs Comparable Local Memory Baselines'
|
|
138
|
+
: 'Selected Audrey Regression Suite';
|
|
120
139
|
const localChart = renderBarChart({
|
|
121
140
|
title: localChartTitle,
|
|
122
141
|
rows: localOverall.map(row => ({ label: row.system, value: row.scorePercent })),
|
|
@@ -162,8 +181,10 @@ export function writeBenchmarkArtifacts({
|
|
|
162
181
|
operationsReadmeChart,
|
|
163
182
|
renderBarChart({
|
|
164
183
|
title: 'Audrey Memory Operations Benchmark',
|
|
165
|
-
rows: (localSuites.find(suite => suite.id === 'operations')?.overall || [])
|
|
166
|
-
|
|
184
|
+
rows: (localSuites.find(suite => suite.id === 'operations')?.overall || []).map(row => ({
|
|
185
|
+
label: row.system,
|
|
186
|
+
value: row.scorePercent,
|
|
187
|
+
})),
|
|
167
188
|
}),
|
|
168
189
|
'utf8',
|
|
169
190
|
);
|
|
@@ -3,31 +3,46 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
|
3
3
|
import { basename, dirname, resolve } from 'node:path';
|
|
4
4
|
import { fileURLToPath } from 'node:url';
|
|
5
5
|
import { writeGuardBenchConformanceCard } from './create-conformance-card.mjs';
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
computeGuardBenchArtifactHashes,
|
|
8
|
+
validateGuardBenchArtifacts,
|
|
9
|
+
} from './validate-guardbench-artifacts.mjs';
|
|
7
10
|
import { publicArtifactValue } from './public-paths.mjs';
|
|
8
11
|
|
|
9
12
|
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
10
13
|
const KNOWN_ADAPTERS = new Map([
|
|
11
|
-
[
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
14
|
+
[
|
|
15
|
+
'mem0',
|
|
16
|
+
{
|
|
17
|
+
name: 'mem0-platform',
|
|
18
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
19
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
20
|
+
},
|
|
21
|
+
],
|
|
22
|
+
[
|
|
23
|
+
'mem0-platform',
|
|
24
|
+
{
|
|
25
|
+
name: 'mem0-platform',
|
|
26
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
27
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
[
|
|
31
|
+
'zep',
|
|
32
|
+
{
|
|
33
|
+
name: 'zep-cloud',
|
|
34
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
35
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
[
|
|
39
|
+
'zep-cloud',
|
|
40
|
+
{
|
|
41
|
+
name: 'zep-cloud',
|
|
42
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
43
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
44
|
+
},
|
|
45
|
+
],
|
|
31
46
|
]);
|
|
32
47
|
|
|
33
48
|
export function parseExternalArgs(argv = process.argv.slice(2)) {
|
|
@@ -127,13 +142,19 @@ export function evaluateAdapterConformance(summary, adapterName) {
|
|
|
127
142
|
.filter(row => row.system === resolvedAdapterName);
|
|
128
143
|
|
|
129
144
|
if (adapterRows.length !== expectedScenarios) {
|
|
130
|
-
failures.push(
|
|
145
|
+
failures.push(
|
|
146
|
+
`Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`,
|
|
147
|
+
);
|
|
131
148
|
}
|
|
132
149
|
if (systemSummary && systemSummary.scenarios !== expectedScenarios) {
|
|
133
|
-
failures.push(
|
|
150
|
+
failures.push(
|
|
151
|
+
`Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`,
|
|
152
|
+
);
|
|
134
153
|
}
|
|
135
154
|
if (systemSummary && systemSummary.redactionLeaks !== 0) {
|
|
136
|
-
failures.push(
|
|
155
|
+
failures.push(
|
|
156
|
+
`Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`,
|
|
157
|
+
);
|
|
137
158
|
}
|
|
138
159
|
if (adapterRows.some(row => row.external !== true)) {
|
|
139
160
|
failures.push(`Adapter ${resolvedAdapterName} rows are not marked external`);
|
|
@@ -270,7 +291,8 @@ async function main() {
|
|
|
270
291
|
const card = child.status === 0 ? writeGuardBenchConformanceCard({ dir: run.outDir }) : null;
|
|
271
292
|
console.log(`External GuardBench metadata: ${metadataPath}`);
|
|
272
293
|
if (card) console.log(`External GuardBench conformance card: ${card.path}`);
|
|
273
|
-
process.exitCode =
|
|
294
|
+
process.exitCode =
|
|
295
|
+
child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
|
|
274
296
|
}
|
|
275
297
|
|
|
276
298
|
if (process.argv[1] && process.argv[1].endsWith('run-external-guardbench.mjs')) {
|
package/benchmarks/run.js
CHANGED
|
@@ -68,7 +68,9 @@ function normalizeSuiteSelection(value = 'all') {
|
|
|
68
68
|
|
|
69
69
|
const invalid = selected.filter(token => !ALL_SUITE_IDS.includes(token));
|
|
70
70
|
if (invalid.length > 0) {
|
|
71
|
-
throw new Error(
|
|
71
|
+
throw new Error(
|
|
72
|
+
`Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`,
|
|
73
|
+
);
|
|
72
74
|
}
|
|
73
75
|
return [...new Set(selected)];
|
|
74
76
|
}
|
|
@@ -94,12 +96,19 @@ function evaluateCase(benchmarkCase, results) {
|
|
|
94
96
|
const expected = (benchmarkCase.expectAny || []).map(normalize);
|
|
95
97
|
const required = (benchmarkCase.expectAll || []).map(normalize);
|
|
96
98
|
const forbidden = (benchmarkCase.forbid || []).map(normalize);
|
|
97
|
-
const firstMatchIndex =
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
const firstMatchIndex =
|
|
100
|
+
expected.length === 0
|
|
101
|
+
? -1
|
|
102
|
+
: normalizedContents.findIndex(content =>
|
|
103
|
+
expected.some(expectation => content.includes(expectation)),
|
|
104
|
+
);
|
|
105
|
+
const firstForbiddenIndex = normalizedContents.findIndex(content =>
|
|
106
|
+
forbidden.some(blocked => content.includes(blocked)),
|
|
107
|
+
);
|
|
101
108
|
const matched = firstMatchIndex !== -1;
|
|
102
|
-
const requiredMatches = required.filter(expectation =>
|
|
109
|
+
const requiredMatches = required.filter(expectation =>
|
|
110
|
+
normalizedContents.some(content => content.includes(expectation)),
|
|
111
|
+
);
|
|
103
112
|
const matchedRequired = required.length > 0 && requiredMatches.length === required.length;
|
|
104
113
|
const leakedForbidden = firstForbiddenIndex !== -1;
|
|
105
114
|
|
|
@@ -108,16 +117,21 @@ function evaluateCase(benchmarkCase, results) {
|
|
|
108
117
|
return {
|
|
109
118
|
passed: score === 1,
|
|
110
119
|
score,
|
|
111
|
-
summary: leakedForbidden
|
|
120
|
+
summary: leakedForbidden
|
|
121
|
+
? 'leaked restricted content'
|
|
122
|
+
: results.length === 0
|
|
123
|
+
? 'correct abstention'
|
|
124
|
+
: 'no leak, but retrieved tangential context',
|
|
112
125
|
};
|
|
113
126
|
}
|
|
114
127
|
|
|
115
128
|
if (required.length > 0) {
|
|
116
|
-
const score =
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
129
|
+
const score =
|
|
130
|
+
matchedRequired && !leakedForbidden
|
|
131
|
+
? 1
|
|
132
|
+
: leakedForbidden
|
|
133
|
+
? 0
|
|
134
|
+
: Math.min(0.5, requiredMatches.length / required.length);
|
|
121
135
|
const missing = required.filter(expectation => !requiredMatches.includes(expectation));
|
|
122
136
|
return {
|
|
123
137
|
passed: score === 1,
|
|
@@ -154,7 +168,9 @@ async function seedRetrievalCase(brain, benchmarkCase) {
|
|
|
154
168
|
const ids = [];
|
|
155
169
|
for (let index = 0; index < benchmarkCase.memory.length; index++) {
|
|
156
170
|
const memory = benchmarkCase.memory[index];
|
|
157
|
-
const supersedes = Number.isInteger(memory.supersedesIndex)
|
|
171
|
+
const supersedes = Number.isInteger(memory.supersedesIndex)
|
|
172
|
+
? ids[memory.supersedesIndex]
|
|
173
|
+
: undefined;
|
|
158
174
|
const id = await brain.encode({
|
|
159
175
|
content: memory.content,
|
|
160
176
|
source: memory.source,
|
|
@@ -264,7 +280,9 @@ async function executeGuardStep(brain, step, refs) {
|
|
|
264
280
|
if (step.type === 'expectGuardAfterError') {
|
|
265
281
|
const receiptId = step.receiptRef ? refs.get(step.receiptRef) : step.receiptId;
|
|
266
282
|
if (!receiptId) {
|
|
267
|
-
throw new Error(
|
|
283
|
+
throw new Error(
|
|
284
|
+
`Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`,
|
|
285
|
+
);
|
|
268
286
|
}
|
|
269
287
|
|
|
270
288
|
try {
|
|
@@ -278,15 +296,19 @@ async function executeGuardStep(brain, step, refs) {
|
|
|
278
296
|
} catch (err) {
|
|
279
297
|
const message = err instanceof Error ? err.message : String(err);
|
|
280
298
|
if (step.errorIncludes && !message.includes(step.errorIncludes)) {
|
|
281
|
-
throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"
|
|
299
|
+
throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`, {
|
|
300
|
+
cause: err,
|
|
301
|
+
});
|
|
282
302
|
}
|
|
283
303
|
const label = step.label ?? 'after_error_rejected';
|
|
284
|
-
return [
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
304
|
+
return [
|
|
305
|
+
{
|
|
306
|
+
id: `${receiptId}:${label}`,
|
|
307
|
+
content: `guard_hardened:${label} error:${message}`,
|
|
308
|
+
type: 'guard_hardening',
|
|
309
|
+
score: 1,
|
|
310
|
+
},
|
|
311
|
+
];
|
|
290
312
|
}
|
|
291
313
|
|
|
292
314
|
throw new Error(`Guard hardening expected an error for receipt ${receiptId}`);
|
|
@@ -299,18 +321,20 @@ async function seedGuardCase(brain, benchmarkCase) {
|
|
|
299
321
|
const refs = new Map();
|
|
300
322
|
const diagnostics = [];
|
|
301
323
|
for (const step of benchmarkCase.steps || []) {
|
|
302
|
-
diagnostics.push(...await executeGuardStep(brain, step, refs));
|
|
324
|
+
diagnostics.push(...(await executeGuardStep(brain, step, refs)));
|
|
303
325
|
}
|
|
304
326
|
return diagnostics;
|
|
305
327
|
}
|
|
306
328
|
|
|
307
329
|
function guardDecisionRows(decision) {
|
|
308
|
-
const rows = [
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
330
|
+
const rows = [
|
|
331
|
+
{
|
|
332
|
+
id: decision.receipt_id,
|
|
333
|
+
content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
|
|
334
|
+
type: 'guard_decision',
|
|
335
|
+
score: 1,
|
|
336
|
+
},
|
|
337
|
+
];
|
|
314
338
|
|
|
315
339
|
for (const [index, warning] of decision.warnings.entries()) {
|
|
316
340
|
rows.push({
|
|
@@ -380,12 +404,15 @@ async function runAudreyCase(benchmarkCase, providerConfig) {
|
|
|
380
404
|
|
|
381
405
|
async function runBaselineCase(system, benchmarkCase, providerConfig) {
|
|
382
406
|
if (benchmarkCase.kind === 'guard') {
|
|
383
|
-
return [
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
407
|
+
return [
|
|
408
|
+
{
|
|
409
|
+
id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
|
|
410
|
+
content:
|
|
411
|
+
'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
|
|
412
|
+
type: 'guard_decision',
|
|
413
|
+
score: 0,
|
|
414
|
+
},
|
|
415
|
+
];
|
|
389
416
|
}
|
|
390
417
|
|
|
391
418
|
return runBaselineScenario(system, benchmarkCase, providerConfig, 5);
|
|
@@ -394,9 +421,18 @@ async function runBaselineCase(system, benchmarkCase, providerConfig) {
|
|
|
394
421
|
async function runSystemsForCase(benchmarkCase, providerConfig) {
|
|
395
422
|
const systems = [
|
|
396
423
|
{ system: 'Audrey', run: () => runAudreyCase(benchmarkCase, providerConfig) },
|
|
397
|
-
{
|
|
398
|
-
|
|
399
|
-
|
|
424
|
+
{
|
|
425
|
+
system: 'Vector Only',
|
|
426
|
+
run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig),
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
system: 'Keyword + Recency',
|
|
430
|
+
run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig),
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
system: 'Recent Window',
|
|
434
|
+
run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig),
|
|
435
|
+
},
|
|
400
436
|
];
|
|
401
437
|
|
|
402
438
|
const results = [];
|
|
@@ -504,13 +540,13 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
504
540
|
|
|
505
541
|
if (audrey.scorePercent < settings.minAudreyScore) {
|
|
506
542
|
failures.push(
|
|
507
|
-
`Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}
|
|
543
|
+
`Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`,
|
|
508
544
|
);
|
|
509
545
|
}
|
|
510
546
|
|
|
511
547
|
if (audrey.passRate < settings.minAudreyPassRate) {
|
|
512
548
|
failures.push(
|
|
513
|
-
`Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}
|
|
549
|
+
`Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`,
|
|
514
550
|
);
|
|
515
551
|
}
|
|
516
552
|
|
|
@@ -518,8 +554,8 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
518
554
|
const margin = audrey.scorePercent - strongestBaseline.scorePercent;
|
|
519
555
|
if (margin < settings.minMarginOverBaseline) {
|
|
520
556
|
failures.push(
|
|
521
|
-
`Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required `
|
|
522
|
-
|
|
557
|
+
`Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required ` +
|
|
558
|
+
`${settings.minMarginOverBaseline.toFixed(1)}-point margin.`,
|
|
523
559
|
);
|
|
524
560
|
}
|
|
525
561
|
}
|
|
@@ -531,7 +567,9 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
|
|
|
531
567
|
return {
|
|
532
568
|
audrey,
|
|
533
569
|
strongestBaseline,
|
|
534
|
-
marginOverBaseline: strongestBaseline
|
|
570
|
+
marginOverBaseline: strongestBaseline
|
|
571
|
+
? audrey.scorePercent - strongestBaseline.scorePercent
|
|
572
|
+
: null,
|
|
535
573
|
thresholds: settings,
|
|
536
574
|
};
|
|
537
575
|
}
|
|
@@ -563,7 +601,9 @@ export async function runBenchmarkSuite(options = {}) {
|
|
|
563
601
|
}
|
|
564
602
|
}
|
|
565
603
|
|
|
566
|
-
const comparableCaseResults = caseResults.filter(
|
|
604
|
+
const comparableCaseResults = caseResults.filter(
|
|
605
|
+
caseResult => caseResult.comparable_to_baselines,
|
|
606
|
+
);
|
|
567
607
|
const overallCaseResults = comparableCaseResults.length > 0 ? comparableCaseResults : caseResults;
|
|
568
608
|
const overallScope = comparableCaseResults.length > 0 ? 'comparable_suites' : 'selected_suites';
|
|
569
609
|
const overallSuiteIds = [...new Set(overallCaseResults.map(caseResult => caseResult.suite))];
|
|
@@ -579,10 +619,14 @@ export async function runBenchmarkSuite(options = {}) {
|
|
|
579
619
|
suites: suiteIds,
|
|
580
620
|
},
|
|
581
621
|
methodology: {
|
|
582
|
-
localBenchmark:
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
622
|
+
localBenchmark:
|
|
623
|
+
'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
|
|
624
|
+
retrievalBenchmark:
|
|
625
|
+
'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
|
|
626
|
+
operationsBenchmark:
|
|
627
|
+
'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
|
|
628
|
+
guardBenchmark:
|
|
629
|
+
'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
|
|
586
630
|
externalLeaderboard: 'Published LoCoMo scores from official papers and project blogs',
|
|
587
631
|
},
|
|
588
632
|
local: {
|
|
@@ -615,10 +659,10 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
615
659
|
});
|
|
616
660
|
const gate = args.check
|
|
617
661
|
? assertBenchmarkGuardrails(summary, {
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
662
|
+
minAudreyScore: args.minAudreyScore,
|
|
663
|
+
minAudreyPassRate: args.minAudreyPassRate,
|
|
664
|
+
minMarginOverBaseline: args.minMarginOverBaseline,
|
|
665
|
+
})
|
|
622
666
|
: null;
|
|
623
667
|
|
|
624
668
|
if (args.jsonOnly) {
|
|
@@ -629,15 +673,22 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
629
673
|
const lines = [];
|
|
630
674
|
lines.push('Audrey benchmark complete.');
|
|
631
675
|
lines.push('');
|
|
632
|
-
lines.push(
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
676
|
+
lines.push(
|
|
677
|
+
`Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`,
|
|
678
|
+
);
|
|
679
|
+
lines.push(
|
|
680
|
+
`Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`,
|
|
681
|
+
);
|
|
682
|
+
const comparableCaseCount = summary.local.cases.filter(testCase =>
|
|
683
|
+
summary.local.overall_suite_ids.includes(testCase.suite),
|
|
684
|
+
).length;
|
|
685
|
+
lines.push(
|
|
686
|
+
`Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`,
|
|
687
|
+
);
|
|
637
688
|
for (const row of summary.local.overall) {
|
|
638
689
|
lines.push(
|
|
639
|
-
`${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, `
|
|
640
|
-
|
|
690
|
+
`${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, ` +
|
|
691
|
+
`${row.avgDurationMs.toFixed(1)} ms avg/case`,
|
|
641
692
|
);
|
|
642
693
|
}
|
|
643
694
|
lines.push('');
|
|
@@ -667,7 +718,9 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
|
|
|
667
718
|
? `${gate.strongestBaseline.system} by ${gate.marginOverBaseline.toFixed(1)} points`
|
|
668
719
|
: 'all local baselines';
|
|
669
720
|
lines.push('');
|
|
670
|
-
lines.push(
|
|
721
|
+
lines.push(
|
|
722
|
+
`Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`,
|
|
723
|
+
);
|
|
671
724
|
}
|
|
672
725
|
|
|
673
726
|
out(lines.join('\n'));
|
|
@@ -42,9 +42,10 @@ export async function validateAdapterModuleFile(options = {}) {
|
|
|
42
42
|
} else {
|
|
43
43
|
try {
|
|
44
44
|
const mod = await import(pathToFileURL(adapterPath).href);
|
|
45
|
-
const candidate =
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
const candidate =
|
|
46
|
+
typeof mod.createGuardBenchAdapter === 'function'
|
|
47
|
+
? await mod.createGuardBenchAdapter()
|
|
48
|
+
: (mod.default ?? mod.adapter);
|
|
48
49
|
adapter = validateGuardBenchAdapter(candidate, adapterPath);
|
|
49
50
|
} catch (error) {
|
|
50
51
|
failures.push(error.message);
|
|
@@ -57,12 +58,12 @@ export async function validateAdapterModuleFile(options = {}) {
|
|
|
57
58
|
moduleFile: basename(adapterPath),
|
|
58
59
|
adapter: adapter
|
|
59
60
|
? {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
61
|
+
name: adapter.name,
|
|
62
|
+
description: adapter.description ?? null,
|
|
63
|
+
hasSetup: typeof adapter.setup === 'function',
|
|
64
|
+
hasDecide: typeof adapter.decide === 'function',
|
|
65
|
+
hasCleanup: typeof adapter.cleanup === 'function',
|
|
66
|
+
}
|
|
66
67
|
: null,
|
|
67
68
|
contract: {
|
|
68
69
|
moduleFormat: 'ESM',
|
|
@@ -87,7 +88,9 @@ async function main() {
|
|
|
87
88
|
} else if (validation.ok) {
|
|
88
89
|
console.log(`GuardBench adapter module validation passed: ${validation.adapterPath}`);
|
|
89
90
|
console.log(`Adapter: ${validation.adapter.name}`);
|
|
90
|
-
console.log(
|
|
91
|
+
console.log(
|
|
92
|
+
`Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`,
|
|
93
|
+
);
|
|
91
94
|
} else {
|
|
92
95
|
console.error('GuardBench adapter module validation failed:');
|
|
93
96
|
for (const failure of validation.failures) console.error(`- ${failure}`);
|