audrey 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +30 -6
- package/benchmarks/adapter-self-test.mjs +6 -2
- package/benchmarks/adapters/example-allow.mjs +5 -2
- package/benchmarks/adapters/mem0-platform.mjs +19 -12
- package/benchmarks/adapters/zep-cloud.mjs +51 -27
- package/benchmarks/baselines.js +11 -6
- package/benchmarks/build-leaderboard.mjs +36 -23
- package/benchmarks/cases.js +24 -12
- package/benchmarks/create-conformance-card.mjs +12 -3
- package/benchmarks/create-submission-bundle.mjs +22 -8
- package/benchmarks/dry-run-external-adapters.mjs +24 -12
- package/benchmarks/guardbench.js +354 -124
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/guardbench-raw.json +243 -144
- package/benchmarks/output/guardbench-summary.json +354 -230
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/submission-bundle/guardbench-raw.json +243 -144
- package/benchmarks/output/submission-bundle/guardbench-summary.json +354 -230
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +23 -2
- package/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +58 -58
- package/benchmarks/perf-snapshot.js +12 -9
- package/benchmarks/perf.bench.js +14 -6
- package/benchmarks/public-paths.mjs +11 -5
- package/benchmarks/reference-results.js +10 -5
- package/benchmarks/report.js +48 -27
- package/benchmarks/run-external-guardbench.mjs +47 -25
- package/benchmarks/run.js +112 -59
- package/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/benchmarks/validate-adapter-module.mjs +13 -10
- package/benchmarks/validate-adapter-registry.mjs +16 -5
- package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
- package/benchmarks/verify-external-evidence.mjs +86 -31
- package/benchmarks/verify-publication-artifacts.mjs +34 -11
- package/benchmarks/verify-submission-bundle.mjs +9 -4
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +5 -3
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +4 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +479 -172
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts.map +1 -1
- package/dist/src/action-key.js +6 -2
- package/dist/src/action-key.js.map +1 -1
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +4 -2
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +8 -5
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +11 -1
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +110 -53
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +37 -15
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +4 -2
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +5 -5
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +17 -9
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/context.js +1 -1
- package/dist/src/context.js.map +1 -1
- package/dist/src/controller.d.ts +17 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +73 -23
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +78 -27
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +1 -1
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +12 -4
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +18 -16
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +5 -4
- package/dist/src/encode.js.map +1 -1
- package/dist/src/events.d.ts +3 -2
- package/dist/src/events.d.ts.map +1 -1
- package/dist/src/events.js +7 -3
- package/dist/src/events.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +21 -7
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +1 -1
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +12 -6
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.d.ts.map +1 -1
- package/dist/src/fts.js +20 -8
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +12 -6
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts.map +1 -1
- package/dist/src/impact.js +26 -10
- package/dist/src/impact.js.map +1 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +11 -6
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +5 -4
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +4 -4
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +10 -5
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.d.ts.map +1 -1
- package/dist/src/introspect.js +12 -6
- package/dist/src/introspect.js.map +1 -1
- package/dist/src/llm.d.ts +2 -2
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +6 -6
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +10 -4
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +6 -8
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts.map +1 -1
- package/dist/src/profile.js.map +1 -1
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +16 -7
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +1 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +85 -18
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +9 -4
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +1 -7
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +4 -2
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +37 -14
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +24 -2
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.js +2 -2
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts +2 -2
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +12 -4
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.js +1 -1
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +20 -10
- package/dist/src/validate.js.map +1 -1
- package/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/audrey-paper-v1.md +6 -6
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +6 -6
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +30 -6
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +243 -144
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +354 -230
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +52 -52
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +18 -5
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +40 -40
- package/examples/fintech-ops-demo.js +12 -5
- package/examples/healthcare-ops-demo.js +8 -4
- package/examples/ollama-memory-agent.js +41 -13
- package/examples/stripe-demo.js +12 -5
- package/package.json +18 -5
- package/scripts/audit-release-completion.mjs +179 -101
- package/scripts/create-arxiv-source.mjs +20 -14
- package/scripts/create-paper-submission-bundle.mjs +6 -2
- package/scripts/finalize-release.mjs +111 -36
- package/scripts/prepare-release-cut.mjs +14 -6
- package/scripts/publish-release-bundle.mjs +62 -23
- package/scripts/publish-release-github-api.mjs +89 -24
- package/scripts/smoke-cli.js +26 -6
- package/scripts/sync-paper-artifacts.mjs +5 -1
- package/scripts/verify-arxiv-compile.mjs +52 -16
- package/scripts/verify-arxiv-source.mjs +45 -15
- package/scripts/verify-browser-launch-plan.mjs +28 -11
- package/scripts/verify-browser-launch-results.mjs +32 -14
- package/scripts/verify-paper-artifacts.mjs +539 -79
- package/scripts/verify-paper-claims.mjs +48 -20
- package/scripts/verify-paper-submission-bundle.mjs +22 -11
- package/scripts/verify-publication-pack.mjs +23 -9
- package/scripts/verify-release-readiness.mjs +250 -71
|
@@ -157,8 +157,8 @@ function parseArgs(argv = process.argv.slice(2)) {
|
|
|
157
157
|
if (token === '--sizes' && argv[i + 1]) {
|
|
158
158
|
args.sizes = argv[++i]
|
|
159
159
|
.split(',')
|
|
160
|
-
.map(
|
|
161
|
-
.filter(
|
|
160
|
+
.map(s => Number.parseInt(s.trim(), 10))
|
|
161
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
162
162
|
} else if (token === '--recall-runs' && argv[i + 1]) {
|
|
163
163
|
args.recallRuns = Number.parseInt(argv[++i], 10);
|
|
164
164
|
} else if (token === '--out' && argv[i + 1]) {
|
|
@@ -180,7 +180,7 @@ async function runOneSize({ size, recallRuns }) {
|
|
|
180
180
|
});
|
|
181
181
|
|
|
182
182
|
const queueProcessingTimes = [];
|
|
183
|
-
audrey.on('post-encode-complete',
|
|
183
|
+
audrey.on('post-encode-complete', event => {
|
|
184
184
|
queueProcessingTimes.push(event.processing_ms);
|
|
185
185
|
});
|
|
186
186
|
|
|
@@ -223,7 +223,10 @@ async function runOneSize({ size, recallRuns }) {
|
|
|
223
223
|
}
|
|
224
224
|
}
|
|
225
225
|
|
|
226
|
-
export async function runPerfSnapshot({
|
|
226
|
+
export async function runPerfSnapshot({
|
|
227
|
+
sizes = DEFAULT_SIZES,
|
|
228
|
+
recallRuns = DEFAULT_RECALL_RUNS,
|
|
229
|
+
} = {}) {
|
|
227
230
|
const startedAt = Date.now();
|
|
228
231
|
const sized = [];
|
|
229
232
|
for (const size of sizes) {
|
|
@@ -265,11 +268,11 @@ export function formatMarkdownTable(snapshot) {
|
|
|
265
268
|
lines.push(
|
|
266
269
|
`Node ${snapshot.machine.node} · ${snapshot.machine.cpuCount}x ${snapshot.machine.cpuModel} · ${snapshot.machine.memoryGb} GB RAM`,
|
|
267
270
|
);
|
|
271
|
+
lines.push(`Generated ${snapshot.generatedAt}${snapshot.gitSha ? ` (${snapshot.gitSha})` : ''}`);
|
|
272
|
+
lines.push('');
|
|
268
273
|
lines.push(
|
|
269
|
-
|
|
274
|
+
'| Corpus size | Encode p50 (ms) | Encode p95 (ms) | Recall p50 (ms) | Recall p95 (ms) | Recall p99 (ms) |',
|
|
270
275
|
);
|
|
271
|
-
lines.push('');
|
|
272
|
-
lines.push('| Corpus size | Encode p50 (ms) | Encode p95 (ms) | Recall p50 (ms) | Recall p95 (ms) | Recall p99 (ms) |');
|
|
273
276
|
lines.push('|---|---|---|---|---|---|');
|
|
274
277
|
for (const row of snapshot.sizes) {
|
|
275
278
|
lines.push(
|
|
@@ -287,7 +290,7 @@ export function formatMarkdownTable(snapshot) {
|
|
|
287
290
|
if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
288
291
|
const args = parseArgs();
|
|
289
292
|
runPerfSnapshot({ sizes: args.sizes, recallRuns: args.recallRuns })
|
|
290
|
-
.then(
|
|
293
|
+
.then(snapshot => {
|
|
291
294
|
if (args.out) {
|
|
292
295
|
writeFileSync(args.out, JSON.stringify(snapshot, null, 2) + '\n');
|
|
293
296
|
}
|
|
@@ -297,7 +300,7 @@ if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href)
|
|
|
297
300
|
process.stdout.write(formatMarkdownTable(snapshot) + '\n');
|
|
298
301
|
}
|
|
299
302
|
})
|
|
300
|
-
.catch(
|
|
303
|
+
.catch(err => {
|
|
301
304
|
console.error('[audrey] perf snapshot failed:', err);
|
|
302
305
|
process.exit(1);
|
|
303
306
|
});
|
package/benchmarks/perf.bench.js
CHANGED
|
@@ -136,16 +136,24 @@ export async function runPerfBenchmark({
|
|
|
136
136
|
};
|
|
137
137
|
|
|
138
138
|
if (queueProcessingTimes.length !== runs) {
|
|
139
|
-
throw new Error(
|
|
139
|
+
throw new Error(
|
|
140
|
+
`expected ${runs} post-encode queue events, got ${queueProcessingTimes.length}`,
|
|
141
|
+
);
|
|
140
142
|
}
|
|
141
143
|
|
|
142
144
|
assertBudget('encode response p95', result.encode_response_ms.p95, budgets.encodeResponseP95Ms);
|
|
143
145
|
assertBudget('hybrid recall p95', result.hybrid_recall_ms.p95, budgets.hybridRecallP95Ms);
|
|
144
|
-
assertBudget(
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
146
|
+
assertBudget(
|
|
147
|
+
'queue processing p50',
|
|
148
|
+
result.queue_processing_ms.p50,
|
|
149
|
+
budgets.queueProcessingP50Ms,
|
|
150
|
+
);
|
|
151
|
+
|
|
152
|
+
out(
|
|
153
|
+
`Audrey perf gate passed: encode p95=${result.encode_response_ms.p95}ms, ` +
|
|
154
|
+
`hybrid recall p95=${result.hybrid_recall_ms.p95}ms, ` +
|
|
155
|
+
`queue p50=${result.queue_processing_ms.p50}ms`,
|
|
156
|
+
);
|
|
149
157
|
return result;
|
|
150
158
|
} finally {
|
|
151
159
|
audrey.close();
|
|
@@ -34,15 +34,19 @@ export function publicCommand(command = []) {
|
|
|
34
34
|
export function publicArtifactValue(value) {
|
|
35
35
|
if (Array.isArray(value)) return value.map(item => publicArtifactValue(item));
|
|
36
36
|
if (value && typeof value === 'object') {
|
|
37
|
-
return Object.fromEntries(
|
|
37
|
+
return Object.fromEntries(
|
|
38
|
+
Object.entries(value).map(([key, item]) => [key, publicArtifactValue(item)]),
|
|
39
|
+
);
|
|
38
40
|
}
|
|
39
41
|
return publicPath(value);
|
|
40
42
|
}
|
|
41
43
|
|
|
42
44
|
export function containsLocalPath(text) {
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
|
|
45
|
+
return (
|
|
46
|
+
WINDOWS_DRIVE_PATTERN.test(text) ||
|
|
47
|
+
EXTENDED_PATH_PATTERN.test(text) ||
|
|
48
|
+
FILE_URL_PATTERN.test(text)
|
|
49
|
+
);
|
|
46
50
|
}
|
|
47
51
|
|
|
48
52
|
export function findLocalPathLeaks(value, path = '$') {
|
|
@@ -53,7 +57,9 @@ export function findLocalPathLeaks(value, path = '$') {
|
|
|
53
57
|
return value.flatMap((item, index) => findLocalPathLeaks(item, `${path}[${index}]`));
|
|
54
58
|
}
|
|
55
59
|
if (value && typeof value === 'object') {
|
|
56
|
-
return Object.entries(value).flatMap(([key, item]) =>
|
|
60
|
+
return Object.entries(value).flatMap(([key, item]) =>
|
|
61
|
+
findLocalPathLeaks(item, `${path}.${key}`),
|
|
62
|
+
);
|
|
57
63
|
}
|
|
58
64
|
return [];
|
|
59
65
|
}
|
|
@@ -44,27 +44,32 @@ export const PUBLISHED_LEADERBOARD = [
|
|
|
44
44
|
export const MEMORY_TRENDS = [
|
|
45
45
|
{
|
|
46
46
|
title: 'Memory is moving from flat retrieval to typed systems',
|
|
47
|
-
summary:
|
|
47
|
+
summary:
|
|
48
|
+
'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
|
|
48
49
|
source: 'https://arxiv.org/abs/2507.03724',
|
|
49
50
|
},
|
|
50
51
|
{
|
|
51
52
|
title: 'Benchmarks now emphasize multi-session realism',
|
|
52
|
-
summary:
|
|
53
|
+
summary:
|
|
54
|
+
'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
|
|
53
55
|
source: 'https://arxiv.org/abs/2410.10813',
|
|
54
56
|
},
|
|
55
57
|
{
|
|
56
58
|
title: 'Context engineering is now competing with retrieval-first designs',
|
|
57
|
-
summary:
|
|
59
|
+
summary:
|
|
60
|
+
'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
|
|
58
61
|
source: 'https://www.letta.com/blog/memory-blocks',
|
|
59
62
|
},
|
|
60
63
|
{
|
|
61
64
|
title: 'Production teams care about latency and token footprint, not just recall quality',
|
|
62
|
-
summary:
|
|
65
|
+
summary:
|
|
66
|
+
'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
|
|
63
67
|
source: 'https://arxiv.org/abs/2504.19413',
|
|
64
68
|
},
|
|
65
69
|
{
|
|
66
70
|
title: 'Temporal and multimodal memory are becoming table stakes',
|
|
67
|
-
summary:
|
|
71
|
+
summary:
|
|
72
|
+
'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
|
|
68
73
|
source: 'https://arxiv.org/abs/2507.07957',
|
|
69
74
|
},
|
|
70
75
|
];
|
package/benchmarks/report.js
CHANGED
|
@@ -38,25 +38,29 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
|
|
|
38
38
|
const barWidth = Math.max(32, Math.floor(plotWidth / Math.max(rows.length, 1)) - 18);
|
|
39
39
|
const gap = rows.length > 1 ? (plotWidth - barWidth * rows.length) / (rows.length - 1) : 0;
|
|
40
40
|
|
|
41
|
-
const bars = rows
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
const bars = rows
|
|
42
|
+
.map((row, index) => {
|
|
43
|
+
const value = Math.max(0, Math.min(maxValue, row.value));
|
|
44
|
+
const barHeight = (value / maxValue) * plotHeight;
|
|
45
|
+
const x = margin.left + index * (barWidth + gap);
|
|
46
|
+
const y = margin.top + plotHeight - barHeight;
|
|
47
|
+
return `
|
|
47
48
|
<rect x="${x}" y="${y}" width="${barWidth}" height="${barHeight}" rx="8" fill="${chartBarColor(row.label)}" />
|
|
48
49
|
<text x="${x + barWidth / 2}" y="${y - 10}" text-anchor="middle" font-size="15" fill="${PALETTE.accent}">${value.toFixed(1)}${valueSuffix}</text>
|
|
49
50
|
<text x="${x + barWidth / 2}" y="${height - 42}" text-anchor="middle" font-size="14" fill="${PALETTE.muted}">${escapeHtml(row.label)}</text>
|
|
50
51
|
`;
|
|
51
|
-
|
|
52
|
+
})
|
|
53
|
+
.join('\n');
|
|
52
54
|
|
|
53
|
-
const grid = [0, 25, 50, 75, 100]
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
const grid = [0, 25, 50, 75, 100]
|
|
56
|
+
.map(tick => {
|
|
57
|
+
const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
|
|
58
|
+
return `
|
|
56
59
|
<line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="${PALETTE.border}" stroke-dasharray="4 4" />
|
|
57
60
|
<text x="${margin.left - 10}" y="${y + 5}" text-anchor="end" font-size="13" fill="${PALETTE.muted}">${tick}${valueSuffix}</text>
|
|
58
61
|
`;
|
|
59
|
-
|
|
62
|
+
})
|
|
63
|
+
.join('\n');
|
|
60
64
|
|
|
61
65
|
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
62
66
|
<svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}" role="img" aria-label="${escapeHtml(title)}">
|
|
@@ -68,39 +72,53 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
|
|
|
68
72
|
}
|
|
69
73
|
|
|
70
74
|
function renderTrendList(trends) {
|
|
71
|
-
return trends
|
|
75
|
+
return trends
|
|
76
|
+
.map(
|
|
77
|
+
trend => `
|
|
72
78
|
<li>
|
|
73
79
|
<strong>${escapeHtml(trend.title)}</strong><br />
|
|
74
80
|
${escapeHtml(trend.summary)}<br />
|
|
75
81
|
<a href="${trend.source}">${escapeHtml(trend.source)}</a>
|
|
76
82
|
</li>
|
|
77
|
-
|
|
83
|
+
`,
|
|
84
|
+
)
|
|
85
|
+
.join('\n');
|
|
78
86
|
}
|
|
79
87
|
|
|
80
88
|
function renderCaseRows(localCases) {
|
|
81
|
-
return localCases
|
|
89
|
+
return localCases
|
|
90
|
+
.map(
|
|
91
|
+
caseResult => `
|
|
82
92
|
<tr>
|
|
83
93
|
<td>${escapeHtml(caseResult.title)}</td>
|
|
84
94
|
<td>${escapeHtml(caseResult.suite)}</td>
|
|
85
95
|
<td>${escapeHtml(caseResult.family)}</td>
|
|
86
|
-
${caseResult.results
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
96
|
+
${caseResult.results
|
|
97
|
+
.map(result => {
|
|
98
|
+
const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
|
|
99
|
+
const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
|
|
100
|
+
return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
|
|
101
|
+
})
|
|
102
|
+
.join('')}
|
|
91
103
|
</tr>
|
|
92
|
-
|
|
104
|
+
`,
|
|
105
|
+
)
|
|
106
|
+
.join('\n');
|
|
93
107
|
}
|
|
94
108
|
|
|
95
109
|
function renderSuiteSections(suiteCharts) {
|
|
96
110
|
if (suiteCharts.length === 0) return '';
|
|
97
|
-
return suiteCharts
|
|
111
|
+
return suiteCharts
|
|
112
|
+
.map(
|
|
113
|
+
chart => `
|
|
98
114
|
<section class="callout">
|
|
99
115
|
<h2>${escapeHtml(chart.title)}</h2>
|
|
100
116
|
<p>${escapeHtml(chart.description)}</p>
|
|
101
117
|
<img src="./${escapeHtml(chart.fileName)}" alt="${escapeHtml(chart.title)} chart" />
|
|
102
118
|
</section>
|
|
103
|
-
|
|
119
|
+
`,
|
|
120
|
+
)
|
|
121
|
+
.join('\n');
|
|
104
122
|
}
|
|
105
123
|
|
|
106
124
|
export function writeBenchmarkArtifacts({
|
|
@@ -114,9 +132,10 @@ export function writeBenchmarkArtifacts({
|
|
|
114
132
|
}) {
|
|
115
133
|
mkdirSync(outputDir, { recursive: true });
|
|
116
134
|
|
|
117
|
-
const localChartTitle =
|
|
118
|
-
|
|
119
|
-
|
|
135
|
+
const localChartTitle =
|
|
136
|
+
summary.local?.overall_scope === 'comparable_suites'
|
|
137
|
+
? 'Audrey vs Comparable Local Memory Baselines'
|
|
138
|
+
: 'Selected Audrey Regression Suite';
|
|
120
139
|
const localChart = renderBarChart({
|
|
121
140
|
title: localChartTitle,
|
|
122
141
|
rows: localOverall.map(row => ({ label: row.system, value: row.scorePercent })),
|
|
@@ -162,8 +181,10 @@ export function writeBenchmarkArtifacts({
|
|
|
162
181
|
operationsReadmeChart,
|
|
163
182
|
renderBarChart({
|
|
164
183
|
title: 'Audrey Memory Operations Benchmark',
|
|
165
|
-
rows: (localSuites.find(suite => suite.id === 'operations')?.overall || [])
|
|
166
|
-
|
|
184
|
+
rows: (localSuites.find(suite => suite.id === 'operations')?.overall || []).map(row => ({
|
|
185
|
+
label: row.system,
|
|
186
|
+
value: row.scorePercent,
|
|
187
|
+
})),
|
|
167
188
|
}),
|
|
168
189
|
'utf8',
|
|
169
190
|
);
|
|
@@ -3,31 +3,46 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
|
3
3
|
import { basename, dirname, resolve } from 'node:path';
|
|
4
4
|
import { fileURLToPath } from 'node:url';
|
|
5
5
|
import { writeGuardBenchConformanceCard } from './create-conformance-card.mjs';
|
|
6
|
-
import {
|
|
6
|
+
import {
|
|
7
|
+
computeGuardBenchArtifactHashes,
|
|
8
|
+
validateGuardBenchArtifacts,
|
|
9
|
+
} from './validate-guardbench-artifacts.mjs';
|
|
7
10
|
import { publicArtifactValue } from './public-paths.mjs';
|
|
8
11
|
|
|
9
12
|
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
10
13
|
const KNOWN_ADAPTERS = new Map([
|
|
11
|
-
[
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
14
|
+
[
|
|
15
|
+
'mem0',
|
|
16
|
+
{
|
|
17
|
+
name: 'mem0-platform',
|
|
18
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
19
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
20
|
+
},
|
|
21
|
+
],
|
|
22
|
+
[
|
|
23
|
+
'mem0-platform',
|
|
24
|
+
{
|
|
25
|
+
name: 'mem0-platform',
|
|
26
|
+
path: 'benchmarks/adapters/mem0-platform.mjs',
|
|
27
|
+
requiredEnv: ['MEM0_API_KEY'],
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
[
|
|
31
|
+
'zep',
|
|
32
|
+
{
|
|
33
|
+
name: 'zep-cloud',
|
|
34
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
35
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
[
|
|
39
|
+
'zep-cloud',
|
|
40
|
+
{
|
|
41
|
+
name: 'zep-cloud',
|
|
42
|
+
path: 'benchmarks/adapters/zep-cloud.mjs',
|
|
43
|
+
requiredEnv: ['ZEP_API_KEY'],
|
|
44
|
+
},
|
|
45
|
+
],
|
|
31
46
|
]);
|
|
32
47
|
|
|
33
48
|
export function parseExternalArgs(argv = process.argv.slice(2)) {
|
|
@@ -127,13 +142,19 @@ export function evaluateAdapterConformance(summary, adapterName) {
|
|
|
127
142
|
.filter(row => row.system === resolvedAdapterName);
|
|
128
143
|
|
|
129
144
|
if (adapterRows.length !== expectedScenarios) {
|
|
130
|
-
failures.push(
|
|
145
|
+
failures.push(
|
|
146
|
+
`Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`,
|
|
147
|
+
);
|
|
131
148
|
}
|
|
132
149
|
if (systemSummary && systemSummary.scenarios !== expectedScenarios) {
|
|
133
|
-
failures.push(
|
|
150
|
+
failures.push(
|
|
151
|
+
`Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`,
|
|
152
|
+
);
|
|
134
153
|
}
|
|
135
154
|
if (systemSummary && systemSummary.redactionLeaks !== 0) {
|
|
136
|
-
failures.push(
|
|
155
|
+
failures.push(
|
|
156
|
+
`Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`,
|
|
157
|
+
);
|
|
137
158
|
}
|
|
138
159
|
if (adapterRows.some(row => row.external !== true)) {
|
|
139
160
|
failures.push(`Adapter ${resolvedAdapterName} rows are not marked external`);
|
|
@@ -270,7 +291,8 @@ async function main() {
|
|
|
270
291
|
const card = child.status === 0 ? writeGuardBenchConformanceCard({ dir: run.outDir }) : null;
|
|
271
292
|
console.log(`External GuardBench metadata: ${metadataPath}`);
|
|
272
293
|
if (card) console.log(`External GuardBench conformance card: ${card.path}`);
|
|
273
|
-
process.exitCode =
|
|
294
|
+
process.exitCode =
|
|
295
|
+
child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
|
|
274
296
|
}
|
|
275
297
|
|
|
276
298
|
if (process.argv[1] && process.argv[1].endsWith('run-external-guardbench.mjs')) {
|