audrey 0.23.1 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +101 -15
- package/LICENSE +21 -21
- package/README.md +232 -6
- package/SECURITY.md +2 -1
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1125 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1271 -0
- package/benchmarks/output/guardbench-summary.json +2107 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +65 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +675 -157
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.js +5 -5
- package/dist/src/affect.js +8 -8
- package/dist/src/audrey.d.ts +13 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +68 -3
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.js +4 -4
- package/dist/src/causal.js +3 -3
- package/dist/src/consolidate.js +48 -48
- package/dist/src/controller.d.ts +78 -6
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +273 -53
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.js +172 -172
- package/dist/src/decay.js +8 -8
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +39 -29
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.js +6 -6
- package/dist/src/feedback.d.ts +6 -0
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +6 -0
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.js +12 -12
- package/dist/src/hybrid-recall.js +9 -9
- package/dist/src/impact.js +6 -6
- package/dist/src/import.d.ts +3 -3
- package/dist/src/import.js +41 -41
- package/dist/src/index.d.ts +5 -4
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.js +14 -14
- package/dist/src/introspect.js +18 -18
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +41 -0
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/promote.js +7 -7
- package/dist/src/prompts.js +118 -118
- package/dist/src/recall.js +30 -30
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.js +4 -4
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +71 -2
- package/dist/src/routes.js.map +1 -1
- package/dist/src/validate.js +25 -25
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +555 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +87 -4
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +92 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +785 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { basename, dirname, resolve } from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import { loadExternalAdapters, runGuardBench } from './guardbench.js';
|
|
5
|
+
import { evaluateAdapterConformance } from './run-external-guardbench.mjs';
|
|
6
|
+
import { validateSchema } from './validate-guardbench-artifacts.mjs';
|
|
7
|
+
import { publicPath } from './public-paths.mjs';
|
|
8
|
+
|
|
9
|
+
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
10
|
+
const DEFAULT_ADAPTER = 'benchmarks/adapters/example-allow.mjs';
|
|
11
|
+
const DEFAULT_OUT = 'benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json';
|
|
12
|
+
const DEFAULT_SCHEMA = 'benchmarks/schemas/guardbench-adapter-self-test.schema.json';
|
|
13
|
+
const RESULT_FIELDS = [
|
|
14
|
+
'decision',
|
|
15
|
+
'riskScore',
|
|
16
|
+
'evidenceIds',
|
|
17
|
+
'recommendedActions',
|
|
18
|
+
'summary',
|
|
19
|
+
'recallErrors',
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
export function parseAdapterSelfTestArgs(argv = process.argv.slice(2)) {
|
|
23
|
+
const args = {
|
|
24
|
+
adapter: DEFAULT_ADAPTER,
|
|
25
|
+
out: DEFAULT_OUT,
|
|
26
|
+
json: false,
|
|
27
|
+
noWrite: false,
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
for (let i = 0; i < argv.length; i++) {
|
|
31
|
+
const token = argv[i];
|
|
32
|
+
if (token === '--adapter' && argv[i + 1]) args.adapter = argv[++i];
|
|
33
|
+
else if (token === '--out' && argv[i + 1]) args.out = argv[++i];
|
|
34
|
+
else if (token === '--json') args.json = true;
|
|
35
|
+
else if (token === '--no-write') args.noWrite = true;
|
|
36
|
+
else if (token === '--help' || token === '-h') args.help = true;
|
|
37
|
+
else throw new Error(`Unknown argument: ${token}`);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return args;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function usage() {
|
|
44
|
+
return `Usage: node benchmarks/adapter-self-test.mjs [options]
|
|
45
|
+
|
|
46
|
+
Options:
|
|
47
|
+
--adapter <path> ESM GuardBench adapter path. Default: ${DEFAULT_ADAPTER}.
|
|
48
|
+
--out <path> JSON report path. Default: ${DEFAULT_OUT}.
|
|
49
|
+
--json Print the full JSON report.
|
|
50
|
+
--no-write Do not write the JSON report.
|
|
51
|
+
`;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function systemSummary(report, adapterName) {
|
|
55
|
+
return report.systemSummaries.find(row => row.system === adapterName) ?? null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function scoreFromReport(report, adapterName) {
|
|
59
|
+
const summary = systemSummary(report, adapterName);
|
|
60
|
+
return {
|
|
61
|
+
scenarios: summary?.scenarios ?? 0,
|
|
62
|
+
fullContractPassRate: summary?.passRate ?? null,
|
|
63
|
+
decisionAccuracy: summary?.decisionAccuracy ?? null,
|
|
64
|
+
evidenceRecall: summary?.evidenceRecall ?? null,
|
|
65
|
+
redactionLeaks: summary?.redactionLeaks ?? null,
|
|
66
|
+
latency: summary?.latency ?? null,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function readJson(path) {
|
|
71
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function validateAdapterSelfTestReport(report, options = {}) {
|
|
75
|
+
const schemaPath = resolve(ROOT, options.schema ?? DEFAULT_SCHEMA);
|
|
76
|
+
const schema = options.schemaObject ?? readJson(schemaPath);
|
|
77
|
+
return validateSchema(report, schema, 'guardbench-adapter-self-test');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export async function runGuardBenchAdapterSelfTest(options = {}) {
|
|
81
|
+
const adapterPath = resolve(ROOT, options.adapterPath ?? options.adapter ?? DEFAULT_ADAPTER);
|
|
82
|
+
if (!existsSync(adapterPath)) {
|
|
83
|
+
throw new Error(`GuardBench adapter not found: ${adapterPath}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const adapters = await loadExternalAdapters([adapterPath]);
|
|
87
|
+
if (adapters.length !== 1) {
|
|
88
|
+
throw new Error(`GuardBench adapter self-test expected 1 adapter, got ${adapters.length}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const [adapter] = adapters;
|
|
92
|
+
const report = await runGuardBench({ externalAdapters: adapters });
|
|
93
|
+
const conformance = evaluateAdapterConformance(report, adapter.name);
|
|
94
|
+
const score = scoreFromReport(report, conformance.adapter);
|
|
95
|
+
const selfTest = {
|
|
96
|
+
schemaVersion: '1.0.0',
|
|
97
|
+
suite: 'GuardBench adapter self-test',
|
|
98
|
+
generatedAt: new Date().toISOString(),
|
|
99
|
+
ok: conformance.ok,
|
|
100
|
+
adapter: {
|
|
101
|
+
name: adapter.name,
|
|
102
|
+
path: publicPath(adapterPath),
|
|
103
|
+
moduleFile: basename(adapterPath),
|
|
104
|
+
description: adapter.description ?? null,
|
|
105
|
+
},
|
|
106
|
+
conformance,
|
|
107
|
+
score,
|
|
108
|
+
contract: {
|
|
109
|
+
expectedAnswersWithheld: true,
|
|
110
|
+
lowScoreAllowed: true,
|
|
111
|
+
requiredScenarioRows: report.scenarios,
|
|
112
|
+
requiredResultFields: RESULT_FIELDS,
|
|
113
|
+
redactionLeakTolerance: 0,
|
|
114
|
+
},
|
|
115
|
+
failures: conformance.failures,
|
|
116
|
+
};
|
|
117
|
+
const schemaErrors = validateAdapterSelfTestReport(selfTest);
|
|
118
|
+
if (schemaErrors.length > 0) {
|
|
119
|
+
throw new Error(`GuardBench adapter self-test schema validation failed: ${schemaErrors.join('; ')}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (options.out && options.write !== false) {
|
|
123
|
+
const outPath = resolve(ROOT, options.out);
|
|
124
|
+
mkdirSync(dirname(outPath), { recursive: true });
|
|
125
|
+
writeFileSync(outPath, `${JSON.stringify(selfTest, null, 2)}\n`, 'utf-8');
|
|
126
|
+
selfTest.outPath = publicPath(outPath);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return selfTest;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
async function main() {
|
|
133
|
+
const args = parseAdapterSelfTestArgs();
|
|
134
|
+
if (args.help) {
|
|
135
|
+
console.log(usage());
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const result = await runGuardBenchAdapterSelfTest({
|
|
140
|
+
adapter: args.adapter,
|
|
141
|
+
out: args.noWrite ? null : args.out,
|
|
142
|
+
write: !args.noWrite,
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
if (args.json) {
|
|
146
|
+
console.log(JSON.stringify(result, null, 2));
|
|
147
|
+
} else if (result.ok) {
|
|
148
|
+
console.log(`GuardBench adapter self-test passed: ${result.adapter.name}`);
|
|
149
|
+
console.log(`Contract rows: ${result.conformance.scenarios}/${result.conformance.expectedScenarios}`);
|
|
150
|
+
console.log(`Full-contract score: ${(result.score.fullContractPassRate * 100).toFixed(1)}%`);
|
|
151
|
+
console.log(`Decision accuracy: ${(result.score.decisionAccuracy * 100).toFixed(1)}%`);
|
|
152
|
+
if (result.outPath) console.log(`Self-test report: ${result.outPath}`);
|
|
153
|
+
} else {
|
|
154
|
+
console.error(`GuardBench adapter self-test failed: ${result.adapter.name}`);
|
|
155
|
+
for (const failure of result.failures) console.error(`- ${failure}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
process.exitCode = result.ok ? 0 : 1;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
|
|
162
|
+
main().catch(error => {
|
|
163
|
+
console.error(error.message);
|
|
164
|
+
process.exit(1);
|
|
165
|
+
});
|
|
166
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { defineGuardBenchAdapter } from '../adapter-kit.mjs';
|
|
2
|
+
|
|
3
|
+
export default defineGuardBenchAdapter({
|
|
4
|
+
name: 'Example Allow Adapter',
|
|
5
|
+
description: 'Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests.',
|
|
6
|
+
async setup({ scenario }) {
|
|
7
|
+
return {
|
|
8
|
+
memoryCount: (scenario.seed.seededMemories ?? []).length,
|
|
9
|
+
toolEventCount: (scenario.seed.seededToolEvents ?? []).length,
|
|
10
|
+
hasFaultInjection: Boolean(scenario.seed.faultInjection),
|
|
11
|
+
};
|
|
12
|
+
},
|
|
13
|
+
async decide({ scenario, state }) {
|
|
14
|
+
return {
|
|
15
|
+
decision: 'allow',
|
|
16
|
+
riskScore: 0,
|
|
17
|
+
evidenceIds: [],
|
|
18
|
+
recommendedActions: [],
|
|
19
|
+
summary: [
|
|
20
|
+
`Example adapter loaded ${state.memoryCount} seeded memories`,
|
|
21
|
+
`${state.toolEventCount} seeded tool events`,
|
|
22
|
+
scenario.seed.seededNoise ? `${scenario.seed.seededNoise.count} noise memories` : 'no noise block',
|
|
23
|
+
state.hasFaultInjection ? 'fault injection present but unsupported' : 'no fault injection',
|
|
24
|
+
].join('; '),
|
|
25
|
+
};
|
|
26
|
+
},
|
|
27
|
+
async cleanup() {},
|
|
28
|
+
});
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import { randomBytes } from 'node:crypto';
|
|
2
|
+
|
|
3
|
+
const DEFAULT_BASE_URL = 'https://api.mem0.ai';
|
|
4
|
+
const DEFAULT_POLL_TIMEOUT_MS = 60_000;
|
|
5
|
+
const DEFAULT_POLL_INTERVAL_MS = 1_000;
|
|
6
|
+
const BATCH_SIZE = 100;
|
|
7
|
+
|
|
8
|
+
function requireEnv(name) {
|
|
9
|
+
const value = process.env[name];
|
|
10
|
+
if (!value) {
|
|
11
|
+
throw new Error(`${name} is required for the Mem0 GuardBench adapter.`);
|
|
12
|
+
}
|
|
13
|
+
return value;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function sleep(ms) {
|
|
17
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function actionText(action) {
|
|
21
|
+
return [action.action, action.command, action.tool, action.cwd, ...(action.files ?? [])]
|
|
22
|
+
.filter(Boolean)
|
|
23
|
+
.join('\n');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function normalize(text) {
|
|
27
|
+
return String(text || '').toLowerCase();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function tokenize(text) {
|
|
31
|
+
return normalize(text)
|
|
32
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
33
|
+
.trim()
|
|
34
|
+
.split(/\s+/)
|
|
35
|
+
.filter(token => token.length > 2);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function tokenOverlap(a, b) {
|
|
39
|
+
const tokens = tokenize(a);
|
|
40
|
+
if (tokens.length === 0) return 0;
|
|
41
|
+
const other = new Set(tokenize(b));
|
|
42
|
+
let matches = 0;
|
|
43
|
+
for (const token of tokens) {
|
|
44
|
+
if (other.has(token)) matches++;
|
|
45
|
+
}
|
|
46
|
+
return matches / tokens.length;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function memoryText(memory) {
|
|
50
|
+
return String(memory?.memory ?? memory?.text ?? memory?.content ?? '');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function evidenceIds(memories) {
|
|
54
|
+
return memories
|
|
55
|
+
.map(memory => memory?.id ?? memory?.memory_id)
|
|
56
|
+
.filter(Boolean);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function decisionFromMemories(memories, action, unsupportedFault = null) {
|
|
60
|
+
if (unsupportedFault) {
|
|
61
|
+
return {
|
|
62
|
+
decision: 'warn',
|
|
63
|
+
riskScore: 0.55,
|
|
64
|
+
evidenceIds: evidenceIds(memories),
|
|
65
|
+
recommendedActions: ['External adapter cannot inject storage faults into Mem0 Platform; verify memory health separately.'],
|
|
66
|
+
summary: `Mem0 adapter cannot emulate fault injection: ${unsupportedFault}.`,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const query = actionText(action);
|
|
71
|
+
const texts = memories.map(memoryText);
|
|
72
|
+
const joined = texts.join('\n').toLowerCase();
|
|
73
|
+
if (/\b(must-follow|never|do not|high-risk|conflicting)\b/i.test(joined)) {
|
|
74
|
+
const relevant = texts.some(text => tokenOverlap(query, text) >= 0.18);
|
|
75
|
+
return {
|
|
76
|
+
decision: relevant ? 'block' : 'warn',
|
|
77
|
+
riskScore: relevant ? 0.85 : 0.55,
|
|
78
|
+
evidenceIds: evidenceIds(memories),
|
|
79
|
+
recommendedActions: ['Review retrieved Mem0 memory before acting.'],
|
|
80
|
+
summary: relevant
|
|
81
|
+
? 'Mem0 retrieved policy-like memory with lexical overlap; adapter escalated from retrieval to a block.'
|
|
82
|
+
: 'Mem0 retrieved policy-like memory without enough lexical overlap for a block.',
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (memories.length > 0) {
|
|
87
|
+
return {
|
|
88
|
+
decision: 'warn',
|
|
89
|
+
riskScore: 0.35,
|
|
90
|
+
evidenceIds: evidenceIds(memories),
|
|
91
|
+
recommendedActions: ['Treat retrieved Mem0 memories as advisory context.'],
|
|
92
|
+
summary: 'Mem0 retrieved related memory, but no policy-like control evidence was found.',
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
decision: 'allow',
|
|
98
|
+
riskScore: 0,
|
|
99
|
+
evidenceIds: [],
|
|
100
|
+
recommendedActions: [],
|
|
101
|
+
summary: 'Mem0 returned no relevant memories for this action.',
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
class Mem0PlatformClient {
|
|
106
|
+
constructor({
|
|
107
|
+
apiKey = requireEnv('MEM0_API_KEY'),
|
|
108
|
+
baseUrl = process.env.MEM0_BASE_URL ?? DEFAULT_BASE_URL,
|
|
109
|
+
pollTimeoutMs = Number(process.env.MEM0_EVENT_TIMEOUT_MS ?? DEFAULT_POLL_TIMEOUT_MS),
|
|
110
|
+
pollIntervalMs = Number(process.env.MEM0_EVENT_POLL_INTERVAL_MS ?? DEFAULT_POLL_INTERVAL_MS),
|
|
111
|
+
fetchImpl = globalThis.fetch,
|
|
112
|
+
} = {}) {
|
|
113
|
+
this.apiKey = apiKey;
|
|
114
|
+
this.baseUrl = baseUrl.replace(/\/+$/, '');
|
|
115
|
+
this.pollTimeoutMs = pollTimeoutMs;
|
|
116
|
+
this.pollIntervalMs = pollIntervalMs;
|
|
117
|
+
this.fetch = fetchImpl;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async request(path, options = {}) {
|
|
121
|
+
const response = await this.fetch(`${this.baseUrl}${path}`, {
|
|
122
|
+
...options,
|
|
123
|
+
headers: {
|
|
124
|
+
Authorization: `Token ${this.apiKey}`,
|
|
125
|
+
'Content-Type': 'application/json',
|
|
126
|
+
...(options.headers ?? {}),
|
|
127
|
+
},
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
if (!response.ok && response.status !== 204) {
|
|
131
|
+
const body = await response.text();
|
|
132
|
+
throw new Error(`Mem0 ${options.method ?? 'GET'} ${path} failed ${response.status}: ${body.slice(0, 500)}`);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (response.status === 204) return null;
|
|
136
|
+
const text = await response.text();
|
|
137
|
+
return text ? JSON.parse(text) : null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async addMessages({ userId, messages, metadata }) {
|
|
141
|
+
if (messages.length === 0) return;
|
|
142
|
+
const response = await this.request('/v3/memories/add/', {
|
|
143
|
+
method: 'POST',
|
|
144
|
+
body: JSON.stringify({
|
|
145
|
+
user_id: userId,
|
|
146
|
+
messages,
|
|
147
|
+
metadata,
|
|
148
|
+
infer: false,
|
|
149
|
+
}),
|
|
150
|
+
});
|
|
151
|
+
if (response?.event_id) {
|
|
152
|
+
await this.waitForEvent(response.event_id);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async waitForEvent(eventId) {
|
|
157
|
+
const started = Date.now();
|
|
158
|
+
while (Date.now() - started < this.pollTimeoutMs) {
|
|
159
|
+
const event = await this.request(`/v1/event/${eventId}/`);
|
|
160
|
+
if (event?.status === 'SUCCEEDED') return event;
|
|
161
|
+
if (event?.status === 'FAILED') {
|
|
162
|
+
throw new Error(`Mem0 event ${eventId} failed: ${JSON.stringify(event).slice(0, 500)}`);
|
|
163
|
+
}
|
|
164
|
+
await sleep(this.pollIntervalMs);
|
|
165
|
+
}
|
|
166
|
+
throw new Error(`Timed out waiting for Mem0 event ${eventId}.`);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async search({ userId, query }) {
|
|
170
|
+
const response = await this.request('/v2/memories/search', {
|
|
171
|
+
method: 'POST',
|
|
172
|
+
body: JSON.stringify({
|
|
173
|
+
query,
|
|
174
|
+
version: 'v2',
|
|
175
|
+
filters: { user_id: userId },
|
|
176
|
+
}),
|
|
177
|
+
});
|
|
178
|
+
return Array.isArray(response) ? response : response?.results ?? [];
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
async deleteUser(userId) {
|
|
182
|
+
await this.request(`/v2/entities/user/${encodeURIComponent(userId)}/`, {
|
|
183
|
+
method: 'DELETE',
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function memoryMessagesFromScenario(scenario) {
|
|
189
|
+
const messages = [];
|
|
190
|
+
for (const memory of scenario.seed.seededMemories ?? []) {
|
|
191
|
+
messages.push({ role: 'user', content: memory.content });
|
|
192
|
+
}
|
|
193
|
+
for (const event of scenario.seed.seededToolEvents ?? []) {
|
|
194
|
+
const seededSecret = event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
|
|
195
|
+
? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
|
|
196
|
+
: '';
|
|
197
|
+
messages.push({
|
|
198
|
+
role: 'user',
|
|
199
|
+
content: [
|
|
200
|
+
`Tool event: ${event.tool ?? 'tool'}`,
|
|
201
|
+
event.action ? `Action: ${event.action}` : '',
|
|
202
|
+
event.outcome ? `Outcome: ${event.outcome}` : '',
|
|
203
|
+
event.errorSummary ? `Error: ${event.errorSummary}` : '',
|
|
204
|
+
event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
|
|
205
|
+
seededSecret ? `Error: ${seededSecret}` : '',
|
|
206
|
+
event.output ? `Output: ${event.output}` : '',
|
|
207
|
+
].filter(Boolean).join('\n'),
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
if (scenario.seed.seededNoise?.count) {
|
|
211
|
+
for (let i = 0; i < scenario.seed.seededNoise.count; i++) {
|
|
212
|
+
messages.push({
|
|
213
|
+
role: 'user',
|
|
214
|
+
content: `Irrelevant background memory ${i}: UI color preference, lunch note, or unrelated calendar detail.`,
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return messages;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
async function addInBatches(client, { userId, scenario, messages }) {
|
|
222
|
+
for (let i = 0; i < messages.length; i += BATCH_SIZE) {
|
|
223
|
+
await client.addMessages({
|
|
224
|
+
userId,
|
|
225
|
+
messages: messages.slice(i, i + BATCH_SIZE),
|
|
226
|
+
metadata: {
|
|
227
|
+
benchmark: 'guardbench',
|
|
228
|
+
scenario_id: scenario.id,
|
|
229
|
+
adapter: 'mem0-platform',
|
|
230
|
+
},
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
function userIdForScenario(scenario) {
|
|
236
|
+
const prefix = process.env.MEM0_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
|
|
237
|
+
const runId = process.env.MEM0_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
|
|
238
|
+
return `${prefix}-${runId}-${scenario.id}`.toLowerCase();
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
export function createGuardBenchAdapter(options = {}) {
|
|
242
|
+
return {
|
|
243
|
+
name: 'Mem0 Platform',
|
|
244
|
+
description: 'Mem0 Platform REST adapter using V3 add, V2 search, event polling, and entity cleanup.',
|
|
245
|
+
async setup({ scenario }) {
|
|
246
|
+
const client = new Mem0PlatformClient(options);
|
|
247
|
+
const userId = userIdForScenario(scenario);
|
|
248
|
+
const messages = memoryMessagesFromScenario(scenario);
|
|
249
|
+
await addInBatches(client, { userId, scenario, messages });
|
|
250
|
+
return { client, userId };
|
|
251
|
+
},
|
|
252
|
+
async decide({ scenario, action, state }) {
|
|
253
|
+
const memories = await state.client.search({
|
|
254
|
+
userId: state.userId,
|
|
255
|
+
query: actionText(action),
|
|
256
|
+
});
|
|
257
|
+
return decisionFromMemories(memories, action, scenario.seed.faultInjection);
|
|
258
|
+
},
|
|
259
|
+
async cleanup({ state }) {
|
|
260
|
+
if (state?.client && state?.userId && process.env.MEM0_GUARDBENCH_SKIP_CLEANUP !== '1') {
|
|
261
|
+
await state.client.deleteUser(state.userId);
|
|
262
|
+
}
|
|
263
|
+
},
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
export default createGuardBenchAdapter();
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.0.0",
|
|
3
|
+
"suite": "GuardBench adapter registry",
|
|
4
|
+
"adapters": [
|
|
5
|
+
{
|
|
6
|
+
"id": "example-allow",
|
|
7
|
+
"name": "Example Allow Adapter",
|
|
8
|
+
"path": "benchmarks/adapters/example-allow.mjs",
|
|
9
|
+
"status": "reference",
|
|
10
|
+
"credentialMode": "none",
|
|
11
|
+
"requiredEnv": [],
|
|
12
|
+
"description": "Credential-free reference adapter for validating module loading, self-test generation, and report validation.",
|
|
13
|
+
"commands": {
|
|
14
|
+
"moduleValidate": "npm run bench:guard:adapter-module:validate -- --adapter benchmarks/adapters/example-allow.mjs",
|
|
15
|
+
"selfTest": "npm run bench:guard:adapter-self-test -- --adapter benchmarks/adapters/example-allow.mjs",
|
|
16
|
+
"selfTestValidate": "npm run bench:guard:adapter-self-test:validate -- --report benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json",
|
|
17
|
+
"externalRun": "npm run bench:guard:adapter-conformance"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "mem0-platform",
|
|
22
|
+
"name": "Mem0 Platform",
|
|
23
|
+
"path": "benchmarks/adapters/mem0-platform.mjs",
|
|
24
|
+
"status": "external-system",
|
|
25
|
+
"credentialMode": "runtime-env",
|
|
26
|
+
"requiredEnv": ["MEM0_API_KEY"],
|
|
27
|
+
"description": "Mem0 Platform REST adapter using V3 add, V2 search, event polling, and entity cleanup.",
|
|
28
|
+
"commands": {
|
|
29
|
+
"moduleValidate": "npm run bench:guard:adapter-module:validate -- --adapter benchmarks/adapters/mem0-platform.mjs",
|
|
30
|
+
"selfTest": "npm run bench:guard:adapter-self-test -- --adapter benchmarks/adapters/mem0-platform.mjs",
|
|
31
|
+
"selfTestValidate": "npm run bench:guard:adapter-self-test:validate -- --report benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json",
|
|
32
|
+
"externalRun": "npm run bench:guard:mem0"
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"id": "zep-cloud",
|
|
37
|
+
"name": "Zep Cloud",
|
|
38
|
+
"path": "benchmarks/adapters/zep-cloud.mjs",
|
|
39
|
+
"status": "external-system",
|
|
40
|
+
"credentialMode": "runtime-env",
|
|
41
|
+
"requiredEnv": ["ZEP_API_KEY"],
|
|
42
|
+
"description": "Zep Cloud REST adapter using v2 users, sessions, memory.add, graph.search, and user cleanup.",
|
|
43
|
+
"commands": {
|
|
44
|
+
"moduleValidate": "npm run bench:guard:adapter-module:validate -- --adapter benchmarks/adapters/zep-cloud.mjs",
|
|
45
|
+
"selfTest": "npm run bench:guard:adapter-self-test -- --adapter benchmarks/adapters/zep-cloud.mjs",
|
|
46
|
+
"selfTestValidate": "npm run bench:guard:adapter-self-test:validate -- --report benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json",
|
|
47
|
+
"externalRun": "npm run bench:guard:zep"
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
]
|
|
51
|
+
}
|