audrey 0.21.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +238 -0
- package/LICENSE +21 -21
- package/README.md +281 -33
- package/SECURITY.md +30 -0
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1035 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1171 -0
- package/benchmarks/output/guardbench-summary.json +1981 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1171 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +1981 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +5 -4
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +6 -8
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +281 -23
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +1186 -82
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +8 -6
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts +4 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +14 -12
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +57 -4
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +512 -65
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts +2 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +18 -8
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +23 -5
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +3 -0
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts +1 -0
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +70 -54
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/controller.d.ts +94 -0
- package/dist/src/controller.d.ts.map +1 -0
- package/dist/src/controller.js +350 -0
- package/dist/src/controller.js.map +1 -0
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +181 -169
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +62 -55
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +60 -22
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts +9 -2
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +25 -12
- package/dist/src/encode.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +5 -3
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts +35 -0
- package/dist/src/feedback.d.ts.map +1 -0
- package/dist/src/feedback.js +129 -0
- package/dist/src/feedback.js.map +1 -0
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +68 -60
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.js +1 -1
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts +2 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +41 -32
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts +47 -0
- package/dist/src/impact.d.ts.map +1 -0
- package/dist/src/impact.js +146 -0
- package/dist/src/impact.js.map +1 -0
- package/dist/src/import.d.ts +177 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +235 -46
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +5 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts +5 -2
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +39 -32
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.js +18 -18
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +1 -0
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +21 -9
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts +2 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +66 -5
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts +23 -0
- package/dist/src/profile.d.ts.map +1 -0
- package/dist/src/profile.js +51 -0
- package/dist/src/profile.js.map +1 -0
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +8 -9
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +165 -136
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts +9 -6
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +204 -62
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts +7 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +94 -11
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +13 -8
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts +1 -0
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +251 -6
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +36 -6
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.d.ts +2 -1
- package/dist/src/server.d.ts.map +1 -1
- package/dist/src/server.js +42 -4
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +42 -29
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts +28 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.d.ts.map +1 -1
- package/dist/src/ulid.js +52 -2
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js +8 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts +2 -0
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +77 -46
- package/dist/src/validate.js.map +1 -1
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +533 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1171 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +1981 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +103 -26
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +72 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +741 -0
- package/docs/assets/benchmarks/local-benchmark.svg +0 -45
- package/docs/assets/benchmarks/operations-benchmark.svg +0 -45
- package/docs/assets/benchmarks/published-memory-standards.svg +0 -50
- package/docs/audrey-for-dummies.md +0 -670
- package/docs/benchmarking.md +0 -151
- package/docs/future-of-llm-memory.md +0 -452
- package/docs/mcp-hosts.md +0 -206
- package/docs/ollama-local-agents.md +0 -128
- package/docs/production-readiness.md +0 -128
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { createEmbeddingProvider } from '../dist/src/embedding.js';
|
|
2
|
+
import { cosineSimilarity } from '../dist/src/utils.js';
|
|
3
|
+
|
|
4
|
+
function normalize(text) {
|
|
5
|
+
return String(text || '').toLowerCase();
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function tokenize(text) {
|
|
9
|
+
return normalize(text)
|
|
10
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
11
|
+
.trim()
|
|
12
|
+
.split(/\s+/)
|
|
13
|
+
.filter(Boolean);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function keywordScore(queryTokens, content) {
|
|
17
|
+
const contentTokens = new Set(tokenize(content));
|
|
18
|
+
if (queryTokens.length === 0) return 0;
|
|
19
|
+
let matches = 0;
|
|
20
|
+
for (const token of queryTokens) {
|
|
21
|
+
if (contentTokens.has(token)) matches++;
|
|
22
|
+
}
|
|
23
|
+
return matches / queryTokens.length;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function sortByScore(rows) {
|
|
27
|
+
return rows
|
|
28
|
+
.filter(row => Number.isFinite(row.score))
|
|
29
|
+
.sort((a, b) => b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function flattenMemories(benchmarkCase, ids = []) {
|
|
33
|
+
return benchmarkCase.memory.map((memory, index) => ({
|
|
34
|
+
id: ids[index] || `memory-${index + 1}`,
|
|
35
|
+
content: memory.content,
|
|
36
|
+
source: memory.source,
|
|
37
|
+
createdAt: memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
|
|
38
|
+
private: Boolean(memory.private),
|
|
39
|
+
}));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function buildSyntheticCase(query, memories, options = {}) {
|
|
43
|
+
return {
|
|
44
|
+
query,
|
|
45
|
+
memory: memories.map(memory => ({
|
|
46
|
+
content: memory.content,
|
|
47
|
+
source: memory.source,
|
|
48
|
+
createdAt: memory.createdAt,
|
|
49
|
+
private: memory.private,
|
|
50
|
+
})),
|
|
51
|
+
options,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function runBaselineRetrieval(system, syntheticCase, providerConfig, limit = 5) {
|
|
56
|
+
switch (system) {
|
|
57
|
+
case 'Vector Only':
|
|
58
|
+
return runVectorOnlyBaseline(syntheticCase, providerConfig, limit);
|
|
59
|
+
case 'Keyword + Recency':
|
|
60
|
+
return runKeywordRecencyBaseline(syntheticCase, limit);
|
|
61
|
+
case 'Recent Window':
|
|
62
|
+
return runRecentWindowBaseline(syntheticCase, limit);
|
|
63
|
+
default:
|
|
64
|
+
throw new Error(`Unknown baseline system: ${system}`);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function createOperationMemory(state, step) {
|
|
69
|
+
const index = state.counter++;
|
|
70
|
+
return {
|
|
71
|
+
id: `memory-${index + 1}`,
|
|
72
|
+
content: step.memory.content,
|
|
73
|
+
source: step.memory.source,
|
|
74
|
+
createdAt: step.memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
|
|
75
|
+
private: Boolean(step.memory.private),
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async function applyBaselineStep(system, state, step, providerConfig) {
|
|
80
|
+
if (step.type === 'encode') {
|
|
81
|
+
const memory = createOperationMemory(state, step);
|
|
82
|
+
state.memories.push(memory);
|
|
83
|
+
if (step.saveAs) {
|
|
84
|
+
state.aliases.set(step.saveAs, memory.id);
|
|
85
|
+
}
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (step.type === 'forgetByQuery') {
|
|
90
|
+
const syntheticCase = buildSyntheticCase(step.query, state.memories, step.options);
|
|
91
|
+
const [match] = await runBaselineRetrieval(system, syntheticCase, providerConfig, 1);
|
|
92
|
+
if (match && Number.isFinite(match.score) && match.score > 0) {
|
|
93
|
+
state.memories = state.memories.filter(memory => memory.id !== match.id);
|
|
94
|
+
}
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (step.type === 'consolidate') {
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
throw new Error(`Unsupported baseline step: ${step.type}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export async function runBaselineScenario(system, benchmarkCase, providerConfig, limit = 5) {
|
|
106
|
+
if (benchmarkCase.kind !== 'operations') {
|
|
107
|
+
return runBaselineRetrieval(system, benchmarkCase, providerConfig, limit);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const state = {
|
|
111
|
+
counter: 0,
|
|
112
|
+
memories: [],
|
|
113
|
+
aliases: new Map(),
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
for (const step of benchmarkCase.steps || []) {
|
|
117
|
+
await applyBaselineStep(system, state, step, providerConfig);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return runBaselineRetrieval(
|
|
121
|
+
system,
|
|
122
|
+
buildSyntheticCase(benchmarkCase.query, state.memories, benchmarkCase.options),
|
|
123
|
+
providerConfig,
|
|
124
|
+
limit,
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
|
|
129
|
+
const queryTokens = tokenize(benchmarkCase.query);
|
|
130
|
+
return sortByScore(flattenMemories(benchmarkCase).map(memory => ({
|
|
131
|
+
...memory,
|
|
132
|
+
type: 'episodic',
|
|
133
|
+
score: keywordScore(queryTokens, memory.content),
|
|
134
|
+
}))).slice(0, limit);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function runRecentWindowBaseline(benchmarkCase, limit = 3) {
|
|
138
|
+
return flattenMemories(benchmarkCase)
|
|
139
|
+
.sort((a, b) => String(b.createdAt).localeCompare(String(a.createdAt)))
|
|
140
|
+
.slice(0, limit)
|
|
141
|
+
.map((memory, index) => ({
|
|
142
|
+
...memory,
|
|
143
|
+
type: 'episodic',
|
|
144
|
+
score: 1 - index * 0.1,
|
|
145
|
+
}));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export async function runVectorOnlyBaseline(benchmarkCase, providerConfig, limit = 5) {
|
|
149
|
+
const provider = createEmbeddingProvider(providerConfig);
|
|
150
|
+
if (typeof provider.ready === 'function') {
|
|
151
|
+
await provider.ready();
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const queryVector = await provider.embed(benchmarkCase.query);
|
|
155
|
+
const queryBuffer = provider.vectorToBuffer(queryVector);
|
|
156
|
+
|
|
157
|
+
const rows = [];
|
|
158
|
+
for (const memory of flattenMemories(benchmarkCase)) {
|
|
159
|
+
const vector = await provider.embed(memory.content);
|
|
160
|
+
const score = cosineSimilarity(queryBuffer, provider.vectorToBuffer(vector), provider);
|
|
161
|
+
rows.push({
|
|
162
|
+
...memory,
|
|
163
|
+
type: 'episodic',
|
|
164
|
+
score,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return sortByScore(rows).slice(0, limit);
|
|
169
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { dirname, join, resolve } from 'node:path';
|
|
3
|
+
import { verifyGuardBenchSubmissionBundle } from './verify-submission-bundle.mjs';
|
|
4
|
+
import { validateSchema } from './validate-guardbench-artifacts.mjs';
|
|
5
|
+
import { publicPath } from './public-paths.mjs';
|
|
6
|
+
|
|
7
|
+
function readJson(path) {
|
|
8
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function percent(value) {
|
|
12
|
+
return value == null ? 'n/a' : `${(value * 100).toFixed(1)}%`;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function number(value) {
|
|
16
|
+
return value == null ? 'n/a' : String(value);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function rowFromBundle(dir) {
|
|
20
|
+
const verification = verifyGuardBenchSubmissionBundle({ dir });
|
|
21
|
+
const manifest = readJson(join(resolve(dir), 'submission-manifest.json'));
|
|
22
|
+
return {
|
|
23
|
+
subject: manifest.subject,
|
|
24
|
+
score: manifest.score,
|
|
25
|
+
conformance: manifest.conformance,
|
|
26
|
+
source: {
|
|
27
|
+
dir: publicPath(resolve(dir)),
|
|
28
|
+
manifestGeneratedAt: manifest.generatedAt,
|
|
29
|
+
fileCount: manifest.files?.length ?? 0,
|
|
30
|
+
},
|
|
31
|
+
verification,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function compareRows(a, b) {
|
|
36
|
+
return (
|
|
37
|
+
Number(b.verification.ok) - Number(a.verification.ok)
|
|
38
|
+
|| Number(b.conformance.ok) - Number(a.conformance.ok)
|
|
39
|
+
|| (b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1)
|
|
40
|
+
|| (b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1)
|
|
41
|
+
|| (b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1)
|
|
42
|
+
|| (a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) - (b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER)
|
|
43
|
+
|| (a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) - (b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER)
|
|
44
|
+
|| a.subject.name.localeCompare(b.subject.name)
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export function buildGuardBenchLeaderboard(options = {}) {
|
|
49
|
+
const bundleDirs = options.bundleDirs?.length
|
|
50
|
+
? options.bundleDirs
|
|
51
|
+
: ['benchmarks/output/submission-bundle'];
|
|
52
|
+
const rows = bundleDirs.map(rowFromBundle).sort(compareRows)
|
|
53
|
+
.map((row, index) => ({ rank: index + 1, ...row }));
|
|
54
|
+
return {
|
|
55
|
+
schemaVersion: '1.0.0',
|
|
56
|
+
suite: 'GuardBench leaderboard',
|
|
57
|
+
generatedAt: new Date().toISOString(),
|
|
58
|
+
ranking: [
|
|
59
|
+
'verified bundle',
|
|
60
|
+
'adapter conformance',
|
|
61
|
+
'fullContractPassRate',
|
|
62
|
+
'decisionAccuracy',
|
|
63
|
+
'evidenceRecall',
|
|
64
|
+
'redactionLeaks ascending',
|
|
65
|
+
'latency.p95Ms ascending',
|
|
66
|
+
'subject.name',
|
|
67
|
+
],
|
|
68
|
+
rows,
|
|
69
|
+
failures: rows.flatMap(row => row.verification.failures.map(failure => `${row.subject.name}: ${failure}`)),
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export function writeGuardBenchLeaderboard(options = {}) {
|
|
74
|
+
const outJson = resolve(options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json');
|
|
75
|
+
const outMd = resolve(options.outMd ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.md');
|
|
76
|
+
const schemasDir = resolve(options.schemasDir ?? 'benchmarks/schemas');
|
|
77
|
+
const leaderboard = buildGuardBenchLeaderboard(options);
|
|
78
|
+
const schema = readJson(join(schemasDir, 'guardbench-leaderboard.schema.json'));
|
|
79
|
+
const schemaErrors = validateSchema(leaderboard, schema, 'guardbench-leaderboard');
|
|
80
|
+
if (schemaErrors.length) {
|
|
81
|
+
throw new Error(`GuardBench leaderboard schema validation failed: ${schemaErrors.join('; ')}`);
|
|
82
|
+
}
|
|
83
|
+
mkdirSync(dirname(outJson), { recursive: true });
|
|
84
|
+
mkdirSync(dirname(outMd), { recursive: true });
|
|
85
|
+
writeFileSync(outJson, `${JSON.stringify(leaderboard, null, 2)}\n`, 'utf-8');
|
|
86
|
+
writeFileSync(outMd, renderMarkdown(leaderboard), 'utf-8');
|
|
87
|
+
return { leaderboard, outJson, outMd };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export function renderMarkdown(leaderboard) {
|
|
91
|
+
const lines = [
|
|
92
|
+
'# GuardBench Leaderboard',
|
|
93
|
+
'',
|
|
94
|
+
`Generated: ${leaderboard.generatedAt}`,
|
|
95
|
+
'',
|
|
96
|
+
'| Rank | Subject | Verified | Conformant | Full Contract | Decision Accuracy | Evidence Recall | Redaction Leaks | p95 Latency | Bundle |',
|
|
97
|
+
'|---:|---|---:|---:|---:|---:|---:|---:|---:|---|',
|
|
98
|
+
];
|
|
99
|
+
for (const row of leaderboard.rows) {
|
|
100
|
+
lines.push([
|
|
101
|
+
row.rank,
|
|
102
|
+
row.subject.name,
|
|
103
|
+
row.verification.ok ? 'yes' : 'no',
|
|
104
|
+
row.conformance.ok ? 'yes' : 'no',
|
|
105
|
+
percent(row.score.fullContractPassRate),
|
|
106
|
+
percent(row.score.decisionAccuracy),
|
|
107
|
+
percent(row.score.evidenceRecall),
|
|
108
|
+
number(row.score.redactionLeaks),
|
|
109
|
+
row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
|
|
110
|
+
row.source.dir,
|
|
111
|
+
].join(' | ').replace(/^/, '| ').replace(/$/, ' |'));
|
|
112
|
+
}
|
|
113
|
+
if (leaderboard.failures.length) {
|
|
114
|
+
lines.push('', '## Verification Failures', '');
|
|
115
|
+
for (const failure of leaderboard.failures) lines.push(`- ${failure}`);
|
|
116
|
+
}
|
|
117
|
+
lines.push('');
|
|
118
|
+
return `${lines.join('\n')}`;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function parseArgs(argv = process.argv.slice(2)) {
|
|
122
|
+
const args = {
|
|
123
|
+
bundleDirs: [],
|
|
124
|
+
outJson: 'benchmarks/output/leaderboard/guardbench-leaderboard.json',
|
|
125
|
+
outMd: 'benchmarks/output/leaderboard/guardbench-leaderboard.md',
|
|
126
|
+
json: false,
|
|
127
|
+
};
|
|
128
|
+
for (let i = 0; i < argv.length; i++) {
|
|
129
|
+
const token = argv[i];
|
|
130
|
+
if ((token === '--bundle' || token === '--dir') && argv[i + 1]) args.bundleDirs.push(argv[++i]);
|
|
131
|
+
else if (token === '--out-json' && argv[i + 1]) args.outJson = argv[++i];
|
|
132
|
+
else if (token === '--out-md' && argv[i + 1]) args.outMd = argv[++i];
|
|
133
|
+
else if (token === '--schemas-dir' && argv[i + 1]) args.schemasDir = argv[++i];
|
|
134
|
+
else if (token === '--json') args.json = true;
|
|
135
|
+
else if (token === '--help' || token === '-h') args.help = true;
|
|
136
|
+
else throw new Error(`Unknown argument: ${token}`);
|
|
137
|
+
}
|
|
138
|
+
return args;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function usage() {
|
|
142
|
+
return [
|
|
143
|
+
'Usage: node benchmarks/build-leaderboard.mjs [--bundle <submission-bundle>] [--json]',
|
|
144
|
+
'',
|
|
145
|
+
'Builds ranked JSON and Markdown GuardBench leaderboard artifacts from verified',
|
|
146
|
+
'submission bundles. Repeat --bundle for multiple systems.',
|
|
147
|
+
].join('\n');
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async function main() {
|
|
151
|
+
const args = parseArgs();
|
|
152
|
+
if (args.help) {
|
|
153
|
+
console.log(usage());
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
const result = writeGuardBenchLeaderboard(args);
|
|
157
|
+
if (args.json) console.log(JSON.stringify(result.leaderboard, null, 2));
|
|
158
|
+
else {
|
|
159
|
+
console.log(`GuardBench leaderboard JSON: ${result.outJson}`);
|
|
160
|
+
console.log(`GuardBench leaderboard Markdown: ${result.outMd}`);
|
|
161
|
+
}
|
|
162
|
+
if (result.leaderboard.failures.length) process.exit(1);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if (process.argv[1] && resolve(process.argv[1]).endsWith('build-leaderboard.mjs')) {
|
|
166
|
+
main().catch(error => {
|
|
167
|
+
console.error(error.stack ?? error.message);
|
|
168
|
+
process.exit(1);
|
|
169
|
+
});
|
|
170
|
+
}
|