audrey 0.23.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +81 -19
  2. package/LICENSE +21 -21
  3. package/README.md +209 -5
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1035 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1171 -0
  24. package/benchmarks/output/guardbench-summary.json +1981 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1171 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +1981 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +164 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +228 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +164 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +228 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +3 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +55 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +61 -5
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +230 -49
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +3 -3
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +2 -2
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +67 -1
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +533 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1171 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +1981 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +164 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +228 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +72 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +741 -0
@@ -0,0 +1,169 @@
1
+ import { createEmbeddingProvider } from '../dist/src/embedding.js';
2
+ import { cosineSimilarity } from '../dist/src/utils.js';
3
+
4
+ function normalize(text) {
5
+ return String(text || '').toLowerCase();
6
+ }
7
+
8
+ function tokenize(text) {
9
+ return normalize(text)
10
+ .replace(/[^a-z0-9]+/g, ' ')
11
+ .trim()
12
+ .split(/\s+/)
13
+ .filter(Boolean);
14
+ }
15
+
16
+ function keywordScore(queryTokens, content) {
17
+ const contentTokens = new Set(tokenize(content));
18
+ if (queryTokens.length === 0) return 0;
19
+ let matches = 0;
20
+ for (const token of queryTokens) {
21
+ if (contentTokens.has(token)) matches++;
22
+ }
23
+ return matches / queryTokens.length;
24
+ }
25
+
26
+ function sortByScore(rows) {
27
+ return rows
28
+ .filter(row => Number.isFinite(row.score))
29
+ .sort((a, b) => b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')));
30
+ }
31
+
32
+ function flattenMemories(benchmarkCase, ids = []) {
33
+ return benchmarkCase.memory.map((memory, index) => ({
34
+ id: ids[index] || `memory-${index + 1}`,
35
+ content: memory.content,
36
+ source: memory.source,
37
+ createdAt: memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
38
+ private: Boolean(memory.private),
39
+ }));
40
+ }
41
+
42
+ function buildSyntheticCase(query, memories, options = {}) {
43
+ return {
44
+ query,
45
+ memory: memories.map(memory => ({
46
+ content: memory.content,
47
+ source: memory.source,
48
+ createdAt: memory.createdAt,
49
+ private: memory.private,
50
+ })),
51
+ options,
52
+ };
53
+ }
54
+
55
+ async function runBaselineRetrieval(system, syntheticCase, providerConfig, limit = 5) {
56
+ switch (system) {
57
+ case 'Vector Only':
58
+ return runVectorOnlyBaseline(syntheticCase, providerConfig, limit);
59
+ case 'Keyword + Recency':
60
+ return runKeywordRecencyBaseline(syntheticCase, limit);
61
+ case 'Recent Window':
62
+ return runRecentWindowBaseline(syntheticCase, limit);
63
+ default:
64
+ throw new Error(`Unknown baseline system: ${system}`);
65
+ }
66
+ }
67
+
68
+ function createOperationMemory(state, step) {
69
+ const index = state.counter++;
70
+ return {
71
+ id: `memory-${index + 1}`,
72
+ content: step.memory.content,
73
+ source: step.memory.source,
74
+ createdAt: step.memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
75
+ private: Boolean(step.memory.private),
76
+ };
77
+ }
78
+
79
+ async function applyBaselineStep(system, state, step, providerConfig) {
80
+ if (step.type === 'encode') {
81
+ const memory = createOperationMemory(state, step);
82
+ state.memories.push(memory);
83
+ if (step.saveAs) {
84
+ state.aliases.set(step.saveAs, memory.id);
85
+ }
86
+ return;
87
+ }
88
+
89
+ if (step.type === 'forgetByQuery') {
90
+ const syntheticCase = buildSyntheticCase(step.query, state.memories, step.options);
91
+ const [match] = await runBaselineRetrieval(system, syntheticCase, providerConfig, 1);
92
+ if (match && Number.isFinite(match.score) && match.score > 0) {
93
+ state.memories = state.memories.filter(memory => memory.id !== match.id);
94
+ }
95
+ return;
96
+ }
97
+
98
+ if (step.type === 'consolidate') {
99
+ return;
100
+ }
101
+
102
+ throw new Error(`Unsupported baseline step: ${step.type}`);
103
+ }
104
+
105
+ export async function runBaselineScenario(system, benchmarkCase, providerConfig, limit = 5) {
106
+ if (benchmarkCase.kind !== 'operations') {
107
+ return runBaselineRetrieval(system, benchmarkCase, providerConfig, limit);
108
+ }
109
+
110
+ const state = {
111
+ counter: 0,
112
+ memories: [],
113
+ aliases: new Map(),
114
+ };
115
+
116
+ for (const step of benchmarkCase.steps || []) {
117
+ await applyBaselineStep(system, state, step, providerConfig);
118
+ }
119
+
120
+ return runBaselineRetrieval(
121
+ system,
122
+ buildSyntheticCase(benchmarkCase.query, state.memories, benchmarkCase.options),
123
+ providerConfig,
124
+ limit,
125
+ );
126
+ }
127
+
128
+ export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
129
+ const queryTokens = tokenize(benchmarkCase.query);
130
+ return sortByScore(flattenMemories(benchmarkCase).map(memory => ({
131
+ ...memory,
132
+ type: 'episodic',
133
+ score: keywordScore(queryTokens, memory.content),
134
+ }))).slice(0, limit);
135
+ }
136
+
137
+ export function runRecentWindowBaseline(benchmarkCase, limit = 3) {
138
+ return flattenMemories(benchmarkCase)
139
+ .sort((a, b) => String(b.createdAt).localeCompare(String(a.createdAt)))
140
+ .slice(0, limit)
141
+ .map((memory, index) => ({
142
+ ...memory,
143
+ type: 'episodic',
144
+ score: 1 - index * 0.1,
145
+ }));
146
+ }
147
+
148
+ export async function runVectorOnlyBaseline(benchmarkCase, providerConfig, limit = 5) {
149
+ const provider = createEmbeddingProvider(providerConfig);
150
+ if (typeof provider.ready === 'function') {
151
+ await provider.ready();
152
+ }
153
+
154
+ const queryVector = await provider.embed(benchmarkCase.query);
155
+ const queryBuffer = provider.vectorToBuffer(queryVector);
156
+
157
+ const rows = [];
158
+ for (const memory of flattenMemories(benchmarkCase)) {
159
+ const vector = await provider.embed(memory.content);
160
+ const score = cosineSimilarity(queryBuffer, provider.vectorToBuffer(vector), provider);
161
+ rows.push({
162
+ ...memory,
163
+ type: 'episodic',
164
+ score,
165
+ });
166
+ }
167
+
168
+ return sortByScore(rows).slice(0, limit);
169
+ }
@@ -0,0 +1,170 @@
1
+ import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { dirname, join, resolve } from 'node:path';
3
+ import { verifyGuardBenchSubmissionBundle } from './verify-submission-bundle.mjs';
4
+ import { validateSchema } from './validate-guardbench-artifacts.mjs';
5
+ import { publicPath } from './public-paths.mjs';
6
+
7
+ function readJson(path) {
8
+ return JSON.parse(readFileSync(path, 'utf-8'));
9
+ }
10
+
11
+ function percent(value) {
12
+ return value == null ? 'n/a' : `${(value * 100).toFixed(1)}%`;
13
+ }
14
+
15
+ function number(value) {
16
+ return value == null ? 'n/a' : String(value);
17
+ }
18
+
19
+ function rowFromBundle(dir) {
20
+ const verification = verifyGuardBenchSubmissionBundle({ dir });
21
+ const manifest = readJson(join(resolve(dir), 'submission-manifest.json'));
22
+ return {
23
+ subject: manifest.subject,
24
+ score: manifest.score,
25
+ conformance: manifest.conformance,
26
+ source: {
27
+ dir: publicPath(resolve(dir)),
28
+ manifestGeneratedAt: manifest.generatedAt,
29
+ fileCount: manifest.files?.length ?? 0,
30
+ },
31
+ verification,
32
+ };
33
+ }
34
+
35
+ function compareRows(a, b) {
36
+ return (
37
+ Number(b.verification.ok) - Number(a.verification.ok)
38
+ || Number(b.conformance.ok) - Number(a.conformance.ok)
39
+ || (b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1)
40
+ || (b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1)
41
+ || (b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1)
42
+ || (a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) - (b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER)
43
+ || (a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) - (b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER)
44
+ || a.subject.name.localeCompare(b.subject.name)
45
+ );
46
+ }
47
+
48
+ export function buildGuardBenchLeaderboard(options = {}) {
49
+ const bundleDirs = options.bundleDirs?.length
50
+ ? options.bundleDirs
51
+ : ['benchmarks/output/submission-bundle'];
52
+ const rows = bundleDirs.map(rowFromBundle).sort(compareRows)
53
+ .map((row, index) => ({ rank: index + 1, ...row }));
54
+ return {
55
+ schemaVersion: '1.0.0',
56
+ suite: 'GuardBench leaderboard',
57
+ generatedAt: new Date().toISOString(),
58
+ ranking: [
59
+ 'verified bundle',
60
+ 'adapter conformance',
61
+ 'fullContractPassRate',
62
+ 'decisionAccuracy',
63
+ 'evidenceRecall',
64
+ 'redactionLeaks ascending',
65
+ 'latency.p95Ms ascending',
66
+ 'subject.name',
67
+ ],
68
+ rows,
69
+ failures: rows.flatMap(row => row.verification.failures.map(failure => `${row.subject.name}: ${failure}`)),
70
+ };
71
+ }
72
+
73
+ export function writeGuardBenchLeaderboard(options = {}) {
74
+ const outJson = resolve(options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json');
75
+ const outMd = resolve(options.outMd ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.md');
76
+ const schemasDir = resolve(options.schemasDir ?? 'benchmarks/schemas');
77
+ const leaderboard = buildGuardBenchLeaderboard(options);
78
+ const schema = readJson(join(schemasDir, 'guardbench-leaderboard.schema.json'));
79
+ const schemaErrors = validateSchema(leaderboard, schema, 'guardbench-leaderboard');
80
+ if (schemaErrors.length) {
81
+ throw new Error(`GuardBench leaderboard schema validation failed: ${schemaErrors.join('; ')}`);
82
+ }
83
+ mkdirSync(dirname(outJson), { recursive: true });
84
+ mkdirSync(dirname(outMd), { recursive: true });
85
+ writeFileSync(outJson, `${JSON.stringify(leaderboard, null, 2)}\n`, 'utf-8');
86
+ writeFileSync(outMd, renderMarkdown(leaderboard), 'utf-8');
87
+ return { leaderboard, outJson, outMd };
88
+ }
89
+
90
+ export function renderMarkdown(leaderboard) {
91
+ const lines = [
92
+ '# GuardBench Leaderboard',
93
+ '',
94
+ `Generated: ${leaderboard.generatedAt}`,
95
+ '',
96
+ '| Rank | Subject | Verified | Conformant | Full Contract | Decision Accuracy | Evidence Recall | Redaction Leaks | p95 Latency | Bundle |',
97
+ '|---:|---|---:|---:|---:|---:|---:|---:|---:|---|',
98
+ ];
99
+ for (const row of leaderboard.rows) {
100
+ lines.push([
101
+ row.rank,
102
+ row.subject.name,
103
+ row.verification.ok ? 'yes' : 'no',
104
+ row.conformance.ok ? 'yes' : 'no',
105
+ percent(row.score.fullContractPassRate),
106
+ percent(row.score.decisionAccuracy),
107
+ percent(row.score.evidenceRecall),
108
+ number(row.score.redactionLeaks),
109
+ row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
110
+ row.source.dir,
111
+ ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'));
112
+ }
113
+ if (leaderboard.failures.length) {
114
+ lines.push('', '## Verification Failures', '');
115
+ for (const failure of leaderboard.failures) lines.push(`- ${failure}`);
116
+ }
117
+ lines.push('');
118
+ return `${lines.join('\n')}`;
119
+ }
120
+
121
+ function parseArgs(argv = process.argv.slice(2)) {
122
+ const args = {
123
+ bundleDirs: [],
124
+ outJson: 'benchmarks/output/leaderboard/guardbench-leaderboard.json',
125
+ outMd: 'benchmarks/output/leaderboard/guardbench-leaderboard.md',
126
+ json: false,
127
+ };
128
+ for (let i = 0; i < argv.length; i++) {
129
+ const token = argv[i];
130
+ if ((token === '--bundle' || token === '--dir') && argv[i + 1]) args.bundleDirs.push(argv[++i]);
131
+ else if (token === '--out-json' && argv[i + 1]) args.outJson = argv[++i];
132
+ else if (token === '--out-md' && argv[i + 1]) args.outMd = argv[++i];
133
+ else if (token === '--schemas-dir' && argv[i + 1]) args.schemasDir = argv[++i];
134
+ else if (token === '--json') args.json = true;
135
+ else if (token === '--help' || token === '-h') args.help = true;
136
+ else throw new Error(`Unknown argument: ${token}`);
137
+ }
138
+ return args;
139
+ }
140
+
141
+ function usage() {
142
+ return [
143
+ 'Usage: node benchmarks/build-leaderboard.mjs [--bundle <submission-bundle>] [--json]',
144
+ '',
145
+ 'Builds ranked JSON and Markdown GuardBench leaderboard artifacts from verified',
146
+ 'submission bundles. Repeat --bundle for multiple systems.',
147
+ ].join('\n');
148
+ }
149
+
150
+ async function main() {
151
+ const args = parseArgs();
152
+ if (args.help) {
153
+ console.log(usage());
154
+ return;
155
+ }
156
+ const result = writeGuardBenchLeaderboard(args);
157
+ if (args.json) console.log(JSON.stringify(result.leaderboard, null, 2));
158
+ else {
159
+ console.log(`GuardBench leaderboard JSON: ${result.outJson}`);
160
+ console.log(`GuardBench leaderboard Markdown: ${result.outMd}`);
161
+ }
162
+ if (result.leaderboard.failures.length) process.exit(1);
163
+ }
164
+
165
+ if (process.argv[1] && resolve(process.argv[1]).endsWith('build-leaderboard.mjs')) {
166
+ main().catch(error => {
167
+ console.error(error.stack ?? error.message);
168
+ process.exit(1);
169
+ });
170
+ }