audrey 0.23.1 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +101 -15
  2. package/LICENSE +21 -21
  3. package/README.md +232 -6
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1125 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1271 -0
  24. package/benchmarks/output/guardbench-summary.json +2107 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +13 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +68 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +78 -6
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +273 -53
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +5 -4
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +3 -3
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +71 -2
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +555 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +92 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +785 -0
@@ -0,0 +1,259 @@
1
+ import { mkdirSync, writeFileSync } from 'node:fs';
2
+ import { join } from 'node:path';
3
+
4
+ const PALETTE = {
5
+ audrey: '#0f766e',
6
+ vector: '#0369a1',
7
+ keyword: '#6d28d9',
8
+ recent: '#b45309',
9
+ external: '#1d4ed8',
10
+ accent: '#111827',
11
+ muted: '#6b7280',
12
+ surface: '#f8fafc',
13
+ border: '#cbd5e1',
14
+ };
15
+
16
+ function escapeHtml(text) {
17
+ return String(text)
18
+ .replaceAll('&', '&')
19
+ .replaceAll('<', '&lt;')
20
+ .replaceAll('>', '&gt;')
21
+ .replaceAll('"', '&quot;');
22
+ }
23
+
24
+ function chartBarColor(label) {
25
+ if (label === 'Audrey') return PALETTE.audrey;
26
+ if (label.includes('Vector')) return PALETTE.vector;
27
+ if (label.includes('Keyword')) return PALETTE.keyword;
28
+ if (label.includes('Recent')) return PALETTE.recent;
29
+ return PALETTE.external;
30
+ }
31
+
32
+ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
33
+ const width = 960;
34
+ const height = 420;
35
+ const margin = { top: 56, right: 32, bottom: 88, left: 64 };
36
+ const plotWidth = width - margin.left - margin.right;
37
+ const plotHeight = height - margin.top - margin.bottom;
38
+ const barWidth = Math.max(32, Math.floor(plotWidth / Math.max(rows.length, 1)) - 18);
39
+ const gap = rows.length > 1 ? (plotWidth - barWidth * rows.length) / (rows.length - 1) : 0;
40
+
41
+ const bars = rows.map((row, index) => {
42
+ const value = Math.max(0, Math.min(maxValue, row.value));
43
+ const barHeight = (value / maxValue) * plotHeight;
44
+ const x = margin.left + index * (barWidth + gap);
45
+ const y = margin.top + plotHeight - barHeight;
46
+ return `
47
+ <rect x="${x}" y="${y}" width="${barWidth}" height="${barHeight}" rx="8" fill="${chartBarColor(row.label)}" />
48
+ <text x="${x + barWidth / 2}" y="${y - 10}" text-anchor="middle" font-size="15" fill="${PALETTE.accent}">${value.toFixed(1)}${valueSuffix}</text>
49
+ <text x="${x + barWidth / 2}" y="${height - 42}" text-anchor="middle" font-size="14" fill="${PALETTE.muted}">${escapeHtml(row.label)}</text>
50
+ `;
51
+ }).join('\n');
52
+
53
+ const grid = [0, 25, 50, 75, 100].map(tick => {
54
+ const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
55
+ return `
56
+ <line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="${PALETTE.border}" stroke-dasharray="4 4" />
57
+ <text x="${margin.left - 10}" y="${y + 5}" text-anchor="end" font-size="13" fill="${PALETTE.muted}">${tick}${valueSuffix}</text>
58
+ `;
59
+ }).join('\n');
60
+
61
+ return `<?xml version="1.0" encoding="UTF-8"?>
62
+ <svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}" role="img" aria-label="${escapeHtml(title)}">
63
+ <rect width="100%" height="100%" fill="white" />
64
+ <text x="${margin.left}" y="34" font-size="24" font-weight="700" fill="${PALETTE.accent}">${escapeHtml(title)}</text>
65
+ ${grid}
66
+ ${bars}
67
+ </svg>`;
68
+ }
69
+
70
+ function renderTrendList(trends) {
71
+ return trends.map(trend => `
72
+ <li>
73
+ <strong>${escapeHtml(trend.title)}</strong><br />
74
+ ${escapeHtml(trend.summary)}<br />
75
+ <a href="${trend.source}">${escapeHtml(trend.source)}</a>
76
+ </li>
77
+ `).join('\n');
78
+ }
79
+
80
+ function renderCaseRows(localCases) {
81
+ return localCases.map(caseResult => `
82
+ <tr>
83
+ <td>${escapeHtml(caseResult.title)}</td>
84
+ <td>${escapeHtml(caseResult.suite)}</td>
85
+ <td>${escapeHtml(caseResult.family)}</td>
86
+ ${caseResult.results.map(result => {
87
+ const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
88
+ const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
89
+ return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
90
+ }).join('')}
91
+ </tr>
92
+ `).join('\n');
93
+ }
94
+
95
+ function renderSuiteSections(suiteCharts) {
96
+ if (suiteCharts.length === 0) return '';
97
+ return suiteCharts.map(chart => `
98
+ <section class="callout">
99
+ <h2>${escapeHtml(chart.title)}</h2>
100
+ <p>${escapeHtml(chart.description)}</p>
101
+ <img src="./${escapeHtml(chart.fileName)}" alt="${escapeHtml(chart.title)} chart" />
102
+ </section>
103
+ `).join('\n');
104
+ }
105
+
106
+ export function writeBenchmarkArtifacts({
107
+ outputDir,
108
+ summary,
109
+ localOverall,
110
+ localSuites,
111
+ externalOverall,
112
+ trends,
113
+ readmeAssetsDir,
114
+ }) {
115
+ mkdirSync(outputDir, { recursive: true });
116
+
117
+ const localChartTitle = summary.local?.overall_scope === 'comparable_suites'
118
+ ? 'Audrey vs Comparable Local Memory Baselines'
119
+ : 'Selected Audrey Regression Suite';
120
+ const localChart = renderBarChart({
121
+ title: localChartTitle,
122
+ rows: localOverall.map(row => ({ label: row.system, value: row.scorePercent })),
123
+ });
124
+ const externalChart = renderBarChart({
125
+ title: 'Published LLM Memory Standards (LoCoMo)',
126
+ rows: externalOverall.map(row => ({ label: row.system, value: row.score })),
127
+ });
128
+
129
+ writeFileSync(join(outputDir, 'local-overall.svg'), localChart, 'utf8');
130
+ writeFileSync(join(outputDir, 'published-locomo.svg'), externalChart, 'utf8');
131
+ writeFileSync(join(outputDir, 'summary.json'), JSON.stringify(summary, null, 2), 'utf8');
132
+
133
+ const suiteCharts = localSuites.map(suite => {
134
+ const fileName = `${suite.id}-overall.svg`;
135
+ const chart = renderBarChart({
136
+ title: `${suite.title} Benchmark`,
137
+ rows: suite.overall.map(row => ({ label: row.system, value: row.scorePercent })),
138
+ });
139
+ writeFileSync(join(outputDir, fileName), chart, 'utf8');
140
+ return {
141
+ id: suite.id,
142
+ title: `${suite.title} Benchmark`,
143
+ description: suite.description,
144
+ fileName,
145
+ path: join(outputDir, fileName),
146
+ };
147
+ });
148
+
149
+ let readmeAssets = null;
150
+ if (readmeAssetsDir) {
151
+ mkdirSync(readmeAssetsDir, { recursive: true });
152
+ const localReadmeChart = join(readmeAssetsDir, 'local-benchmark.svg');
153
+ const externalReadmeChart = join(readmeAssetsDir, 'published-memory-standards.svg');
154
+ writeFileSync(localReadmeChart, localChart, 'utf8');
155
+ writeFileSync(externalReadmeChart, externalChart, 'utf8');
156
+
157
+ const operationsSuite = suiteCharts.find(chart => chart.id === 'operations');
158
+ let operationsReadmeChart = null;
159
+ if (operationsSuite) {
160
+ operationsReadmeChart = join(readmeAssetsDir, 'operations-benchmark.svg');
161
+ writeFileSync(
162
+ operationsReadmeChart,
163
+ renderBarChart({
164
+ title: 'Audrey Memory Operations Benchmark',
165
+ rows: (localSuites.find(suite => suite.id === 'operations')?.overall || [])
166
+ .map(row => ({ label: row.system, value: row.scorePercent })),
167
+ }),
168
+ 'utf8',
169
+ );
170
+ }
171
+
172
+ readmeAssets = {
173
+ localChart: localReadmeChart,
174
+ operationsChart: operationsReadmeChart,
175
+ externalChart: externalReadmeChart,
176
+ };
177
+ }
178
+
179
+ const html = `<!doctype html>
180
+ <html lang="en">
181
+ <head>
182
+ <meta charset="utf-8" />
183
+ <title>Audrey Memory Benchmark</title>
184
+ <style>
185
+ body { font-family: "Segoe UI", Arial, sans-serif; margin: 32px; color: ${PALETTE.accent}; background: ${PALETTE.surface}; }
186
+ main { max-width: 1120px; margin: 0 auto; }
187
+ h1, h2 { margin-bottom: 12px; }
188
+ p, li { line-height: 1.5; }
189
+ .callout { background: white; border: 1px solid ${PALETTE.border}; border-radius: 16px; padding: 20px; margin-bottom: 24px; }
190
+ .grid { display: grid; gap: 24px; grid-template-columns: 1fr; }
191
+ img { width: 100%; border: 1px solid ${PALETTE.border}; border-radius: 16px; background: white; }
192
+ table { width: 100%; border-collapse: collapse; background: white; border-radius: 16px; overflow: hidden; }
193
+ th, td { border: 1px solid ${PALETTE.border}; padding: 12px; vertical-align: top; text-align: left; }
194
+ th { background: #e2e8f0; }
195
+ code { background: #e2e8f0; padding: 2px 6px; border-radius: 6px; }
196
+ </style>
197
+ </head>
198
+ <body>
199
+ <main>
200
+ <h1>Audrey Memory Benchmark</h1>
201
+ <div class="callout">
202
+ <p><strong>Method:</strong> Audrey is scored on a local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle behavior, and agent guard-loop benchmarks. The combined local chart uses comparable retrieval/lifecycle suites when available; the guard loop is reported as its own controller regression suite. Published external LoCoMo numbers stay separate so the comparison remains honest.</p>
203
+ <p><strong>Scope:</strong> ${escapeHtml(summary.local?.overall_scope ?? 'unknown')} across ${escapeHtml((summary.local?.overall_suite_ids ?? []).join(', '))}; ${summary.local?.cases?.length ?? 0} total cases.</p>
204
+ <p><strong>Run:</strong> <code>${escapeHtml(summary.command)}</code></p>
205
+ <p><strong>Generated:</strong> ${escapeHtml(summary.generatedAt)}</p>
206
+ </div>
207
+
208
+ <div class="grid">
209
+ <section class="callout">
210
+ <h2>Combined Local Benchmark</h2>
211
+ <img src="./local-overall.svg" alt="Combined local benchmark bar chart" />
212
+ </section>
213
+
214
+ ${renderSuiteSections(suiteCharts)}
215
+
216
+ <section class="callout">
217
+ <h2>Published Leaderboard</h2>
218
+ <img src="./published-locomo.svg" alt="Published LoCoMo leaderboard bar chart" />
219
+ </section>
220
+ </div>
221
+
222
+ <section class="callout">
223
+ <h2>Case Matrix</h2>
224
+ <table>
225
+ <thead>
226
+ <tr>
227
+ <th>Case</th>
228
+ <th>Suite</th>
229
+ <th>Family</th>
230
+ ${summary.local.overall.map(row => `<th>${escapeHtml(row.system)}</th>`).join('')}
231
+ </tr>
232
+ </thead>
233
+ <tbody>
234
+ ${renderCaseRows(summary.local.cases)}
235
+ </tbody>
236
+ </table>
237
+ </section>
238
+
239
+ <section class="callout">
240
+ <h2>March 23, 2026 Memory Trends</h2>
241
+ <ul>
242
+ ${renderTrendList(trends)}
243
+ </ul>
244
+ </section>
245
+ </main>
246
+ </body>
247
+ </html>`;
248
+
249
+ writeFileSync(join(outputDir, 'report.html'), html, 'utf8');
250
+
251
+ return {
252
+ json: join(outputDir, 'summary.json'),
253
+ html: join(outputDir, 'report.html'),
254
+ localChart: join(outputDir, 'local-overall.svg'),
255
+ suiteCharts,
256
+ externalChart: join(outputDir, 'published-locomo.svg'),
257
+ readmeAssets,
258
+ };
259
+ }
@@ -0,0 +1,281 @@
1
+ import { spawnSync } from 'node:child_process';
2
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
3
+ import { basename, dirname, resolve } from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+ import { writeGuardBenchConformanceCard } from './create-conformance-card.mjs';
6
+ import { computeGuardBenchArtifactHashes, validateGuardBenchArtifacts } from './validate-guardbench-artifacts.mjs';
7
+ import { publicArtifactValue } from './public-paths.mjs';
8
+
9
+ const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
10
+ const KNOWN_ADAPTERS = new Map([
11
+ ['mem0', {
12
+ name: 'mem0-platform',
13
+ path: 'benchmarks/adapters/mem0-platform.mjs',
14
+ requiredEnv: ['MEM0_API_KEY'],
15
+ }],
16
+ ['mem0-platform', {
17
+ name: 'mem0-platform',
18
+ path: 'benchmarks/adapters/mem0-platform.mjs',
19
+ requiredEnv: ['MEM0_API_KEY'],
20
+ }],
21
+ ['zep', {
22
+ name: 'zep-cloud',
23
+ path: 'benchmarks/adapters/zep-cloud.mjs',
24
+ requiredEnv: ['ZEP_API_KEY'],
25
+ }],
26
+ ['zep-cloud', {
27
+ name: 'zep-cloud',
28
+ path: 'benchmarks/adapters/zep-cloud.mjs',
29
+ requiredEnv: ['ZEP_API_KEY'],
30
+ }],
31
+ ]);
32
+
33
+ export function parseExternalArgs(argv = process.argv.slice(2)) {
34
+ const args = {
35
+ adapter: 'mem0-platform',
36
+ outDir: null,
37
+ check: false,
38
+ dryRun: false,
39
+ json: false,
40
+ minPassRate: null,
41
+ allowMissingEnv: false,
42
+ };
43
+
44
+ for (let i = 0; i < argv.length; i++) {
45
+ const token = argv[i];
46
+ if (token === '--adapter' && argv[i + 1]) args.adapter = argv[++i];
47
+ else if (token === '--out-dir' && argv[i + 1]) args.outDir = argv[++i];
48
+ else if (token === '--check') args.check = true;
49
+ else if (token === '--dry-run') args.dryRun = true;
50
+ else if (token === '--json') args.json = true;
51
+ else if (token === '--min-pass-rate' && argv[i + 1]) args.minPassRate = argv[++i];
52
+ else if (token === '--allow-missing-env') args.allowMissingEnv = true;
53
+ else if (token === '--help' || token === '-h') args.help = true;
54
+ else throw new Error(`Unknown argument: ${token}`);
55
+ }
56
+
57
+ return args;
58
+ }
59
+
60
+ function readJson(path) {
61
+ return JSON.parse(readFileSync(path, 'utf-8'));
62
+ }
63
+
64
+ function adapterSpec(adapter) {
65
+ const known = KNOWN_ADAPTERS.get(adapter);
66
+ if (known) return known;
67
+
68
+ const adapterPath = resolve(ROOT, adapter);
69
+ return {
70
+ name: basename(adapter).replace(/\.[cm]?js$/i, ''),
71
+ path: adapterPath,
72
+ requiredEnv: [],
73
+ };
74
+ }
75
+
76
+ export function buildExternalGuardBenchRun(args = {}, env = process.env) {
77
+ const spec = adapterSpec(args.adapter ?? 'mem0-platform');
78
+ const adapterPath = resolve(ROOT, spec.path);
79
+ const outDir = resolve(ROOT, args.outDir ?? `benchmarks/output/external/${spec.name}`);
80
+ const missingEnv = spec.requiredEnv.filter(name => !env[name]);
81
+ const command = [
82
+ process.execPath,
83
+ resolve(ROOT, 'benchmarks/guardbench.js'),
84
+ '--adapter',
85
+ adapterPath,
86
+ '--out-dir',
87
+ outDir,
88
+ ];
89
+
90
+ if (args.check) command.push('--check');
91
+ if (args.json) command.push('--json');
92
+ if (args.minPassRate != null) command.push('--min-pass-rate', String(args.minPassRate));
93
+ const validationCommand = [
94
+ process.execPath,
95
+ resolve(ROOT, 'benchmarks/validate-guardbench-artifacts.mjs'),
96
+ '--dir',
97
+ outDir,
98
+ ];
99
+
100
+ return {
101
+ adapter: spec.name,
102
+ adapterPath,
103
+ outDir,
104
+ requiredEnv: spec.requiredEnv,
105
+ missingEnv,
106
+ command,
107
+ validationCommand,
108
+ };
109
+ }
110
+
111
+ export function evaluateAdapterConformance(summary, adapterName) {
112
+ const failures = [];
113
+ const externalSubjects = (summary.manifest?.subjects ?? []).filter(subject => subject.external);
114
+ const resolvedAdapterName = summary.systemSummaries?.some(row => row.system === adapterName)
115
+ ? adapterName
116
+ : externalSubjects.length === 1
117
+ ? externalSubjects[0].name
118
+ : adapterName;
119
+ const systemSummary = summary.systemSummaries?.find(row => row.system === resolvedAdapterName);
120
+ if (!systemSummary) {
121
+ failures.push(`Missing system summary for adapter ${resolvedAdapterName}`);
122
+ }
123
+
124
+ const expectedScenarios = summary.scenarios ?? summary.manifest?.scenarios?.length ?? 0;
125
+ const adapterRows = (summary.cases ?? [])
126
+ .flatMap(entry => entry.results ?? [])
127
+ .filter(row => row.system === resolvedAdapterName);
128
+
129
+ if (adapterRows.length !== expectedScenarios) {
130
+ failures.push(`Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`);
131
+ }
132
+ if (systemSummary && systemSummary.scenarios !== expectedScenarios) {
133
+ failures.push(`Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`);
134
+ }
135
+ if (systemSummary && systemSummary.redactionLeaks !== 0) {
136
+ failures.push(`Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`);
137
+ }
138
+ if (adapterRows.some(row => row.external !== true)) {
139
+ failures.push(`Adapter ${resolvedAdapterName} rows are not marked external`);
140
+ }
141
+
142
+ return {
143
+ ok: failures.length === 0,
144
+ adapter: resolvedAdapterName,
145
+ requestedAdapter: adapterName,
146
+ scenarios: systemSummary?.scenarios ?? adapterRows.length,
147
+ expectedScenarios,
148
+ fullContractPassRate: systemSummary?.passRate ?? null,
149
+ decisionAccuracy: systemSummary?.decisionAccuracy ?? null,
150
+ redactionLeaks: systemSummary?.redactionLeaks ?? null,
151
+ failures,
152
+ };
153
+ }
154
+
155
+ function usage() {
156
+ return `Usage: node benchmarks/run-external-guardbench.mjs [options]
157
+
158
+ Options:
159
+ --adapter <name|path> Adapter alias or ESM adapter path. Default: mem0-platform.
160
+ --out-dir <path> Output directory. Default: benchmarks/output/external/<adapter>.
161
+ --check Fail if Audrey Guard pass rate is below the threshold.
162
+ --min-pass-rate <percent> GuardBench pass-rate threshold for --check.
163
+ --json Forward JSON output from GuardBench.
164
+ --dry-run Print the resolved command and metadata without running.
165
+ --allow-missing-env Permit running even when known runtime env vars are absent.
166
+ `;
167
+ }
168
+
169
+ export function writeExternalRunMetadata(path, metadata) {
170
+ mkdirSync(path, { recursive: true });
171
+ const file = resolve(path, 'external-run-metadata.json');
172
+ writeFileSync(file, `${JSON.stringify(publicArtifactValue(metadata), null, 2)}\n`, 'utf-8');
173
+ return file;
174
+ }
175
+
176
+ async function main() {
177
+ const args = parseExternalArgs();
178
+ if (args.help) {
179
+ console.log(usage());
180
+ return;
181
+ }
182
+
183
+ const run = buildExternalGuardBenchRun(args);
184
+ const startedAt = new Date().toISOString();
185
+ const metadata = {
186
+ suite: 'GuardBench external adapter run',
187
+ startedAt,
188
+ adapter: run.adapter,
189
+ adapterPath: run.adapterPath,
190
+ outDir: run.outDir,
191
+ requiredEnv: run.requiredEnv,
192
+ missingEnv: run.missingEnv,
193
+ command: run.command,
194
+ validationCommand: run.validationCommand,
195
+ dryRun: args.dryRun,
196
+ };
197
+
198
+ if (!existsSync(run.adapterPath)) {
199
+ throw new Error(`Adapter not found: ${run.adapterPath}`);
200
+ }
201
+
202
+ if (run.missingEnv.length && !args.allowMissingEnv && !args.dryRun) {
203
+ metadata.status = 'blocked';
204
+ metadata.blockReason = `Missing runtime environment: ${run.missingEnv.join(', ')}`;
205
+ const metadataPath = writeExternalRunMetadata(run.outDir, metadata);
206
+ throw new Error(`${metadata.blockReason}. Metadata written to ${metadataPath}`);
207
+ }
208
+
209
+ if (args.dryRun) {
210
+ metadata.status = run.missingEnv.length ? 'dry-run-missing-env' : 'dry-run-ready';
211
+ const metadataPath = writeExternalRunMetadata(run.outDir, metadata);
212
+ if (args.json) {
213
+ console.log(JSON.stringify({ ...metadata, metadataPath }, null, 2));
214
+ } else {
215
+ console.log(`External GuardBench dry run: ${run.adapter}`);
216
+ console.log(`Command: ${run.command.map(part => JSON.stringify(part)).join(' ')}`);
217
+ console.log(`Metadata: ${metadataPath}`);
218
+ if (run.missingEnv.length) console.log(`Missing runtime env: ${run.missingEnv.join(', ')}`);
219
+ }
220
+ return;
221
+ }
222
+
223
+ writeExternalRunMetadata(run.outDir, { ...metadata, status: 'running' });
224
+ const child = spawnSync(run.command[0], run.command.slice(1), {
225
+ cwd: ROOT,
226
+ env: process.env,
227
+ stdio: 'inherit',
228
+ });
229
+ const validation = validateGuardBenchArtifacts({ dir: run.outDir });
230
+ let adapterConformance = {
231
+ ok: false,
232
+ adapter: run.adapter,
233
+ failures: ['GuardBench summary was not available for adapter conformance evaluation'],
234
+ };
235
+ if (child.status === 0) {
236
+ try {
237
+ const summary = readJson(resolve(run.outDir, 'guardbench-summary.json'));
238
+ adapterConformance = evaluateAdapterConformance(summary, run.adapter);
239
+ } catch (error) {
240
+ adapterConformance = {
241
+ ok: false,
242
+ adapter: run.adapter,
243
+ failures: [error.message],
244
+ };
245
+ }
246
+ }
247
+ if (validation.ok) {
248
+ console.log(`External GuardBench artifact validation passed: ${run.outDir}`);
249
+ } else {
250
+ console.error('External GuardBench artifact validation failed:');
251
+ for (const failure of validation.failures) console.error(`- ${failure}`);
252
+ }
253
+ if (adapterConformance.ok) {
254
+ console.log(`External GuardBench adapter conformance passed: ${adapterConformance.adapter}`);
255
+ } else {
256
+ console.error('External GuardBench adapter conformance failed:');
257
+ for (const failure of adapterConformance.failures) console.error(`- ${failure}`);
258
+ }
259
+ const completed = {
260
+ ...metadata,
261
+ completedAt: new Date().toISOString(),
262
+ status: child.status === 0 && validation.ok && adapterConformance.ok ? 'passed' : 'failed',
263
+ exitCode: child.status,
264
+ signal: child.signal,
265
+ artifactHashes: child.status === 0 ? computeGuardBenchArtifactHashes(run.outDir) : undefined,
266
+ artifactValidation: validation,
267
+ adapterConformance,
268
+ };
269
+ const metadataPath = writeExternalRunMetadata(run.outDir, completed);
270
+ const card = child.status === 0 ? writeGuardBenchConformanceCard({ dir: run.outDir }) : null;
271
+ console.log(`External GuardBench metadata: ${metadataPath}`);
272
+ if (card) console.log(`External GuardBench conformance card: ${card.path}`);
273
+ process.exitCode = child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
274
+ }
275
+
276
+ if (process.argv[1] && process.argv[1].endsWith('run-external-guardbench.mjs')) {
277
+ main().catch(err => {
278
+ console.error(err.message);
279
+ process.exit(1);
280
+ });
281
+ }