audrey 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. package/CHANGELOG.md +30 -0
  2. package/README.md +5 -3
  3. package/benchmarks/adapter-self-test.mjs +6 -2
  4. package/benchmarks/adapters/example-allow.mjs +5 -2
  5. package/benchmarks/adapters/mem0-platform.mjs +19 -12
  6. package/benchmarks/adapters/zep-cloud.mjs +51 -27
  7. package/benchmarks/baselines.js +11 -6
  8. package/benchmarks/build-leaderboard.mjs +36 -23
  9. package/benchmarks/cases.js +24 -12
  10. package/benchmarks/create-conformance-card.mjs +12 -3
  11. package/benchmarks/create-submission-bundle.mjs +22 -8
  12. package/benchmarks/dry-run-external-adapters.mjs +24 -12
  13. package/benchmarks/guardbench.js +263 -123
  14. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  15. package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  16. package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  17. package/benchmarks/output/guardbench-conformance-card.json +11 -11
  18. package/benchmarks/output/guardbench-raw.json +107 -108
  19. package/benchmarks/output/guardbench-summary.json +170 -172
  20. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  21. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  22. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +11 -11
  23. package/benchmarks/output/submission-bundle/guardbench-raw.json +107 -108
  24. package/benchmarks/output/submission-bundle/guardbench-summary.json +170 -172
  25. package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  26. package/benchmarks/output/submission-bundle/validation-report.json +1 -1
  27. package/benchmarks/output/summary.json +57 -57
  28. package/benchmarks/perf-snapshot.js +12 -9
  29. package/benchmarks/perf.bench.js +14 -6
  30. package/benchmarks/public-paths.mjs +11 -5
  31. package/benchmarks/reference-results.js +10 -5
  32. package/benchmarks/report.js +48 -27
  33. package/benchmarks/run-external-guardbench.mjs +47 -25
  34. package/benchmarks/run.js +112 -59
  35. package/benchmarks/validate-adapter-module.mjs +13 -10
  36. package/benchmarks/validate-adapter-registry.mjs +16 -5
  37. package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
  38. package/benchmarks/verify-external-evidence.mjs +86 -31
  39. package/benchmarks/verify-publication-artifacts.mjs +34 -11
  40. package/benchmarks/verify-submission-bundle.mjs +9 -4
  41. package/dist/mcp-server/config.d.ts +1 -1
  42. package/dist/mcp-server/config.d.ts.map +1 -1
  43. package/dist/mcp-server/config.js +5 -3
  44. package/dist/mcp-server/config.js.map +1 -1
  45. package/dist/mcp-server/index.d.ts +4 -3
  46. package/dist/mcp-server/index.d.ts.map +1 -1
  47. package/dist/mcp-server/index.js +479 -172
  48. package/dist/mcp-server/index.js.map +1 -1
  49. package/dist/src/action-key.d.ts.map +1 -1
  50. package/dist/src/action-key.js +6 -2
  51. package/dist/src/action-key.js.map +1 -1
  52. package/dist/src/adaptive.d.ts.map +1 -1
  53. package/dist/src/adaptive.js +4 -2
  54. package/dist/src/adaptive.js.map +1 -1
  55. package/dist/src/affect.d.ts.map +1 -1
  56. package/dist/src/affect.js +8 -5
  57. package/dist/src/affect.js.map +1 -1
  58. package/dist/src/audrey.d.ts +1 -1
  59. package/dist/src/audrey.d.ts.map +1 -1
  60. package/dist/src/audrey.js +93 -49
  61. package/dist/src/audrey.js.map +1 -1
  62. package/dist/src/capsule.d.ts.map +1 -1
  63. package/dist/src/capsule.js +37 -15
  64. package/dist/src/capsule.js.map +1 -1
  65. package/dist/src/causal.d.ts +1 -1
  66. package/dist/src/causal.d.ts.map +1 -1
  67. package/dist/src/causal.js +4 -2
  68. package/dist/src/causal.js.map +1 -1
  69. package/dist/src/confidence.d.ts.map +1 -1
  70. package/dist/src/confidence.js +5 -5
  71. package/dist/src/confidence.js.map +1 -1
  72. package/dist/src/consolidate.d.ts.map +1 -1
  73. package/dist/src/consolidate.js +17 -9
  74. package/dist/src/consolidate.js.map +1 -1
  75. package/dist/src/context.js +1 -1
  76. package/dist/src/context.js.map +1 -1
  77. package/dist/src/controller.d.ts.map +1 -1
  78. package/dist/src/controller.js +24 -13
  79. package/dist/src/controller.js.map +1 -1
  80. package/dist/src/db.d.ts.map +1 -1
  81. package/dist/src/db.js +78 -27
  82. package/dist/src/db.js.map +1 -1
  83. package/dist/src/decay.d.ts +1 -1
  84. package/dist/src/decay.d.ts.map +1 -1
  85. package/dist/src/decay.js +1 -1
  86. package/dist/src/decay.js.map +1 -1
  87. package/dist/src/embedding.d.ts +12 -4
  88. package/dist/src/embedding.d.ts.map +1 -1
  89. package/dist/src/embedding.js +18 -16
  90. package/dist/src/embedding.js.map +1 -1
  91. package/dist/src/encode.d.ts.map +1 -1
  92. package/dist/src/encode.js +5 -4
  93. package/dist/src/encode.js.map +1 -1
  94. package/dist/src/events.d.ts +3 -2
  95. package/dist/src/events.d.ts.map +1 -1
  96. package/dist/src/events.js +7 -3
  97. package/dist/src/events.js.map +1 -1
  98. package/dist/src/export.d.ts.map +1 -1
  99. package/dist/src/export.js +21 -7
  100. package/dist/src/export.js.map +1 -1
  101. package/dist/src/feedback.d.ts.map +1 -1
  102. package/dist/src/feedback.js +1 -1
  103. package/dist/src/feedback.js.map +1 -1
  104. package/dist/src/forget.d.ts.map +1 -1
  105. package/dist/src/forget.js +12 -6
  106. package/dist/src/forget.js.map +1 -1
  107. package/dist/src/fts.d.ts.map +1 -1
  108. package/dist/src/fts.js +20 -8
  109. package/dist/src/fts.js.map +1 -1
  110. package/dist/src/hybrid-recall.d.ts.map +1 -1
  111. package/dist/src/hybrid-recall.js +12 -6
  112. package/dist/src/hybrid-recall.js.map +1 -1
  113. package/dist/src/impact.d.ts.map +1 -1
  114. package/dist/src/impact.js +26 -10
  115. package/dist/src/impact.js.map +1 -1
  116. package/dist/src/import.d.ts.map +1 -1
  117. package/dist/src/import.js +11 -6
  118. package/dist/src/import.js.map +1 -1
  119. package/dist/src/index.d.ts +3 -3
  120. package/dist/src/index.d.ts.map +1 -1
  121. package/dist/src/index.js +3 -3
  122. package/dist/src/index.js.map +1 -1
  123. package/dist/src/interference.d.ts.map +1 -1
  124. package/dist/src/interference.js +10 -5
  125. package/dist/src/interference.js.map +1 -1
  126. package/dist/src/introspect.d.ts.map +1 -1
  127. package/dist/src/introspect.js +12 -6
  128. package/dist/src/introspect.js.map +1 -1
  129. package/dist/src/llm.d.ts +2 -2
  130. package/dist/src/llm.d.ts.map +1 -1
  131. package/dist/src/llm.js +6 -6
  132. package/dist/src/llm.js.map +1 -1
  133. package/dist/src/migrate.d.ts.map +1 -1
  134. package/dist/src/migrate.js +10 -4
  135. package/dist/src/migrate.js.map +1 -1
  136. package/dist/src/preflight.d.ts.map +1 -1
  137. package/dist/src/preflight.js +6 -8
  138. package/dist/src/preflight.js.map +1 -1
  139. package/dist/src/profile.d.ts.map +1 -1
  140. package/dist/src/profile.js.map +1 -1
  141. package/dist/src/promote.d.ts.map +1 -1
  142. package/dist/src/promote.js +16 -7
  143. package/dist/src/promote.js.map +1 -1
  144. package/dist/src/prompts.d.ts.map +1 -1
  145. package/dist/src/prompts.js +1 -2
  146. package/dist/src/prompts.js.map +1 -1
  147. package/dist/src/recall.d.ts.map +1 -1
  148. package/dist/src/recall.js +85 -18
  149. package/dist/src/recall.js.map +1 -1
  150. package/dist/src/redact.d.ts.map +1 -1
  151. package/dist/src/redact.js +9 -4
  152. package/dist/src/redact.js.map +1 -1
  153. package/dist/src/reflexes.d.ts.map +1 -1
  154. package/dist/src/reflexes.js +1 -7
  155. package/dist/src/reflexes.js.map +1 -1
  156. package/dist/src/rollback.d.ts.map +1 -1
  157. package/dist/src/rollback.js +4 -2
  158. package/dist/src/rollback.js.map +1 -1
  159. package/dist/src/routes.d.ts.map +1 -1
  160. package/dist/src/routes.js +33 -13
  161. package/dist/src/routes.js.map +1 -1
  162. package/dist/src/rules-compiler.d.ts.map +1 -1
  163. package/dist/src/rules-compiler.js +24 -2
  164. package/dist/src/rules-compiler.js.map +1 -1
  165. package/dist/src/server.js +2 -2
  166. package/dist/src/server.js.map +1 -1
  167. package/dist/src/tool-trace.d.ts +2 -2
  168. package/dist/src/tool-trace.d.ts.map +1 -1
  169. package/dist/src/tool-trace.js +12 -4
  170. package/dist/src/tool-trace.js.map +1 -1
  171. package/dist/src/types.d.ts.map +1 -1
  172. package/dist/src/ulid.js +1 -1
  173. package/dist/src/ulid.js.map +1 -1
  174. package/dist/src/utils.d.ts.map +1 -1
  175. package/dist/src/utils.js.map +1 -1
  176. package/dist/src/validate.d.ts.map +1 -1
  177. package/dist/src/validate.js +20 -10
  178. package/dist/src/validate.js.map +1 -1
  179. package/docs/paper/07-evaluation.md +5 -5
  180. package/docs/paper/audrey-paper-v1.md +5 -5
  181. package/docs/paper/evidence-ledger.md +1 -1
  182. package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  183. package/docs/paper/output/arxiv/main.tex +5 -5
  184. package/docs/paper/output/arxiv-compile-report.json +3 -3
  185. package/docs/paper/output/submission-bundle/README.md +5 -3
  186. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  187. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  188. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  189. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +11 -11
  190. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +107 -108
  191. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +170 -172
  192. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  193. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  194. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  195. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
  196. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +58 -58
  197. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
  198. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
  199. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
  200. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  201. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
  202. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
  203. package/docs/paper/output/submission-bundle/package.json +17 -4
  204. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +36 -36
  205. package/examples/fintech-ops-demo.js +12 -5
  206. package/examples/healthcare-ops-demo.js +8 -4
  207. package/examples/ollama-memory-agent.js +41 -13
  208. package/examples/stripe-demo.js +12 -5
  209. package/package.json +17 -4
  210. package/scripts/audit-release-completion.mjs +179 -101
  211. package/scripts/create-arxiv-source.mjs +20 -14
  212. package/scripts/create-paper-submission-bundle.mjs +6 -2
  213. package/scripts/finalize-release.mjs +111 -36
  214. package/scripts/prepare-release-cut.mjs +14 -6
  215. package/scripts/publish-release-bundle.mjs +62 -23
  216. package/scripts/publish-release-github-api.mjs +89 -24
  217. package/scripts/smoke-cli.js +9 -9
  218. package/scripts/sync-paper-artifacts.mjs +5 -1
  219. package/scripts/verify-arxiv-compile.mjs +52 -16
  220. package/scripts/verify-arxiv-source.mjs +45 -15
  221. package/scripts/verify-browser-launch-plan.mjs +28 -11
  222. package/scripts/verify-browser-launch-results.mjs +32 -14
  223. package/scripts/verify-paper-artifacts.mjs +539 -79
  224. package/scripts/verify-paper-claims.mjs +48 -20
  225. package/scripts/verify-paper-submission-bundle.mjs +22 -11
  226. package/scripts/verify-publication-pack.mjs +23 -9
  227. package/scripts/verify-release-readiness.mjs +211 -76
@@ -44,27 +44,32 @@ export const PUBLISHED_LEADERBOARD = [
44
44
  export const MEMORY_TRENDS = [
45
45
  {
46
46
  title: 'Memory is moving from flat retrieval to typed systems',
47
- summary: 'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
47
+ summary:
48
+ 'Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.',
48
49
  source: 'https://arxiv.org/abs/2507.03724',
49
50
  },
50
51
  {
51
52
  title: 'Benchmarks now emphasize multi-session realism',
52
- summary: 'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
53
+ summary:
54
+ 'LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.',
53
55
  source: 'https://arxiv.org/abs/2410.10813',
54
56
  },
55
57
  {
56
58
  title: 'Context engineering is now competing with retrieval-first designs',
57
- summary: 'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
59
+ summary:
60
+ 'Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.',
58
61
  source: 'https://www.letta.com/blog/memory-blocks',
59
62
  },
60
63
  {
61
64
  title: 'Production teams care about latency and token footprint, not just recall quality',
62
- summary: 'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
65
+ summary:
66
+ 'Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.',
63
67
  source: 'https://arxiv.org/abs/2504.19413',
64
68
  },
65
69
  {
66
70
  title: 'Temporal and multimodal memory are becoming table stakes',
67
- summary: 'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
71
+ summary:
72
+ 'MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.',
68
73
  source: 'https://arxiv.org/abs/2507.07957',
69
74
  },
70
75
  ];
@@ -38,25 +38,29 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
38
38
  const barWidth = Math.max(32, Math.floor(plotWidth / Math.max(rows.length, 1)) - 18);
39
39
  const gap = rows.length > 1 ? (plotWidth - barWidth * rows.length) / (rows.length - 1) : 0;
40
40
 
41
- const bars = rows.map((row, index) => {
42
- const value = Math.max(0, Math.min(maxValue, row.value));
43
- const barHeight = (value / maxValue) * plotHeight;
44
- const x = margin.left + index * (barWidth + gap);
45
- const y = margin.top + plotHeight - barHeight;
46
- return `
41
+ const bars = rows
42
+ .map((row, index) => {
43
+ const value = Math.max(0, Math.min(maxValue, row.value));
44
+ const barHeight = (value / maxValue) * plotHeight;
45
+ const x = margin.left + index * (barWidth + gap);
46
+ const y = margin.top + plotHeight - barHeight;
47
+ return `
47
48
  <rect x="${x}" y="${y}" width="${barWidth}" height="${barHeight}" rx="8" fill="${chartBarColor(row.label)}" />
48
49
  <text x="${x + barWidth / 2}" y="${y - 10}" text-anchor="middle" font-size="15" fill="${PALETTE.accent}">${value.toFixed(1)}${valueSuffix}</text>
49
50
  <text x="${x + barWidth / 2}" y="${height - 42}" text-anchor="middle" font-size="14" fill="${PALETTE.muted}">${escapeHtml(row.label)}</text>
50
51
  `;
51
- }).join('\n');
52
+ })
53
+ .join('\n');
52
54
 
53
- const grid = [0, 25, 50, 75, 100].map(tick => {
54
- const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
55
- return `
55
+ const grid = [0, 25, 50, 75, 100]
56
+ .map(tick => {
57
+ const y = margin.top + plotHeight - (tick / maxValue) * plotHeight;
58
+ return `
56
59
  <line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="${PALETTE.border}" stroke-dasharray="4 4" />
57
60
  <text x="${margin.left - 10}" y="${y + 5}" text-anchor="end" font-size="13" fill="${PALETTE.muted}">${tick}${valueSuffix}</text>
58
61
  `;
59
- }).join('\n');
62
+ })
63
+ .join('\n');
60
64
 
61
65
  return `<?xml version="1.0" encoding="UTF-8"?>
62
66
  <svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}" role="img" aria-label="${escapeHtml(title)}">
@@ -68,39 +72,53 @@ function renderBarChart({ title, rows, valueSuffix = '%', maxValue = 100 }) {
68
72
  }
69
73
 
70
74
  function renderTrendList(trends) {
71
- return trends.map(trend => `
75
+ return trends
76
+ .map(
77
+ trend => `
72
78
  <li>
73
79
  <strong>${escapeHtml(trend.title)}</strong><br />
74
80
  ${escapeHtml(trend.summary)}<br />
75
81
  <a href="${trend.source}">${escapeHtml(trend.source)}</a>
76
82
  </li>
77
- `).join('\n');
83
+ `,
84
+ )
85
+ .join('\n');
78
86
  }
79
87
 
80
88
  function renderCaseRows(localCases) {
81
- return localCases.map(caseResult => `
89
+ return localCases
90
+ .map(
91
+ caseResult => `
82
92
  <tr>
83
93
  <td>${escapeHtml(caseResult.title)}</td>
84
94
  <td>${escapeHtml(caseResult.suite)}</td>
85
95
  <td>${escapeHtml(caseResult.family)}</td>
86
- ${caseResult.results.map(result => {
87
- const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
88
- const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
89
- return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
90
- }).join('')}
96
+ ${caseResult.results
97
+ .map(result => {
98
+ const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2';
99
+ const fg = result.passed ? '#065f46' : result.score >= 0.5 ? '#9a3412' : '#991b1b';
100
+ return `<td style="background:${bg};color:${fg}">${result.score.toFixed(2)}<br /><span style="font-size:12px">${escapeHtml(result.summary)}</span></td>`;
101
+ })
102
+ .join('')}
91
103
  </tr>
92
- `).join('\n');
104
+ `,
105
+ )
106
+ .join('\n');
93
107
  }
94
108
 
95
109
  function renderSuiteSections(suiteCharts) {
96
110
  if (suiteCharts.length === 0) return '';
97
- return suiteCharts.map(chart => `
111
+ return suiteCharts
112
+ .map(
113
+ chart => `
98
114
  <section class="callout">
99
115
  <h2>${escapeHtml(chart.title)}</h2>
100
116
  <p>${escapeHtml(chart.description)}</p>
101
117
  <img src="./${escapeHtml(chart.fileName)}" alt="${escapeHtml(chart.title)} chart" />
102
118
  </section>
103
- `).join('\n');
119
+ `,
120
+ )
121
+ .join('\n');
104
122
  }
105
123
 
106
124
  export function writeBenchmarkArtifacts({
@@ -114,9 +132,10 @@ export function writeBenchmarkArtifacts({
114
132
  }) {
115
133
  mkdirSync(outputDir, { recursive: true });
116
134
 
117
- const localChartTitle = summary.local?.overall_scope === 'comparable_suites'
118
- ? 'Audrey vs Comparable Local Memory Baselines'
119
- : 'Selected Audrey Regression Suite';
135
+ const localChartTitle =
136
+ summary.local?.overall_scope === 'comparable_suites'
137
+ ? 'Audrey vs Comparable Local Memory Baselines'
138
+ : 'Selected Audrey Regression Suite';
120
139
  const localChart = renderBarChart({
121
140
  title: localChartTitle,
122
141
  rows: localOverall.map(row => ({ label: row.system, value: row.scorePercent })),
@@ -162,8 +181,10 @@ export function writeBenchmarkArtifacts({
162
181
  operationsReadmeChart,
163
182
  renderBarChart({
164
183
  title: 'Audrey Memory Operations Benchmark',
165
- rows: (localSuites.find(suite => suite.id === 'operations')?.overall || [])
166
- .map(row => ({ label: row.system, value: row.scorePercent })),
184
+ rows: (localSuites.find(suite => suite.id === 'operations')?.overall || []).map(row => ({
185
+ label: row.system,
186
+ value: row.scorePercent,
187
+ })),
167
188
  }),
168
189
  'utf8',
169
190
  );
@@ -3,31 +3,46 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
3
3
  import { basename, dirname, resolve } from 'node:path';
4
4
  import { fileURLToPath } from 'node:url';
5
5
  import { writeGuardBenchConformanceCard } from './create-conformance-card.mjs';
6
- import { computeGuardBenchArtifactHashes, validateGuardBenchArtifacts } from './validate-guardbench-artifacts.mjs';
6
+ import {
7
+ computeGuardBenchArtifactHashes,
8
+ validateGuardBenchArtifacts,
9
+ } from './validate-guardbench-artifacts.mjs';
7
10
  import { publicArtifactValue } from './public-paths.mjs';
8
11
 
9
12
  const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
10
13
  const KNOWN_ADAPTERS = new Map([
11
- ['mem0', {
12
- name: 'mem0-platform',
13
- path: 'benchmarks/adapters/mem0-platform.mjs',
14
- requiredEnv: ['MEM0_API_KEY'],
15
- }],
16
- ['mem0-platform', {
17
- name: 'mem0-platform',
18
- path: 'benchmarks/adapters/mem0-platform.mjs',
19
- requiredEnv: ['MEM0_API_KEY'],
20
- }],
21
- ['zep', {
22
- name: 'zep-cloud',
23
- path: 'benchmarks/adapters/zep-cloud.mjs',
24
- requiredEnv: ['ZEP_API_KEY'],
25
- }],
26
- ['zep-cloud', {
27
- name: 'zep-cloud',
28
- path: 'benchmarks/adapters/zep-cloud.mjs',
29
- requiredEnv: ['ZEP_API_KEY'],
30
- }],
14
+ [
15
+ 'mem0',
16
+ {
17
+ name: 'mem0-platform',
18
+ path: 'benchmarks/adapters/mem0-platform.mjs',
19
+ requiredEnv: ['MEM0_API_KEY'],
20
+ },
21
+ ],
22
+ [
23
+ 'mem0-platform',
24
+ {
25
+ name: 'mem0-platform',
26
+ path: 'benchmarks/adapters/mem0-platform.mjs',
27
+ requiredEnv: ['MEM0_API_KEY'],
28
+ },
29
+ ],
30
+ [
31
+ 'zep',
32
+ {
33
+ name: 'zep-cloud',
34
+ path: 'benchmarks/adapters/zep-cloud.mjs',
35
+ requiredEnv: ['ZEP_API_KEY'],
36
+ },
37
+ ],
38
+ [
39
+ 'zep-cloud',
40
+ {
41
+ name: 'zep-cloud',
42
+ path: 'benchmarks/adapters/zep-cloud.mjs',
43
+ requiredEnv: ['ZEP_API_KEY'],
44
+ },
45
+ ],
31
46
  ]);
32
47
 
33
48
  export function parseExternalArgs(argv = process.argv.slice(2)) {
@@ -127,13 +142,19 @@ export function evaluateAdapterConformance(summary, adapterName) {
127
142
  .filter(row => row.system === resolvedAdapterName);
128
143
 
129
144
  if (adapterRows.length !== expectedScenarios) {
130
- failures.push(`Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`);
145
+ failures.push(
146
+ `Adapter ${resolvedAdapterName} returned ${adapterRows.length}/${expectedScenarios} scenario rows`,
147
+ );
131
148
  }
132
149
  if (systemSummary && systemSummary.scenarios !== expectedScenarios) {
133
- failures.push(`Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`);
150
+ failures.push(
151
+ `Adapter ${resolvedAdapterName} system summary has ${systemSummary.scenarios}/${expectedScenarios} scenarios`,
152
+ );
134
153
  }
135
154
  if (systemSummary && systemSummary.redactionLeaks !== 0) {
136
- failures.push(`Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`);
155
+ failures.push(
156
+ `Adapter ${resolvedAdapterName} leaked ${systemSummary.redactionLeaks} seeded secret(s) in decision output`,
157
+ );
137
158
  }
138
159
  if (adapterRows.some(row => row.external !== true)) {
139
160
  failures.push(`Adapter ${resolvedAdapterName} rows are not marked external`);
@@ -270,7 +291,8 @@ async function main() {
270
291
  const card = child.status === 0 ? writeGuardBenchConformanceCard({ dir: run.outDir }) : null;
271
292
  console.log(`External GuardBench metadata: ${metadataPath}`);
272
293
  if (card) console.log(`External GuardBench conformance card: ${card.path}`);
273
- process.exitCode = child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
294
+ process.exitCode =
295
+ child.status === 0 && validation.ok && adapterConformance.ok ? 0 : (child.status ?? 1);
274
296
  }
275
297
 
276
298
  if (process.argv[1] && process.argv[1].endsWith('run-external-guardbench.mjs')) {
package/benchmarks/run.js CHANGED
@@ -68,7 +68,9 @@ function normalizeSuiteSelection(value = 'all') {
68
68
 
69
69
  const invalid = selected.filter(token => !ALL_SUITE_IDS.includes(token));
70
70
  if (invalid.length > 0) {
71
- throw new Error(`Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`);
71
+ throw new Error(
72
+ `Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`,
73
+ );
72
74
  }
73
75
  return [...new Set(selected)];
74
76
  }
@@ -94,12 +96,19 @@ function evaluateCase(benchmarkCase, results) {
94
96
  const expected = (benchmarkCase.expectAny || []).map(normalize);
95
97
  const required = (benchmarkCase.expectAll || []).map(normalize);
96
98
  const forbidden = (benchmarkCase.forbid || []).map(normalize);
97
- const firstMatchIndex = expected.length === 0
98
- ? -1
99
- : normalizedContents.findIndex(content => expected.some(expectation => content.includes(expectation)));
100
- const firstForbiddenIndex = normalizedContents.findIndex(content => forbidden.some(blocked => content.includes(blocked)));
99
+ const firstMatchIndex =
100
+ expected.length === 0
101
+ ? -1
102
+ : normalizedContents.findIndex(content =>
103
+ expected.some(expectation => content.includes(expectation)),
104
+ );
105
+ const firstForbiddenIndex = normalizedContents.findIndex(content =>
106
+ forbidden.some(blocked => content.includes(blocked)),
107
+ );
101
108
  const matched = firstMatchIndex !== -1;
102
- const requiredMatches = required.filter(expectation => normalizedContents.some(content => content.includes(expectation)));
109
+ const requiredMatches = required.filter(expectation =>
110
+ normalizedContents.some(content => content.includes(expectation)),
111
+ );
103
112
  const matchedRequired = required.length > 0 && requiredMatches.length === required.length;
104
113
  const leakedForbidden = firstForbiddenIndex !== -1;
105
114
 
@@ -108,16 +117,21 @@ function evaluateCase(benchmarkCase, results) {
108
117
  return {
109
118
  passed: score === 1,
110
119
  score,
111
- summary: leakedForbidden ? 'leaked restricted content' : results.length === 0 ? 'correct abstention' : 'no leak, but retrieved tangential context',
120
+ summary: leakedForbidden
121
+ ? 'leaked restricted content'
122
+ : results.length === 0
123
+ ? 'correct abstention'
124
+ : 'no leak, but retrieved tangential context',
112
125
  };
113
126
  }
114
127
 
115
128
  if (required.length > 0) {
116
- const score = matchedRequired && !leakedForbidden
117
- ? 1
118
- : leakedForbidden
119
- ? 0
120
- : Math.min(0.5, requiredMatches.length / required.length);
129
+ const score =
130
+ matchedRequired && !leakedForbidden
131
+ ? 1
132
+ : leakedForbidden
133
+ ? 0
134
+ : Math.min(0.5, requiredMatches.length / required.length);
121
135
  const missing = required.filter(expectation => !requiredMatches.includes(expectation));
122
136
  return {
123
137
  passed: score === 1,
@@ -154,7 +168,9 @@ async function seedRetrievalCase(brain, benchmarkCase) {
154
168
  const ids = [];
155
169
  for (let index = 0; index < benchmarkCase.memory.length; index++) {
156
170
  const memory = benchmarkCase.memory[index];
157
- const supersedes = Number.isInteger(memory.supersedesIndex) ? ids[memory.supersedesIndex] : undefined;
171
+ const supersedes = Number.isInteger(memory.supersedesIndex)
172
+ ? ids[memory.supersedesIndex]
173
+ : undefined;
158
174
  const id = await brain.encode({
159
175
  content: memory.content,
160
176
  source: memory.source,
@@ -264,7 +280,9 @@ async function executeGuardStep(brain, step, refs) {
264
280
  if (step.type === 'expectGuardAfterError') {
265
281
  const receiptId = step.receiptRef ? refs.get(step.receiptRef) : step.receiptId;
266
282
  if (!receiptId) {
267
- throw new Error(`Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`);
283
+ throw new Error(
284
+ `Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`,
285
+ );
268
286
  }
269
287
 
270
288
  try {
@@ -278,15 +296,19 @@ async function executeGuardStep(brain, step, refs) {
278
296
  } catch (err) {
279
297
  const message = err instanceof Error ? err.message : String(err);
280
298
  if (step.errorIncludes && !message.includes(step.errorIncludes)) {
281
- throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`);
299
+ throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`, {
300
+ cause: err,
301
+ });
282
302
  }
283
303
  const label = step.label ?? 'after_error_rejected';
284
- return [{
285
- id: `${receiptId}:${label}`,
286
- content: `guard_hardened:${label} error:${message}`,
287
- type: 'guard_hardening',
288
- score: 1,
289
- }];
304
+ return [
305
+ {
306
+ id: `${receiptId}:${label}`,
307
+ content: `guard_hardened:${label} error:${message}`,
308
+ type: 'guard_hardening',
309
+ score: 1,
310
+ },
311
+ ];
290
312
  }
291
313
 
292
314
  throw new Error(`Guard hardening expected an error for receipt ${receiptId}`);
@@ -299,18 +321,20 @@ async function seedGuardCase(brain, benchmarkCase) {
299
321
  const refs = new Map();
300
322
  const diagnostics = [];
301
323
  for (const step of benchmarkCase.steps || []) {
302
- diagnostics.push(...await executeGuardStep(brain, step, refs));
324
+ diagnostics.push(...(await executeGuardStep(brain, step, refs)));
303
325
  }
304
326
  return diagnostics;
305
327
  }
306
328
 
307
329
  function guardDecisionRows(decision) {
308
- const rows = [{
309
- id: decision.receipt_id,
310
- content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
311
- type: 'guard_decision',
312
- score: 1,
313
- }];
330
+ const rows = [
331
+ {
332
+ id: decision.receipt_id,
333
+ content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
334
+ type: 'guard_decision',
335
+ score: 1,
336
+ },
337
+ ];
314
338
 
315
339
  for (const [index, warning] of decision.warnings.entries()) {
316
340
  rows.push({
@@ -380,12 +404,15 @@ async function runAudreyCase(benchmarkCase, providerConfig) {
380
404
 
381
405
  async function runBaselineCase(system, benchmarkCase, providerConfig) {
382
406
  if (benchmarkCase.kind === 'guard') {
383
- return [{
384
- id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
385
- content: 'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
386
- type: 'guard_decision',
387
- score: 0,
388
- }];
407
+ return [
408
+ {
409
+ id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
410
+ content:
411
+ 'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
412
+ type: 'guard_decision',
413
+ score: 0,
414
+ },
415
+ ];
389
416
  }
390
417
 
391
418
  return runBaselineScenario(system, benchmarkCase, providerConfig, 5);
@@ -394,9 +421,18 @@ async function runBaselineCase(system, benchmarkCase, providerConfig) {
394
421
  async function runSystemsForCase(benchmarkCase, providerConfig) {
395
422
  const systems = [
396
423
  { system: 'Audrey', run: () => runAudreyCase(benchmarkCase, providerConfig) },
397
- { system: 'Vector Only', run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig) },
398
- { system: 'Keyword + Recency', run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig) },
399
- { system: 'Recent Window', run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig) },
424
+ {
425
+ system: 'Vector Only',
426
+ run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig),
427
+ },
428
+ {
429
+ system: 'Keyword + Recency',
430
+ run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig),
431
+ },
432
+ {
433
+ system: 'Recent Window',
434
+ run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig),
435
+ },
400
436
  ];
401
437
 
402
438
  const results = [];
@@ -504,13 +540,13 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
504
540
 
505
541
  if (audrey.scorePercent < settings.minAudreyScore) {
506
542
  failures.push(
507
- `Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`
543
+ `Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`,
508
544
  );
509
545
  }
510
546
 
511
547
  if (audrey.passRate < settings.minAudreyPassRate) {
512
548
  failures.push(
513
- `Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`
549
+ `Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`,
514
550
  );
515
551
  }
516
552
 
@@ -518,8 +554,8 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
518
554
  const margin = audrey.scorePercent - strongestBaseline.scorePercent;
519
555
  if (margin < settings.minMarginOverBaseline) {
520
556
  failures.push(
521
- `Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required `
522
- + `${settings.minMarginOverBaseline.toFixed(1)}-point margin.`
557
+ `Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required ` +
558
+ `${settings.minMarginOverBaseline.toFixed(1)}-point margin.`,
523
559
  );
524
560
  }
525
561
  }
@@ -531,7 +567,9 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
531
567
  return {
532
568
  audrey,
533
569
  strongestBaseline,
534
- marginOverBaseline: strongestBaseline ? audrey.scorePercent - strongestBaseline.scorePercent : null,
570
+ marginOverBaseline: strongestBaseline
571
+ ? audrey.scorePercent - strongestBaseline.scorePercent
572
+ : null,
535
573
  thresholds: settings,
536
574
  };
537
575
  }
@@ -563,7 +601,9 @@ export async function runBenchmarkSuite(options = {}) {
563
601
  }
564
602
  }
565
603
 
566
- const comparableCaseResults = caseResults.filter(caseResult => caseResult.comparable_to_baselines);
604
+ const comparableCaseResults = caseResults.filter(
605
+ caseResult => caseResult.comparable_to_baselines,
606
+ );
567
607
  const overallCaseResults = comparableCaseResults.length > 0 ? comparableCaseResults : caseResults;
568
608
  const overallScope = comparableCaseResults.length > 0 ? 'comparable_suites' : 'selected_suites';
569
609
  const overallSuiteIds = [...new Set(overallCaseResults.map(caseResult => caseResult.suite))];
@@ -579,10 +619,14 @@ export async function runBenchmarkSuite(options = {}) {
579
619
  suites: suiteIds,
580
620
  },
581
621
  methodology: {
582
- localBenchmark: 'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
583
- retrievalBenchmark: 'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
584
- operationsBenchmark: 'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
585
- guardBenchmark: 'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
622
+ localBenchmark:
623
+ 'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
624
+ retrievalBenchmark:
625
+ 'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
626
+ operationsBenchmark:
627
+ 'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
628
+ guardBenchmark:
629
+ 'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
586
630
  externalLeaderboard: 'Published LoCoMo scores from official papers and project blogs',
587
631
  },
588
632
  local: {
@@ -615,10 +659,10 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
615
659
  });
616
660
  const gate = args.check
617
661
  ? assertBenchmarkGuardrails(summary, {
618
- minAudreyScore: args.minAudreyScore,
619
- minAudreyPassRate: args.minAudreyPassRate,
620
- minMarginOverBaseline: args.minMarginOverBaseline,
621
- })
662
+ minAudreyScore: args.minAudreyScore,
663
+ minAudreyPassRate: args.minAudreyPassRate,
664
+ minMarginOverBaseline: args.minMarginOverBaseline,
665
+ })
622
666
  : null;
623
667
 
624
668
  if (args.jsonOnly) {
@@ -629,15 +673,22 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
629
673
  const lines = [];
630
674
  lines.push('Audrey benchmark complete.');
631
675
  lines.push('');
632
- lines.push(`Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`);
633
- lines.push(`Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`);
634
- const comparableCaseCount = summary.local.cases
635
- .filter(testCase => summary.local.overall_suite_ids.includes(testCase.suite)).length;
636
- lines.push(`Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`);
676
+ lines.push(
677
+ `Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`,
678
+ );
679
+ lines.push(
680
+ `Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`,
681
+ );
682
+ const comparableCaseCount = summary.local.cases.filter(testCase =>
683
+ summary.local.overall_suite_ids.includes(testCase.suite),
684
+ ).length;
685
+ lines.push(
686
+ `Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`,
687
+ );
637
688
  for (const row of summary.local.overall) {
638
689
  lines.push(
639
- `${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, `
640
- + `${row.avgDurationMs.toFixed(1)} ms avg/case`
690
+ `${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, ` +
691
+ `${row.avgDurationMs.toFixed(1)} ms avg/case`,
641
692
  );
642
693
  }
643
694
  lines.push('');
@@ -667,7 +718,9 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
667
718
  ? `${gate.strongestBaseline.system} by ${gate.marginOverBaseline.toFixed(1)} points`
668
719
  : 'all local baselines';
669
720
  lines.push('');
670
- lines.push(`Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`);
721
+ lines.push(
722
+ `Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`,
723
+ );
671
724
  }
672
725
 
673
726
  out(lines.join('\n'));
@@ -42,9 +42,10 @@ export async function validateAdapterModuleFile(options = {}) {
42
42
  } else {
43
43
  try {
44
44
  const mod = await import(pathToFileURL(adapterPath).href);
45
- const candidate = typeof mod.createGuardBenchAdapter === 'function'
46
- ? await mod.createGuardBenchAdapter()
47
- : mod.default ?? mod.adapter;
45
+ const candidate =
46
+ typeof mod.createGuardBenchAdapter === 'function'
47
+ ? await mod.createGuardBenchAdapter()
48
+ : (mod.default ?? mod.adapter);
48
49
  adapter = validateGuardBenchAdapter(candidate, adapterPath);
49
50
  } catch (error) {
50
51
  failures.push(error.message);
@@ -57,12 +58,12 @@ export async function validateAdapterModuleFile(options = {}) {
57
58
  moduleFile: basename(adapterPath),
58
59
  adapter: adapter
59
60
  ? {
60
- name: adapter.name,
61
- description: adapter.description ?? null,
62
- hasSetup: typeof adapter.setup === 'function',
63
- hasDecide: typeof adapter.decide === 'function',
64
- hasCleanup: typeof adapter.cleanup === 'function',
65
- }
61
+ name: adapter.name,
62
+ description: adapter.description ?? null,
63
+ hasSetup: typeof adapter.setup === 'function',
64
+ hasDecide: typeof adapter.decide === 'function',
65
+ hasCleanup: typeof adapter.cleanup === 'function',
66
+ }
66
67
  : null,
67
68
  contract: {
68
69
  moduleFormat: 'ESM',
@@ -87,7 +88,9 @@ async function main() {
87
88
  } else if (validation.ok) {
88
89
  console.log(`GuardBench adapter module validation passed: ${validation.adapterPath}`);
89
90
  console.log(`Adapter: ${validation.adapter.name}`);
90
- console.log(`Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`);
91
+ console.log(
92
+ `Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`,
93
+ );
91
94
  } else {
92
95
  console.error('GuardBench adapter module validation failed:');
93
96
  for (const failure of validation.failures) console.error(`- ${failure}`);