audrey 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/README.md +30 -6
  3. package/benchmarks/adapter-self-test.mjs +6 -2
  4. package/benchmarks/adapters/example-allow.mjs +5 -2
  5. package/benchmarks/adapters/mem0-platform.mjs +19 -12
  6. package/benchmarks/adapters/zep-cloud.mjs +51 -27
  7. package/benchmarks/baselines.js +11 -6
  8. package/benchmarks/build-leaderboard.mjs +36 -23
  9. package/benchmarks/cases.js +24 -12
  10. package/benchmarks/create-conformance-card.mjs +12 -3
  11. package/benchmarks/create-submission-bundle.mjs +22 -8
  12. package/benchmarks/dry-run-external-adapters.mjs +24 -12
  13. package/benchmarks/guardbench.js +354 -124
  14. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
  15. package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  16. package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  17. package/benchmarks/output/guardbench-conformance-card.json +12 -12
  18. package/benchmarks/output/guardbench-raw.json +243 -144
  19. package/benchmarks/output/guardbench-summary.json +354 -230
  20. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  21. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  22. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
  23. package/benchmarks/output/submission-bundle/guardbench-raw.json +243 -144
  24. package/benchmarks/output/submission-bundle/guardbench-summary.json +354 -230
  25. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +21 -1
  26. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +23 -2
  27. package/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
  28. package/benchmarks/output/submission-bundle/validation-report.json +1 -1
  29. package/benchmarks/output/summary.json +58 -58
  30. package/benchmarks/perf-snapshot.js +12 -9
  31. package/benchmarks/perf.bench.js +14 -6
  32. package/benchmarks/public-paths.mjs +11 -5
  33. package/benchmarks/reference-results.js +10 -5
  34. package/benchmarks/report.js +48 -27
  35. package/benchmarks/run-external-guardbench.mjs +47 -25
  36. package/benchmarks/run.js +112 -59
  37. package/benchmarks/schemas/guardbench-raw.schema.json +21 -1
  38. package/benchmarks/schemas/guardbench-summary.schema.json +23 -2
  39. package/benchmarks/validate-adapter-module.mjs +13 -10
  40. package/benchmarks/validate-adapter-registry.mjs +16 -5
  41. package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
  42. package/benchmarks/verify-external-evidence.mjs +86 -31
  43. package/benchmarks/verify-publication-artifacts.mjs +34 -11
  44. package/benchmarks/verify-submission-bundle.mjs +9 -4
  45. package/dist/mcp-server/config.d.ts +1 -1
  46. package/dist/mcp-server/config.d.ts.map +1 -1
  47. package/dist/mcp-server/config.js +5 -3
  48. package/dist/mcp-server/config.js.map +1 -1
  49. package/dist/mcp-server/index.d.ts +4 -3
  50. package/dist/mcp-server/index.d.ts.map +1 -1
  51. package/dist/mcp-server/index.js +479 -172
  52. package/dist/mcp-server/index.js.map +1 -1
  53. package/dist/src/action-key.d.ts.map +1 -1
  54. package/dist/src/action-key.js +6 -2
  55. package/dist/src/action-key.js.map +1 -1
  56. package/dist/src/adaptive.d.ts.map +1 -1
  57. package/dist/src/adaptive.js +4 -2
  58. package/dist/src/adaptive.js.map +1 -1
  59. package/dist/src/affect.d.ts.map +1 -1
  60. package/dist/src/affect.js +8 -5
  61. package/dist/src/affect.js.map +1 -1
  62. package/dist/src/audrey.d.ts +11 -1
  63. package/dist/src/audrey.d.ts.map +1 -1
  64. package/dist/src/audrey.js +110 -53
  65. package/dist/src/audrey.js.map +1 -1
  66. package/dist/src/capsule.d.ts.map +1 -1
  67. package/dist/src/capsule.js +37 -15
  68. package/dist/src/capsule.js.map +1 -1
  69. package/dist/src/causal.d.ts +1 -1
  70. package/dist/src/causal.d.ts.map +1 -1
  71. package/dist/src/causal.js +4 -2
  72. package/dist/src/causal.js.map +1 -1
  73. package/dist/src/confidence.d.ts.map +1 -1
  74. package/dist/src/confidence.js +5 -5
  75. package/dist/src/confidence.js.map +1 -1
  76. package/dist/src/consolidate.d.ts.map +1 -1
  77. package/dist/src/consolidate.js +17 -9
  78. package/dist/src/consolidate.js.map +1 -1
  79. package/dist/src/context.js +1 -1
  80. package/dist/src/context.js.map +1 -1
  81. package/dist/src/controller.d.ts +17 -1
  82. package/dist/src/controller.d.ts.map +1 -1
  83. package/dist/src/controller.js +73 -23
  84. package/dist/src/controller.js.map +1 -1
  85. package/dist/src/db.d.ts.map +1 -1
  86. package/dist/src/db.js +78 -27
  87. package/dist/src/db.js.map +1 -1
  88. package/dist/src/decay.d.ts +1 -1
  89. package/dist/src/decay.d.ts.map +1 -1
  90. package/dist/src/decay.js +1 -1
  91. package/dist/src/decay.js.map +1 -1
  92. package/dist/src/embedding.d.ts +12 -4
  93. package/dist/src/embedding.d.ts.map +1 -1
  94. package/dist/src/embedding.js +18 -16
  95. package/dist/src/embedding.js.map +1 -1
  96. package/dist/src/encode.d.ts.map +1 -1
  97. package/dist/src/encode.js +5 -4
  98. package/dist/src/encode.js.map +1 -1
  99. package/dist/src/events.d.ts +3 -2
  100. package/dist/src/events.d.ts.map +1 -1
  101. package/dist/src/events.js +7 -3
  102. package/dist/src/events.js.map +1 -1
  103. package/dist/src/export.d.ts.map +1 -1
  104. package/dist/src/export.js +21 -7
  105. package/dist/src/export.js.map +1 -1
  106. package/dist/src/feedback.d.ts.map +1 -1
  107. package/dist/src/feedback.js +1 -1
  108. package/dist/src/feedback.js.map +1 -1
  109. package/dist/src/forget.d.ts.map +1 -1
  110. package/dist/src/forget.js +12 -6
  111. package/dist/src/forget.js.map +1 -1
  112. package/dist/src/fts.d.ts.map +1 -1
  113. package/dist/src/fts.js +20 -8
  114. package/dist/src/fts.js.map +1 -1
  115. package/dist/src/hybrid-recall.d.ts.map +1 -1
  116. package/dist/src/hybrid-recall.js +12 -6
  117. package/dist/src/hybrid-recall.js.map +1 -1
  118. package/dist/src/impact.d.ts.map +1 -1
  119. package/dist/src/impact.js +26 -10
  120. package/dist/src/impact.js.map +1 -1
  121. package/dist/src/import.d.ts.map +1 -1
  122. package/dist/src/import.js +11 -6
  123. package/dist/src/import.js.map +1 -1
  124. package/dist/src/index.d.ts +5 -4
  125. package/dist/src/index.d.ts.map +1 -1
  126. package/dist/src/index.js +4 -4
  127. package/dist/src/index.js.map +1 -1
  128. package/dist/src/interference.d.ts.map +1 -1
  129. package/dist/src/interference.js +10 -5
  130. package/dist/src/interference.js.map +1 -1
  131. package/dist/src/introspect.d.ts.map +1 -1
  132. package/dist/src/introspect.js +12 -6
  133. package/dist/src/introspect.js.map +1 -1
  134. package/dist/src/llm.d.ts +2 -2
  135. package/dist/src/llm.d.ts.map +1 -1
  136. package/dist/src/llm.js +6 -6
  137. package/dist/src/llm.js.map +1 -1
  138. package/dist/src/migrate.d.ts.map +1 -1
  139. package/dist/src/migrate.js +10 -4
  140. package/dist/src/migrate.js.map +1 -1
  141. package/dist/src/preflight.d.ts.map +1 -1
  142. package/dist/src/preflight.js +6 -8
  143. package/dist/src/preflight.js.map +1 -1
  144. package/dist/src/profile.d.ts.map +1 -1
  145. package/dist/src/profile.js.map +1 -1
  146. package/dist/src/promote.d.ts.map +1 -1
  147. package/dist/src/promote.js +16 -7
  148. package/dist/src/promote.js.map +1 -1
  149. package/dist/src/prompts.d.ts.map +1 -1
  150. package/dist/src/prompts.js +1 -2
  151. package/dist/src/prompts.js.map +1 -1
  152. package/dist/src/recall.d.ts.map +1 -1
  153. package/dist/src/recall.js +85 -18
  154. package/dist/src/recall.js.map +1 -1
  155. package/dist/src/redact.d.ts.map +1 -1
  156. package/dist/src/redact.js +9 -4
  157. package/dist/src/redact.js.map +1 -1
  158. package/dist/src/reflexes.d.ts.map +1 -1
  159. package/dist/src/reflexes.js +1 -7
  160. package/dist/src/reflexes.js.map +1 -1
  161. package/dist/src/rollback.d.ts.map +1 -1
  162. package/dist/src/rollback.js +4 -2
  163. package/dist/src/rollback.js.map +1 -1
  164. package/dist/src/routes.d.ts.map +1 -1
  165. package/dist/src/routes.js +37 -14
  166. package/dist/src/routes.js.map +1 -1
  167. package/dist/src/rules-compiler.d.ts.map +1 -1
  168. package/dist/src/rules-compiler.js +24 -2
  169. package/dist/src/rules-compiler.js.map +1 -1
  170. package/dist/src/server.js +2 -2
  171. package/dist/src/server.js.map +1 -1
  172. package/dist/src/tool-trace.d.ts +2 -2
  173. package/dist/src/tool-trace.d.ts.map +1 -1
  174. package/dist/src/tool-trace.js +12 -4
  175. package/dist/src/tool-trace.js.map +1 -1
  176. package/dist/src/types.d.ts.map +1 -1
  177. package/dist/src/ulid.js +1 -1
  178. package/dist/src/ulid.js.map +1 -1
  179. package/dist/src/utils.d.ts.map +1 -1
  180. package/dist/src/utils.js.map +1 -1
  181. package/dist/src/validate.d.ts.map +1 -1
  182. package/dist/src/validate.js +20 -10
  183. package/dist/src/validate.js.map +1 -1
  184. package/docs/paper/07-evaluation.md +5 -5
  185. package/docs/paper/audrey-paper-v1.md +6 -6
  186. package/docs/paper/evidence-ledger.md +1 -1
  187. package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  188. package/docs/paper/output/arxiv/main.tex +6 -6
  189. package/docs/paper/output/arxiv-compile-report.json +3 -3
  190. package/docs/paper/output/submission-bundle/README.md +30 -6
  191. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
  192. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  193. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  194. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
  195. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +243 -144
  196. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +354 -230
  197. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  198. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  199. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
  200. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
  201. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +52 -52
  202. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +21 -1
  203. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +23 -2
  204. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
  205. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
  206. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
  207. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  208. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
  209. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
  210. package/docs/paper/output/submission-bundle/package.json +18 -5
  211. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +40 -40
  212. package/examples/fintech-ops-demo.js +12 -5
  213. package/examples/healthcare-ops-demo.js +8 -4
  214. package/examples/ollama-memory-agent.js +41 -13
  215. package/examples/stripe-demo.js +12 -5
  216. package/package.json +18 -5
  217. package/scripts/audit-release-completion.mjs +179 -101
  218. package/scripts/create-arxiv-source.mjs +20 -14
  219. package/scripts/create-paper-submission-bundle.mjs +6 -2
  220. package/scripts/finalize-release.mjs +111 -36
  221. package/scripts/prepare-release-cut.mjs +14 -6
  222. package/scripts/publish-release-bundle.mjs +62 -23
  223. package/scripts/publish-release-github-api.mjs +89 -24
  224. package/scripts/smoke-cli.js +26 -6
  225. package/scripts/sync-paper-artifacts.mjs +5 -1
  226. package/scripts/verify-arxiv-compile.mjs +52 -16
  227. package/scripts/verify-arxiv-source.mjs +45 -15
  228. package/scripts/verify-browser-launch-plan.mjs +28 -11
  229. package/scripts/verify-browser-launch-results.mjs +32 -14
  230. package/scripts/verify-paper-artifacts.mjs +539 -79
  231. package/scripts/verify-paper-claims.mjs +48 -20
  232. package/scripts/verify-paper-submission-bundle.mjs +22 -11
  233. package/scripts/verify-publication-pack.mjs +23 -9
  234. package/scripts/verify-release-readiness.mjs +250 -71
package/benchmarks/run.js CHANGED
@@ -68,7 +68,9 @@ function normalizeSuiteSelection(value = 'all') {
68
68
 
69
69
  const invalid = selected.filter(token => !ALL_SUITE_IDS.includes(token));
70
70
  if (invalid.length > 0) {
71
- throw new Error(`Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`);
71
+ throw new Error(
72
+ `Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`,
73
+ );
72
74
  }
73
75
  return [...new Set(selected)];
74
76
  }
@@ -94,12 +96,19 @@ function evaluateCase(benchmarkCase, results) {
94
96
  const expected = (benchmarkCase.expectAny || []).map(normalize);
95
97
  const required = (benchmarkCase.expectAll || []).map(normalize);
96
98
  const forbidden = (benchmarkCase.forbid || []).map(normalize);
97
- const firstMatchIndex = expected.length === 0
98
- ? -1
99
- : normalizedContents.findIndex(content => expected.some(expectation => content.includes(expectation)));
100
- const firstForbiddenIndex = normalizedContents.findIndex(content => forbidden.some(blocked => content.includes(blocked)));
99
+ const firstMatchIndex =
100
+ expected.length === 0
101
+ ? -1
102
+ : normalizedContents.findIndex(content =>
103
+ expected.some(expectation => content.includes(expectation)),
104
+ );
105
+ const firstForbiddenIndex = normalizedContents.findIndex(content =>
106
+ forbidden.some(blocked => content.includes(blocked)),
107
+ );
101
108
  const matched = firstMatchIndex !== -1;
102
- const requiredMatches = required.filter(expectation => normalizedContents.some(content => content.includes(expectation)));
109
+ const requiredMatches = required.filter(expectation =>
110
+ normalizedContents.some(content => content.includes(expectation)),
111
+ );
103
112
  const matchedRequired = required.length > 0 && requiredMatches.length === required.length;
104
113
  const leakedForbidden = firstForbiddenIndex !== -1;
105
114
 
@@ -108,16 +117,21 @@ function evaluateCase(benchmarkCase, results) {
108
117
  return {
109
118
  passed: score === 1,
110
119
  score,
111
- summary: leakedForbidden ? 'leaked restricted content' : results.length === 0 ? 'correct abstention' : 'no leak, but retrieved tangential context',
120
+ summary: leakedForbidden
121
+ ? 'leaked restricted content'
122
+ : results.length === 0
123
+ ? 'correct abstention'
124
+ : 'no leak, but retrieved tangential context',
112
125
  };
113
126
  }
114
127
 
115
128
  if (required.length > 0) {
116
- const score = matchedRequired && !leakedForbidden
117
- ? 1
118
- : leakedForbidden
119
- ? 0
120
- : Math.min(0.5, requiredMatches.length / required.length);
129
+ const score =
130
+ matchedRequired && !leakedForbidden
131
+ ? 1
132
+ : leakedForbidden
133
+ ? 0
134
+ : Math.min(0.5, requiredMatches.length / required.length);
121
135
  const missing = required.filter(expectation => !requiredMatches.includes(expectation));
122
136
  return {
123
137
  passed: score === 1,
@@ -154,7 +168,9 @@ async function seedRetrievalCase(brain, benchmarkCase) {
154
168
  const ids = [];
155
169
  for (let index = 0; index < benchmarkCase.memory.length; index++) {
156
170
  const memory = benchmarkCase.memory[index];
157
- const supersedes = Number.isInteger(memory.supersedesIndex) ? ids[memory.supersedesIndex] : undefined;
171
+ const supersedes = Number.isInteger(memory.supersedesIndex)
172
+ ? ids[memory.supersedesIndex]
173
+ : undefined;
158
174
  const id = await brain.encode({
159
175
  content: memory.content,
160
176
  source: memory.source,
@@ -264,7 +280,9 @@ async function executeGuardStep(brain, step, refs) {
264
280
  if (step.type === 'expectGuardAfterError') {
265
281
  const receiptId = step.receiptRef ? refs.get(step.receiptRef) : step.receiptId;
266
282
  if (!receiptId) {
267
- throw new Error(`Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`);
283
+ throw new Error(
284
+ `Missing guard benchmark receipt reference: ${step.receiptRef || step.receiptId}`,
285
+ );
268
286
  }
269
287
 
270
288
  try {
@@ -278,15 +296,19 @@ async function executeGuardStep(brain, step, refs) {
278
296
  } catch (err) {
279
297
  const message = err instanceof Error ? err.message : String(err);
280
298
  if (step.errorIncludes && !message.includes(step.errorIncludes)) {
281
- throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`);
299
+ throw new Error(`Guard hardening expected "${step.errorIncludes}" but got "${message}"`, {
300
+ cause: err,
301
+ });
282
302
  }
283
303
  const label = step.label ?? 'after_error_rejected';
284
- return [{
285
- id: `${receiptId}:${label}`,
286
- content: `guard_hardened:${label} error:${message}`,
287
- type: 'guard_hardening',
288
- score: 1,
289
- }];
304
+ return [
305
+ {
306
+ id: `${receiptId}:${label}`,
307
+ content: `guard_hardened:${label} error:${message}`,
308
+ type: 'guard_hardening',
309
+ score: 1,
310
+ },
311
+ ];
290
312
  }
291
313
 
292
314
  throw new Error(`Guard hardening expected an error for receipt ${receiptId}`);
@@ -299,18 +321,20 @@ async function seedGuardCase(brain, benchmarkCase) {
299
321
  const refs = new Map();
300
322
  const diagnostics = [];
301
323
  for (const step of benchmarkCase.steps || []) {
302
- diagnostics.push(...await executeGuardStep(brain, step, refs));
324
+ diagnostics.push(...(await executeGuardStep(brain, step, refs)));
303
325
  }
304
326
  return diagnostics;
305
327
  }
306
328
 
307
329
  function guardDecisionRows(decision) {
308
- const rows = [{
309
- id: decision.receipt_id,
310
- content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
311
- type: 'guard_decision',
312
- score: 1,
313
- }];
330
+ const rows = [
331
+ {
332
+ id: decision.receipt_id,
333
+ content: `decision:${decision.decision} verdict:${decision.verdict} risk:${decision.risk_score} ${decision.summary}`,
334
+ type: 'guard_decision',
335
+ score: 1,
336
+ },
337
+ ];
314
338
 
315
339
  for (const [index, warning] of decision.warnings.entries()) {
316
340
  rows.push({
@@ -380,12 +404,15 @@ async function runAudreyCase(benchmarkCase, providerConfig) {
380
404
 
381
405
  async function runBaselineCase(system, benchmarkCase, providerConfig) {
382
406
  if (benchmarkCase.kind === 'guard') {
383
- return [{
384
- id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
385
- content: 'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
386
- type: 'guard_decision',
387
- score: 0,
388
- }];
407
+ return [
408
+ {
409
+ id: `${system.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-guard-baseline`,
410
+ content:
411
+ 'decision:go verdict:clear summary:retrieval-only baseline has no before-action guard controller',
412
+ type: 'guard_decision',
413
+ score: 0,
414
+ },
415
+ ];
389
416
  }
390
417
 
391
418
  return runBaselineScenario(system, benchmarkCase, providerConfig, 5);
@@ -394,9 +421,18 @@ async function runBaselineCase(system, benchmarkCase, providerConfig) {
394
421
  async function runSystemsForCase(benchmarkCase, providerConfig) {
395
422
  const systems = [
396
423
  { system: 'Audrey', run: () => runAudreyCase(benchmarkCase, providerConfig) },
397
- { system: 'Vector Only', run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig) },
398
- { system: 'Keyword + Recency', run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig) },
399
- { system: 'Recent Window', run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig) },
424
+ {
425
+ system: 'Vector Only',
426
+ run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig),
427
+ },
428
+ {
429
+ system: 'Keyword + Recency',
430
+ run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig),
431
+ },
432
+ {
433
+ system: 'Recent Window',
434
+ run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig),
435
+ },
400
436
  ];
401
437
 
402
438
  const results = [];
@@ -504,13 +540,13 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
504
540
 
505
541
  if (audrey.scorePercent < settings.minAudreyScore) {
506
542
  failures.push(
507
- `Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`
543
+ `Audrey score ${audrey.scorePercent.toFixed(1)}% fell below ${settings.minAudreyScore.toFixed(1)}%.`,
508
544
  );
509
545
  }
510
546
 
511
547
  if (audrey.passRate < settings.minAudreyPassRate) {
512
548
  failures.push(
513
- `Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`
549
+ `Audrey pass rate ${audrey.passRate.toFixed(1)}% fell below ${settings.minAudreyPassRate.toFixed(1)}%.`,
514
550
  );
515
551
  }
516
552
 
@@ -518,8 +554,8 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
518
554
  const margin = audrey.scorePercent - strongestBaseline.scorePercent;
519
555
  if (margin < settings.minMarginOverBaseline) {
520
556
  failures.push(
521
- `Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required `
522
- + `${settings.minMarginOverBaseline.toFixed(1)}-point margin.`
557
+ `Audrey beat ${strongestBaseline.system} by ${margin.toFixed(1)} points, below the required ` +
558
+ `${settings.minMarginOverBaseline.toFixed(1)}-point margin.`,
523
559
  );
524
560
  }
525
561
  }
@@ -531,7 +567,9 @@ export function assertBenchmarkGuardrails(summary, options = {}) {
531
567
  return {
532
568
  audrey,
533
569
  strongestBaseline,
534
- marginOverBaseline: strongestBaseline ? audrey.scorePercent - strongestBaseline.scorePercent : null,
570
+ marginOverBaseline: strongestBaseline
571
+ ? audrey.scorePercent - strongestBaseline.scorePercent
572
+ : null,
535
573
  thresholds: settings,
536
574
  };
537
575
  }
@@ -563,7 +601,9 @@ export async function runBenchmarkSuite(options = {}) {
563
601
  }
564
602
  }
565
603
 
566
- const comparableCaseResults = caseResults.filter(caseResult => caseResult.comparable_to_baselines);
604
+ const comparableCaseResults = caseResults.filter(
605
+ caseResult => caseResult.comparable_to_baselines,
606
+ );
567
607
  const overallCaseResults = comparableCaseResults.length > 0 ? comparableCaseResults : caseResults;
568
608
  const overallScope = comparableCaseResults.length > 0 ? 'comparable_suites' : 'selected_suites';
569
609
  const overallSuiteIds = [...new Set(overallCaseResults.map(caseResult => caseResult.suite))];
@@ -579,10 +619,14 @@ export async function runBenchmarkSuite(options = {}) {
579
619
  suites: suiteIds,
580
620
  },
581
621
  methodology: {
582
- localBenchmark: 'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
583
- retrievalBenchmark: 'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
584
- operationsBenchmark: 'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
585
- guardBenchmark: 'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
622
+ localBenchmark:
623
+ 'Local regression suite inspired by LongMemEval-style retrieval, operation-level lifecycle, and agent guard-loop benchmarks',
624
+ retrievalBenchmark:
625
+ 'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling',
626
+ operationsBenchmark:
627
+ 'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations',
628
+ guardBenchmark:
629
+ 'Memory-before-action controller behavior: receipts, learned tool-failure cautions, strict blocking reflexes, and guard-after hardening',
586
630
  externalLeaderboard: 'Published LoCoMo scores from official papers and project blogs',
587
631
  },
588
632
  local: {
@@ -615,10 +659,10 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
615
659
  });
616
660
  const gate = args.check
617
661
  ? assertBenchmarkGuardrails(summary, {
618
- minAudreyScore: args.minAudreyScore,
619
- minAudreyPassRate: args.minAudreyPassRate,
620
- minMarginOverBaseline: args.minMarginOverBaseline,
621
- })
662
+ minAudreyScore: args.minAudreyScore,
663
+ minAudreyPassRate: args.minAudreyPassRate,
664
+ minMarginOverBaseline: args.minMarginOverBaseline,
665
+ })
622
666
  : null;
623
667
 
624
668
  if (args.jsonOnly) {
@@ -629,15 +673,22 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
629
673
  const lines = [];
630
674
  lines.push('Audrey benchmark complete.');
631
675
  lines.push('');
632
- lines.push(`Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`);
633
- lines.push(`Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`);
634
- const comparableCaseCount = summary.local.cases
635
- .filter(testCase => summary.local.overall_suite_ids.includes(testCase.suite)).length;
636
- lines.push(`Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`);
676
+ lines.push(
677
+ `Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`,
678
+ );
679
+ lines.push(
680
+ `Scope: ${summary.local.overall_scope} (${summary.local.overall_suite_ids.join(', ')})`,
681
+ );
682
+ const comparableCaseCount = summary.local.cases.filter(testCase =>
683
+ summary.local.overall_suite_ids.includes(testCase.suite),
684
+ ).length;
685
+ lines.push(
686
+ `Cases: ${summary.local.cases.length} total; ${comparableCaseCount} in combined local chart`,
687
+ );
637
688
  for (const row of summary.local.overall) {
638
689
  lines.push(
639
- `${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, `
640
- + `${row.avgDurationMs.toFixed(1)} ms avg/case`
690
+ `${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, ` +
691
+ `${row.avgDurationMs.toFixed(1)} ms avg/case`,
641
692
  );
642
693
  }
643
694
  lines.push('');
@@ -667,7 +718,9 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons
667
718
  ? `${gate.strongestBaseline.system} by ${gate.marginOverBaseline.toFixed(1)} points`
668
719
  : 'all local baselines';
669
720
  lines.push('');
670
- lines.push(`Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`);
721
+ lines.push(
722
+ `Regression gate passed: Audrey stayed above ${gate.thresholds.minAudreyScore.toFixed(1)}% and ahead of ${baselineLabel}.`,
723
+ );
671
724
  }
672
725
 
673
726
  out(lines.join('\n'));
@@ -25,6 +25,23 @@
25
25
  "artifactRedactionSweep": { "$ref": "#/$defs/artifactRedactionSweep" }
26
26
  },
27
27
  "$defs": {
28
+ "jsonValue": {
29
+ "anyOf": [
30
+ { "type": "null" },
31
+ { "type": "string" },
32
+ { "type": "boolean" },
33
+ { "type": "number" },
34
+ {
35
+ "type": "array",
36
+ "items": { "$ref": "#/$defs/jsonValue" }
37
+ },
38
+ { "$ref": "#/$defs/jsonObject" }
39
+ ]
40
+ },
41
+ "jsonObject": {
42
+ "type": "object",
43
+ "additionalProperties": { "$ref": "#/$defs/jsonValue" }
44
+ },
28
45
  "provenance": {
29
46
  "type": "object",
30
47
  "additionalProperties": false,
@@ -129,11 +146,14 @@
129
146
  },
130
147
  "summary": { "type": "string", "minLength": 1 },
131
148
  "recallErrors": { "type": "array" },
149
+ "adapterExtensions": { "$ref": "#/$defs/jsonObject" },
132
150
  "leakedSecrets": {
133
151
  "type": "array",
134
152
  "items": { "type": "string" }
135
153
  },
136
- "requiredEvidenceMatched": { "type": "boolean" }
154
+ "requiredEvidenceMatched": { "type": "boolean" },
155
+ "hasEvidenceForDecision": { "type": "boolean" },
156
+ "lineageTextMatched": { "type": "boolean" }
137
157
  }
138
158
  },
139
159
  "artifactRedactionSweep": {
@@ -84,6 +84,23 @@
84
84
  "artifactRedactionSweep": { "$ref": "#/$defs/artifactRedactionSweep" }
85
85
  },
86
86
  "$defs": {
87
+ "jsonValue": {
88
+ "anyOf": [
89
+ { "type": "null" },
90
+ { "type": "string" },
91
+ { "type": "boolean" },
92
+ { "type": "number" },
93
+ {
94
+ "type": "array",
95
+ "items": { "$ref": "#/$defs/jsonValue" }
96
+ },
97
+ { "$ref": "#/$defs/jsonObject" }
98
+ ]
99
+ },
100
+ "jsonObject": {
101
+ "type": "object",
102
+ "additionalProperties": { "$ref": "#/$defs/jsonValue" }
103
+ },
87
104
  "latency": {
88
105
  "type": "object",
89
106
  "additionalProperties": false,
@@ -132,7 +149,8 @@
132
149
  "evidenceRecall": { "type": "number", "minimum": 0, "maximum": 1 },
133
150
  "redactionLeaks": { "type": "integer", "minimum": 0 },
134
151
  "recallDegradationDetectionRate": { "type": "number", "minimum": 0, "maximum": 1 },
135
- "latency": { "$ref": "#/$defs/latency" }
152
+ "latency": { "$ref": "#/$defs/latency" },
153
+ "lineageRichness": { "type": "number", "minimum": 0, "maximum": 1 }
136
154
  }
137
155
  },
138
156
  "resultRow": {
@@ -178,11 +196,14 @@
178
196
  },
179
197
  "summary": { "type": "string", "minLength": 1 },
180
198
  "recallErrors": { "type": "array" },
199
+ "adapterExtensions": { "$ref": "#/$defs/jsonObject" },
181
200
  "leakedSecrets": {
182
201
  "type": "array",
183
202
  "items": { "type": "string" }
184
203
  },
185
- "requiredEvidenceMatched": { "type": "boolean" }
204
+ "requiredEvidenceMatched": { "type": "boolean" },
205
+ "hasEvidenceForDecision": { "type": "boolean" },
206
+ "lineageTextMatched": { "type": "boolean" }
186
207
  }
187
208
  },
188
209
  "caseResult": {
@@ -42,9 +42,10 @@ export async function validateAdapterModuleFile(options = {}) {
42
42
  } else {
43
43
  try {
44
44
  const mod = await import(pathToFileURL(adapterPath).href);
45
- const candidate = typeof mod.createGuardBenchAdapter === 'function'
46
- ? await mod.createGuardBenchAdapter()
47
- : mod.default ?? mod.adapter;
45
+ const candidate =
46
+ typeof mod.createGuardBenchAdapter === 'function'
47
+ ? await mod.createGuardBenchAdapter()
48
+ : (mod.default ?? mod.adapter);
48
49
  adapter = validateGuardBenchAdapter(candidate, adapterPath);
49
50
  } catch (error) {
50
51
  failures.push(error.message);
@@ -57,12 +58,12 @@ export async function validateAdapterModuleFile(options = {}) {
57
58
  moduleFile: basename(adapterPath),
58
59
  adapter: adapter
59
60
  ? {
60
- name: adapter.name,
61
- description: adapter.description ?? null,
62
- hasSetup: typeof adapter.setup === 'function',
63
- hasDecide: typeof adapter.decide === 'function',
64
- hasCleanup: typeof adapter.cleanup === 'function',
65
- }
61
+ name: adapter.name,
62
+ description: adapter.description ?? null,
63
+ hasSetup: typeof adapter.setup === 'function',
64
+ hasDecide: typeof adapter.decide === 'function',
65
+ hasCleanup: typeof adapter.cleanup === 'function',
66
+ }
66
67
  : null,
67
68
  contract: {
68
69
  moduleFormat: 'ESM',
@@ -87,7 +88,9 @@ async function main() {
87
88
  } else if (validation.ok) {
88
89
  console.log(`GuardBench adapter module validation passed: ${validation.adapterPath}`);
89
90
  console.log(`Adapter: ${validation.adapter.name}`);
90
- console.log(`Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`);
91
+ console.log(
92
+ `Methods: setup=${validation.adapter.hasSetup}, decide=${validation.adapter.hasDecide}, cleanup=${validation.adapter.hasCleanup}`,
93
+ );
91
94
  } else {
92
95
  console.error('GuardBench adapter module validation failed:');
93
96
  for (const failure of validation.failures) console.error(`- ${failure}`);
@@ -69,11 +69,18 @@ export async function validateAdapterRegistry(options = {}) {
69
69
  failures.push(`Adapter ${adapter.id} has credentialMode=none but declares requiredEnv`);
70
70
  }
71
71
  if (adapter.credentialMode === 'runtime-env' && adapter.requiredEnv.length === 0) {
72
- failures.push(`Adapter ${adapter.id} has credentialMode=runtime-env but declares no requiredEnv`);
72
+ failures.push(
73
+ `Adapter ${adapter.id} has credentialMode=runtime-env but declares no requiredEnv`,
74
+ );
73
75
  }
74
76
  for (const [commandName, command] of Object.entries(adapter.commands ?? {})) {
75
- if ((commandName === 'moduleValidate' || commandName === 'selfTest') && !command.includes(adapter.path)) {
76
- failures.push(`Adapter ${adapter.id} command ${commandName} does not reference ${adapter.path}`);
77
+ if (
78
+ (commandName === 'moduleValidate' || commandName === 'selfTest') &&
79
+ !command.includes(adapter.path)
80
+ ) {
81
+ failures.push(
82
+ `Adapter ${adapter.id} command ${commandName} does not reference ${adapter.path}`,
83
+ );
77
84
  }
78
85
  }
79
86
  if (!existsSync(resolve(adapter.path))) {
@@ -89,10 +96,14 @@ export async function validateAdapterRegistry(options = {}) {
89
96
  failures: report.failures,
90
97
  });
91
98
  if (!report.ok) {
92
- failures.push(`Adapter ${adapter.id} failed module validation: ${report.failures.join('; ')}`);
99
+ failures.push(
100
+ `Adapter ${adapter.id} failed module validation: ${report.failures.join('; ')}`,
101
+ );
93
102
  }
94
103
  if (report.adapter?.name && report.adapter.name !== adapter.name) {
95
- failures.push(`Adapter ${adapter.id} registry name ${adapter.name} does not match module name ${report.adapter.name}`);
104
+ failures.push(
105
+ `Adapter ${adapter.id} registry name ${adapter.name} does not match module name ${report.adapter.name}`,
106
+ );
96
107
  }
97
108
  }
98
109
 
@@ -134,13 +134,25 @@ export function validateSchema(value, schema, label, root = schema) {
134
134
  if (currentSchema.minLength != null && String(current).length < currentSchema.minLength) {
135
135
  errors.push(`${path}: shorter than minLength ${currentSchema.minLength}`);
136
136
  }
137
- if (currentSchema.pattern && typeof current === 'string' && !(new RegExp(currentSchema.pattern).test(current))) {
137
+ if (
138
+ currentSchema.pattern &&
139
+ typeof current === 'string' &&
140
+ !new RegExp(currentSchema.pattern).test(current)
141
+ ) {
138
142
  errors.push(`${path}: does not match ${currentSchema.pattern}`);
139
143
  }
140
- if (currentSchema.minimum != null && typeof current === 'number' && current < currentSchema.minimum) {
144
+ if (
145
+ currentSchema.minimum != null &&
146
+ typeof current === 'number' &&
147
+ current < currentSchema.minimum
148
+ ) {
141
149
  errors.push(`${path}: below minimum ${currentSchema.minimum}`);
142
150
  }
143
- if (currentSchema.maximum != null && typeof current === 'number' && current > currentSchema.maximum) {
151
+ if (
152
+ currentSchema.maximum != null &&
153
+ typeof current === 'number' &&
154
+ current > currentSchema.maximum
155
+ ) {
144
156
  errors.push(`${path}: above maximum ${currentSchema.maximum}`);
145
157
  }
146
158
 
@@ -155,7 +167,8 @@ export function validateSchema(value, schema, label, root = schema) {
155
167
 
156
168
  if (currentSchema.type === 'object') {
157
169
  for (const required of currentSchema.required ?? []) {
158
- if (!Object.hasOwn(current, required)) errors.push(`${path}: missing required property ${required}`);
170
+ if (!Object.hasOwn(current, required))
171
+ errors.push(`${path}: missing required property ${required}`);
159
172
  }
160
173
  if (currentSchema.additionalProperties === false) {
161
174
  for (const key of Object.keys(current)) {
@@ -177,7 +190,10 @@ export function validateSchema(value, schema, label, root = schema) {
177
190
  function stableJson(value) {
178
191
  if (Array.isArray(value)) return `[${value.map(stableJson).join(',')}]`;
179
192
  if (value && typeof value === 'object') {
180
- return `{${Object.keys(value).sort().map(key => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(',')}}`;
193
+ return `{${Object.keys(value)
194
+ .sort()
195
+ .map(key => `${JSON.stringify(key)}:${stableJson(value[key])}`)
196
+ .join(',')}}`;
181
197
  }
182
198
  return JSON.stringify(value);
183
199
  }
@@ -231,7 +247,11 @@ export function validateGuardBenchArtifacts(options = {}) {
231
247
  failures.push(error.message);
232
248
  continue;
233
249
  }
234
- for (const error of validateSchema(optionalArtifacts[key], schemas[key], `guardbench-${key}`)) {
250
+ for (const error of validateSchema(
251
+ optionalArtifacts[key],
252
+ schemas[key],
253
+ `guardbench-${key}`,
254
+ )) {
235
255
  failures.push(`${basename(path)}: ${error}`);
236
256
  }
237
257
  }
@@ -243,7 +263,9 @@ export function validateGuardBenchArtifacts(options = {}) {
243
263
  if (!Object.hasOwn(currentHashes, file)) {
244
264
  failures.push(`external-run-metadata.json: artifactHashes includes unknown file ${file}`);
245
265
  } else if (currentHashes[file] !== expectedHash) {
246
- failures.push(`external-run-metadata.json: artifactHashes.${file} does not match current artifact`);
266
+ failures.push(
267
+ `external-run-metadata.json: artifactHashes.${file} does not match current artifact`,
268
+ );
247
269
  }
248
270
  }
249
271
  for (const file of Object.values(ARTIFACT_FILES)) {
@@ -255,27 +277,58 @@ export function validateGuardBenchArtifacts(options = {}) {
255
277
  const conformanceCard = optionalArtifacts.conformanceCard;
256
278
  if (conformanceCard) {
257
279
  const currentHashes = computeGuardBenchArtifactHashes(dir);
258
- for (const [file, expectedHash] of Object.entries(conformanceCard.integrity?.artifactHashes ?? {})) {
280
+ for (const [file, expectedHash] of Object.entries(
281
+ conformanceCard.integrity?.artifactHashes ?? {},
282
+ )) {
259
283
  if (!Object.hasOwn(currentHashes, file)) {
260
- failures.push(`guardbench-conformance-card.json: integrity.artifactHashes includes unknown file ${file}`);
284
+ failures.push(
285
+ `guardbench-conformance-card.json: integrity.artifactHashes includes unknown file ${file}`,
286
+ );
261
287
  } else if (currentHashes[file] !== expectedHash) {
262
- failures.push(`guardbench-conformance-card.json: integrity.artifactHashes.${file} does not match current artifact`);
288
+ failures.push(
289
+ `guardbench-conformance-card.json: integrity.artifactHashes.${file} does not match current artifact`,
290
+ );
263
291
  }
264
292
  }
265
293
  if (conformanceCard.manifestVersion !== artifacts.manifest.manifestVersion) {
266
- failures.push('guardbench-conformance-card.json: manifestVersion does not match guardbench-manifest.json');
294
+ failures.push(
295
+ 'guardbench-conformance-card.json: manifestVersion does not match guardbench-manifest.json',
296
+ );
267
297
  }
268
298
  if (conformanceCard.suiteId !== artifacts.manifest.suiteId) {
269
- failures.push('guardbench-conformance-card.json: suiteId does not match guardbench-manifest.json');
299
+ failures.push(
300
+ 'guardbench-conformance-card.json: suiteId does not match guardbench-manifest.json',
301
+ );
270
302
  }
271
- if (!artifacts.summary.systemSummaries?.some(row => row.system === conformanceCard.subject?.name)) {
272
- failures.push('guardbench-conformance-card.json: subject.name is not present in guardbench-summary.json');
303
+ if (
304
+ !artifacts.summary.systemSummaries?.some(
305
+ row => row.system === conformanceCard.subject?.name,
306
+ )
307
+ ) {
308
+ failures.push(
309
+ 'guardbench-conformance-card.json: subject.name is not present in guardbench-summary.json',
310
+ );
273
311
  }
274
312
  }
275
313
 
276
- assertSameJson(artifacts.summary.manifest, artifacts.manifest, 'summary.manifest vs guardbench-manifest.json', failures);
277
- assertSameJson(artifacts.summary.cases, artifacts.raw.cases, 'summary.cases vs raw.cases', failures);
278
- assertSameJson(artifacts.summary.provenance, artifacts.raw.provenance, 'summary.provenance vs raw.provenance', failures);
314
+ assertSameJson(
315
+ artifacts.summary.manifest,
316
+ artifacts.manifest,
317
+ 'summary.manifest vs guardbench-manifest.json',
318
+ failures,
319
+ );
320
+ assertSameJson(
321
+ artifacts.summary.cases,
322
+ artifacts.raw.cases,
323
+ 'summary.cases vs raw.cases',
324
+ failures,
325
+ );
326
+ assertSameJson(
327
+ artifacts.summary.provenance,
328
+ artifacts.raw.provenance,
329
+ 'summary.provenance vs raw.provenance',
330
+ failures,
331
+ );
279
332
  if (artifacts.summary.generatedAt !== artifacts.raw.generatedAt) {
280
333
  failures.push('summary.generatedAt vs raw.generatedAt: cross-artifact mismatch');
281
334
  }
@@ -290,7 +343,9 @@ export function validateGuardBenchArtifacts(options = {}) {
290
343
  failures.push('guardbench-raw.json: artifactRedactionSweep did not pass');
291
344
  }
292
345
 
293
- const artifactText = Object.values(artifacts).map(value => JSON.stringify(value)).join('\n');
346
+ const artifactText = Object.values(artifacts)
347
+ .map(value => JSON.stringify(value))
348
+ .join('\n');
294
349
  for (const secret of seededSecrets) {
295
350
  if (secret && artifactText.includes(secret)) {
296
351
  failures.push(`raw seeded secret leaked into GuardBench artifacts: ${secret}`);
@@ -310,7 +365,9 @@ export function validateGuardBenchArtifacts(options = {}) {
310
365
  dir: publicPath(dir),
311
366
  schemasDir: publicPath(schemasDir),
312
367
  files: Object.values(ARTIFACT_FILES),
313
- optionalFiles: Object.values(OPTIONAL_ARTIFACT_FILES).filter(file => existsSync(join(dir, file))),
368
+ optionalFiles: Object.values(OPTIONAL_ARTIFACT_FILES).filter(file =>
369
+ existsSync(join(dir, file)),
370
+ ),
314
371
  failures,
315
372
  };
316
373
  }