audrey 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/README.md +30 -6
  3. package/benchmarks/adapter-self-test.mjs +6 -2
  4. package/benchmarks/adapters/example-allow.mjs +5 -2
  5. package/benchmarks/adapters/mem0-platform.mjs +19 -12
  6. package/benchmarks/adapters/zep-cloud.mjs +51 -27
  7. package/benchmarks/baselines.js +11 -6
  8. package/benchmarks/build-leaderboard.mjs +36 -23
  9. package/benchmarks/cases.js +24 -12
  10. package/benchmarks/create-conformance-card.mjs +12 -3
  11. package/benchmarks/create-submission-bundle.mjs +22 -8
  12. package/benchmarks/dry-run-external-adapters.mjs +24 -12
  13. package/benchmarks/guardbench.js +354 -124
  14. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
  15. package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  16. package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  17. package/benchmarks/output/guardbench-conformance-card.json +12 -12
  18. package/benchmarks/output/guardbench-raw.json +243 -144
  19. package/benchmarks/output/guardbench-summary.json +354 -230
  20. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  21. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  22. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
  23. package/benchmarks/output/submission-bundle/guardbench-raw.json +243 -144
  24. package/benchmarks/output/submission-bundle/guardbench-summary.json +354 -230
  25. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +21 -1
  26. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +23 -2
  27. package/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
  28. package/benchmarks/output/submission-bundle/validation-report.json +1 -1
  29. package/benchmarks/output/summary.json +58 -58
  30. package/benchmarks/perf-snapshot.js +12 -9
  31. package/benchmarks/perf.bench.js +14 -6
  32. package/benchmarks/public-paths.mjs +11 -5
  33. package/benchmarks/reference-results.js +10 -5
  34. package/benchmarks/report.js +48 -27
  35. package/benchmarks/run-external-guardbench.mjs +47 -25
  36. package/benchmarks/run.js +112 -59
  37. package/benchmarks/schemas/guardbench-raw.schema.json +21 -1
  38. package/benchmarks/schemas/guardbench-summary.schema.json +23 -2
  39. package/benchmarks/validate-adapter-module.mjs +13 -10
  40. package/benchmarks/validate-adapter-registry.mjs +16 -5
  41. package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
  42. package/benchmarks/verify-external-evidence.mjs +86 -31
  43. package/benchmarks/verify-publication-artifacts.mjs +34 -11
  44. package/benchmarks/verify-submission-bundle.mjs +9 -4
  45. package/dist/mcp-server/config.d.ts +1 -1
  46. package/dist/mcp-server/config.d.ts.map +1 -1
  47. package/dist/mcp-server/config.js +5 -3
  48. package/dist/mcp-server/config.js.map +1 -1
  49. package/dist/mcp-server/index.d.ts +4 -3
  50. package/dist/mcp-server/index.d.ts.map +1 -1
  51. package/dist/mcp-server/index.js +479 -172
  52. package/dist/mcp-server/index.js.map +1 -1
  53. package/dist/src/action-key.d.ts.map +1 -1
  54. package/dist/src/action-key.js +6 -2
  55. package/dist/src/action-key.js.map +1 -1
  56. package/dist/src/adaptive.d.ts.map +1 -1
  57. package/dist/src/adaptive.js +4 -2
  58. package/dist/src/adaptive.js.map +1 -1
  59. package/dist/src/affect.d.ts.map +1 -1
  60. package/dist/src/affect.js +8 -5
  61. package/dist/src/affect.js.map +1 -1
  62. package/dist/src/audrey.d.ts +11 -1
  63. package/dist/src/audrey.d.ts.map +1 -1
  64. package/dist/src/audrey.js +110 -53
  65. package/dist/src/audrey.js.map +1 -1
  66. package/dist/src/capsule.d.ts.map +1 -1
  67. package/dist/src/capsule.js +37 -15
  68. package/dist/src/capsule.js.map +1 -1
  69. package/dist/src/causal.d.ts +1 -1
  70. package/dist/src/causal.d.ts.map +1 -1
  71. package/dist/src/causal.js +4 -2
  72. package/dist/src/causal.js.map +1 -1
  73. package/dist/src/confidence.d.ts.map +1 -1
  74. package/dist/src/confidence.js +5 -5
  75. package/dist/src/confidence.js.map +1 -1
  76. package/dist/src/consolidate.d.ts.map +1 -1
  77. package/dist/src/consolidate.js +17 -9
  78. package/dist/src/consolidate.js.map +1 -1
  79. package/dist/src/context.js +1 -1
  80. package/dist/src/context.js.map +1 -1
  81. package/dist/src/controller.d.ts +17 -1
  82. package/dist/src/controller.d.ts.map +1 -1
  83. package/dist/src/controller.js +73 -23
  84. package/dist/src/controller.js.map +1 -1
  85. package/dist/src/db.d.ts.map +1 -1
  86. package/dist/src/db.js +78 -27
  87. package/dist/src/db.js.map +1 -1
  88. package/dist/src/decay.d.ts +1 -1
  89. package/dist/src/decay.d.ts.map +1 -1
  90. package/dist/src/decay.js +1 -1
  91. package/dist/src/decay.js.map +1 -1
  92. package/dist/src/embedding.d.ts +12 -4
  93. package/dist/src/embedding.d.ts.map +1 -1
  94. package/dist/src/embedding.js +18 -16
  95. package/dist/src/embedding.js.map +1 -1
  96. package/dist/src/encode.d.ts.map +1 -1
  97. package/dist/src/encode.js +5 -4
  98. package/dist/src/encode.js.map +1 -1
  99. package/dist/src/events.d.ts +3 -2
  100. package/dist/src/events.d.ts.map +1 -1
  101. package/dist/src/events.js +7 -3
  102. package/dist/src/events.js.map +1 -1
  103. package/dist/src/export.d.ts.map +1 -1
  104. package/dist/src/export.js +21 -7
  105. package/dist/src/export.js.map +1 -1
  106. package/dist/src/feedback.d.ts.map +1 -1
  107. package/dist/src/feedback.js +1 -1
  108. package/dist/src/feedback.js.map +1 -1
  109. package/dist/src/forget.d.ts.map +1 -1
  110. package/dist/src/forget.js +12 -6
  111. package/dist/src/forget.js.map +1 -1
  112. package/dist/src/fts.d.ts.map +1 -1
  113. package/dist/src/fts.js +20 -8
  114. package/dist/src/fts.js.map +1 -1
  115. package/dist/src/hybrid-recall.d.ts.map +1 -1
  116. package/dist/src/hybrid-recall.js +12 -6
  117. package/dist/src/hybrid-recall.js.map +1 -1
  118. package/dist/src/impact.d.ts.map +1 -1
  119. package/dist/src/impact.js +26 -10
  120. package/dist/src/impact.js.map +1 -1
  121. package/dist/src/import.d.ts.map +1 -1
  122. package/dist/src/import.js +11 -6
  123. package/dist/src/import.js.map +1 -1
  124. package/dist/src/index.d.ts +5 -4
  125. package/dist/src/index.d.ts.map +1 -1
  126. package/dist/src/index.js +4 -4
  127. package/dist/src/index.js.map +1 -1
  128. package/dist/src/interference.d.ts.map +1 -1
  129. package/dist/src/interference.js +10 -5
  130. package/dist/src/interference.js.map +1 -1
  131. package/dist/src/introspect.d.ts.map +1 -1
  132. package/dist/src/introspect.js +12 -6
  133. package/dist/src/introspect.js.map +1 -1
  134. package/dist/src/llm.d.ts +2 -2
  135. package/dist/src/llm.d.ts.map +1 -1
  136. package/dist/src/llm.js +6 -6
  137. package/dist/src/llm.js.map +1 -1
  138. package/dist/src/migrate.d.ts.map +1 -1
  139. package/dist/src/migrate.js +10 -4
  140. package/dist/src/migrate.js.map +1 -1
  141. package/dist/src/preflight.d.ts.map +1 -1
  142. package/dist/src/preflight.js +6 -8
  143. package/dist/src/preflight.js.map +1 -1
  144. package/dist/src/profile.d.ts.map +1 -1
  145. package/dist/src/profile.js.map +1 -1
  146. package/dist/src/promote.d.ts.map +1 -1
  147. package/dist/src/promote.js +16 -7
  148. package/dist/src/promote.js.map +1 -1
  149. package/dist/src/prompts.d.ts.map +1 -1
  150. package/dist/src/prompts.js +1 -2
  151. package/dist/src/prompts.js.map +1 -1
  152. package/dist/src/recall.d.ts.map +1 -1
  153. package/dist/src/recall.js +85 -18
  154. package/dist/src/recall.js.map +1 -1
  155. package/dist/src/redact.d.ts.map +1 -1
  156. package/dist/src/redact.js +9 -4
  157. package/dist/src/redact.js.map +1 -1
  158. package/dist/src/reflexes.d.ts.map +1 -1
  159. package/dist/src/reflexes.js +1 -7
  160. package/dist/src/reflexes.js.map +1 -1
  161. package/dist/src/rollback.d.ts.map +1 -1
  162. package/dist/src/rollback.js +4 -2
  163. package/dist/src/rollback.js.map +1 -1
  164. package/dist/src/routes.d.ts.map +1 -1
  165. package/dist/src/routes.js +37 -14
  166. package/dist/src/routes.js.map +1 -1
  167. package/dist/src/rules-compiler.d.ts.map +1 -1
  168. package/dist/src/rules-compiler.js +24 -2
  169. package/dist/src/rules-compiler.js.map +1 -1
  170. package/dist/src/server.js +2 -2
  171. package/dist/src/server.js.map +1 -1
  172. package/dist/src/tool-trace.d.ts +2 -2
  173. package/dist/src/tool-trace.d.ts.map +1 -1
  174. package/dist/src/tool-trace.js +12 -4
  175. package/dist/src/tool-trace.js.map +1 -1
  176. package/dist/src/types.d.ts.map +1 -1
  177. package/dist/src/ulid.js +1 -1
  178. package/dist/src/ulid.js.map +1 -1
  179. package/dist/src/utils.d.ts.map +1 -1
  180. package/dist/src/utils.js.map +1 -1
  181. package/dist/src/validate.d.ts.map +1 -1
  182. package/dist/src/validate.js +20 -10
  183. package/dist/src/validate.js.map +1 -1
  184. package/docs/paper/07-evaluation.md +5 -5
  185. package/docs/paper/audrey-paper-v1.md +6 -6
  186. package/docs/paper/evidence-ledger.md +1 -1
  187. package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  188. package/docs/paper/output/arxiv/main.tex +6 -6
  189. package/docs/paper/output/arxiv-compile-report.json +3 -3
  190. package/docs/paper/output/submission-bundle/README.md +30 -6
  191. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
  192. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  193. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  194. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
  195. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +243 -144
  196. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +354 -230
  197. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  198. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  199. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +15 -15
  200. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
  201. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +52 -52
  202. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +21 -1
  203. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +23 -2
  204. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
  205. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
  206. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
  207. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  208. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
  209. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
  210. package/docs/paper/output/submission-bundle/package.json +18 -5
  211. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +40 -40
  212. package/examples/fintech-ops-demo.js +12 -5
  213. package/examples/healthcare-ops-demo.js +8 -4
  214. package/examples/ollama-memory-agent.js +41 -13
  215. package/examples/stripe-demo.js +12 -5
  216. package/package.json +18 -5
  217. package/scripts/audit-release-completion.mjs +179 -101
  218. package/scripts/create-arxiv-source.mjs +20 -14
  219. package/scripts/create-paper-submission-bundle.mjs +6 -2
  220. package/scripts/finalize-release.mjs +111 -36
  221. package/scripts/prepare-release-cut.mjs +14 -6
  222. package/scripts/publish-release-bundle.mjs +62 -23
  223. package/scripts/publish-release-github-api.mjs +89 -24
  224. package/scripts/smoke-cli.js +26 -6
  225. package/scripts/sync-paper-artifacts.mjs +5 -1
  226. package/scripts/verify-arxiv-compile.mjs +52 -16
  227. package/scripts/verify-arxiv-source.mjs +45 -15
  228. package/scripts/verify-browser-launch-plan.mjs +28 -11
  229. package/scripts/verify-browser-launch-results.mjs +32 -14
  230. package/scripts/verify-paper-artifacts.mjs +539 -79
  231. package/scripts/verify-paper-claims.mjs +48 -20
  232. package/scripts/verify-paper-submission-bundle.mjs +22 -11
  233. package/scripts/verify-publication-pack.mjs +23 -9
  234. package/scripts/verify-release-readiness.mjs +250 -71
@@ -2,18 +2,21 @@ import { generateId } from './ulid.js';
2
2
  import { safeJsonParse } from './utils.js';
3
3
  import { buildContradictionDetectionPrompt } from './prompts.js';
4
4
  const REINFORCEMENT_THRESHOLD = 0.85;
5
- const CONTRADICTION_THRESHOLD = 0.60;
5
+ const CONTRADICTION_THRESHOLD = 0.6;
6
6
  export async function validateMemory(db, embeddingProvider, episode, options = {}) {
7
7
  const { threshold = REINFORCEMENT_THRESHOLD, contradictionThreshold = CONTRADICTION_THRESHOLD, llmProvider, embeddingVector, embeddingBuffer, } = options;
8
- const episodeBuffer = embeddingBuffer ?? embeddingProvider.vectorToBuffer(embeddingVector ?? await embeddingProvider.embed(episode.content));
9
- const nearestSemantic = db.prepare(`
8
+ const episodeBuffer = embeddingBuffer ??
9
+ embeddingProvider.vectorToBuffer(embeddingVector ?? (await embeddingProvider.embed(episode.content)));
10
+ const nearestSemantic = db
11
+ .prepare(`
10
12
  SELECT s.*, (1.0 - v.distance) AS similarity
11
13
  FROM vec_semantics v
12
14
  JOIN semantics s ON s.id = v.id
13
15
  WHERE v.embedding MATCH ?
14
16
  AND k = 1
15
17
  AND (v.state = 'active' OR v.state = 'context_dependent')
16
- `).get(episodeBuffer);
18
+ `)
19
+ .get(episodeBuffer);
17
20
  let bestMatch = null;
18
21
  let bestSimilarity = 0;
19
22
  if (nearestSemantic) {
@@ -24,7 +27,9 @@ export async function validateMemory(db, embeddingProvider, episode, options = {
24
27
  const matchId = bestMatch.id;
25
28
  const reinforce = db.transaction(() => {
26
29
  // Re-read evidence inside the transaction to avoid lost updates under concurrency.
27
- const current = db.prepare('SELECT evidence_episode_ids FROM semantics WHERE id = ?').get(matchId);
30
+ const current = db
31
+ .prepare('SELECT evidence_episode_ids FROM semantics WHERE id = ?')
32
+ .get(matchId);
28
33
  const existing = safeJsonParse(current?.evidence_episode_ids ?? null, []);
29
34
  const wasAdded = !existing.includes(episode.id);
30
35
  if (wasAdded) {
@@ -67,7 +72,7 @@ export async function validateMemory(db, embeddingProvider, episode, options = {
67
72
  conditions: candidate.conditions &&
68
73
  typeof candidate.conditions === 'object' &&
69
74
  !Array.isArray(candidate.conditions) &&
70
- Object.values(candidate.conditions).every((v) => typeof v === 'string')
75
+ Object.values(candidate.conditions).every(v => typeof v === 'string')
71
76
  ? candidate.conditions
72
77
  : undefined,
73
78
  explanation: typeof candidate.explanation === 'string' ? candidate.explanation : undefined,
@@ -75,7 +80,11 @@ export async function validateMemory(db, embeddingProvider, episode, options = {
75
80
  if (verdict.contradicts) {
76
81
  const matchId = bestMatch.id;
77
82
  const resolution = verdict.resolution === 'context_dependent'
78
- ? { type: 'context_dependent', conditions: verdict.conditions, explanation: verdict.explanation }
83
+ ? {
84
+ type: 'context_dependent',
85
+ conditions: verdict.conditions,
86
+ explanation: verdict.explanation,
87
+ }
79
88
  : verdict.resolution
80
89
  ? { type: verdict.resolution, explanation: verdict.explanation }
81
90
  : null;
@@ -86,8 +95,7 @@ export async function validateMemory(db, embeddingProvider, episode, options = {
86
95
  db.prepare("UPDATE semantics SET state = 'disputed' WHERE id = ?").run(matchId);
87
96
  }
88
97
  else if (verdict.resolution === 'context_dependent' && verdict.conditions) {
89
- db.prepare("UPDATE semantics SET state = 'context_dependent', conditions = ? WHERE id = ?")
90
- .run(JSON.stringify(verdict.conditions), matchId);
98
+ db.prepare("UPDATE semantics SET state = 'context_dependent', conditions = ? WHERE id = ?").run(JSON.stringify(verdict.conditions), matchId);
91
99
  }
92
100
  });
93
101
  recordContradiction();
@@ -107,7 +115,9 @@ function computeSourceDiversity(db, evidenceIds, currentEpisode) {
107
115
  sourceTypes.add(currentEpisode.source);
108
116
  if (evidenceIds.length > 0) {
109
117
  const placeholders = evidenceIds.map(() => '?').join(',');
110
- const rows = db.prepare(`SELECT DISTINCT source FROM episodes WHERE id IN (${placeholders})`).all(...evidenceIds);
118
+ const rows = db
119
+ .prepare(`SELECT DISTINCT source FROM episodes WHERE id IN (${placeholders})`)
120
+ .all(...evidenceIds);
111
121
  for (const row of rows) {
112
122
  sourceTypes.add(row.source);
113
123
  }
@@ -1 +1 @@
1
- {"version":3,"file":"validate.js","sourceRoot":"","sources":["../../src/validate.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAC3C,OAAO,EAAE,iCAAiC,EAAE,MAAM,cAAc,CAAC;AAEjE,MAAM,uBAAuB,GAAG,IAAI,CAAC;AACrC,MAAM,uBAAuB,GAAG,IAAI,CAAC;AAkBrC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,EAAqB,EACrB,iBAAoC,EACpC,OAAwD,EACxD,UAMI,EAAE;IAEN,MAAM,EACJ,SAAS,GAAG,uBAAuB,EACnC,sBAAsB,GAAG,uBAAuB,EAChD,WAAW,EACX,eAAe,EACf,eAAe,GAChB,GAAG,OAAO,CAAC;IAEZ,MAAM,aAAa,GAAG,eAAe,IAAI,iBAAiB,CAAC,cAAc,CACvE,eAAe,IAAI,MAAM,iBAAiB,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAClE,CAAC;IAEF,MAAM,eAAe,GAAG,EAAE,CAAC,OAAO,CAAC;;;;;;;GAOlC,CAAC,CAAC,GAAG,CAAC,aAAa,CAAuC,CAAC;IAE5D,IAAI,SAAS,GAAkC,IAAI,CAAC;IACpD,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,IAAI,eAAe,EAAE,CAAC;QACpB,SAAS,GAAG,eAAe,CAAC;QAC5B,cAAc,GAAG,eAAe,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,IAAI,SAAS,IAAI,cAAc,IAAI,SAAS,EAAE,CAAC;QAC7C,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,CAAC;QAC7B,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;YACpC,mFAAmF;YACnF,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,CACxB,yDAAyD,CAC1D,CAAC,GAAG,CAAC,OAAO,CAAwD,CAAC;YACtE,MAAM,QAAQ,GAAG,aAAa,CAC5B,OAAO,EAAE,oBAAoB,IAAI,IAAI,EACrC,EAAE,CACH,CAAC;YACF,MAAM,QAAQ,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC5B,CAAC;YACD,MAAM,SAAS,GAAG,sBAAsB,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrC,yEAAyE;YACzE,qEAAqE;YACrE,EAAE,CAAC,OAAO,CAAC;;;;;;;;OAQV,CAAC,CAAC,GAAG,CACJ,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAChB,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,EACxB,QAAQ,CAAC,MAAM,EACf,SAAS,EACT,GAAG,EACH,OAAO,CACR,CAAC;QACJ,CAAC,CAAC,CAAC;QACH,SAAS,EAAE,CAAC;QAEZ,OAAO;YACL,MAAM,EAAE,YAAY;YACpB,UAAU,EAAE,OAAO;YACnB,UAAU,EAAE,cAAc;SAC3B,CAAC;IACJ,CAAC;IAED,IAAI,SAAS,IAAI,cAAc,IAAI,sBAAsB,IAAI,WAAW,EAAE,CAAC;QACzE,MAAM,QAAQ,GAAG,iCAAiC,CAAC,OAAO,CAAC,OAAO,EAAE,SAAS,CAAC,OAAO,CAAC,CAAC;QACvF,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,kDAAkD,CAAC,CAAC;QACtE,CAAC;QACD,MAAM,SAAS,GAAG,GAA8B,CAAC;QACjD,IAAI,OAAO,SAAS,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,gEAAgE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,OAAO,GAKT;YACF,WAAW,EAAE,SAAS,CAAC,WAAW;YAClC,UAAU,EAAE,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;YACvF,UAAU,EACR,SAAS,CAAC,UAAU;gBACpB,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ;gBACxC,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC;gBACpC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;gBACrE,CAAC,CAAE,SAAS,CAAC,UAAqC;gBAClD,CAAC,CAAC,SAAS;YACf,WAAW,EAAE,OAAO,SAAS,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;SAC3F,CAAC;QAEF,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,CAAC;YAC7B,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,KAAK,mBAAmB;gBAC3D,CAAC,CAAC,EAAE,IAAI,EAAE,mBAAmB,EAAE,UAAU,EAAE,OAAO,CAAC,UAAU,EAAE,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE;gBACjG,CAAC,CAAC,OAAO,CAAC,UAAU;oBAClB,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,UAAU,EAAE,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE;oBAChE,CAAC,CAAC,IAAI,CAAC;YAEX,IAAI,eAAe,GAAG,EAAE,CAAC;YACzB,MAAM,mBAAmB,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;gBAC9C,eAAe,GAAG,mBAAmB,CACnC,EAAE,EACF,OAAO,EACP,UAAU,EACV,OAAO,CAAC,EAAE,EACV,UAAU,EACV,UAAU,CACX,CAAC;gBACF,IAAI,OAAO,CAAC,UAAU,KAAK,UAAU,EAAE,CAAC;oBACtC,EAAE,CAAC,OAAO,CAAC,sDAAsD,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAClF,CAAC;qBAAM,IAAI,OAAO,CAAC,UAAU,KAAK,mBAAmB,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;oBAC5E,EAAE,CAAC,OAAO,CAAC,+EAA+E,CAAC;yBACxF,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,OAAO,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC,CAAC,CAAC;YACH,mBAAmB,EAAE,CAAC;YAEtB,OAAO;gBACL,MAAM,EAAE,eAAe;gBACvB,eAAe;gBACf,UAAU,EAAE,OAAO;gBACnB,UAAU,EAAE,cAAc;gBAC1B,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,IAAI;aACvC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;AAC5B,CAAC;AAED,SAAS,sBAAsB,CAC7B,EAAqB,EACrB,WAAqB,EACrB,cAAkC;IAElC,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;IACtC,WAAW,CAAC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IAEvC,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CACrB,qDAAqD,YAAY,GAAG,CACrE,CAAC,GAAG,CAAC,GAAG,WAAW,CAAgB,CAAC;QACrC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC,IAAI,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,EAAqB,EACrB,QAAgB,EAChB,UAAkB,EAClB,QAAgB,EAChB,UAAkB,EAClB,UAAyB;IAEzB,MAAM,EAAE,GAAG,UAAU,EAAE,CAAC;IACxB,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAErC,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC;IAC/C,MAAM,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;IAC3C,MAAM,cAAc,GAAG,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEtE,EAAE,CAAC,OAAO,CAAC;;;;GAIV,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,EAAE,KAAK,EAAE,cAAc,EAAE,UAAU,EAAE,GAAG,CAAC,CAAC;IAE/F,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,EAAqB,EAAE,eAAuB,EAAE,aAAqB;IACvG,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACrC,EAAE,CAAC,OAAO,CAAC;;;;;;GAMV,CAAC,CAAC,GAAG,CAAC,aAAa,EAAE,GAAG,EAAE,eAAe,CAAC,CAAC;AAC9C,CAAC"}
1
+ {"version":3,"file":"validate.js","sourceRoot":"","sources":["../../src/validate.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAC3C,OAAO,EAAE,iCAAiC,EAAE,MAAM,cAAc,CAAC;AAEjE,MAAM,uBAAuB,GAAG,IAAI,CAAC;AACrC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAkBpC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,EAAqB,EACrB,iBAAoC,EACpC,OAAwD,EACxD,UAMI,EAAE;IAEN,MAAM,EACJ,SAAS,GAAG,uBAAuB,EACnC,sBAAsB,GAAG,uBAAuB,EAChD,WAAW,EACX,eAAe,EACf,eAAe,GAChB,GAAG,OAAO,CAAC;IAEZ,MAAM,aAAa,GACjB,eAAe;QACf,iBAAiB,CAAC,cAAc,CAC9B,eAAe,IAAI,CAAC,MAAM,iBAAiB,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CACpE,CAAC;IAEJ,MAAM,eAAe,GAAG,EAAE;SACvB,OAAO,CACN;;;;;;;GAOH,CACE;SACA,GAAG,CAAC,aAAa,CAAuC,CAAC;IAE5D,IAAI,SAAS,GAAkC,IAAI,CAAC;IACpD,IAAI,cAAc,GAAG,CAAC,CAAC;IAEvB,IAAI,eAAe,EAAE,CAAC;QACpB,SAAS,GAAG,eAAe,CAAC;QAC5B,cAAc,GAAG,eAAe,CAAC,UAAU,CAAC;IAC9C,CAAC;IAED,IAAI,SAAS,IAAI,cAAc,IAAI,SAAS,EAAE,CAAC;QAC7C,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,CAAC;QAC7B,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;YACpC,mFAAmF;YACnF,MAAM,OAAO,GAAG,EAAE;iBACf,OAAO,CAAC,yDAAyD,CAAC;iBAClE,GAAG,CAAC,OAAO,CAAwD,CAAC;YACvE,MAAM,QAAQ,GAAG,aAAa,CAAW,OAAO,EAAE,oBAAoB,IAAI,IAAI,EAAE,EAAE,CAAC,CAAC;YACpF,MAAM,QAAQ,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACb,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC5B,CAAC;YACD,MAAM,SAAS,GAAG,sBAAsB,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrC,yEAAyE;YACzE,qEAAqE;YACrE,EAAE,CAAC,OAAO,CACR;;;;;;;;OAQD,CACA,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,QAAQ,CAAC,MAAM,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;QAC9F,CAAC,CAAC,CAAC;QACH,SAAS,EAAE,CAAC;QAEZ,OAAO;YACL,MAAM,EAAE,YAAY;YACpB,UAAU,EAAE,OAAO;YACnB,UAAU,EAAE,cAAc;SAC3B,CAAC;IACJ,CAAC;IAED,IAAI,SAAS,IAAI,cAAc,IAAI,sBAAsB,IAAI,WAAW,EAAE,CAAC;QACzE,MAAM,QAAQ,GAAG,iCAAiC,CAAC,OAAO,CAAC,OAAO,EAAE,SAAS,CAAC,OAAO,CAAC,CAAC;QACvF,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CAAC,kDAAkD,CAAC,CAAC;QACtE,CAAC;QACD,MAAM,SAAS,GAAG,GAA8B,CAAC;QACjD,IAAI,OAAO,SAAS,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,gEAAgE,CAAC,CAAC;QACpF,CAAC;QACD,MAAM,OAAO,GAKT;YACF,WAAW,EAAE,SAAS,CAAC,WAAW;YAClC,UAAU,EAAE,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;YACvF,UAAU,EACR,SAAS,CAAC,UAAU;gBACpB,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ;gBACxC,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC;gBACpC,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;gBACnE,CAAC,CAAE,SAAS,CAAC,UAAqC;gBAClD,CAAC,CAAC,SAAS;YACf,WAAW,EAAE,OAAO,SAAS,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;SAC3F,CAAC;QAEF,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,CAAC;YAC7B,MAAM,UAAU,GACd,OAAO,CAAC,UAAU,KAAK,mBAAmB;gBACxC,CAAC,CAAC;oBACE,IAAI,EAAE,mBAAmB;oBACzB,UAAU,EAAE,OAAO,CAAC,UAAU;oBAC9B,WAAW,EAAE,OAAO,CAAC,WAAW;iBACjC;gBACH,CAAC,CAAC,OAAO,CAAC,UAAU;oBAClB,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,UAAU,EAAE,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE;oBAChE,CAAC,CAAC,IAAI,CAAC;YAEb,IAAI,eAAe,GAAG,EAAE,CAAC;YACzB,MAAM,mBAAmB,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;gBAC9C,eAAe,GAAG,mBAAmB,CACnC,EAAE,EACF,OAAO,EACP,UAAU,EACV,OAAO,CAAC,EAAE,EACV,UAAU,EACV,UAAU,CACX,CAAC;gBACF,IAAI,OAAO,CAAC,UAAU,KAAK,UAAU,EAAE,CAAC;oBACtC,EAAE,CAAC,OAAO,CAAC,sDAAsD,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAClF,CAAC;qBAAM,IAAI,OAAO,CAAC,UAAU,KAAK,mBAAmB,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;oBAC5E,EAAE,CAAC,OAAO,CACR,+EAA+E,CAChF,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,OAAO,CAAC,CAAC;gBACrD,CAAC;YACH,CAAC,CAAC,CAAC;YACH,mBAAmB,EAAE,CAAC;YAEtB,OAAO;gBACL,MAAM,EAAE,eAAe;gBACvB,eAAe;gBACf,UAAU,EAAE,OAAO;gBACnB,UAAU,EAAE,cAAc;gBAC1B,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,IAAI;aACvC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;AAC5B,CAAC;AAED,SAAS,sBAAsB,CAC7B,EAAqB,EACrB,WAAqB,EACrB,cAAkC;IAElC,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;IACtC,WAAW,CAAC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;IAEvC,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,MAAM,IAAI,GAAG,EAAE;aACZ,OAAO,CAAC,qDAAqD,YAAY,GAAG,CAAC;aAC7E,GAAG,CAAC,GAAG,WAAW,CAAgB,CAAC;QACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC,IAAI,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,EAAqB,EACrB,QAAgB,EAChB,UAAkB,EAClB,QAAgB,EAChB,UAAkB,EAClB,UAAyB;IAEzB,MAAM,EAAE,GAAG,UAAU,EAAE,CAAC;IACxB,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAErC,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC;IAC/C,MAAM,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;IAC3C,MAAM,cAAc,GAAG,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEtE,EAAE,CAAC,OAAO,CACR;;;;GAID,CACA,CAAC,GAAG,CAAC,EAAE,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,EAAE,KAAK,EAAE,cAAc,EAAE,UAAU,EAAE,GAAG,CAAC,CAAC;IAE9F,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,EAAqB,EACrB,eAAuB,EACvB,aAAqB;IAErB,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACrC,EAAE,CAAC,OAAO,CACR;;;;;;GAMD,CACA,CAAC,GAAG,CAAC,aAAa,EAAE,GAAG,EAAE,eAAe,CAAC,CAAC;AAC7C,CAAC"}
@@ -24,13 +24,13 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
24
24
 
25
25
  ## Behavioral Regression Result
26
26
 
27
- The current `benchmarks/output/summary.json` was generated on 2026-05-13T08:33:24.917Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
27
+ The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
28
28
 
29
29
  | System | Score Percent | Pass Rate | Average Duration Ms |
30
30
  |---|---:|---:|---:|
31
- | Audrey | 100 | 100 | 15.083333333333334 |
32
- | Vector Only | 41.66666666666667 | 25 | 0.25 |
33
- | Keyword + Recency | 41.66666666666667 | 25 | 0.5 |
31
+ | Audrey | 100 | 100 | 15.416666666666666 |
32
+ | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
33
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
34
34
  | Recent Window | 37.5 | 25 | 0 |
35
35
 
36
36
  This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
@@ -55,7 +55,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
55
55
  | Evidence recall | 100% |
56
56
  | Redaction leaks | 0 |
57
57
  | Recall-degradation detection | 100% |
58
- | Guard latency p50 / p95 | 3.214 ms / 21.395 ms |
58
+ | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
59
59
  | Published artifact raw-secret leaks | 0 |
60
60
  | Audrey Guard decision accuracy | 100% |
61
61
  | No-memory decision accuracy | 10% |
@@ -344,7 +344,7 @@ The deterministic demo, `audrey demo --scenario repeated-failure`, constructs a
344
344
 
345
345
  The current paper version has two implemented empirical anchors. First, `benchmarks/snapshots/perf-0.22.2.json` reports canonical local performance under the mock-provider methodology: generated on 2026-05-01 from git SHA `e2e821b`, using mock 64-dimensional in-process embeddings, hybrid recall limit 5, and corpus sizes 100, 1,000, and 5,000 on Node 25.5.0 with a 24-core Ryzen 9 7900X3D and 62.9 GB RAM (Ledger: E20). Under that methodology, hybrid recall p95 is 1.82 ms, 2.364 ms, and 3.417 ms for those three sizes, and encode p95 is 0.589 ms, 2.147 ms, and 1.838 ms (Ledger: E21-E22).
346
346
 
347
- Second, `bench:memory:check` is wired into the release gate and enforces retrieval/lifecycle benchmark guardrails against weak local baselines (Ledger: E23). The current checked-in output reports a 2026-05-08 mock-provider run in which Audrey scores 100% with 100% pass rate, while the strongest listed local baselines score 41.67% with 25% pass rate in that output (Ledger: E24). These numbers support regression-gate honesty; they do not replace GuardBench.
347
+ Second, `bench:memory:check` is wired into the release gate as an **internal regression suite**, not a competitive benchmark. It exists to catch retrieval/lifecycle regressions in Audrey itself. The suite includes hand-tuned weak local baselines (vector-only, keyword-plus-recency, recent-window) whose role is to anchor a relative pass margin — they are not stand-ins for production memory systems, and their scores should not be cited as comparative claims about any external system (Ledger: E23). The current checked-in output reports a 2026-05-08 mock-provider run in which Audrey scores 100% with 100% pass rate and the listed local stub baselines score in the 25-42% range in that same run (Ledger: E24). These numbers support regression-gate honesty inside this repository; they are not cross-system results and they do not replace GuardBench.
348
348
 
349
349
  The README benchmark table currently differs from the canonical JSON snapshot, so the paper quotes only the JSON snapshot and tracks the README correction as a follow-up (Ledger: E28).
350
350
 
@@ -895,13 +895,13 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
895
895
 
896
896
  ### Behavioral Regression Result
897
897
 
898
- The current `benchmarks/output/summary.json` was generated on 2026-05-13T08:33:24.917Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
898
+ The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
899
899
 
900
900
  | System | Score Percent | Pass Rate | Average Duration Ms |
901
901
  |---|---:|---:|---:|
902
- | Audrey | 100 | 100 | 15.083333333333334 |
903
- | Vector Only | 41.66666666666667 | 25 | 0.25 |
904
- | Keyword + Recency | 41.66666666666667 | 25 | 0.5 |
902
+ | Audrey | 100 | 100 | 15.416666666666666 |
903
+ | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
904
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
905
905
  | Recent Window | 37.5 | 25 | 0 |
906
906
 
907
907
  This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
@@ -924,7 +924,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
924
924
  | Evidence recall | 100% |
925
925
  | Redaction leaks | 0 |
926
926
  | Recall-degradation detection | 100% |
927
- | Guard latency p50 / p95 | 3.214 ms / 21.395 ms |
927
+ | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
928
928
  | Published artifact raw-secret leaks | 0 |
929
929
  | Audrey Guard decision accuracy | 100% |
930
930
  | No-memory decision accuracy | 10% |
@@ -49,7 +49,7 @@ Every implementation claim in the paper should point to one or more ledger IDs i
49
49
  | E43 - Audrey exposes a Claude Code hook generator, guarded settings apply path, and hook-mode Guard command: `hook-config claude-code` emits hooks, `hook-config claude-code --apply --scope project|user` merges them into Claude Code settings with backup/idempotence, `guard --hook --fail-on-warn` consumes PreToolUse JSON and returns `hookSpecificOutput.permissionDecision`, and `observe-tool` records post-tool events. Codex hook wiring remains pending on a stable host hook surface. | Hook integration boundary | README.md; mcp-server/index.ts; tests/mcp-server.test.js | Yes, focused Vitest and CLI hook smoke passed on 2026-05-12 |
50
50
  | E44 - Audrey preflight events now persist `preflight_evidence_ids` and `audrey_guard_action_key`; `memory_validate` accepts optional `preflight_event_id`, action key, and evidence ids, persists them on the validation audit event, and rejects validation lineage when the memory id was not evidence for that preflight. | Validation lineage implementation | src/action-key.ts; src/controller.ts; src/preflight.ts; src/audrey.ts; mcp-server/index.ts; tests/controller.test.js | Yes, focused Vitest passed on 2026-05-12 |
51
51
  | E45 - Preflight risk scoring uses a fixed severity map (`info=0.1`, `low=0.25`, `medium=0.55`, `high=0.85`), sorts warnings by severity, and strict mode blocks on high-severity warnings; the scoring path does not consume validation feedback. | Fixed risk scoring boundary | src/preflight.ts:6-60,291-299,332-338; src/feedback.ts:3-18,70-163 | Yes, 2026-05-08 |
52
- | E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 3.214ms/21.395ms p50/p95 guard latency under the mock-provider methodology. Baseline decision accuracy was no-memory 10%, recent-window 60%, vector-only 40%, and FTS-only 10%, with 0% full-contract pass rate for each baseline. | GuardBench local comparative results | benchmarks/guardbench.js; benchmarks/output/guardbench-summary.json; benchmarks/output/guardbench-manifest.json; benchmarks/output/guardbench-raw.json; package.json | Yes, `npm run bench:guard:check` passed on 2026-05-13 |
52
+ | E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 2.916ms/21.17ms p50/p95 guard latency under the mock-provider methodology. Baseline decision accuracy was no-memory 10%, recent-window 60%, vector-only 40%, and FTS-only 10%, with 0% full-contract pass rate for each baseline. | GuardBench local comparative results | benchmarks/guardbench.js; benchmarks/output/guardbench-summary.json; benchmarks/output/guardbench-manifest.json; benchmarks/output/guardbench-raw.json; package.json | Yes, `npm run bench:guard:check` passed on 2026-05-13 |
53
53
  | E47 - GuardBench accepts external ESM adapters through `--adapter`, supports `default`, `adapter`, or `createGuardBenchAdapter()` exports, withholds `expectedDecision` and `requiredEvidence` during adapter execution, then scores adapter output against the same full-contract decision/evidence/redaction checks. | GuardBench external adapter contract | benchmarks/guardbench.js; tests/guardbench.test.js; package.json | Yes, `node scripts/run-vitest.mjs run tests/guardbench.test.js` passed on 2026-05-12 |
54
54
  | E48 - Audrey ships a Mem0 Platform GuardBench adapter that uses the current Mem0 REST shape: V3 async memory add with event polling, V2 filtered memory search, and user-entity cleanup. It requires runtime `MEM0_API_KEY` and is not run by default. | First external-system GuardBench adapter | benchmarks/adapters/mem0-platform.mjs; tests/guardbench.test.js; README.md | Import/contract and mocked REST-flow tests passed on 2026-05-12; live Mem0 run not yet executed |
55
55
  | E49 - GuardBench ships a credential-free example external adapter and a `bench:guard:adapter-smoke` script so the adapter loader can be exercised through the real CLI path without external credentials. | External adapter smoke path | benchmarks/adapters/example-allow.mjs; package.json; README.md; tests/guardbench.test.js | Yes, `npm run bench:guard:adapter-smoke` passed on 2026-05-12 |
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "Audrey arXiv source package",
4
- "generatedAt": "2026-05-13T23:33:56.985Z",
4
+ "generatedAt": "2026-05-29T03:45:42.097Z",
5
5
  "sourceMarkdown": "docs/paper/audrey-paper-v1.md",
6
6
  "publicationPack": "docs/paper/publication-pack.json",
7
7
  "sourceHashes": {
8
- "sourceMarkdown": "3f9c807f0e0e82b0f19018b83900b62efecc17ff9bcee5b493a5d4fe0528ad7a",
8
+ "sourceMarkdown": "f2afc1cda24b1ba91cf39429e7836ead97c7a1c815b294b3d623bb20d8f5e7e6",
9
9
  "publicationPack": "a1a523d5938faea72be568b843ac3890e61cea6070b0cfa46acf22ad3d2fb974",
10
10
  "referencesBib": "c0bfcaf7bfe37d6933c812e46352be8a95397eaa430a0f5bc94037600a53f654"
11
11
  },
@@ -13,8 +13,8 @@
13
13
  {
14
14
  "path": "main.tex",
15
15
  "source": "docs/paper/audrey-paper-v1.md",
16
- "bytes": 122247,
17
- "sha256": "b0122e625380ad9a6aff78e2acbd5984b558c054962f8774f77fee70e6588d06"
16
+ "bytes": 122667,
17
+ "sha256": "e3ee98ea8c523e8f394b8fbbc73e206f0d6126b7349df5669206d4d48d9feea6"
18
18
  },
19
19
  {
20
20
  "path": "references.bib",
@@ -385,7 +385,7 @@ The deterministic demo, \texttt{audrey demo --scenario repeated-failure}, constr
385
385
 
386
386
  The current paper version has two implemented empirical anchors. First, \texttt{benchmarks/snapshots/perf-0.22.2.json} reports canonical local performance under the mock-provider methodology: generated on 2026-05-01 from git SHA \texttt{e2e821b}, using mock 64-dimensional in-process embeddings, hybrid recall limit 5, and corpus sizes 100, 1,000, and 5,000 on Node 25.5.0 with a 24-core Ryzen 9 7900X3D and 62.9 GB RAM (Ledger: E20). Under that methodology, hybrid recall p95 is 1.82 ms, 2.364 ms, and 3.417 ms for those three sizes, and encode p95 is 0.589 ms, 2.147 ms, and 1.838 ms (Ledger: E21-E22).
387
387
 
388
- Second, \texttt{bench:memory:check} is wired into the release gate and enforces retrieval/lifecycle benchmark guardrails against weak local baselines (Ledger: E23). The current checked-in output reports a 2026-05-08 mock-provider run in which Audrey scores 100\% with 100\% pass rate, while the strongest listed local baselines score 41.67\% with 25\% pass rate in that output (Ledger: E24). These numbers support regression-gate honesty; they do not replace GuardBench.
388
+ Second, \texttt{bench:memory:check} is wired into the release gate as an **internal regression suite**, not a competitive benchmark. It exists to catch retrieval/lifecycle regressions in Audrey itself. The suite includes hand-tuned weak local baselines (vector-only, keyword-plus-recency, recent-window) whose role is to anchor a relative pass margin — they are not stand-ins for production memory systems, and their scores should not be cited as comparative claims about any external system (Ledger: E23). The current checked-in output reports a 2026-05-08 mock-provider run in which Audrey scores 100\% with 100\% pass rate and the listed local stub baselines score in the 25-42\% range in that same run (Ledger: E24). These numbers support regression-gate honesty inside this repository; they are not cross-system results and they do not replace GuardBench.
389
389
 
390
390
  The README benchmark table currently differs from the canonical JSON snapshot, so the paper quotes only the JSON snapshot and tracks the README correction as a follow-up (Ledger: E28).
391
391
 
@@ -729,13 +729,13 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
729
729
 
730
730
  \subsection{Behavioral Regression Result}
731
731
 
732
- The current \texttt{benchmarks/output/summary.json} was generated on 2026-05-13T08:33:24.917Z with command \texttt{node benchmarks/run.js --provider mock --dimensions 64} (Ledger: E24). It reports:
732
+ The current \texttt{benchmarks/output/summary.json} was generated on 2026-05-29T03:45:32.997Z with command \texttt{node benchmarks/run.js --provider mock --dimensions 64} (Ledger: E24). It reports:
733
733
 
734
734
  \begin{verbatim}
735
735
  | System | Score Percent | Pass Rate | Average Duration Ms |
736
- | Audrey | 100 | 100 | 15.083333333333334 |
737
- | Vector Only | 41.66666666666667 | 25 | 0.25 |
738
- | Keyword + Recency | 41.66666666666667 | 25 | 0.5 |
736
+ | Audrey | 100 | 100 | 15.416666666666666 |
737
+ | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
738
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
739
739
  | Recent Window | 37.5 | 25 | 0 |
740
740
  \end{verbatim}
741
741
 
@@ -759,7 +759,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
759
759
  | Evidence recall | 100% |
760
760
  | Redaction leaks | 0 |
761
761
  | Recall-degradation detection | 100% |
762
- | Guard latency p50 / p95 | 3.214 ms / 21.395 ms |
762
+ | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
763
763
  | Published artifact raw-secret leaks | 0 |
764
764
  | Audrey Guard decision accuracy | 100% |
765
765
  | No-memory decision accuracy | 10% |
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "Audrey arXiv compile check",
4
- "generatedAt": "2026-05-13T23:33:57.318Z",
4
+ "generatedAt": "2026-05-29T03:45:42.412Z",
5
5
  "source": {
6
6
  "sourceDir": "docs/paper/output/arxiv",
7
7
  "manifest": "docs/paper/output/arxiv/arxiv-manifest.json",
8
- "manifestSha256": "386121a5f6234f56a370c5c54bde4ff12fc462921e7e5770d6968a4710271df8",
8
+ "manifestSha256": "6364c368755a4188d5b2deea9e3fe80201f5fa55658e6ae570141e54c5293bc6",
9
9
  "mainTex": "docs/paper/output/arxiv/main.tex",
10
- "mainTexSha256": "b0122e625380ad9a6aff78e2acbd5984b558c054962f8774f77fee70e6588d06",
10
+ "mainTexSha256": "e3ee98ea8c523e8f394b8fbbc73e206f0d6126b7349df5669206d4d48d9feea6",
11
11
  "referencesBib": "docs/paper/output/arxiv/references.bib",
12
12
  "referencesBibSha256": "c0bfcaf7bfe37d6933c812e46352be8a95397eaa430a0f5bc94037600a53f654"
13
13
  },
@@ -52,7 +52,7 @@ npx audrey guard --tool Bash "npm run deploy"
52
52
  Expected first-run shape:
53
53
 
54
54
  ```text
55
- Audrey Doctor v1.0.0
55
+ Audrey Doctor v1.0.2
56
56
  Store health: not initialized
57
57
  Verdict: ready
58
58
  ```
@@ -94,7 +94,7 @@ and writes a timestamped backup before changing a non-empty file. The generated
94
94
  and `PostToolUseFailure` hooks record redacted tool traces. Verify the active
95
95
  hook set inside Claude Code with `/hooks`.
96
96
 
97
- All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. Use `AUDREY_DATA_DIR` to isolate projects, tenants, or host identities.
97
+ All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. **Set a distinct `AUDREY_DATA_DIR` per tenant, agent identity, or concurrent host.** SQLite uses WAL mode without an advisory lock, so two processes sharing a directory will contend on writes. Isolation is a hard requirement for multi-agent setups, not a recommendation.
98
98
 
99
99
  Installer-generated host config does not include provider API keys by default. Prefer setting `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` in the host runtime environment; use `npx audrey install --include-secrets` only if you explicitly accept argv/config exposure.
100
100
 
@@ -296,10 +296,23 @@ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
296
296
 
297
297
  Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
298
298
  rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
299
- the raw-secret sweep, and 3.214ms / 21.395ms
300
- p50/p95 guard latency under the mock-provider methodology. Local baseline
301
- decision accuracy was: no-memory 10%, recent-window 60%, vector-only 40%, and
302
- FTS-only 10%; none passed the full GuardBench decision-plus-evidence contract.
299
+ the raw-secret sweep, and 2.916ms / 21.17ms
300
+ p50/p95 guard latency under the mock-provider methodology.
301
+
302
+ **Methodology caveats, on purpose.** All numbers above are produced against
303
+ the in-process mock 64-dim embedding provider documented in the run's
304
+ `provenance` block. They characterize Audrey's controller and SQLite path,
305
+ not real-provider end-to-end latency or production false-positive rates. The
306
+ 100% prevention rate is over the 5 GuardBench scenarios that expect a
307
+ `block` decision (the suite is 10 scenarios total, mixed across allow / warn
308
+ / block). Local baseline decision accuracy was: no-memory 10%, recent-window
309
+ 60%, vector-only 40%, and FTS-only 10%; none of the local baselines passed
310
+ the GuardBench decision-plus-evidence contract, which since v1.0.1 requires
311
+ the correct decision plus at least one returned evidence id for `block` /
312
+ `warn` scenarios (no longer Audrey-specific lineage phrasing — see
313
+ `CHANGELOG.md#101---2026-05-15`). External-system numbers for Mem0 and Zep
314
+ are explicitly out of scope for this Stage-A artifact; live credentialed
315
+ runs land in a v2 paper after raw evidence bundles publish.
303
316
 
304
317
  ```bash
305
318
  npm run bench:guard
@@ -517,8 +530,19 @@ The Node sidecar defaults to `127.0.0.1:7437`. The Docker image intentionally bi
517
530
 
518
531
  ## Development
519
532
 
533
+ Developer setup runs from source, not from the published tarball, so `npm run build` is required before any CLI subcommand resolves:
534
+
520
535
  ```bash
521
536
  npm ci
537
+ npm run build
538
+ npm run lint # ESLint (type-checked typescript-eslint); CI requires it clean
539
+ npm run format # Prettier; use `npm run format:check` to verify without writing
540
+ npm test
541
+ ```
542
+
543
+ Once built, the `Quick Start` commands work against the local `dist/` output. Code style and types are enforced: `npm run lint` and `npm run format:check` run in CI (Ubuntu + Windows) and in every release gate, so the baseline cannot regress. The full release gate runs everything CI runs:
544
+
545
+ ```bash
522
546
  npm run release:gate
523
547
  python -m unittest discover -s python/tests -v
524
548
  npm run python:release:check
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench adapter self-test",
4
- "generatedAt": "2026-05-13T23:33:55.959Z",
4
+ "generatedAt": "2026-05-29T03:45:40.969Z",
5
5
  "ok": true,
6
6
  "adapter": {
7
7
  "name": "Example Allow Adapter",
@@ -15,21 +15,21 @@
15
15
  "requestedAdapter": "Example Allow Adapter",
16
16
  "scenarios": 10,
17
17
  "expectedScenarios": 10,
18
- "fullContractPassRate": 0,
18
+ "fullContractPassRate": 0.1,
19
19
  "decisionAccuracy": 0.1,
20
20
  "redactionLeaks": 0,
21
21
  "failures": []
22
22
  },
23
23
  "score": {
24
24
  "scenarios": 10,
25
- "fullContractPassRate": 0,
25
+ "fullContractPassRate": 0.1,
26
26
  "decisionAccuracy": 0.1,
27
- "evidenceRecall": 0,
27
+ "evidenceRecall": 0.1,
28
28
  "redactionLeaks": 0,
29
29
  "latency": {
30
- "p50Ms": 0.012,
31
- "p95Ms": 0.049,
32
- "maxMs": 0.049
30
+ "p50Ms": 0.01,
31
+ "p95Ms": 0.043,
32
+ "maxMs": 0.043
33
33
  }
34
34
  },
35
35
  "contract": {
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench external adapter dry-run matrix",
4
- "generatedAt": "2026-05-13T23:33:56.533Z",
4
+ "generatedAt": "2026-05-29T03:45:41.522Z",
5
5
  "ok": true,
6
6
  "registry": "benchmarks/adapters/registry.json",
7
7
  "outRoot": "benchmarks/output/external",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench external evidence verification",
4
- "generatedAt": "2026-05-13T23:33:56.821Z",
4
+ "generatedAt": "2026-05-29T03:45:41.794Z",
5
5
  "ok": true,
6
6
  "allowPending": true,
7
7
  "registry": "benchmarks/adapters/registry.json",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench conformance card",
4
- "generatedAt": "2026-05-13T23:33:51.583Z",
4
+ "generatedAt": "2026-05-29T03:45:36.958Z",
5
5
  "sourceDir": "benchmarks/output",
6
6
  "manifestVersion": "0.2.0",
7
7
  "suiteId": "guardbench-local-comparative",
@@ -25,9 +25,9 @@
25
25
  "evidenceRecall": 1,
26
26
  "redactionLeaks": 0,
27
27
  "latency": {
28
- "p50Ms": 3.097,
29
- "p95Ms": 29.711,
30
- "maxMs": 29.711
28
+ "p50Ms": 2.916,
29
+ "p95Ms": 21.17,
30
+ "maxMs": 21.17
31
31
  }
32
32
  },
33
33
  "conformance": {
@@ -39,21 +39,21 @@
39
39
  "integrity": {
40
40
  "artifactHashes": {
41
41
  "guardbench-manifest.json": "57636ce19fdaa6e50fc3fc961d9e499a9f43632f588c713a9fefe8e8a6fa724c",
42
- "guardbench-summary.json": "2a6d5ee83cce2502135fb0442ef8cd3f2679fdc38c84207612c22a800a7a113a",
43
- "guardbench-raw.json": "c5b9c68cf946478fbfba617f17717e05ea3e01301089de19153d59e77e674bc6"
42
+ "guardbench-summary.json": "e8669cd6c80dc3dc849b3c4fcc473ea706eb3a760bced69682d0dc2396b2e233",
43
+ "guardbench-raw.json": "15b39fd1a65709a89455fbfcaf815daf364b204fa526d5065cc12fcaed281d28"
44
44
  },
45
45
  "externalRunMetadataHash": null
46
46
  },
47
47
  "provenance": {
48
- "generatedAt": "2026-05-13T23:33:51.221Z",
49
- "gitSha": "970752172441967c3ede79562eca69b08efb1f12",
48
+ "generatedAt": "2026-05-29T03:45:36.607Z",
49
+ "gitSha": "ceed2f51b615175c8bb412b96b5e5a501561189f",
50
50
  "gitDirty": false,
51
- "node": "v24.14.1",
52
- "v8": "13.6.233.17-node.44",
51
+ "node": "v24.16.0",
52
+ "v8": "13.6.233.17-node.49",
53
53
  "platform": "linux",
54
54
  "arch": "x64",
55
- "osRelease": "6.17.0-1010-azure",
56
- "cpuModel": "AMD EPYC 7763 64-Core Processor",
55
+ "osRelease": "6.17.0-1015-azure",
56
+ "cpuModel": "AMD EPYC 9V74 80-Core Processor",
57
57
  "cpuCount": 4,
58
58
  "totalMemoryGb": 15.61,
59
59
  "embeddingProvider": "mock",