audrey 0.23.1 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +101 -15
  2. package/LICENSE +21 -21
  3. package/README.md +232 -6
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1125 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1271 -0
  24. package/benchmarks/output/guardbench-summary.json +2107 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +13 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +68 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +78 -6
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +273 -53
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +5 -4
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +3 -3
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +71 -2
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +555 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +92 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +785 -0
@@ -0,0 +1,280 @@
1
+ import { randomBytes } from 'node:crypto';
2
+
3
+ const DEFAULT_BASE_URL = 'https://api.getzep.com';
4
+ const DEFAULT_INGEST_DELAY_MS = 0;
5
+ const BATCH_SIZE = 50;
6
+
7
+ function requireEnv(name) {
8
+ const value = process.env[name];
9
+ if (!value) {
10
+ throw new Error(`${name} is required for the Zep Cloud GuardBench adapter.`);
11
+ }
12
+ return value;
13
+ }
14
+
15
+ function sleep(ms) {
16
+ return new Promise(resolve => setTimeout(resolve, ms));
17
+ }
18
+
19
+ function actionText(action) {
20
+ return [action.action, action.command, action.tool, action.cwd, ...(action.files ?? [])]
21
+ .filter(Boolean)
22
+ .join('\n');
23
+ }
24
+
25
+ function normalize(text) {
26
+ return String(text || '').toLowerCase();
27
+ }
28
+
29
+ function tokenize(text) {
30
+ return normalize(text)
31
+ .replace(/[^a-z0-9]+/g, ' ')
32
+ .trim()
33
+ .split(/\s+/)
34
+ .filter(token => token.length > 2);
35
+ }
36
+
37
+ function tokenOverlap(a, b) {
38
+ const tokens = tokenize(a);
39
+ if (tokens.length === 0) return 0;
40
+ const other = new Set(tokenize(b));
41
+ let matches = 0;
42
+ for (const token of tokens) {
43
+ if (other.has(token)) matches++;
44
+ }
45
+ return matches / tokens.length;
46
+ }
47
+
48
+ function resultText(result) {
49
+ return [
50
+ result?.fact,
51
+ result?.content,
52
+ result?.summary,
53
+ result?.name,
54
+ result?.context,
55
+ ].filter(Boolean).join('\n');
56
+ }
57
+
58
+ function collectSearchResults(response) {
59
+ const rows = [];
60
+ for (const key of ['edges', 'episodes', 'nodes', 'observations', 'thread_summaries']) {
61
+ if (Array.isArray(response?.[key])) rows.push(...response[key]);
62
+ }
63
+ if (response?.context) rows.push({ uuid: 'zep-context', context: response.context });
64
+ return rows;
65
+ }
66
+
67
+ function evidenceIds(results) {
68
+ return results.map((result, index) =>
69
+ result?.uuid ?? result?.id ?? result?.task_id ?? result?.thread_id ?? `zep-result-${index + 1}`);
70
+ }
71
+
72
+ function decisionFromSearchResults(results, action, unsupportedFault = null) {
73
+ if (unsupportedFault) {
74
+ return {
75
+ decision: 'warn',
76
+ riskScore: 0.55,
77
+ evidenceIds: evidenceIds(results),
78
+ recommendedActions: ['External adapter cannot inject storage faults into Zep Cloud; verify memory health separately.'],
79
+ summary: `Zep Cloud adapter cannot emulate fault injection: ${unsupportedFault}.`,
80
+ };
81
+ }
82
+
83
+ const query = actionText(action);
84
+ const texts = results.map(resultText);
85
+ const joined = texts.join('\n').toLowerCase();
86
+ if (/\b(must-follow|never|do not|high-risk|conflicting)\b/i.test(joined)) {
87
+ const relevant = texts.some(text => tokenOverlap(query, text) >= 0.18);
88
+ return {
89
+ decision: relevant ? 'block' : 'warn',
90
+ riskScore: relevant ? 0.85 : 0.55,
91
+ evidenceIds: evidenceIds(results),
92
+ recommendedActions: ['Review retrieved Zep graph memory before acting.'],
93
+ summary: relevant
94
+ ? 'Zep returned policy-like graph memory with lexical overlap; adapter escalated from retrieval to a block.'
95
+ : 'Zep returned policy-like graph memory without enough lexical overlap for a block.',
96
+ };
97
+ }
98
+
99
+ if (results.length > 0) {
100
+ return {
101
+ decision: 'warn',
102
+ riskScore: 0.35,
103
+ evidenceIds: evidenceIds(results),
104
+ recommendedActions: ['Treat retrieved Zep graph memory as advisory context.'],
105
+ summary: 'Zep returned related graph memory, but no policy-like control evidence was found.',
106
+ };
107
+ }
108
+
109
+ return {
110
+ decision: 'allow',
111
+ riskScore: 0,
112
+ evidenceIds: [],
113
+ recommendedActions: [],
114
+ summary: 'Zep returned no relevant graph memory for this action.',
115
+ };
116
+ }
117
+
118
+ class ZepCloudClient {
119
+ constructor({
120
+ apiKey = requireEnv('ZEP_API_KEY'),
121
+ baseUrl = process.env.ZEP_BASE_URL ?? DEFAULT_BASE_URL,
122
+ authScheme = process.env.ZEP_AUTH_SCHEME ?? 'Api-Key',
123
+ fetchImpl = globalThis.fetch,
124
+ } = {}) {
125
+ this.apiKey = apiKey;
126
+ this.baseUrl = baseUrl.replace(/\/+$/, '');
127
+ this.authScheme = authScheme;
128
+ this.fetch = fetchImpl;
129
+ }
130
+
131
+ get authorization() {
132
+ return this.authScheme ? `${this.authScheme} ${this.apiKey}` : this.apiKey;
133
+ }
134
+
135
+ async request(path, { method = 'GET', body, okStatuses = [200, 201, 204], ignoreNotFound = false } = {}) {
136
+ const response = await this.fetch(`${this.baseUrl}${path}`, {
137
+ method,
138
+ headers: {
139
+ Authorization: this.authorization,
140
+ 'Content-Type': 'application/json',
141
+ },
142
+ body: body == null ? undefined : JSON.stringify(body),
143
+ });
144
+
145
+ if (ignoreNotFound && response.status === 404) return null;
146
+ if (!okStatuses.includes(response.status)) {
147
+ const text = await response.text();
148
+ throw new Error(`Zep ${method} ${path} failed ${response.status}: ${text.slice(0, 500)}`);
149
+ }
150
+
151
+ if (response.status === 204) return null;
152
+ const text = await response.text();
153
+ return text ? JSON.parse(text) : null;
154
+ }
155
+
156
+ async createUser(userId) {
157
+ return this.request('/api/v2/users', {
158
+ method: 'POST',
159
+ body: { user_id: userId },
160
+ });
161
+ }
162
+
163
+ async createSession({ sessionId, userId }) {
164
+ return this.request('/api/v2/sessions', {
165
+ method: 'POST',
166
+ body: { session_id: sessionId, user_id: userId },
167
+ });
168
+ }
169
+
170
+ async addMessages({ sessionId, messages }) {
171
+ if (messages.length === 0) return null;
172
+ return this.request(`/api/v2/sessions/${encodeURIComponent(sessionId)}/memory`, {
173
+ method: 'POST',
174
+ body: { messages, return_context: false },
175
+ });
176
+ }
177
+
178
+ async searchGraph({ userId, query }) {
179
+ return this.request('/api/v2/graph/search', {
180
+ method: 'POST',
181
+ body: {
182
+ user_id: userId,
183
+ query: query.slice(0, 400),
184
+ scope: 'edges',
185
+ limit: 10,
186
+ },
187
+ });
188
+ }
189
+
190
+ async deleteUser(userId) {
191
+ return this.request(`/api/v2/users/${encodeURIComponent(userId)}`, {
192
+ method: 'DELETE',
193
+ ignoreNotFound: true,
194
+ });
195
+ }
196
+ }
197
+
198
+ function message(content) {
199
+ return {
200
+ role: 'guardbench',
201
+ role_type: 'norole',
202
+ content,
203
+ };
204
+ }
205
+
206
+ function memoryMessagesFromScenario(scenario) {
207
+ const messages = [];
208
+ for (const memory of scenario.seed.seededMemories ?? []) {
209
+ messages.push(message(memory.content));
210
+ }
211
+ for (const event of scenario.seed.seededToolEvents ?? []) {
212
+ const seededSecret = event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
213
+ ? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
214
+ : '';
215
+ messages.push(message([
216
+ `Tool event: ${event.tool ?? 'tool'}`,
217
+ event.action ? `Action: ${event.action}` : '',
218
+ event.outcome ? `Outcome: ${event.outcome}` : '',
219
+ event.errorSummary ? `Error: ${event.errorSummary}` : '',
220
+ event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
221
+ seededSecret ? `Error: ${seededSecret}` : '',
222
+ event.output ? `Output: ${event.output}` : '',
223
+ ].filter(Boolean).join('\n')));
224
+ }
225
+ if (scenario.seed.seededNoise?.count) {
226
+ for (let i = 0; i < scenario.seed.seededNoise.count; i++) {
227
+ messages.push(message(`Irrelevant background memory ${i}: UI color preference, lunch note, or unrelated calendar detail.`));
228
+ }
229
+ }
230
+ return messages;
231
+ }
232
+
233
+ async function addInBatches(client, { sessionId, messages }) {
234
+ for (let i = 0; i < messages.length; i += BATCH_SIZE) {
235
+ await client.addMessages({
236
+ sessionId,
237
+ messages: messages.slice(i, i + BATCH_SIZE),
238
+ });
239
+ }
240
+ }
241
+
242
+ function idForScenario(kind, scenario) {
243
+ const prefix = process.env.ZEP_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
244
+ const runId = process.env.ZEP_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
245
+ return `${prefix}-${runId}-${kind}-${scenario.id}`.toLowerCase();
246
+ }
247
+
248
+ export function createGuardBenchAdapter(options = {}) {
249
+ return {
250
+ name: 'Zep Cloud',
251
+ description: 'Zep Cloud REST adapter using v2 users, sessions, memory.add, graph.search, and user cleanup.',
252
+ async setup({ scenario }) {
253
+ const client = new ZepCloudClient(options);
254
+ const userId = idForScenario('user', scenario);
255
+ const sessionId = idForScenario('session', scenario);
256
+ const messages = memoryMessagesFromScenario(scenario);
257
+ await client.createUser(userId);
258
+ await client.createSession({ sessionId, userId });
259
+ await addInBatches(client, { sessionId, messages });
260
+ const ingestDelayMs = Number(options.ingestDelayMs ?? process.env.ZEP_GUARDBENCH_INGEST_DELAY_MS ?? DEFAULT_INGEST_DELAY_MS);
261
+ if (ingestDelayMs > 0) await sleep(ingestDelayMs);
262
+ return { client, userId, sessionId };
263
+ },
264
+ async decide({ scenario, action, state }) {
265
+ const search = await state.client.searchGraph({
266
+ userId: state.userId,
267
+ query: actionText(action),
268
+ });
269
+ const results = collectSearchResults(search);
270
+ return decisionFromSearchResults(results, action, scenario.seed.faultInjection);
271
+ },
272
+ async cleanup({ state }) {
273
+ if (state?.client && state?.userId && process.env.ZEP_GUARDBENCH_SKIP_CLEANUP !== '1') {
274
+ await state.client.deleteUser(state.userId);
275
+ }
276
+ },
277
+ };
278
+ }
279
+
280
+ export default createGuardBenchAdapter();
@@ -0,0 +1,169 @@
1
+ import { createEmbeddingProvider } from '../dist/src/embedding.js';
2
+ import { cosineSimilarity } from '../dist/src/utils.js';
3
+
4
+ function normalize(text) {
5
+ return String(text || '').toLowerCase();
6
+ }
7
+
8
+ function tokenize(text) {
9
+ return normalize(text)
10
+ .replace(/[^a-z0-9]+/g, ' ')
11
+ .trim()
12
+ .split(/\s+/)
13
+ .filter(Boolean);
14
+ }
15
+
16
+ function keywordScore(queryTokens, content) {
17
+ const contentTokens = new Set(tokenize(content));
18
+ if (queryTokens.length === 0) return 0;
19
+ let matches = 0;
20
+ for (const token of queryTokens) {
21
+ if (contentTokens.has(token)) matches++;
22
+ }
23
+ return matches / queryTokens.length;
24
+ }
25
+
26
+ function sortByScore(rows) {
27
+ return rows
28
+ .filter(row => Number.isFinite(row.score))
29
+ .sort((a, b) => b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')));
30
+ }
31
+
32
+ function flattenMemories(benchmarkCase, ids = []) {
33
+ return benchmarkCase.memory.map((memory, index) => ({
34
+ id: ids[index] || `memory-${index + 1}`,
35
+ content: memory.content,
36
+ source: memory.source,
37
+ createdAt: memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
38
+ private: Boolean(memory.private),
39
+ }));
40
+ }
41
+
42
+ function buildSyntheticCase(query, memories, options = {}) {
43
+ return {
44
+ query,
45
+ memory: memories.map(memory => ({
46
+ content: memory.content,
47
+ source: memory.source,
48
+ createdAt: memory.createdAt,
49
+ private: memory.private,
50
+ })),
51
+ options,
52
+ };
53
+ }
54
+
55
+ async function runBaselineRetrieval(system, syntheticCase, providerConfig, limit = 5) {
56
+ switch (system) {
57
+ case 'Vector Only':
58
+ return runVectorOnlyBaseline(syntheticCase, providerConfig, limit);
59
+ case 'Keyword + Recency':
60
+ return runKeywordRecencyBaseline(syntheticCase, limit);
61
+ case 'Recent Window':
62
+ return runRecentWindowBaseline(syntheticCase, limit);
63
+ default:
64
+ throw new Error(`Unknown baseline system: ${system}`);
65
+ }
66
+ }
67
+
68
+ function createOperationMemory(state, step) {
69
+ const index = state.counter++;
70
+ return {
71
+ id: `memory-${index + 1}`,
72
+ content: step.memory.content,
73
+ source: step.memory.source,
74
+ createdAt: step.memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(),
75
+ private: Boolean(step.memory.private),
76
+ };
77
+ }
78
+
79
+ async function applyBaselineStep(system, state, step, providerConfig) {
80
+ if (step.type === 'encode') {
81
+ const memory = createOperationMemory(state, step);
82
+ state.memories.push(memory);
83
+ if (step.saveAs) {
84
+ state.aliases.set(step.saveAs, memory.id);
85
+ }
86
+ return;
87
+ }
88
+
89
+ if (step.type === 'forgetByQuery') {
90
+ const syntheticCase = buildSyntheticCase(step.query, state.memories, step.options);
91
+ const [match] = await runBaselineRetrieval(system, syntheticCase, providerConfig, 1);
92
+ if (match && Number.isFinite(match.score) && match.score > 0) {
93
+ state.memories = state.memories.filter(memory => memory.id !== match.id);
94
+ }
95
+ return;
96
+ }
97
+
98
+ if (step.type === 'consolidate') {
99
+ return;
100
+ }
101
+
102
+ throw new Error(`Unsupported baseline step: ${step.type}`);
103
+ }
104
+
105
+ export async function runBaselineScenario(system, benchmarkCase, providerConfig, limit = 5) {
106
+ if (benchmarkCase.kind !== 'operations') {
107
+ return runBaselineRetrieval(system, benchmarkCase, providerConfig, limit);
108
+ }
109
+
110
+ const state = {
111
+ counter: 0,
112
+ memories: [],
113
+ aliases: new Map(),
114
+ };
115
+
116
+ for (const step of benchmarkCase.steps || []) {
117
+ await applyBaselineStep(system, state, step, providerConfig);
118
+ }
119
+
120
+ return runBaselineRetrieval(
121
+ system,
122
+ buildSyntheticCase(benchmarkCase.query, state.memories, benchmarkCase.options),
123
+ providerConfig,
124
+ limit,
125
+ );
126
+ }
127
+
128
+ export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
129
+ const queryTokens = tokenize(benchmarkCase.query);
130
+ return sortByScore(flattenMemories(benchmarkCase).map(memory => ({
131
+ ...memory,
132
+ type: 'episodic',
133
+ score: keywordScore(queryTokens, memory.content),
134
+ }))).slice(0, limit);
135
+ }
136
+
137
+ export function runRecentWindowBaseline(benchmarkCase, limit = 3) {
138
+ return flattenMemories(benchmarkCase)
139
+ .sort((a, b) => String(b.createdAt).localeCompare(String(a.createdAt)))
140
+ .slice(0, limit)
141
+ .map((memory, index) => ({
142
+ ...memory,
143
+ type: 'episodic',
144
+ score: 1 - index * 0.1,
145
+ }));
146
+ }
147
+
148
+ export async function runVectorOnlyBaseline(benchmarkCase, providerConfig, limit = 5) {
149
+ const provider = createEmbeddingProvider(providerConfig);
150
+ if (typeof provider.ready === 'function') {
151
+ await provider.ready();
152
+ }
153
+
154
+ const queryVector = await provider.embed(benchmarkCase.query);
155
+ const queryBuffer = provider.vectorToBuffer(queryVector);
156
+
157
+ const rows = [];
158
+ for (const memory of flattenMemories(benchmarkCase)) {
159
+ const vector = await provider.embed(memory.content);
160
+ const score = cosineSimilarity(queryBuffer, provider.vectorToBuffer(vector), provider);
161
+ rows.push({
162
+ ...memory,
163
+ type: 'episodic',
164
+ score,
165
+ });
166
+ }
167
+
168
+ return sortByScore(rows).slice(0, limit);
169
+ }
@@ -0,0 +1,170 @@
1
+ import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { dirname, join, resolve } from 'node:path';
3
+ import { verifyGuardBenchSubmissionBundle } from './verify-submission-bundle.mjs';
4
+ import { validateSchema } from './validate-guardbench-artifacts.mjs';
5
+ import { publicPath } from './public-paths.mjs';
6
+
7
+ function readJson(path) {
8
+ return JSON.parse(readFileSync(path, 'utf-8'));
9
+ }
10
+
11
+ function percent(value) {
12
+ return value == null ? 'n/a' : `${(value * 100).toFixed(1)}%`;
13
+ }
14
+
15
+ function number(value) {
16
+ return value == null ? 'n/a' : String(value);
17
+ }
18
+
19
+ function rowFromBundle(dir) {
20
+ const verification = verifyGuardBenchSubmissionBundle({ dir });
21
+ const manifest = readJson(join(resolve(dir), 'submission-manifest.json'));
22
+ return {
23
+ subject: manifest.subject,
24
+ score: manifest.score,
25
+ conformance: manifest.conformance,
26
+ source: {
27
+ dir: publicPath(resolve(dir)),
28
+ manifestGeneratedAt: manifest.generatedAt,
29
+ fileCount: manifest.files?.length ?? 0,
30
+ },
31
+ verification,
32
+ };
33
+ }
34
+
35
+ function compareRows(a, b) {
36
+ return (
37
+ Number(b.verification.ok) - Number(a.verification.ok)
38
+ || Number(b.conformance.ok) - Number(a.conformance.ok)
39
+ || (b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1)
40
+ || (b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1)
41
+ || (b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1)
42
+ || (a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) - (b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER)
43
+ || (a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) - (b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER)
44
+ || a.subject.name.localeCompare(b.subject.name)
45
+ );
46
+ }
47
+
48
+ export function buildGuardBenchLeaderboard(options = {}) {
49
+ const bundleDirs = options.bundleDirs?.length
50
+ ? options.bundleDirs
51
+ : ['benchmarks/output/submission-bundle'];
52
+ const rows = bundleDirs.map(rowFromBundle).sort(compareRows)
53
+ .map((row, index) => ({ rank: index + 1, ...row }));
54
+ return {
55
+ schemaVersion: '1.0.0',
56
+ suite: 'GuardBench leaderboard',
57
+ generatedAt: new Date().toISOString(),
58
+ ranking: [
59
+ 'verified bundle',
60
+ 'adapter conformance',
61
+ 'fullContractPassRate',
62
+ 'decisionAccuracy',
63
+ 'evidenceRecall',
64
+ 'redactionLeaks ascending',
65
+ 'latency.p95Ms ascending',
66
+ 'subject.name',
67
+ ],
68
+ rows,
69
+ failures: rows.flatMap(row => row.verification.failures.map(failure => `${row.subject.name}: ${failure}`)),
70
+ };
71
+ }
72
+
73
+ export function writeGuardBenchLeaderboard(options = {}) {
74
+ const outJson = resolve(options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json');
75
+ const outMd = resolve(options.outMd ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.md');
76
+ const schemasDir = resolve(options.schemasDir ?? 'benchmarks/schemas');
77
+ const leaderboard = buildGuardBenchLeaderboard(options);
78
+ const schema = readJson(join(schemasDir, 'guardbench-leaderboard.schema.json'));
79
+ const schemaErrors = validateSchema(leaderboard, schema, 'guardbench-leaderboard');
80
+ if (schemaErrors.length) {
81
+ throw new Error(`GuardBench leaderboard schema validation failed: ${schemaErrors.join('; ')}`);
82
+ }
83
+ mkdirSync(dirname(outJson), { recursive: true });
84
+ mkdirSync(dirname(outMd), { recursive: true });
85
+ writeFileSync(outJson, `${JSON.stringify(leaderboard, null, 2)}\n`, 'utf-8');
86
+ writeFileSync(outMd, renderMarkdown(leaderboard), 'utf-8');
87
+ return { leaderboard, outJson, outMd };
88
+ }
89
+
90
+ export function renderMarkdown(leaderboard) {
91
+ const lines = [
92
+ '# GuardBench Leaderboard',
93
+ '',
94
+ `Generated: ${leaderboard.generatedAt}`,
95
+ '',
96
+ '| Rank | Subject | Verified | Conformant | Full Contract | Decision Accuracy | Evidence Recall | Redaction Leaks | p95 Latency | Bundle |',
97
+ '|---:|---|---:|---:|---:|---:|---:|---:|---:|---|',
98
+ ];
99
+ for (const row of leaderboard.rows) {
100
+ lines.push([
101
+ row.rank,
102
+ row.subject.name,
103
+ row.verification.ok ? 'yes' : 'no',
104
+ row.conformance.ok ? 'yes' : 'no',
105
+ percent(row.score.fullContractPassRate),
106
+ percent(row.score.decisionAccuracy),
107
+ percent(row.score.evidenceRecall),
108
+ number(row.score.redactionLeaks),
109
+ row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
110
+ row.source.dir,
111
+ ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'));
112
+ }
113
+ if (leaderboard.failures.length) {
114
+ lines.push('', '## Verification Failures', '');
115
+ for (const failure of leaderboard.failures) lines.push(`- ${failure}`);
116
+ }
117
+ lines.push('');
118
+ return `${lines.join('\n')}`;
119
+ }
120
+
121
+ function parseArgs(argv = process.argv.slice(2)) {
122
+ const args = {
123
+ bundleDirs: [],
124
+ outJson: 'benchmarks/output/leaderboard/guardbench-leaderboard.json',
125
+ outMd: 'benchmarks/output/leaderboard/guardbench-leaderboard.md',
126
+ json: false,
127
+ };
128
+ for (let i = 0; i < argv.length; i++) {
129
+ const token = argv[i];
130
+ if ((token === '--bundle' || token === '--dir') && argv[i + 1]) args.bundleDirs.push(argv[++i]);
131
+ else if (token === '--out-json' && argv[i + 1]) args.outJson = argv[++i];
132
+ else if (token === '--out-md' && argv[i + 1]) args.outMd = argv[++i];
133
+ else if (token === '--schemas-dir' && argv[i + 1]) args.schemasDir = argv[++i];
134
+ else if (token === '--json') args.json = true;
135
+ else if (token === '--help' || token === '-h') args.help = true;
136
+ else throw new Error(`Unknown argument: ${token}`);
137
+ }
138
+ return args;
139
+ }
140
+
141
+ function usage() {
142
+ return [
143
+ 'Usage: node benchmarks/build-leaderboard.mjs [--bundle <submission-bundle>] [--json]',
144
+ '',
145
+ 'Builds ranked JSON and Markdown GuardBench leaderboard artifacts from verified',
146
+ 'submission bundles. Repeat --bundle for multiple systems.',
147
+ ].join('\n');
148
+ }
149
+
150
+ async function main() {
151
+ const args = parseArgs();
152
+ if (args.help) {
153
+ console.log(usage());
154
+ return;
155
+ }
156
+ const result = writeGuardBenchLeaderboard(args);
157
+ if (args.json) console.log(JSON.stringify(result.leaderboard, null, 2));
158
+ else {
159
+ console.log(`GuardBench leaderboard JSON: ${result.outJson}`);
160
+ console.log(`GuardBench leaderboard Markdown: ${result.outMd}`);
161
+ }
162
+ if (result.leaderboard.failures.length) process.exit(1);
163
+ }
164
+
165
+ if (process.argv[1] && resolve(process.argv[1]).endsWith('build-leaderboard.mjs')) {
166
+ main().catch(error => {
167
+ console.error(error.stack ?? error.message);
168
+ process.exit(1);
169
+ });
170
+ }