audrey 0.23.1 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +101 -15
  2. package/LICENSE +21 -21
  3. package/README.md +232 -6
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1125 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1271 -0
  24. package/benchmarks/output/guardbench-summary.json +2107 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +13 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +68 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +78 -6
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +273 -53
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +5 -4
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +3 -3
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +71 -2
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +555 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +92 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +785 -0
@@ -0,0 +1,537 @@
1
+ export const RETRIEVAL_CASES = [
2
+ {
3
+ id: 'information-extraction',
4
+ suite: 'retrieval',
5
+ kind: 'retrieval',
6
+ family: 'information_extraction',
7
+ title: 'Information extraction',
8
+ description: 'Recover a directly stated user fact from durable memory.',
9
+ query: 'Where does Sam live now?',
10
+ expectAny: ['Austin'],
11
+ memory: [
12
+ {
13
+ content: 'Sam moved to Austin in March 2026 after leaving Denver.',
14
+ source: 'direct-observation',
15
+ tags: ['profile', 'location'],
16
+ context: { subject: 'sam', domain: 'assistant' },
17
+ },
18
+ {
19
+ content: 'Sam likes to work from coffee shops on South Congress.',
20
+ source: 'tool-result',
21
+ tags: ['preference', 'routine'],
22
+ context: { subject: 'sam', domain: 'assistant' },
23
+ },
24
+ ],
25
+ },
26
+ {
27
+ id: 'knowledge-update',
28
+ suite: 'retrieval',
29
+ kind: 'retrieval',
30
+ family: 'knowledge_updates',
31
+ title: 'Knowledge updates',
32
+ description: 'Prefer the newer fact over stale preferences.',
33
+ query: 'What drink does Sam prefer now?',
34
+ expectAny: ['green tea'],
35
+ forbid: ['Sam prefers coffee before early meetings.'],
36
+ memory: [
37
+ {
38
+ content: 'Sam prefers coffee before early meetings.',
39
+ source: 'told-by-user',
40
+ tags: ['preference'],
41
+ context: { subject: 'sam', domain: 'assistant' },
42
+ },
43
+ {
44
+ content: 'Sam switched from coffee to green tea after January 2026.',
45
+ source: 'direct-observation',
46
+ tags: ['preference', 'update'],
47
+ context: { subject: 'sam', domain: 'assistant' },
48
+ supersedesIndex: 0,
49
+ },
50
+ ],
51
+ },
52
+ {
53
+ id: 'multi-session-reasoning',
54
+ suite: 'retrieval',
55
+ kind: 'retrieval',
56
+ family: 'multi_session_reasoning',
57
+ title: 'Multi-session reasoning',
58
+ description: 'Synthesize a decision from multiple related episodes.',
59
+ query: 'Which vendor was approved after the pilot budget review?',
60
+ expectAny: ['Northwind'],
61
+ memory: [
62
+ {
63
+ content: 'During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.',
64
+ source: 'tool-result',
65
+ tags: ['project', 'pilot'],
66
+ context: { subject: 'sam', domain: 'operations' },
67
+ },
68
+ {
69
+ content: 'Finance rejected Fabricam because the support SLA was too weak.',
70
+ source: 'direct-observation',
71
+ tags: ['finance', 'vendor'],
72
+ context: { subject: 'sam', domain: 'operations' },
73
+ },
74
+ {
75
+ content: 'The pilot budget review approved Northwind for rollout after the support SLA review.',
76
+ source: 'direct-observation',
77
+ tags: ['finance', 'vendor', 'approval'],
78
+ context: { subject: 'sam', domain: 'operations' },
79
+ },
80
+ ],
81
+ },
82
+ {
83
+ id: 'temporal-reasoning',
84
+ suite: 'retrieval',
85
+ kind: 'retrieval',
86
+ family: 'temporal_reasoning',
87
+ title: 'Temporal reasoning',
88
+ description: 'Answer by isolating the right time window.',
89
+ query: 'What happened in February 2026?',
90
+ expectAny: ['architecture review'],
91
+ memory: [
92
+ {
93
+ content: 'In January 2026 Sam kicked off the migration plan.',
94
+ source: 'tool-result',
95
+ tags: ['timeline'],
96
+ createdAt: '2026-01-12T09:00:00.000Z',
97
+ },
98
+ {
99
+ content: 'In February 2026 Sam completed the architecture review.',
100
+ source: 'direct-observation',
101
+ tags: ['timeline'],
102
+ createdAt: '2026-02-18T15:30:00.000Z',
103
+ },
104
+ {
105
+ content: 'In March 2026 Sam started the rollout checklist.',
106
+ source: 'tool-result',
107
+ tags: ['timeline'],
108
+ createdAt: '2026-03-02T08:15:00.000Z',
109
+ },
110
+ ],
111
+ options: {
112
+ after: '2026-02-01T00:00:00.000Z',
113
+ before: '2026-03-01T00:00:00.000Z',
114
+ },
115
+ },
116
+ {
117
+ id: 'abstention',
118
+ suite: 'retrieval',
119
+ kind: 'retrieval',
120
+ family: 'abstention',
121
+ title: 'Abstention',
122
+ description: 'Avoid pretending to know a specific identifier that was never stored.',
123
+ query: 'What is Sam passport number?',
124
+ expectNone: true,
125
+ memory: [
126
+ {
127
+ content: 'Sam renewed a passport in February 2026.',
128
+ source: 'tool-result',
129
+ tags: ['travel'],
130
+ },
131
+ {
132
+ content: 'Sam has a trip to Toronto next month.',
133
+ source: 'told-by-user',
134
+ tags: ['travel'],
135
+ },
136
+ ],
137
+ },
138
+ {
139
+ id: 'conflict-resolution',
140
+ suite: 'retrieval',
141
+ kind: 'retrieval',
142
+ family: 'conflict_resolution',
143
+ title: 'Conflict resolution',
144
+ description: 'Prefer high-reliability evidence over model-generated noise.',
145
+ query: 'What caused the outage?',
146
+ expectAny: ['TLS certificate', 'expired certificate'],
147
+ forbid: ['The outage was caused by database corruption.'],
148
+ memory: [
149
+ {
150
+ content: 'The outage was caused by an expired TLS certificate on api.example.com.',
151
+ source: 'direct-observation',
152
+ tags: ['incident', 'root-cause'],
153
+ },
154
+ {
155
+ content: 'The outage was caused by database corruption.',
156
+ source: 'model-generated',
157
+ tags: ['incident', 'root-cause'],
158
+ },
159
+ ],
160
+ },
161
+ {
162
+ id: 'procedural-learning',
163
+ suite: 'retrieval',
164
+ kind: 'retrieval',
165
+ family: 'procedural_learning',
166
+ title: 'Procedural learning',
167
+ description: 'Turn repeated incidents into an actionable operating rule.',
168
+ query: 'What should the agent do when payout retries start returning 429?',
169
+ expectAny: ['cap retry batches', 'stagger retries'],
170
+ memory: [
171
+ {
172
+ content: 'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.',
173
+ source: 'direct-observation',
174
+ tags: ['payments', 'rate-limit'],
175
+ },
176
+ {
177
+ content: 'Payout incident volume dropped after retry batches were capped at 50 merchants per worker.',
178
+ source: 'tool-result',
179
+ tags: ['payments', 'rate-limit'],
180
+ },
181
+ {
182
+ content: 'Risk operations requested an escalation when multiple merchants were affected in the same hour.',
183
+ source: 'told-by-user',
184
+ tags: ['payments', 'escalation'],
185
+ },
186
+ ],
187
+ consolidate: {
188
+ minClusterSize: 3,
189
+ similarityThreshold: -0.3,
190
+ principle: {
191
+ content: 'When payout retries start returning 429, cap retry batches and stagger retries before escalating.',
192
+ type: 'procedural',
193
+ conditions: ['processor returns 429', 'multiple merchants impacted'],
194
+ },
195
+ },
196
+ options: {
197
+ types: ['procedural', 'semantic'],
198
+ },
199
+ },
200
+ {
201
+ id: 'privacy-boundary',
202
+ suite: 'retrieval',
203
+ kind: 'retrieval',
204
+ family: 'privacy_boundary',
205
+ title: 'Privacy boundary',
206
+ description: 'Never leak private memory into public recall.',
207
+ query: 'What is Sam API key?',
208
+ expectNone: true,
209
+ forbid: ['sk-secret-demo-key'],
210
+ memory: [
211
+ {
212
+ content: 'Sam API key is sk-secret-demo-key.',
213
+ source: 'told-by-user',
214
+ private: true,
215
+ tags: ['secret'],
216
+ },
217
+ {
218
+ content: 'Sam prefers dark mode in internal dashboards.',
219
+ source: 'told-by-user',
220
+ tags: ['preference'],
221
+ },
222
+ ],
223
+ },
224
+ ];
225
+
226
+ export const OPERATION_CASES = [
227
+ {
228
+ id: 'operation-update-overwrite',
229
+ suite: 'operations',
230
+ kind: 'operations',
231
+ family: 'update_overwrite',
232
+ title: 'Update and overwrite',
233
+ description: 'Current-state recall should prefer the new fact after an explicit overwrite.',
234
+ query: 'What is the primary deployment region now?',
235
+ expectAny: ['eu-west-1'],
236
+ forbid: ['us-east-1'],
237
+ steps: [
238
+ {
239
+ type: 'encode',
240
+ saveAs: 'initial-region',
241
+ memory: {
242
+ content: 'The primary deployment region is us-east-1.',
243
+ source: 'told-by-user',
244
+ tags: ['deployment', 'region'],
245
+ },
246
+ },
247
+ {
248
+ type: 'encode',
249
+ supersedesRef: 'initial-region',
250
+ memory: {
251
+ content: 'As of March 2026, the primary deployment region is eu-west-1.',
252
+ source: 'direct-observation',
253
+ tags: ['deployment', 'region', 'update'],
254
+ },
255
+ },
256
+ ],
257
+ },
258
+ {
259
+ id: 'operation-delete-and-abstain',
260
+ suite: 'operations',
261
+ kind: 'operations',
262
+ family: 'delete_and_abstain',
263
+ title: 'Delete and abstain',
264
+ description: 'Explicit deletion should remove a secret from later recall.',
265
+ query: 'What is the staging API token?',
266
+ expectNone: true,
267
+ forbid: ['tok-demo-staging-1234'],
268
+ steps: [
269
+ {
270
+ type: 'encode',
271
+ memory: {
272
+ content: 'The staging API token is tok-demo-staging-1234.',
273
+ source: 'told-by-user',
274
+ tags: ['secret', 'staging'],
275
+ },
276
+ },
277
+ {
278
+ type: 'encode',
279
+ memory: {
280
+ content: 'The staging environment rotates API credentials weekly.',
281
+ source: 'tool-result',
282
+ tags: ['staging', 'ops'],
283
+ },
284
+ },
285
+ {
286
+ type: 'forgetByQuery',
287
+ query: 'staging API token',
288
+ options: { minSimilarity: 0.35 },
289
+ },
290
+ ],
291
+ },
292
+ {
293
+ id: 'operation-semantic-merge',
294
+ suite: 'operations',
295
+ kind: 'operations',
296
+ family: 'semantic_merge',
297
+ title: 'Semantic merge',
298
+ description: 'Related episodes should merge into a reusable semantic operating rule.',
299
+ query: 'When should the disputes queue trigger manual review?',
300
+ expectAny: ['manual review', 'same bin in one hour'],
301
+ steps: [
302
+ {
303
+ type: 'encode',
304
+ memory: {
305
+ content: 'Three charge disputes from the same BIN landed in the queue within one hour.',
306
+ source: 'direct-observation',
307
+ tags: ['fraud', 'disputes'],
308
+ },
309
+ },
310
+ {
311
+ type: 'encode',
312
+ memory: {
313
+ content: 'Fraud ops escalated repeated same-BIN disputes for analyst attention.',
314
+ source: 'tool-result',
315
+ tags: ['fraud', 'disputes'],
316
+ },
317
+ },
318
+ {
319
+ type: 'encode',
320
+ memory: {
321
+ content: 'The queue stabilized after repeated same-BIN disputes were reviewed manually.',
322
+ source: 'told-by-user',
323
+ tags: ['fraud', 'disputes'],
324
+ },
325
+ },
326
+ {
327
+ type: 'consolidate',
328
+ minClusterSize: 3,
329
+ similarityThreshold: -0.3,
330
+ principle: {
331
+ content: 'Repeated disputes from the same BIN in one hour should trigger manual review.',
332
+ type: 'semantic',
333
+ },
334
+ },
335
+ ],
336
+ options: {
337
+ types: ['semantic'],
338
+ },
339
+ },
340
+ {
341
+ id: 'operation-procedural-merge',
342
+ suite: 'operations',
343
+ kind: 'operations',
344
+ family: 'procedural_merge',
345
+ title: 'Procedural merge',
346
+ description: 'Related episodes should merge into an executable procedure, not just a loose fact.',
347
+ query: 'What should the agent do after two webhook signature failures?',
348
+ expectAny: ['rotate the signing secret', 'replay queued events'],
349
+ steps: [
350
+ {
351
+ type: 'encode',
352
+ memory: {
353
+ content: 'Webhook signature verification failed twice for merchant ACME.',
354
+ source: 'direct-observation',
355
+ tags: ['webhooks', 'security'],
356
+ },
357
+ },
358
+ {
359
+ type: 'encode',
360
+ memory: {
361
+ content: 'Operations recovered the incident by rotating the signing secret.',
362
+ source: 'tool-result',
363
+ tags: ['webhooks', 'security'],
364
+ },
365
+ },
366
+ {
367
+ type: 'encode',
368
+ memory: {
369
+ content: 'Queued webhook events were replayed after the signing secret changed.',
370
+ source: 'told-by-user',
371
+ tags: ['webhooks', 'security'],
372
+ },
373
+ },
374
+ {
375
+ type: 'consolidate',
376
+ minClusterSize: 3,
377
+ similarityThreshold: -0.3,
378
+ principle: {
379
+ content: 'When webhook signature verification fails twice, rotate the signing secret and replay queued events.',
380
+ type: 'procedural',
381
+ conditions: ['signature verification fails twice', 'queued events pending'],
382
+ },
383
+ },
384
+ ],
385
+ options: {
386
+ types: ['procedural', 'semantic'],
387
+ },
388
+ },
389
+ ];
390
+
391
+ export const GUARD_CASES = [
392
+ {
393
+ id: 'guard-recent-tool-failure',
394
+ suite: 'guard',
395
+ kind: 'guard',
396
+ family: 'closed_loop_failure_memory',
397
+ title: 'Guard remembers failed tool outcome',
398
+ description: 'A failed guarded tool run should create a future caution and warning reflex for the same tool.',
399
+ action: 'run npm test before release',
400
+ tool: 'npm test',
401
+ expectAll: ['decision:caution', 'warning:recent_failure', 'reflex:warn'],
402
+ forbid: ['decision:go'],
403
+ steps: [
404
+ {
405
+ type: 'guardCycle',
406
+ action: 'run npm test',
407
+ tool: 'npm test',
408
+ outcome: 'failed',
409
+ errorSummary: 'Vitest failed with spawn EPERM',
410
+ },
411
+ ],
412
+ },
413
+ {
414
+ id: 'guard-strict-must-follow',
415
+ suite: 'guard',
416
+ kind: 'guard',
417
+ family: 'strict_must_follow_block',
418
+ title: 'Guard blocks strict must-follow release memory',
419
+ description: 'Strict guard mode should block a release action when must-follow memory applies.',
420
+ action: 'publish Audrey release',
421
+ tool: 'npm publish',
422
+ strict: true,
423
+ expectAll: ['decision:block', 'warning:must_follow', 'reflex:block'],
424
+ forbid: ['decision:go'],
425
+ steps: [
426
+ {
427
+ type: 'encode',
428
+ memory: {
429
+ content: 'Never publish Audrey without running npm pack --dry-run first.',
430
+ source: 'direct-observation',
431
+ tags: ['must-follow', 'release'],
432
+ },
433
+ },
434
+ ],
435
+ },
436
+ {
437
+ id: 'guard-rejects-replayed-outcome',
438
+ suite: 'guard',
439
+ kind: 'guard',
440
+ family: 'guard_receipt_hardening',
441
+ title: 'Guard rejects replayed receipt outcomes',
442
+ description: 'A receipt should only be closed once, while the failed outcome still becomes future caution memory.',
443
+ action: 'run npm test before release',
444
+ tool: 'npm test',
445
+ expectAll: ['guard_hardened:replay_rejected', 'decision:caution', 'warning:recent_failure'],
446
+ forbid: ['decision:go'],
447
+ steps: [
448
+ {
449
+ type: 'guardCycle',
450
+ saveReceiptAs: 'receipt',
451
+ action: 'run npm test',
452
+ tool: 'npm test',
453
+ outcome: 'failed',
454
+ errorSummary: 'Vitest failed with spawn EPERM',
455
+ },
456
+ {
457
+ type: 'expectGuardAfterError',
458
+ receiptRef: 'receipt',
459
+ label: 'replay_rejected',
460
+ tool: 'npm test',
461
+ outcome: 'failed',
462
+ errorSummary: 'replayed failure should not be recorded',
463
+ errorIncludes: 'already has an outcome',
464
+ },
465
+ ],
466
+ },
467
+ {
468
+ id: 'guard-rejects-non-guard-receipt',
469
+ suite: 'guard',
470
+ kind: 'guard',
471
+ family: 'guard_receipt_hardening',
472
+ title: 'Guard rejects non-guard receipts',
473
+ description: 'A normal tool trace must not be accepted as a guard receipt for after-action feedback.',
474
+ action: 'format docs',
475
+ tool: 'Bash',
476
+ expectAll: ['guard_hardened:non_guard_receipt_rejected'],
477
+ forbid: ['decision:block'],
478
+ steps: [
479
+ {
480
+ type: 'observeTool',
481
+ saveAs: 'non-guard-receipt',
482
+ event: 'PreToolUse',
483
+ tool: 'Bash',
484
+ metadata: { benchmark: 'non-guard-receipt' },
485
+ },
486
+ {
487
+ type: 'expectGuardAfterError',
488
+ receiptRef: 'non-guard-receipt',
489
+ label: 'non_guard_receipt_rejected',
490
+ tool: 'Bash',
491
+ outcome: 'succeeded',
492
+ errorIncludes: 'not a guard receipt',
493
+ },
494
+ ],
495
+ },
496
+ ];
497
+
498
+ export const LOCAL_BENCHMARK_SUITES = [
499
+ {
500
+ id: 'retrieval',
501
+ title: 'Retrieval capabilities',
502
+ description: 'LongMemEval-style memory abilities plus privacy and abstention.',
503
+ cases: RETRIEVAL_CASES,
504
+ },
505
+ {
506
+ id: 'operations',
507
+ title: 'Memory operations',
508
+ description: 'Update, delete, merge, and abstention behavior after lifecycle operations.',
509
+ cases: OPERATION_CASES,
510
+ },
511
+ {
512
+ id: 'guard',
513
+ title: 'Agent guard loop',
514
+ description: 'Closed-loop memory-before-action behavior for receipts, warnings, and blocking reflexes.',
515
+ comparableToBaselines: false,
516
+ cases: GUARD_CASES,
517
+ },
518
+ ];
519
+
520
+ export const BENCHMARK_CASES = LOCAL_BENCHMARK_SUITES.flatMap(suite => suite.cases);
521
+
522
+ export const FAMILY_ORDER = [
523
+ 'information_extraction',
524
+ 'knowledge_updates',
525
+ 'multi_session_reasoning',
526
+ 'temporal_reasoning',
527
+ 'abstention',
528
+ 'conflict_resolution',
529
+ 'procedural_learning',
530
+ 'privacy_boundary',
531
+ 'update_overwrite',
532
+ 'delete_and_abstain',
533
+ 'semantic_merge',
534
+ 'procedural_merge',
535
+ 'closed_loop_failure_memory',
536
+ 'strict_must_follow_block',
537
+ ];
@@ -0,0 +1,139 @@
1
+ import { existsSync, readFileSync, writeFileSync } from 'node:fs';
2
+ import { createHash } from 'node:crypto';
3
+ import { join, resolve } from 'node:path';
4
+ import { computeGuardBenchArtifactHashes, validateGuardBenchArtifacts } from './validate-guardbench-artifacts.mjs';
5
+ import { publicArtifactValue, publicPath } from './public-paths.mjs';
6
+
7
+ const CARD_FILE = 'guardbench-conformance-card.json';
8
+ const METADATA_FILE = 'external-run-metadata.json';
9
+
10
+ function readJson(path) {
11
+ return JSON.parse(readFileSync(path, 'utf-8'));
12
+ }
13
+
14
+ function sha256File(path) {
15
+ return createHash('sha256').update(readFileSync(path)).digest('hex');
16
+ }
17
+
18
+ function findExternalSubject(summary, requestedAdapter) {
19
+ const externalSubjects = (summary.manifest?.subjects ?? []).filter(subject => subject.external);
20
+ if (requestedAdapter) {
21
+ const requested = externalSubjects.find(subject => subject.name === requestedAdapter || subject.id === requestedAdapter);
22
+ if (requested) return requested;
23
+ }
24
+ return externalSubjects.length === 1 ? externalSubjects[0] : null;
25
+ }
26
+
27
+ function findSystemSummary(summary, metadata) {
28
+ const requested = metadata?.adapterConformance?.adapter ?? metadata?.adapter;
29
+ if (requested) {
30
+ const direct = summary.systemSummaries?.find(row => row.system === requested);
31
+ if (direct) return direct;
32
+ }
33
+ const externalSubject = findExternalSubject(summary, requested);
34
+ if (externalSubject) {
35
+ return summary.systemSummaries?.find(row => row.system === externalSubject.name) ?? null;
36
+ }
37
+ const audreyGuard = summary.systemSummaries?.find(row => row.system === 'Audrey Guard');
38
+ if (audreyGuard) return audreyGuard;
39
+ return null;
40
+ }
41
+
42
+ export function buildGuardBenchConformanceCard(options = {}) {
43
+ const dir = resolve(options.dir ?? 'benchmarks/output');
44
+ const summary = readJson(join(dir, 'guardbench-summary.json'));
45
+ const metadataPath = join(dir, METADATA_FILE);
46
+ const metadata = existsSync(metadataPath) ? readJson(metadataPath) : null;
47
+ const validation = validateGuardBenchArtifacts({ dir });
48
+ const systemSummary = findSystemSummary(summary, metadata);
49
+ const externalSubject = findExternalSubject(summary, systemSummary?.system ?? metadata?.adapter);
50
+ const artifactHashes = computeGuardBenchArtifactHashes(dir);
51
+
52
+ return {
53
+ schemaVersion: '1.0.0',
54
+ suite: 'GuardBench conformance card',
55
+ generatedAt: new Date().toISOString(),
56
+ sourceDir: publicPath(dir),
57
+ manifestVersion: summary.manifest?.manifestVersion ?? null,
58
+ suiteId: summary.manifest?.suiteId ?? null,
59
+ subject: {
60
+ name: systemSummary?.system ?? metadata?.adapterConformance?.adapter ?? metadata?.adapter ?? 'unknown',
61
+ requestedAdapter: metadata?.adapterConformance?.requestedAdapter ?? metadata?.adapter ?? null,
62
+ external: Boolean(externalSubject?.external ?? metadata),
63
+ },
64
+ run: {
65
+ status: metadata?.status ?? (validation.ok ? 'validated' : 'invalid'),
66
+ startedAt: metadata?.startedAt ?? null,
67
+ completedAt: metadata?.completedAt ?? null,
68
+ command: publicArtifactValue(metadata?.command ?? null),
69
+ validationCommand: publicArtifactValue(metadata?.validationCommand ?? null),
70
+ },
71
+ score: {
72
+ scenarios: systemSummary?.scenarios ?? summary.scenarios ?? 0,
73
+ fullContractPassed: systemSummary?.passed ?? null,
74
+ fullContractPassRate: systemSummary?.passRate ?? null,
75
+ decisionAccuracy: systemSummary?.decisionAccuracy ?? null,
76
+ evidenceRecall: systemSummary?.evidenceRecall ?? null,
77
+ redactionLeaks: systemSummary?.redactionLeaks ?? null,
78
+ latency: systemSummary?.latency ?? null,
79
+ },
80
+ conformance: {
81
+ ok: Boolean(metadata?.adapterConformance?.ok ?? validation.ok),
82
+ failures: metadata?.adapterConformance?.failures ?? validation.failures,
83
+ artifactValidationOk: validation.ok,
84
+ artifactValidationFailures: validation.failures,
85
+ },
86
+ integrity: {
87
+ artifactHashes,
88
+ externalRunMetadataHash: existsSync(metadataPath) ? sha256File(metadataPath) : null,
89
+ },
90
+ provenance: summary.provenance,
91
+ };
92
+ }
93
+
94
+ export function writeGuardBenchConformanceCard(options = {}) {
95
+ const dir = resolve(options.dir ?? 'benchmarks/output');
96
+ const card = buildGuardBenchConformanceCard({ dir });
97
+ const path = join(dir, CARD_FILE);
98
+ writeFileSync(path, `${JSON.stringify(card, null, 2)}\n`, 'utf-8');
99
+ return { path, card };
100
+ }
101
+
102
+ function parseArgs(argv = process.argv.slice(2)) {
103
+ const args = { dir: 'benchmarks/output', json: false };
104
+ for (let i = 0; i < argv.length; i++) {
105
+ const token = argv[i];
106
+ if ((token === '--dir' || token === '--out-dir') && argv[i + 1]) args.dir = argv[++i];
107
+ else if (token === '--json') args.json = true;
108
+ else if (token === '--help' || token === '-h') args.help = true;
109
+ else throw new Error(`Unknown argument: ${token}`);
110
+ }
111
+ return args;
112
+ }
113
+
114
+ function usage() {
115
+ return [
116
+ 'Usage: node benchmarks/create-conformance-card.mjs [--dir benchmarks/output] [--json]',
117
+ '',
118
+ 'Writes guardbench-conformance-card.json for a validated GuardBench output bundle.',
119
+ ].join('\n');
120
+ }
121
+
122
+ async function main() {
123
+ const args = parseArgs();
124
+ if (args.help) {
125
+ console.log(usage());
126
+ return;
127
+ }
128
+ const result = writeGuardBenchConformanceCard(args);
129
+ if (args.json) console.log(JSON.stringify({ path: result.path, card: result.card }, null, 2));
130
+ else console.log(`GuardBench conformance card: ${result.path}`);
131
+ if (!result.card.conformance.artifactValidationOk) process.exit(1);
132
+ }
133
+
134
+ if (process.argv[1] && resolve(process.argv[1]).endsWith('create-conformance-card.mjs')) {
135
+ main().catch(error => {
136
+ console.error(error.stack ?? error.message);
137
+ process.exit(1);
138
+ });
139
+ }