audrey 0.23.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +81 -19
  2. package/LICENSE +21 -21
  3. package/README.md +209 -5
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1035 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1171 -0
  24. package/benchmarks/output/guardbench-summary.json +1981 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1171 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +1981 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +164 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +228 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +164 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +228 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +3 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +55 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +61 -5
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +230 -49
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +3 -3
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +2 -2
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +67 -1
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +533 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1171 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +1981 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +164 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +228 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +72 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +741 -0
@@ -0,0 +1,533 @@
1
+ <div align="center">
2
+ <img src="docs/assets/audrey-wordmark.png" alt="Audrey wordmark" width="760">
3
+
4
+ <p><strong>The local-first memory firewall for AI agents.</strong></p>
5
+
6
+ <p>
7
+ Give Codex, Claude Code, Claude Desktop, Cursor, Windsurf, VS Code, JetBrains, Ollama-backed agents,
8
+ and custom agent services one durable memory layer they can check before they touch tools.
9
+ </p>
10
+
11
+ <p>
12
+ <a href="https://github.com/Evilander/Audrey/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/Evilander/Audrey/actions/workflows/ci.yml/badge.svg?branch=master"></a>
13
+ <a href="https://www.npmjs.com/package/audrey"><img alt="npm version" src="https://img.shields.io/npm/v/audrey.svg"></a>
14
+ <a href="LICENSE"><img alt="MIT license" src="https://img.shields.io/badge/license-MIT-blue.svg"></a>
15
+ </p>
16
+ </div>
17
+
18
+ ## Why Audrey Exists
19
+
20
+ Agents forget the exact mistakes they made yesterday. They repeat broken commands, lose project-specific rules, miss contradictions, and treat every new session like a cold start.
21
+
22
+ Audrey Guard is the headline loop: record what happened, remember what mattered, check before action, return `allow`, `warn`, or `block` with evidence, then validate whether the memory helped.
23
+
24
+ Audrey turns those hard-won lessons into a local memory runtime:
25
+
26
+ - `audrey guard --tool Bash "npm run deploy"` runs memory-before-action from the terminal.
27
+ - `memory_recall` finds durable context by semantic similarity.
28
+ - `memory_preflight` checks prior failures, risks, rules, and relevant procedures before an action.
29
+ - `memory_reflexes` converts remembered evidence into trigger-response guidance agents can follow.
30
+ - `memory_validate` closes the loop after the action: `helpful`, `used`, or `wrong` outcomes feed salience and can bind back to the exact preflight event, evidence ids, and Guard action fingerprint.
31
+ - `memory_dream` consolidates episodes into principles and applies decay.
32
+ - `audrey impact` and `audrey doctor` tell a human or CI system whether the runtime is doing real work and is actually ready.
33
+
34
+ It is not a hosted vector database, a notes app, or a Claude-only plugin. Audrey is a SQLite-backed continuity layer that can sit under any local or sidecar agent loop.
35
+
36
+ <div align="center">
37
+ <img src="docs/assets/audrey-feature-grid.jpg" alt="Audrey feature marks: memory continuity, archive signal, recall loop, layered evidence, local node, and remembering before acting" width="760">
38
+ </div>
39
+
40
+ ## Quick Start
41
+
42
+ Requires Node.js 20+.
43
+
44
+ ```bash
45
+ npx audrey doctor
46
+ npx audrey demo --scenario repeated-failure
47
+ npx audrey guard --tool Bash "npm run deploy"
48
+ ```
49
+
50
+ `doctor` verifies Node, the MCP entrypoint, provider selection, memory-store health, and host config generation. The repeated-failure demo is no-key, no-host, and no-network: it creates a temporary store, records a failed deploy, teaches Audrey the fix, then shows Audrey Guard blocking the repeat attempt with evidence.
51
+
52
+ Expected first-run shape:
53
+
54
+ ```text
55
+ Audrey Doctor v1.0.0
56
+ Store health: not initialized
57
+ Verdict: ready
58
+ ```
59
+
60
+ After the first real memory write, `doctor` should report the store as healthy.
61
+
62
+ ## Install Into Agent Hosts
63
+
64
+ Preview host setup without editing config files:
65
+
66
+ ```bash
67
+ npx audrey install --host codex --dry-run
68
+ npx audrey install --host claude-code --dry-run
69
+ npx audrey install --host generic --dry-run
70
+ ```
71
+
72
+ Generate raw config blocks:
73
+
74
+ ```bash
75
+ npx audrey mcp-config codex
76
+ npx audrey mcp-config generic
77
+ npx audrey mcp-config vscode
78
+ npx audrey hook-config claude-code
79
+ ```
80
+
81
+ Claude Code can be registered directly:
82
+
83
+ ```bash
84
+ npx audrey install
85
+ claude mcp list
86
+ ```
87
+
88
+ For memory-before-action hooks, preview with `npx audrey hook-config
89
+ claude-code`, then apply with `npx audrey hook-config claude-code --apply
90
+ --scope project` for `.claude/settings.local.json` or `--scope user` for
91
+ `~/.claude/settings.json`. Audrey merges the hook block into existing settings
92
+ and writes a timestamped backup before changing a non-empty file. The generated
93
+ `PreToolUse` hook runs `audrey guard --hook --fail-on-warn`; the `PostToolUse`
94
+ and `PostToolUseFailure` hooks record redacted tool traces. Verify the active
95
+ hook set inside Claude Code with `/hooks`.
96
+
97
+ All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. Use `AUDREY_DATA_DIR` to isolate projects, tenants, or host identities.
98
+
99
+ Installer-generated host config does not include provider API keys by default. Prefer setting `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` in the host runtime environment; use `npx audrey install --include-secrets` only if you explicitly accept argv/config exposure.
100
+
101
+ ## Use With Ollama And Local Agents
102
+
103
+ Ollama runs models; Audrey supplies memory. Start Audrey as a local REST sidecar and expose its routes as tools in your agent loop:
104
+
105
+ ```bash
106
+ AUDREY_AGENT=ollama-local-agent npx audrey serve
107
+ curl http://localhost:7437/health
108
+ curl http://localhost:7437/v1/status
109
+ ```
110
+
111
+ Runnable example:
112
+
113
+ ```bash
114
+ AUDREY_AGENT=ollama-local-agent npx audrey serve
115
+ OLLAMA_MODEL=qwen3 node examples/ollama-memory-agent.js "What should you remember about Audrey?"
116
+ ```
117
+
118
+ Core sidecar tools:
119
+
120
+ | Agent Need | REST Route |
121
+ |---|---|
122
+ | Check memory before acting | `POST /v1/preflight` |
123
+ | Get reflex rules for an action | `POST /v1/reflexes` |
124
+ | Store a useful observation | `POST /v1/encode` |
125
+ | Recall relevant context | `POST /v1/recall` |
126
+ | Get a turn-sized memory packet | `POST /v1/capsule` |
127
+ | Check health | `GET /v1/status` |
128
+
129
+ ## What Ships
130
+
131
+ | Surface | Status |
132
+ |---|---|
133
+ | MCP stdio server | 20 tools plus status/recent/principles resources and briefing/recall/reflection prompts |
134
+ | CLI | `doctor`, `demo`, `guard`, `install`, `mcp-config`, `hook-config`, `status`, `dream`, `reembed`, `observe-tool`, `promote`, `impact` |
135
+ | REST API | Hono server with `/health` and `/v1/*` routes |
136
+ | JavaScript SDK | Direct TypeScript/Node import from `audrey` |
137
+ | Python client | `pip install audrey-memory`, calls the REST sidecar |
138
+ | Storage | Local SQLite plus `sqlite-vec`, no hosted database required |
139
+ | Deployment | npm package, Docker, Compose, host-specific MCP config generation |
140
+ | Safety loop | preflight warnings, reflexes, redacted tool traces, contradiction handling |
141
+
142
+ ## Memory Model
143
+
144
+ Audrey is built around the parts of memory that matter for agents:
145
+
146
+ - Episodic memory: specific observations, tool results, preferences, and session facts.
147
+ - Semantic memory: consolidated principles extracted from repeated evidence.
148
+ - Procedural memory: remembered ways to act, avoid, retry, or verify.
149
+ - Affect and salience: emotional weight and importance influence recall.
150
+ - Interference and decay: stale, conflicting, or low-confidence memories lose authority over time.
151
+ - Contradiction handling: competing claims are tracked instead of silently overwritten.
152
+ - Tool-trace learning: failed commands and risky actions become future preflight warnings.
153
+
154
+ The product bet is simple: the next generation of useful agents will not just retrieve facts. They will remember what happened, decide whether a memory is still trustworthy, and use that memory before touching tools.
155
+
156
+ ## Use Audrey From Code
157
+
158
+ ### JavaScript
159
+
160
+ ```js
161
+ import { Audrey } from 'audrey';
162
+
163
+ const brain = new Audrey({
164
+ dataDir: './audrey-data',
165
+ agent: 'support-agent',
166
+ embedding: { provider: 'local', dimensions: 384 },
167
+ });
168
+
169
+ await brain.encode({
170
+ content: 'Stripe returns HTTP 429 above 100 req/s',
171
+ source: 'direct-observation',
172
+ tags: ['stripe', 'rate-limit'],
173
+ });
174
+
175
+ const memories = await brain.recall('stripe rate limit');
176
+
177
+ await brain.waitForIdle();
178
+ brain.close();
179
+ ```
180
+
181
+ ### Python
182
+
183
+ ```bash
184
+ pip install audrey-memory
185
+ ```
186
+
187
+ ```python
188
+ from audrey_memory import Audrey
189
+
190
+ brain = Audrey(base_url="http://127.0.0.1:7437", agent="support-agent")
191
+ memory_id = brain.encode("Stripe returns HTTP 429 above 100 req/s", source="direct-observation")
192
+ results = brain.recall("stripe rate limit", limit=5)
193
+ brain.close()
194
+ ```
195
+
196
+ ## Production Readiness
197
+
198
+ Audrey is close to a 1.0-ready local memory runtime, but production depends on how it is embedded. Treat it like stateful infrastructure.
199
+
200
+ Release gates used for this package:
201
+
202
+ ```bash
203
+ npm run release:gate
204
+ npm run python:release:check
205
+ npm run bench:guard:card
206
+ npm run bench:guard:validate
207
+ npx audrey doctor
208
+ npx audrey demo
209
+ ```
210
+
211
+ Recommended runtime checks:
212
+
213
+ ```bash
214
+ npx audrey doctor --json
215
+ npx audrey status --json --fail-on-unhealthy
216
+ npx audrey install --host codex --dry-run
217
+ ```
218
+
219
+ Production controls you still own:
220
+
221
+ - Set one `AUDREY_DATA_DIR` per tenant, environment, or isolation boundary.
222
+ - Pin `AUDREY_EMBEDDING_PROVIDER` and `AUDREY_LLM_PROVIDER` explicitly.
223
+ - Back up the SQLite data directory before provider or dimension changes.
224
+ - Keep API keys and raw credentials out of encoded memory content.
225
+ - Use `AUDREY_API_KEY` if the REST sidecar is reachable beyond the local process boundary.
226
+ - Run `npx audrey dream` on a schedule so consolidation and decay stay current.
227
+ - Add application-level encryption, retention, access control, and audit logging for regulated environments.
228
+
229
+ ## Environment Variables
230
+
231
+ | Variable | Default | Purpose |
232
+ |---|---|---|
233
+ | `AUDREY_DATA_DIR` | `~/.audrey/data` | SQLite memory store path. Use one per tenant or agent identity for isolation. |
234
+ | `AUDREY_AGENT` | `local-agent` | Logical agent identity stamped on writes. |
235
+ | `AUDREY_EMBEDDING_PROVIDER` | `local` | `local`, `gemini`, `openai`, or `mock`. Cloud providers require explicit opt-in. |
236
+ | `AUDREY_LLM_PROVIDER` | auto | `anthropic`, `openai`, or `mock`. |
237
+ | `AUDREY_DEVICE` | `gpu` | Local embedding device (`gpu` or `cpu`). Falls back to CPU if GPU init fails. |
238
+ | `AUDREY_PORT` | `7437` | REST sidecar port. |
239
+ | `AUDREY_HOST` | `127.0.0.1` | REST sidecar bind address. Set to `0.0.0.0` only with `AUDREY_API_KEY`. |
240
+ | `AUDREY_API_KEY` | unset | Bearer token required for non-loopback REST traffic. |
241
+ | `AUDREY_ALLOW_NO_AUTH` | `0` | Set to `1` to allow non-loopback bind without an API key. Don't. |
242
+ | `AUDREY_ENABLE_ADMIN_TOOLS` | `0` | Set to `1` to enable export, import, and forget routes/tools. Disabled by default. |
243
+ | `AUDREY_PROMOTE_ROOTS` | unset | Colon/semicolon-separated extra roots for `audrey promote --yes` writes. By default writes are restricted to `process.cwd()`. |
244
+ | `AUDREY_DEBUG` | `0` | Set to `1` to print MCP info logs (server started, warmup completed). Errors always log. |
245
+ | `AUDREY_PROFILE` | `0` | Set to `1` to emit per-stage timings via MCP `_meta.diagnostics`. |
246
+ | `AUDREY_DISABLE_WARMUP` | `0` | Set to `1` to skip background embedding warmup at MCP boot. |
247
+ | `AUDREY_ONNX_VERBOSE` | `0` | Set to `1` to restore ONNX runtime EP-assignment warnings (suppressed by default). |
248
+ | `AUDREY_PRAGMA_DEFAULTS` | `1` | Set to `0` to revert SQLite PRAGMA tuning to better-sqlite3 defaults. |
249
+ | `AUDREY_CONTEXT_BUDGET_CHARS` | `4000` | Default Memory Capsule character budget. |
250
+
251
+ ## Benchmarks
252
+
253
+ Audrey ships three benchmark families.
254
+
255
+ ### Performance snapshot
256
+
257
+ `npm run bench:perf-snapshot` measures encode and hybrid recall latency at multiple corpus sizes against the in-process mock provider. It reports p50/p95/p99 plus machine provenance so the numbers are reproducible and honest about what they cover.
258
+
259
+ ```bash
260
+ npm run build
261
+ npm run bench:perf-snapshot # default sizes 100, 1000, 5000
262
+ node benchmarks/perf-snapshot.js --sizes 1000,10000 --json # custom shape
263
+ ```
264
+
265
+ Sample output from `benchmarks/snapshots/perf-0.22.2.json` (24-core Ryzen 9 7900X3D, Node 25.5.0, mock 64-dim embedding, hybrid recall, limit 5):
266
+
267
+ | Corpus size | Encode p50 (ms) | Encode p95 (ms) | Recall p50 (ms) | Recall p95 (ms) | Recall p99 (ms) |
268
+ |---|---|---|---|---|---|
269
+ | 100 | 0.33 | 0.59 | 0.54 | 1.82 | 2.71 |
270
+ | 1,000 | 0.31 | 2.15 | 1.57 | 2.36 | 21.18 |
271
+ | 5,000 | 0.31 | 1.84 | 2.09 | 3.42 | 16.58 |
272
+
273
+ These numbers cover Audrey's own pipeline (SQLite + sqlite-vec + hybrid ranking) and exclude embedding-provider cost. Real-world recall p95 with a local 384-dim provider is typically 5-15x higher; with a hosted provider it is dominated by the API round-trip. Run on your own hardware before quoting numbers anywhere.
274
+
275
+ ### Behavioral regression suite
276
+
277
+ `npm run bench:memory:check` is a release gate. It runs a small set of retrieval and lifecycle scenarios (information extraction, knowledge updates, multi-session reasoning, conflict resolution, privacy boundary, overwrite, delete-and-abstain, semantic/procedural merge) against Audrey and three weak baselines (vector-only, keyword+recency, recent-window) and asserts Audrey doesn't regress. The baseline comparisons exist to catch correctness regressions in retrieval logic, not to make marketing claims.
278
+
279
+ ```bash
280
+ npm run bench:memory # full regression suite (writes JSON + report)
281
+ npm run bench:memory:check # release gate, exits non-zero on regression
282
+ ```
283
+
284
+ ### GuardBench comparative suite
285
+
286
+ `npm run bench:guard:check` runs Audrey's local GuardBench comparative suite:
287
+ ten pre-action scenarios across Audrey Guard, no-memory, recent-window,
288
+ vector-only, and FTS-only adapters. The scenarios cover exact repeated
289
+ failures, required procedures, changed file scopes, changed commands,
290
+ recovered failures, recall degradation, redaction safety, conflicting
291
+ instructions, and noisy stores. It writes
292
+ `benchmarks/output/guardbench-summary.json`,
293
+ `benchmarks/output/guardbench-manifest.json`, and
294
+ `benchmarks/output/guardbench-raw.json`. The emitted manifest, summary, and raw
295
+ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
296
+
297
+ Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
298
+ rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
299
+ the raw-secret sweep, and 3.214ms / 21.395ms
300
+ p50/p95 guard latency under the mock-provider methodology. Local baseline
301
+ decision accuracy was: no-memory 10%, recent-window 60%, vector-only 40%, and
302
+ FTS-only 10%; none passed the full GuardBench decision-plus-evidence contract.
303
+
304
+ ```bash
305
+ npm run bench:guard
306
+ npm run bench:guard:check
307
+ npm run bench:guard:manifest
308
+ npm run bench:guard:validate
309
+ npm run bench:guard:card
310
+ npm run bench:guard:bundle
311
+ npm run bench:guard:bundle:verify
312
+ npm run bench:guard:leaderboard
313
+ npm run bench:guard:adapter-registry:validate
314
+ npm run bench:guard:adapter-module:validate
315
+ npm run bench:guard:adapter-self-test
316
+ npm run bench:guard:adapter-self-test:validate
317
+ npm run bench:guard:publication:verify
318
+ npm run bench:guard:adapter-smoke
319
+ npm run bench:guard:adapter-conformance
320
+ npm run bench:guard:external:dry-run
321
+ npm run bench:guard:mem0 -- --dry-run
322
+ npm run bench:guard:zep -- --dry-run
323
+ node benchmarks/adapter-self-test.mjs --adapter ./path/to/adapter.mjs
324
+ node benchmarks/guardbench.js --adapter ./path/to/adapter.mjs --check
325
+ ```
326
+
327
+ External GuardBench adapters are ESM modules that export either `default`,
328
+ `adapter`, or `createGuardBenchAdapter()`. The adapter receives scenario seed
329
+ data and the proposed action, but the harness withholds `expectedDecision` and
330
+ `requiredEvidence` until scoring. Start from
331
+ `benchmarks/adapters/example-allow.mjs` when wiring a new system. Adapter
332
+ authors can import `defineGuardBenchAdapter()` and `defineGuardBenchResult()`
333
+ from `benchmarks/adapter-kit.mjs` to validate module shape and decision output
334
+ while developing.
335
+
336
+ The published adapter registry lives at `benchmarks/adapters/registry.json`.
337
+ Run `npm run bench:guard:adapter-registry:validate` to verify registry shape,
338
+ adapter paths, and credential-free module loading.
339
+
340
+ Before running the full self-test, validate the ESM module shape quickly:
341
+
342
+ ```bash
343
+ npm run bench:guard:adapter-module:validate -- --adapter ./path/to/adapter.mjs
344
+ ```
345
+
346
+ Before publishing a new adapter, run `npm run bench:guard:adapter-self-test --
347
+ --adapter ./path/to/adapter.mjs`. The self-test validates the external adapter
348
+ contract and row conformance while explicitly allowing low benchmark scores, so
349
+ authors can separate "valid submission shape" from "competitive GuardBench
350
+ performance." The generated self-test report is validated against
351
+ `benchmarks/schemas/guardbench-adapter-self-test.schema.json`. Reviewers can
352
+ validate a submitted report without rerunning an adapter through `npm run
353
+ bench:guard:adapter-self-test:validate -- --report ./guardbench-adapter-self-test.json`.
354
+
355
+ Audrey ships external adapters for Mem0 Platform and Zep Cloud. Run them only
356
+ with runtime API keys:
357
+
358
+ ```bash
359
+ set MEM0_API_KEY=...
360
+ npm run bench:guard:mem0
361
+
362
+ set ZEP_API_KEY=...
363
+ npm run bench:guard:zep
364
+ ```
365
+
366
+ The Zep adapter uses the current REST surface for users, sessions, `memory.add`,
367
+ `graph.search`, and benchmark-user cleanup. If Zep graph ingestion needs more
368
+ time in a live account, set `ZEP_GUARDBENCH_INGEST_DELAY_MS` before the run.
369
+
370
+ Run `npm run bench:guard:external:dry-run` before coordinating credentialed
371
+ runs. It walks the runtime-env adapter registry, writes non-secret
372
+ `external-run-metadata.json` files for each adapter, and reports which runtime
373
+ environment variables are still missing. The external dry-run matrix report is schema-bound by
374
+ `benchmarks/schemas/guardbench-external-dry-run.schema.json` and written to
375
+ `benchmarks/output/external/guardbench-external-dry-run.json`.
376
+
377
+ Run `npm run bench:guard:external:evidence` after dry-runs or live runs to
378
+ write `benchmarks/output/external/guardbench-external-evidence.json`. This
379
+ external evidence verification report is schema-bound by
380
+ `benchmarks/schemas/guardbench-external-evidence.schema.json`, treats dry-run
381
+ or missing-key rows as pending in normal release gates, and checks that saved
382
+ metadata does not contain runtime credential values. Use
383
+ `npm run bench:guard:external:evidence:strict` when Mem0/Zep keys have been
384
+ provided; strict mode fails until every runtime-env adapter has a passed live
385
+ bundle.
386
+
387
+ External runs write `external-run-metadata.json` alongside the GuardBench
388
+ summary, manifest, and raw output bundle under
389
+ `benchmarks/output/external/<adapter>/`. The external runner validates the
390
+ emitted bundle with `benchmarks/validate-guardbench-artifacts.mjs` before
391
+ marking the run passed, and separately records adapter conformance so a valid
392
+ low-scoring adapter is distinguished from a malformed adapter. When
393
+ `external-run-metadata.json` is present, the validator also checks it against
394
+ `benchmarks/schemas/guardbench-external-run.schema.json` and verifies any
395
+ recorded SHA-256 artifact hashes against the bundle on disk.
396
+
397
+ For a shareable submission artifact, run `npm run bench:guard:card -- --dir
398
+ <output-dir>`. This writes `guardbench-conformance-card.json` with the subject
399
+ name, run status, score, conformance result, artifact hashes, optional
400
+ external-run metadata hash, and machine provenance. The standalone validator
401
+ checks the card when it is present.
402
+
403
+ For a portable submission directory, run `npm run bench:guard:bundle -- --dir
404
+ <output-dir>`. This creates `submission-bundle/` with the raw GuardBench
405
+ artifacts, conformance card, JSON schemas, validation report, and
406
+ `submission-manifest.json` with SHA-256 hashes for every bundled file.
407
+ Reviewers can run `npm run bench:guard:bundle:verify -- --dir
408
+ <submission-bundle>` to check manifest hashes, bundled schemas, and artifact
409
+ validation from the bundle alone.
410
+
411
+ For benchmark aggregation, run `npm run bench:guard:leaderboard -- --bundle
412
+ <submission-bundle>`. The leaderboard builder verifies each bundle before
413
+ ranking and writes JSON plus Markdown reports under `benchmarks/output/leaderboard/`.
414
+
415
+ Before publishing benchmark artifacts, run `npm run
416
+ bench:guard:publication:verify`. This single benchmark-focused verifier checks
417
+ the adapter registry, default adapter module, adapter self-test report,
418
+ GuardBench manifest/summary/raw artifacts, submission bundle, external dry-run
419
+ matrix, external evidence verification report, leaderboard, and a local
420
+ absolute-path sweep over the public artifact set.
421
+ The verifier validates its own machine-readable report against
422
+ `benchmarks/schemas/guardbench-publication-verification.schema.json` before it
423
+ exits.
424
+
425
+ Before turning the paper into public posts or submissions, run `npm run
426
+ paper:claims`. It validates `docs/paper/claim-register.json` against the
427
+ current paper, README, GuardBench artifacts, publication verifier, and external
428
+ evidence status so pending Mem0/Zep live-score claims cannot slip into public
429
+ copy.
430
+ Run `npm run paper:publication-pack` to verify the ready-to-use arXiv, Hacker
431
+ News, Reddit, X, and LinkedIn drafts in `docs/paper/publication-pack.json`
432
+ before browser-based submission. The X URL reserve is explicit: the first X
433
+ post carries `reservedUrlChars: 24`, and submitted artifact-url targets in
434
+ `browser-launch-results.json` must record the final `artifactUrl`.
435
+ Run `npm run paper:arxiv` to generate a deterministic TeX source package under
436
+ `docs/paper/output/arxiv/`, and `npm run paper:arxiv:verify` to check hashes,
437
+ citation conversion, bibliography coverage, seeded-secret redaction, and local
438
+ absolute-path leakage before arXiv upload.
439
+ Run `npm run paper:arxiv:compile` to record a schema-bound compile report at
440
+ `docs/paper/output/arxiv-compile-report.json`. It attempts `tectonic`,
441
+ `latexmk`, `pdflatex`/`bibtex`, or `uvx tecto` with a local bundle proxy when
442
+ available; `npm run paper:arxiv:compile:strict` stays blocked on hosts without
443
+ supported TeX tooling.
444
+ Run `npm run paper:launch-plan` to verify
445
+ `docs/paper/browser-launch-plan.json`, which maps those drafts to manual
446
+ browser targets, login/captcha expectations, platform-rule checks, source
447
+ URLs, and post-submit URL capture.
448
+ Run `npm run paper:launch-results` to validate
449
+ `docs/paper/browser-launch-results.json`, the post-submit ledger for arXiv,
450
+ Hacker News, Reddit, X, and LinkedIn targets. The normal verifier allows
451
+ pending rows with explicit blockers; `npm run paper:launch-results:strict`
452
+ fails until every target has a submitted, operator-verified public URL.
453
+ Run `npm run paper:bundle` to generate
454
+ `docs/paper/output/submission-bundle/`, a hash-manifested package containing
455
+ paper sources, claim and publication registers, GuardBench outputs, schemas,
456
+ and package metadata. `npm run paper:bundle:verify` checks the manifest and
457
+ file hashes before browser upload.
458
+ Run `npm run release:readiness` for the pending-aware Audrey 1.0 checklist.
459
+ It keeps code/paper readiness separate from publish blockers; `npm run
460
+ release:readiness:strict` fails until the 1.0 version surfaces,
461
+ source-control state, live remote-head verification, Python artifacts, npm
462
+ registry/auth readiness, PyPI publish readiness, arXiv compile proof, browser
463
+ publication URLs, and live Mem0/Zep evidence are complete.
464
+ Run `npm run release:cut:plan` to preview the exact 1.0 version/changelog
465
+ edits across npm, lockfile, MCP, and Python surfaces. `npm run
466
+ release:cut:apply -- --target-version 1.0.0` writes those edits only when the
467
+ final cut is intentional. The generated changelog section is release-note copy,
468
+ not a TODO scaffold; `release:readiness:strict` rejects placeholder changelog
469
+ markers before publication.
470
+ Run `npm run security:audit` before packaging or publishing; the release gates
471
+ call it after artifact verification so production dependency advisories cannot
472
+ slip past the final package check.
473
+
474
+ ## Command Reference
475
+
476
+ ```bash
477
+ # First contact
478
+ npx audrey doctor
479
+ npx audrey demo
480
+
481
+ # MCP setup
482
+ npx audrey install --host codex --dry-run
483
+ npx audrey mcp-config codex
484
+ npx audrey mcp-config generic
485
+ npx audrey hook-config claude-code
486
+ npx audrey install
487
+ npx audrey uninstall
488
+
489
+ # Health and maintenance
490
+ npx audrey status
491
+ npx audrey status --json --fail-on-unhealthy
492
+ npx audrey dream
493
+ npx audrey reembed
494
+
495
+ # Closed-loop visibility
496
+ npx audrey impact
497
+ npx audrey impact --json --window 7 --limit 5
498
+
499
+ # Tool-trace learning
500
+ npx audrey observe-tool --event PostToolUse --tool Bash --outcome failed
501
+ npx audrey promote --dry-run
502
+
503
+ # REST sidecar
504
+ npx audrey serve
505
+ copy .env.docker.example .env
506
+ # edit AUDREY_API_KEY in .env
507
+ docker compose up -d --build
508
+ ```
509
+
510
+ The Node sidecar defaults to `127.0.0.1:7437`. The Docker image intentionally binds inside the container on `3487`, so Compose requires `AUDREY_API_KEY` in `.env` before startup. Override the published host port with `AUDREY_PUBLISHED_PORT` when using Compose.
511
+
512
+ ## Documentation
513
+
514
+ - [Security policy](SECURITY.md)
515
+ - [Audrey paper outline](docs/AUDREY_PAPER_OUTLINE.md)
516
+ - Public setup, runtime, benchmark, and command guidance is maintained in this README.
517
+
518
+ ## Development
519
+
520
+ ```bash
521
+ npm ci
522
+ npm run release:gate
523
+ python -m unittest discover -s python/tests -v
524
+ npm run python:release:check
525
+ ```
526
+
527
+ `npm test` uses a repo-local Vitest launcher so locked-down Windows temp
528
+ directories do not block test startup. `npm run release:gate:sandbox` remains
529
+ available for hosts that block child-process spawning entirely.
530
+
531
+ ## License
532
+
533
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,50 @@
1
+ {
2
+ "schemaVersion": "1.0.0",
3
+ "suite": "GuardBench adapter self-test",
4
+ "generatedAt": "2026-05-13T23:33:55.959Z",
5
+ "ok": true,
6
+ "adapter": {
7
+ "name": "Example Allow Adapter",
8
+ "path": "benchmarks/adapters/example-allow.mjs",
9
+ "moduleFile": "example-allow.mjs",
10
+ "description": "Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests."
11
+ },
12
+ "conformance": {
13
+ "ok": true,
14
+ "adapter": "Example Allow Adapter",
15
+ "requestedAdapter": "Example Allow Adapter",
16
+ "scenarios": 10,
17
+ "expectedScenarios": 10,
18
+ "fullContractPassRate": 0,
19
+ "decisionAccuracy": 0.1,
20
+ "redactionLeaks": 0,
21
+ "failures": []
22
+ },
23
+ "score": {
24
+ "scenarios": 10,
25
+ "fullContractPassRate": 0,
26
+ "decisionAccuracy": 0.1,
27
+ "evidenceRecall": 0,
28
+ "redactionLeaks": 0,
29
+ "latency": {
30
+ "p50Ms": 0.012,
31
+ "p95Ms": 0.049,
32
+ "maxMs": 0.049
33
+ }
34
+ },
35
+ "contract": {
36
+ "expectedAnswersWithheld": true,
37
+ "lowScoreAllowed": true,
38
+ "requiredScenarioRows": 10,
39
+ "requiredResultFields": [
40
+ "decision",
41
+ "riskScore",
42
+ "evidenceIds",
43
+ "recommendedActions",
44
+ "summary",
45
+ "recallErrors"
46
+ ],
47
+ "redactionLeakTolerance": 0
48
+ },
49
+ "failures": []
50
+ }
@@ -0,0 +1,69 @@
1
+ {
2
+ "schemaVersion": "1.0.0",
3
+ "suite": "GuardBench external adapter dry-run matrix",
4
+ "generatedAt": "2026-05-13T23:33:56.533Z",
5
+ "ok": true,
6
+ "registry": "benchmarks/adapters/registry.json",
7
+ "outRoot": "benchmarks/output/external",
8
+ "adapters": [
9
+ {
10
+ "id": "mem0-platform",
11
+ "name": "Mem0 Platform",
12
+ "credentialMode": "runtime-env",
13
+ "requiredEnv": [
14
+ "MEM0_API_KEY"
15
+ ],
16
+ "missingEnv": [
17
+ "MEM0_API_KEY"
18
+ ],
19
+ "status": "dry-run-missing-env",
20
+ "command": [
21
+ "node",
22
+ "benchmarks/guardbench.js",
23
+ "--adapter",
24
+ "benchmarks/adapters/mem0-platform.mjs",
25
+ "--out-dir",
26
+ "benchmarks/output/external/mem0-platform",
27
+ "--check",
28
+ "--json"
29
+ ],
30
+ "validationCommand": [
31
+ "node",
32
+ "benchmarks/validate-guardbench-artifacts.mjs",
33
+ "--dir",
34
+ "benchmarks/output/external/mem0-platform"
35
+ ],
36
+ "metadataPath": "benchmarks/output/external/mem0-platform/external-run-metadata.json"
37
+ },
38
+ {
39
+ "id": "zep-cloud",
40
+ "name": "Zep Cloud",
41
+ "credentialMode": "runtime-env",
42
+ "requiredEnv": [
43
+ "ZEP_API_KEY"
44
+ ],
45
+ "missingEnv": [
46
+ "ZEP_API_KEY"
47
+ ],
48
+ "status": "dry-run-missing-env",
49
+ "command": [
50
+ "node",
51
+ "benchmarks/guardbench.js",
52
+ "--adapter",
53
+ "benchmarks/adapters/zep-cloud.mjs",
54
+ "--out-dir",
55
+ "benchmarks/output/external/zep-cloud",
56
+ "--check",
57
+ "--json"
58
+ ],
59
+ "validationCommand": [
60
+ "node",
61
+ "benchmarks/validate-guardbench-artifacts.mjs",
62
+ "--dir",
63
+ "benchmarks/output/external/zep-cloud"
64
+ ],
65
+ "metadataPath": "benchmarks/output/external/zep-cloud/external-run-metadata.json"
66
+ }
67
+ ],
68
+ "failures": []
69
+ }