audrey 0.23.1 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/CHANGELOG.md +101 -15
  2. package/LICENSE +21 -21
  3. package/README.md +232 -6
  4. package/SECURITY.md +2 -1
  5. package/benchmarks/adapter-kit.mjs +20 -0
  6. package/benchmarks/adapter-self-test.mjs +166 -0
  7. package/benchmarks/adapters/example-allow.mjs +28 -0
  8. package/benchmarks/adapters/mem0-platform.mjs +267 -0
  9. package/benchmarks/adapters/registry.json +51 -0
  10. package/benchmarks/adapters/zep-cloud.mjs +280 -0
  11. package/benchmarks/baselines.js +169 -0
  12. package/benchmarks/build-leaderboard.mjs +170 -0
  13. package/benchmarks/cases.js +537 -0
  14. package/benchmarks/create-conformance-card.mjs +139 -0
  15. package/benchmarks/create-submission-bundle.mjs +176 -0
  16. package/benchmarks/dry-run-external-adapters.mjs +165 -0
  17. package/benchmarks/guardbench.js +1125 -0
  18. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  19. package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  20. package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  21. package/benchmarks/output/guardbench-conformance-card.json +63 -0
  22. package/benchmarks/output/guardbench-manifest.json +414 -0
  23. package/benchmarks/output/guardbench-raw.json +1271 -0
  24. package/benchmarks/output/guardbench-summary.json +2107 -0
  25. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  26. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  27. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
  28. package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
  29. package/benchmarks/output/submission-bundle/guardbench-raw.json +1271 -0
  30. package/benchmarks/output/submission-bundle/guardbench-summary.json +2107 -0
  31. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
  32. package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
  33. package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
  34. package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
  35. package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
  36. package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
  37. package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
  38. package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
  39. package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
  40. package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +184 -0
  41. package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
  42. package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +249 -0
  43. package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  44. package/benchmarks/output/submission-bundle/validation-report.json +31 -0
  45. package/benchmarks/output/summary.json +2354 -0
  46. package/benchmarks/perf-snapshot.js +304 -0
  47. package/benchmarks/perf.bench.js +161 -0
  48. package/benchmarks/public-paths.mjs +78 -0
  49. package/benchmarks/reference-results.js +70 -0
  50. package/benchmarks/report.js +259 -0
  51. package/benchmarks/run-external-guardbench.mjs +281 -0
  52. package/benchmarks/run.js +682 -0
  53. package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  54. package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  55. package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  56. package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  57. package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  58. package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  59. package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  60. package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  61. package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  62. package/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  63. package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  64. package/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  65. package/benchmarks/snapshots/perf-0.22.2.json +123 -0
  66. package/benchmarks/snapshots/perf-0.23.0.json +123 -0
  67. package/benchmarks/validate-adapter-module.mjs +104 -0
  68. package/benchmarks/validate-adapter-registry.mjs +134 -0
  69. package/benchmarks/validate-adapter-self-test.mjs +96 -0
  70. package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
  71. package/benchmarks/verify-external-evidence.mjs +296 -0
  72. package/benchmarks/verify-publication-artifacts.mjs +286 -0
  73. package/benchmarks/verify-submission-bundle.mjs +167 -0
  74. package/dist/mcp-server/config.d.ts +1 -1
  75. package/dist/mcp-server/config.d.ts.map +1 -1
  76. package/dist/mcp-server/config.js +1 -1
  77. package/dist/mcp-server/config.js.map +1 -1
  78. package/dist/mcp-server/index.d.ts +65 -3
  79. package/dist/mcp-server/index.d.ts.map +1 -1
  80. package/dist/mcp-server/index.js +675 -157
  81. package/dist/mcp-server/index.js.map +1 -1
  82. package/dist/src/action-key.d.ts +9 -0
  83. package/dist/src/action-key.d.ts.map +1 -0
  84. package/dist/src/action-key.js +49 -0
  85. package/dist/src/action-key.js.map +1 -0
  86. package/dist/src/adaptive.js +5 -5
  87. package/dist/src/affect.js +8 -8
  88. package/dist/src/audrey.d.ts +13 -0
  89. package/dist/src/audrey.d.ts.map +1 -1
  90. package/dist/src/audrey.js +68 -3
  91. package/dist/src/audrey.js.map +1 -1
  92. package/dist/src/capsule.js +4 -4
  93. package/dist/src/causal.js +3 -3
  94. package/dist/src/consolidate.js +48 -48
  95. package/dist/src/controller.d.ts +78 -6
  96. package/dist/src/controller.d.ts.map +1 -1
  97. package/dist/src/controller.js +273 -53
  98. package/dist/src/controller.js.map +1 -1
  99. package/dist/src/db.js +172 -172
  100. package/dist/src/decay.js +8 -8
  101. package/dist/src/embedding.d.ts +2 -1
  102. package/dist/src/embedding.d.ts.map +1 -1
  103. package/dist/src/embedding.js +39 -29
  104. package/dist/src/embedding.js.map +1 -1
  105. package/dist/src/encode.js +6 -6
  106. package/dist/src/feedback.d.ts +6 -0
  107. package/dist/src/feedback.d.ts.map +1 -1
  108. package/dist/src/feedback.js +6 -0
  109. package/dist/src/feedback.js.map +1 -1
  110. package/dist/src/forget.js +12 -12
  111. package/dist/src/hybrid-recall.js +9 -9
  112. package/dist/src/impact.js +6 -6
  113. package/dist/src/import.d.ts +3 -3
  114. package/dist/src/import.js +41 -41
  115. package/dist/src/index.d.ts +5 -4
  116. package/dist/src/index.d.ts.map +1 -1
  117. package/dist/src/index.js +3 -3
  118. package/dist/src/index.js.map +1 -1
  119. package/dist/src/interference.js +14 -14
  120. package/dist/src/introspect.js +18 -18
  121. package/dist/src/preflight.d.ts.map +1 -1
  122. package/dist/src/preflight.js +41 -0
  123. package/dist/src/preflight.js.map +1 -1
  124. package/dist/src/promote.js +7 -7
  125. package/dist/src/prompts.js +118 -118
  126. package/dist/src/recall.js +30 -30
  127. package/dist/src/reflexes.d.ts +1 -0
  128. package/dist/src/reflexes.d.ts.map +1 -1
  129. package/dist/src/reflexes.js +3 -0
  130. package/dist/src/reflexes.js.map +1 -1
  131. package/dist/src/rollback.js +4 -4
  132. package/dist/src/routes.d.ts.map +1 -1
  133. package/dist/src/routes.js +71 -2
  134. package/dist/src/routes.js.map +1 -1
  135. package/dist/src/validate.js +25 -25
  136. package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  137. package/docs/MEMORY_BENCHMARKING.md +59 -0
  138. package/docs/PRODUCTION_BACKLOG.md +304 -0
  139. package/docs/paper/00-master.md +48 -0
  140. package/docs/paper/01-introduction.md +27 -0
  141. package/docs/paper/02-related-work.md +47 -0
  142. package/docs/paper/03-problem-definition.md +108 -0
  143. package/docs/paper/04-design.md +164 -0
  144. package/docs/paper/05-guardbench-spec.md +412 -0
  145. package/docs/paper/06-implementation.md +113 -0
  146. package/docs/paper/07-evaluation.md +168 -0
  147. package/docs/paper/08-discussion-limitations.md +61 -0
  148. package/docs/paper/09-conclusion.md +11 -0
  149. package/docs/paper/SUBMISSION_README.md +162 -0
  150. package/docs/paper/appendix-a-demo-transcript.md +114 -0
  151. package/docs/paper/arxiv-compile-report.schema.json +116 -0
  152. package/docs/paper/arxiv-source.schema.json +61 -0
  153. package/docs/paper/audrey-paper-v1.md +1106 -0
  154. package/docs/paper/browser-launch-plan.json +209 -0
  155. package/docs/paper/browser-launch-plan.schema.json +100 -0
  156. package/docs/paper/browser-launch-results.json +86 -0
  157. package/docs/paper/browser-launch-results.schema.json +66 -0
  158. package/docs/paper/claim-register.json +138 -0
  159. package/docs/paper/claim-register.schema.json +81 -0
  160. package/docs/paper/evidence-ledger.md +103 -0
  161. package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  162. package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  163. package/docs/paper/output/arxiv/main.tex +949 -0
  164. package/docs/paper/output/arxiv/references.bib +222 -0
  165. package/docs/paper/output/arxiv-compile-report.json +24 -0
  166. package/docs/paper/output/submission-bundle/LICENSE +21 -0
  167. package/docs/paper/output/submission-bundle/README.md +555 -0
  168. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
  169. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
  170. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
  171. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
  172. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
  173. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1271 -0
  174. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +2107 -0
  175. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
  176. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
  177. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
  178. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
  179. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
  180. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
  181. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
  182. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
  183. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
  184. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
  185. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
  186. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
  187. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
  188. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
  189. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +184 -0
  190. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
  191. package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +249 -0
  192. package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
  193. package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
  194. package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
  195. package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
  196. package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
  197. package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
  198. package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
  199. package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
  200. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
  201. package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
  202. package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
  203. package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
  204. package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
  205. package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
  206. package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
  207. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
  208. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
  209. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
  210. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
  211. package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
  212. package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
  213. package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
  214. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
  215. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
  216. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
  217. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
  218. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
  219. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
  220. package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
  221. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
  222. package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
  223. package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
  224. package/docs/paper/output/submission-bundle/package.json +212 -0
  225. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
  226. package/docs/paper/paper-submission-bundle.schema.json +70 -0
  227. package/docs/paper/publication-pack.json +81 -0
  228. package/docs/paper/publication-pack.schema.json +60 -0
  229. package/docs/paper/references.bib +222 -0
  230. package/package.json +87 -4
  231. package/scripts/audit-release-completion.mjs +362 -0
  232. package/scripts/create-arxiv-source.mjs +362 -0
  233. package/scripts/create-paper-submission-bundle.mjs +210 -0
  234. package/scripts/finalize-release.mjs +526 -0
  235. package/scripts/prepare-release-cut.mjs +269 -0
  236. package/scripts/publish-release-bundle.mjs +209 -0
  237. package/scripts/publish-release-github-api.mjs +429 -0
  238. package/scripts/run-vitest.mjs +34 -0
  239. package/scripts/smoke-cli.js +92 -0
  240. package/scripts/sync-paper-artifacts.mjs +109 -0
  241. package/scripts/verify-arxiv-compile.mjs +440 -0
  242. package/scripts/verify-arxiv-source.mjs +194 -0
  243. package/scripts/verify-browser-launch-plan.mjs +237 -0
  244. package/scripts/verify-browser-launch-results.mjs +285 -0
  245. package/scripts/verify-paper-artifacts.mjs +338 -0
  246. package/scripts/verify-paper-claims.mjs +226 -0
  247. package/scripts/verify-paper-submission-bundle.mjs +207 -0
  248. package/scripts/verify-publication-pack.mjs +196 -0
  249. package/scripts/verify-python-package.py +201 -0
  250. package/scripts/verify-release-readiness.mjs +785 -0
package/CHANGELOG.md CHANGED
@@ -1,24 +1,110 @@
1
1
  # Changelog
2
2
 
3
- ## 0.23.1 - 2026-05-08
3
+ ## 1.0.1 - 2026-05-15
4
4
 
5
- ### Added - Audrey Guard chassis
5
+ ### Honest benchmarking
6
6
 
7
- - Added `MemoryController` as the first orchestration layer for memory-before-action workflows. `beforeAction()` returns `allow` / `warn` / `block` with evidence, reflexes, recommendations, and an optional capsule; `afterAction()` records redacted tool outcomes and turns failures into tool-result memories.
8
- - Added `audrey guard --tool <Tool> "<action>"` with `--json`, `--explain`, `--override`, and `--fail-on-warn`.
9
- - Added `audrey demo --scenario repeated-failure`, a deterministic no-network demo where Audrey records a failed deploy, blocks the repeat attempt, validates the lesson, and prints impact.
10
- - `Audrey.encodeBatch()` now uses provider-level `embedBatch()` and validates the batch before embedding, avoiding N sequential cloud embedding calls for valid batches.
11
- - Recall now surfaces partial vector/FTS failures on the returned result array. Capsules preserve those diagnostics, strict Guard preflights block when recall is degraded, and `/v1/status` / `memory_status` expose the latest recall degradation signal.
12
- - Added `docs/AUDREY_PAPER_OUTLINE.md`, framing Audrey Guard as local-first pre-action memory control for tool-using agents and outlining the GuardBench evaluation plan.
7
+ - **GuardBench pass gate rewritten.** The `passed` check no longer requires Audrey-specific lineage substrings (`"failed before"`, `"recall:"`, `"must-follow"`, etc.) in the subject's `summary`. A scenario passes when the decision matches the expected verdict, no seeded secrets leak, and (for `block`/`warn` scenarios) the subject returns at least one evidence id. The prior phrase-substring gate was structurally biased toward Audrey because only its controller emitted those exact tokens; baselines or external adapters that produced semantically correct decisions could still fail the gate on phrasing alone. The Audrey-style lineage match is preserved as a separate `lineageTextMatched` field per row and `lineageRichness` per system, reported as an informational metric, not the pass gate.
8
+ - Adds `lineageRichness` and `hasEvidenceForDecision` to GuardBench raw + summary schemas; `requiredEvidenceMatched` is kept as a back-compat alias of `hasEvidenceForDecision`.
13
9
 
14
- ### Fixed
10
+ ### Guard runtime
11
+
12
+ - **`MemoryController` no longer hard-blocks repeated failures forever.** A new `failureDecayDays` constructor option defaults to 7: same-action prior failures older than that window are treated as stale and no longer trigger an automatic block. Pass `failureDecayDays: 0` to restore the pre-1.0.1 behavior.
13
+ - Adds `AgentAction.acknowledgePriorFailure` on the `MemoryController` SDK surface. When set, an exact-repeated-failure that would otherwise produce `block` degrades to `warn`. Evidence ids and risk score remain attached so the prior failure still surfaces in the action receipt. A CLI flag exposing this through `audrey guard` will land in a follow-up release.
14
+
15
+ ### Structured errors
16
+
17
+ - `Audrey.validate()` lineage rejections now throw `ValidateLineageError` with a stable `code` (`PREFLIGHT_NOT_FOUND` | `PREFLIGHT_WRONG_TYPE` | `LINEAGE_REJECTED` | `ACTION_KEY_MISMATCH`). `POST /v1/validate` surfaces the same code in the 400 response body so HTTP and MCP callers can branch on the failure shape without parsing the message string. `ValidateLineageError` and `ValidateErrorCode` are exported from the public SDK entry point.
18
+
19
+ ### Documentation
20
+
21
+ - README's GuardBench section caveats the headline number against the mock 64-dim provider, the 5-of-10 expected-block scenario count, and the new evidence-non-empty gate so the "10/10 vs baselines" framing matches the actual contract.
22
+ - README documents `AUDREY_DATA_DIR` per-tenant isolation as a hard requirement (SQLite WAL mode has no advisory lock; two processes in one data dir contend).
23
+ - README dev path notes `npm run build` before any source-tree CLI subcommand resolves.
24
+ - Paper section reframes `bench:memory:check` as an internal regression suite, not a competitive benchmark, so local stub baselines are not cited as cross-system claims.
25
+ - Personal-env diagnostic logs (`gcm-diagnose.log`, scratch `*.log`, `audrey-arxiv-preview.png`) excluded from repo root and `.gitignore` broadened.
26
+
27
+ ## 1.0.0 - 2026-05-13
28
+
29
+ ### Audrey Guard
30
+
31
+ - Ships Audrey Guard as the release-defining loop: receipt-backed `go`,
32
+ `caution`, and `block` decisions before tool use, followed by auditable
33
+ outcome capture through CLI, REST, MCP, and SDK surfaces.
34
+ - Adds Claude Code hook generation and an idempotent hook-apply path so
35
+ `guard --hook --fail-on-warn` can run at `PreToolUse` and post-tool events
36
+ can feed Audrey's redacted trace memory.
37
+ - Binds validation feedback to preflight event ids, evidence ids, and action
38
+ fingerprints so remembered guidance can be audited after use.
39
+
40
+ ### GuardBench And Paper Artifacts
41
+
42
+ - Ships GuardBench, a local comparative benchmark for pre-action memory control
43
+ across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only
44
+ baselines.
45
+ - Adds portable GuardBench bundles, conformance cards, JSON schemas, adapter
46
+ self-tests, leaderboard generation, external adapter dry-runs, and pending
47
+ external evidence reports for Mem0 Platform and Zep Cloud.
48
+ - Ships the Audrey Guard paper source, claim register, publication-pack
49
+ verifier, browser launch plan/results ledger, deterministic arXiv source
50
+ package, local arXiv compile proof, and paper submission bundle.
51
+
52
+ ### Release Controls
53
+
54
+ - Adds pending-aware `release:readiness` and strict `release:readiness:strict`
55
+ gates so code, paper, source control, npm, PyPI, browser publication, and
56
+ external-evidence blockers stay separate.
57
+ - Adds `release:cut:plan` and `release:cut:apply` so npm, lockfile, MCP,
58
+ Python, and changelog version surfaces are cut consistently.
59
+ - Adds production dependency audit coverage to release gates and keeps
60
+ `npm audit --omit=dev --audit-level=moderate` clean.
61
+
62
+ ### Runtime And Client Hardening
63
+
64
+ - `Audrey.encodeBatch()` now calls provider-level `embedBatch()` once per batch
65
+ and writes each episode through the existing `encodeEpisode()` path with the
66
+ precomputed vector.
67
+ - OpenAI embedding batches are chunked by `batchSize` so large batch encodes do
68
+ not turn into one oversized API request.
69
+ - Improves recall degradation reporting across capsules, strict preflights,
70
+ status surfaces, and Guard decisions.
71
+
72
+ ## 0.23.0 - 2026-05-05
73
+
74
+ ### Audrey Guard — memory before action becomes the product loop
75
+
76
+ - Added Audrey Guard as a first-class controller loop: `beforeAction()` checks memory before an agent touches tools, returns a receipt-backed `go` / `caution` / `block` decision, and `afterAction()` records what happened afterward.
77
+ - Added JavaScript SDK exports and `Audrey.beforeAction()` / `Audrey.afterAction()` methods so agent runtimes can use the same loop without going through CLI or REST.
78
+ - Added `POST /v1/guard/before` and `POST /v1/guard/after` REST routes for sidecar agents.
79
+ - Added `memory_guard_before` and `memory_guard_after` MCP tools for hosts that want memory decisions at the tool boundary.
80
+ - Added `npx audrey guard` and `npx audrey guard-after` CLI commands, including JSON output for hooks and automation.
81
+
82
+ ### Release-defining behavior
83
+
84
+ - Guard decisions reuse the existing preflight and reflex machinery without doing two independent recall passes.
85
+ - Guard receipts are stored as `memory_events` rows with guard metadata, evidence ids, reflex ids, preflight decision, warning counts, and redacted tool-trace linkage.
86
+ - `guard-after` now validates evidence feedback before mutating memory, rejects non-guard receipts, and prevents replaying the same receipt to apply duplicate feedback.
87
+ - A failed guarded tool run becomes future memory: the next guard check for the same tool can produce a recent-failure warning and reflex before the agent repeats the mistake.
88
+ - Strict guard mode can block high-severity must-follow memories before risky actions, which is the release's headline "memory firewall" behavior.
89
+
90
+ ### Benchmarks
91
+
92
+ - Added an Agent Guard Loop benchmark suite covering prior tool-failure caution, strict must-follow blocking, receipt replay rejection, and non-guard receipt rejection.
93
+ - Added `npm run bench:memory:guard` for focused guard-loop regression testing.
94
+ - Kept guard-loop cases out of the comparable retrieval/lifecycle aggregate when all suites are run, so the local baseline chart remains honest rather than inflated by no-controller placeholders.
95
+ - Committed a fresh `benchmarks/snapshots/perf-0.23.0.json` performance snapshot and fixed direct snapshot runs so they resolve Audrey's package version without depending on npm-injected environment.
96
+ - Added a CLI smoke script to the release gate and Node CI jobs so `--version`, `doctor --json`, and `demo` are proven before pack dry-run.
97
+ - Included benchmark harness files and snapshots in the npm package so advertised benchmark scripts work from the published tarball.
98
+ - Added a package-lock consistency test so release versions cannot drift between `package.json` and `package-lock.json` again.
99
+
100
+ ### Docs and release posture
15
101
 
16
- - Docker Compose now requires `AUDREY_API_KEY` instead of starting a non-loopback unauthenticated REST sidecar that the server correctly refuses.
17
- - Guard exact-failure matching now redacts before trimming, matches tool names case-insensitively, and includes file scope in the action hash.
18
- - Redaction-aware truncation keeps complete `[REDACTED:*]` markers in long tool errors and output summaries.
19
- - `npm test` and `npm run test:watch` now set a repo-local Vitest temp directory before Vitest starts, avoiding locked-down Windows user-temp failures.
20
- - `npm audit --omit=dev --audit-level=moderate` is clean after refreshing Hono, Zod, and transitive rate-limit packages.
21
- - README benchmark sample values now match `benchmarks/snapshots/perf-0.22.2.json`; the paper evidence ledger was re-checked for the repeated-failure demo line range and live bibliography URLs before release prep.
102
+ - Updated README quick-start, surface tables, and benchmark notes around Audrey Guard.
103
+ - Added `docs/MEMORY_BENCHMARKING.md` to state the release's benchmark policy and map Audrey against LongMemEval, LoCoMo, MemoryAgentBench, StructMemEval, and MemGUI-Bench.
104
+ - Added release design and implementation docs under `docs/superpowers/`.
105
+ - Updated the production backlog to mark the v0.23 controller slice as shipped and to focus the next work on hook installation, external benchmark evidence, batching, and partial recall diagnostics.
106
+ - Bumped JavaScript, MCP CLI, and Python client version surfaces to `0.23.0`.
107
+ - Added the Python 3.9 `eval-type-backport` dependency marker required by Pydantic for Audrey's modern type annotations, and moved Python package metadata to the current setuptools license form.
22
108
 
23
109
  ## 0.22.2 - 2026-05-01
24
110
 
package/LICENSE CHANGED
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2026 evilander
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2026 evilander
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -27,7 +27,7 @@ Audrey turns those hard-won lessons into a local memory runtime:
27
27
  - `memory_recall` finds durable context by semantic similarity.
28
28
  - `memory_preflight` checks prior failures, risks, rules, and relevant procedures before an action.
29
29
  - `memory_reflexes` converts remembered evidence into trigger-response guidance agents can follow.
30
- - `memory_validate` closes the loop after the action `helpful`, `used`, or `wrong` outcomes feed salience and decay.
30
+ - `memory_validate` closes the loop after the action: `helpful`, `used`, or `wrong` outcomes feed salience and can bind back to the exact preflight event, evidence ids, and Guard action fingerprint.
31
31
  - `memory_dream` consolidates episodes into principles and applies decay.
32
32
  - `audrey impact` and `audrey doctor` tell a human or CI system whether the runtime is doing real work and is actually ready.
33
33
 
@@ -52,7 +52,7 @@ npx audrey guard --tool Bash "npm run deploy"
52
52
  Expected first-run shape:
53
53
 
54
54
  ```text
55
- Audrey Doctor v0.23.1
55
+ Audrey Doctor v1.0.0
56
56
  Store health: not initialized
57
57
  Verdict: ready
58
58
  ```
@@ -75,6 +75,7 @@ Generate raw config blocks:
75
75
  npx audrey mcp-config codex
76
76
  npx audrey mcp-config generic
77
77
  npx audrey mcp-config vscode
78
+ npx audrey hook-config claude-code
78
79
  ```
79
80
 
80
81
  Claude Code can be registered directly:
@@ -84,7 +85,16 @@ npx audrey install
84
85
  claude mcp list
85
86
  ```
86
87
 
87
- All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. Use `AUDREY_DATA_DIR` to isolate projects, tenants, or host identities.
88
+ For memory-before-action hooks, preview with `npx audrey hook-config
89
+ claude-code`, then apply with `npx audrey hook-config claude-code --apply
90
+ --scope project` for `.claude/settings.local.json` or `--scope user` for
91
+ `~/.claude/settings.json`. Audrey merges the hook block into existing settings
92
+ and writes a timestamped backup before changing a non-empty file. The generated
93
+ `PreToolUse` hook runs `audrey guard --hook --fail-on-warn`; the `PostToolUse`
94
+ and `PostToolUseFailure` hooks record redacted tool traces. Verify the active
95
+ hook set inside Claude Code with `/hooks`.
96
+
97
+ All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. **Set a distinct `AUDREY_DATA_DIR` per tenant, agent identity, or concurrent host.** SQLite uses WAL mode without an advisory lock, so two processes sharing a directory will contend on writes. Isolation is a hard requirement for multi-agent setups, not a recommendation.
88
98
 
89
99
  Installer-generated host config does not include provider API keys by default. Prefer setting `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` in the host runtime environment; use `npx audrey install --include-secrets` only if you explicitly accept argv/config exposure.
90
100
 
@@ -121,7 +131,7 @@ Core sidecar tools:
121
131
  | Surface | Status |
122
132
  |---|---|
123
133
  | MCP stdio server | 20 tools plus status/recent/principles resources and briefing/recall/reflection prompts |
124
- | CLI | `doctor`, `demo`, `guard`, `install`, `mcp-config`, `status`, `dream`, `reembed`, `observe-tool`, `promote`, `impact` |
134
+ | CLI | `doctor`, `demo`, `guard`, `install`, `mcp-config`, `hook-config`, `status`, `dream`, `reembed`, `observe-tool`, `promote`, `impact` |
125
135
  | REST API | Hono server with `/health` and `/v1/*` routes |
126
136
  | JavaScript SDK | Direct TypeScript/Node import from `audrey` |
127
137
  | Python client | `pip install audrey-memory`, calls the REST sidecar |
@@ -191,6 +201,9 @@ Release gates used for this package:
191
201
 
192
202
  ```bash
193
203
  npm run release:gate
204
+ npm run python:release:check
205
+ npm run bench:guard:card
206
+ npm run bench:guard:validate
194
207
  npx audrey doctor
195
208
  npx audrey demo
196
209
  ```
@@ -237,7 +250,7 @@ Production controls you still own:
237
250
 
238
251
  ## Benchmarks
239
252
 
240
- Audrey ships two benchmark commands.
253
+ Audrey ships three benchmark families.
241
254
 
242
255
  ### Performance snapshot
243
256
 
@@ -268,6 +281,209 @@ npm run bench:memory # full regression suite (writes JSON + report)
268
281
  npm run bench:memory:check # release gate, exits non-zero on regression
269
282
  ```
270
283
 
284
+ ### GuardBench comparative suite
285
+
286
+ `npm run bench:guard:check` runs Audrey's local GuardBench comparative suite:
287
+ ten pre-action scenarios across Audrey Guard, no-memory, recent-window,
288
+ vector-only, and FTS-only adapters. The scenarios cover exact repeated
289
+ failures, required procedures, changed file scopes, changed commands,
290
+ recovered failures, recall degradation, redaction safety, conflicting
291
+ instructions, and noisy stores. It writes
292
+ `benchmarks/output/guardbench-summary.json`,
293
+ `benchmarks/output/guardbench-manifest.json`, and
294
+ `benchmarks/output/guardbench-raw.json`. The emitted manifest, summary, and raw
295
+ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
296
+
297
+ Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
298
+ rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
299
+ the raw-secret sweep, and 2.465ms / 30.791ms
300
+ p50/p95 guard latency under the mock-provider methodology.
301
+
302
+ **Methodology caveats, on purpose.** All numbers above are produced against
303
+ the in-process mock 64-dim embedding provider documented in the run's
304
+ `provenance` block. They characterize Audrey's controller and SQLite path,
305
+ not real-provider end-to-end latency or production false-positive rates. The
306
+ 100% prevention rate is over the 5 GuardBench scenarios that expect a
307
+ `block` decision (the suite is 10 scenarios total, mixed across allow / warn
308
+ / block). Local baseline decision accuracy was: no-memory 10%, recent-window
309
+ 60%, vector-only 40%, and FTS-only 10%; none of the local baselines passed
310
+ the GuardBench decision-plus-evidence contract, which since v1.0.1 requires
311
+ the correct decision plus at least one returned evidence id for `block` /
312
+ `warn` scenarios (no longer Audrey-specific lineage phrasing — see
313
+ `CHANGELOG.md#101---2026-05-15`). External-system numbers for Mem0 and Zep
314
+ are explicitly out of scope for this Stage-A artifact; live credentialed
315
+ runs land in a v2 paper after raw evidence bundles publish.
316
+
317
+ ```bash
318
+ npm run bench:guard
319
+ npm run bench:guard:check
320
+ npm run bench:guard:manifest
321
+ npm run bench:guard:validate
322
+ npm run bench:guard:card
323
+ npm run bench:guard:bundle
324
+ npm run bench:guard:bundle:verify
325
+ npm run bench:guard:leaderboard
326
+ npm run bench:guard:adapter-registry:validate
327
+ npm run bench:guard:adapter-module:validate
328
+ npm run bench:guard:adapter-self-test
329
+ npm run bench:guard:adapter-self-test:validate
330
+ npm run bench:guard:publication:verify
331
+ npm run bench:guard:adapter-smoke
332
+ npm run bench:guard:adapter-conformance
333
+ npm run bench:guard:external:dry-run
334
+ npm run bench:guard:mem0 -- --dry-run
335
+ npm run bench:guard:zep -- --dry-run
336
+ node benchmarks/adapter-self-test.mjs --adapter ./path/to/adapter.mjs
337
+ node benchmarks/guardbench.js --adapter ./path/to/adapter.mjs --check
338
+ ```
339
+
340
+ External GuardBench adapters are ESM modules that export either `default`,
341
+ `adapter`, or `createGuardBenchAdapter()`. The adapter receives scenario seed
342
+ data and the proposed action, but the harness withholds `expectedDecision` and
343
+ `requiredEvidence` until scoring. Start from
344
+ `benchmarks/adapters/example-allow.mjs` when wiring a new system. Adapter
345
+ authors can import `defineGuardBenchAdapter()` and `defineGuardBenchResult()`
346
+ from `benchmarks/adapter-kit.mjs` to validate module shape and decision output
347
+ while developing.
348
+
349
+ The published adapter registry lives at `benchmarks/adapters/registry.json`.
350
+ Run `npm run bench:guard:adapter-registry:validate` to verify registry shape,
351
+ adapter paths, and credential-free module loading.
352
+
353
+ Before running the full self-test, validate the ESM module shape quickly:
354
+
355
+ ```bash
356
+ npm run bench:guard:adapter-module:validate -- --adapter ./path/to/adapter.mjs
357
+ ```
358
+
359
+ Before publishing a new adapter, run `npm run bench:guard:adapter-self-test --
360
+ --adapter ./path/to/adapter.mjs`. The self-test validates the external adapter
361
+ contract and row conformance while explicitly allowing low benchmark scores, so
362
+ authors can separate "valid submission shape" from "competitive GuardBench
363
+ performance." The generated self-test report is validated against
364
+ `benchmarks/schemas/guardbench-adapter-self-test.schema.json`. Reviewers can
365
+ validate a submitted report without rerunning an adapter through `npm run
366
+ bench:guard:adapter-self-test:validate -- --report ./guardbench-adapter-self-test.json`.
367
+
368
+ Audrey ships external adapters for Mem0 Platform and Zep Cloud. Run them only
369
+ with runtime API keys:
370
+
371
+ ```bash
372
+ set MEM0_API_KEY=...
373
+ npm run bench:guard:mem0
374
+
375
+ set ZEP_API_KEY=...
376
+ npm run bench:guard:zep
377
+ ```
378
+
379
+ The Zep adapter uses the current REST surface for users, sessions, `memory.add`,
380
+ `graph.search`, and benchmark-user cleanup. If Zep graph ingestion needs more
381
+ time in a live account, set `ZEP_GUARDBENCH_INGEST_DELAY_MS` before the run.
382
+
383
+ Run `npm run bench:guard:external:dry-run` before coordinating credentialed
384
+ runs. It walks the runtime-env adapter registry, writes non-secret
385
+ `external-run-metadata.json` files for each adapter, and reports which runtime
386
+ environment variables are still missing. The external dry-run matrix report is schema-bound by
387
+ `benchmarks/schemas/guardbench-external-dry-run.schema.json` and written to
388
+ `benchmarks/output/external/guardbench-external-dry-run.json`.
389
+
390
+ Run `npm run bench:guard:external:evidence` after dry-runs or live runs to
391
+ write `benchmarks/output/external/guardbench-external-evidence.json`. This
392
+ external evidence verification report is schema-bound by
393
+ `benchmarks/schemas/guardbench-external-evidence.schema.json`, treats dry-run
394
+ or missing-key rows as pending in normal release gates, and checks that saved
395
+ metadata does not contain runtime credential values. Use
396
+ `npm run bench:guard:external:evidence:strict` when Mem0/Zep keys have been
397
+ provided; strict mode fails until every runtime-env adapter has a passed live
398
+ bundle.
399
+
400
+ External runs write `external-run-metadata.json` alongside the GuardBench
401
+ summary, manifest, and raw output bundle under
402
+ `benchmarks/output/external/<adapter>/`. The external runner validates the
403
+ emitted bundle with `benchmarks/validate-guardbench-artifacts.mjs` before
404
+ marking the run passed, and separately records adapter conformance so a valid
405
+ low-scoring adapter is distinguished from a malformed adapter. When
406
+ `external-run-metadata.json` is present, the validator also checks it against
407
+ `benchmarks/schemas/guardbench-external-run.schema.json` and verifies any
408
+ recorded SHA-256 artifact hashes against the bundle on disk.
409
+
410
+ For a shareable submission artifact, run `npm run bench:guard:card -- --dir
411
+ <output-dir>`. This writes `guardbench-conformance-card.json` with the subject
412
+ name, run status, score, conformance result, artifact hashes, optional
413
+ external-run metadata hash, and machine provenance. The standalone validator
414
+ checks the card when it is present.
415
+
416
+ For a portable submission directory, run `npm run bench:guard:bundle -- --dir
417
+ <output-dir>`. This creates `submission-bundle/` with the raw GuardBench
418
+ artifacts, conformance card, JSON schemas, validation report, and
419
+ `submission-manifest.json` with SHA-256 hashes for every bundled file.
420
+ Reviewers can run `npm run bench:guard:bundle:verify -- --dir
421
+ <submission-bundle>` to check manifest hashes, bundled schemas, and artifact
422
+ validation from the bundle alone.
423
+
424
+ For benchmark aggregation, run `npm run bench:guard:leaderboard -- --bundle
425
+ <submission-bundle>`. The leaderboard builder verifies each bundle before
426
+ ranking and writes JSON plus Markdown reports under `benchmarks/output/leaderboard/`.
427
+
428
+ Before publishing benchmark artifacts, run `npm run
429
+ bench:guard:publication:verify`. This single benchmark-focused verifier checks
430
+ the adapter registry, default adapter module, adapter self-test report,
431
+ GuardBench manifest/summary/raw artifacts, submission bundle, external dry-run
432
+ matrix, external evidence verification report, leaderboard, and a local
433
+ absolute-path sweep over the public artifact set.
434
+ The verifier validates its own machine-readable report against
435
+ `benchmarks/schemas/guardbench-publication-verification.schema.json` before it
436
+ exits.
437
+
438
+ Before turning the paper into public posts or submissions, run `npm run
439
+ paper:claims`. It validates `docs/paper/claim-register.json` against the
440
+ current paper, README, GuardBench artifacts, publication verifier, and external
441
+ evidence status so pending Mem0/Zep live-score claims cannot slip into public
442
+ copy.
443
+ Run `npm run paper:publication-pack` to verify the ready-to-use arXiv, Hacker
444
+ News, Reddit, X, and LinkedIn drafts in `docs/paper/publication-pack.json`
445
+ before browser-based submission. The X URL reserve is explicit: the first X
446
+ post carries `reservedUrlChars: 24`, and submitted artifact-url targets in
447
+ `browser-launch-results.json` must record the final `artifactUrl`.
448
+ Run `npm run paper:arxiv` to generate a deterministic TeX source package under
449
+ `docs/paper/output/arxiv/`, and `npm run paper:arxiv:verify` to check hashes,
450
+ citation conversion, bibliography coverage, seeded-secret redaction, and local
451
+ absolute-path leakage before arXiv upload.
452
+ Run `npm run paper:arxiv:compile` to record a schema-bound compile report at
453
+ `docs/paper/output/arxiv-compile-report.json`. It attempts `tectonic`,
454
+ `latexmk`, `pdflatex`/`bibtex`, or `uvx tecto` with a local bundle proxy when
455
+ available; `npm run paper:arxiv:compile:strict` stays blocked on hosts without
456
+ supported TeX tooling.
457
+ Run `npm run paper:launch-plan` to verify
458
+ `docs/paper/browser-launch-plan.json`, which maps those drafts to manual
459
+ browser targets, login/captcha expectations, platform-rule checks, source
460
+ URLs, and post-submit URL capture.
461
+ Run `npm run paper:launch-results` to validate
462
+ `docs/paper/browser-launch-results.json`, the post-submit ledger for arXiv,
463
+ Hacker News, Reddit, X, and LinkedIn targets. The normal verifier allows
464
+ pending rows with explicit blockers; `npm run paper:launch-results:strict`
465
+ fails until every target has a submitted, operator-verified public URL.
466
+ Run `npm run paper:bundle` to generate
467
+ `docs/paper/output/submission-bundle/`, a hash-manifested package containing
468
+ paper sources, claim and publication registers, GuardBench outputs, schemas,
469
+ and package metadata. `npm run paper:bundle:verify` checks the manifest and
470
+ file hashes before browser upload.
471
+ Run `npm run release:readiness` for the pending-aware Audrey 1.0 checklist.
472
+ It keeps code/paper readiness separate from publish blockers; `npm run
473
+ release:readiness:strict` fails until the 1.0 version surfaces,
474
+ source-control state, live remote-head verification, Python artifacts, npm
475
+ registry/auth readiness, PyPI publish readiness, arXiv compile proof, browser
476
+ publication URLs, and live Mem0/Zep evidence are complete.
477
+ Run `npm run release:cut:plan` to preview the exact 1.0 version/changelog
478
+ edits across npm, lockfile, MCP, and Python surfaces. `npm run
479
+ release:cut:apply -- --target-version 1.0.0` writes those edits only when the
480
+ final cut is intentional. The generated changelog section is release-note copy,
481
+ not a TODO scaffold; `release:readiness:strict` rejects placeholder changelog
482
+ markers before publication.
483
+ Run `npm run security:audit` before packaging or publishing; the release gates
484
+ call it after artifact verification so production dependency advisories cannot
485
+ slip past the final package check.
486
+
271
487
  ## Command Reference
272
488
 
273
489
  ```bash
@@ -279,6 +495,7 @@ npx audrey demo
279
495
  npx audrey install --host codex --dry-run
280
496
  npx audrey mcp-config codex
281
497
  npx audrey mcp-config generic
498
+ npx audrey hook-config claude-code
282
499
  npx audrey install
283
500
  npx audrey uninstall
284
501
 
@@ -313,11 +530,20 @@ The Node sidecar defaults to `127.0.0.1:7437`. The Docker image intentionally bi
313
530
 
314
531
  ## Development
315
532
 
533
+ Developer setup runs from source, not from the published tarball, so `npm run build` is required before any CLI subcommand resolves:
534
+
316
535
  ```bash
317
536
  npm ci
537
+ npm run build
538
+ npm test
539
+ ```
540
+
541
+ Once built, the `Quick Start` commands work against the local `dist/` output. The full release gate runs everything CI runs:
542
+
543
+ ```bash
318
544
  npm run release:gate
319
545
  python -m unittest discover -s python/tests -v
320
- python -m build --no-isolation python
546
+ npm run python:release:check
321
547
  ```
322
548
 
323
549
  `npm test` uses a repo-local Vitest launcher so locked-down Windows temp
package/SECURITY.md CHANGED
@@ -6,7 +6,8 @@ Security fixes are best-effort for the current published release line and the cu
6
6
 
7
7
  | Version | Supported |
8
8
  |---|---|
9
- | `0.22.x` | Yes |
9
+ | `0.23.x` | Yes |
10
+ | `0.22.x` | Best effort |
10
11
  | `< 0.22.0` | No |
11
12
 
12
13
  ## Reporting a Vulnerability
@@ -0,0 +1,20 @@
1
+ import { validateGuardBenchAdapter, validateAdapterResult } from './guardbench.js';
2
+
3
+ export const GUARDBENCH_ADAPTER_CONTRACT_VERSION = '1.0.0';
4
+ export const GUARDBENCH_DECISIONS = Object.freeze(['allow', 'warn', 'block']);
5
+ export const GUARDBENCH_RESULT_FIELDS = Object.freeze([
6
+ 'decision',
7
+ 'riskScore',
8
+ 'evidenceIds',
9
+ 'recommendedActions',
10
+ 'summary',
11
+ 'recallErrors',
12
+ ]);
13
+
14
+ export function defineGuardBenchAdapter(adapter) {
15
+ return validateGuardBenchAdapter(adapter, adapter?.name ?? 'inline adapter');
16
+ }
17
+
18
+ export function defineGuardBenchResult(result, adapterName = 'adapter', scenarioId = 'scenario') {
19
+ return validateAdapterResult(result, adapterName, scenarioId);
20
+ }