audrey 0.23.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +81 -19
- package/LICENSE +21 -21
- package/README.md +209 -5
- package/SECURITY.md +2 -1
- package/benchmarks/adapter-kit.mjs +20 -0
- package/benchmarks/adapter-self-test.mjs +166 -0
- package/benchmarks/adapters/example-allow.mjs +28 -0
- package/benchmarks/adapters/mem0-platform.mjs +267 -0
- package/benchmarks/adapters/registry.json +51 -0
- package/benchmarks/adapters/zep-cloud.mjs +280 -0
- package/benchmarks/baselines.js +169 -0
- package/benchmarks/build-leaderboard.mjs +170 -0
- package/benchmarks/cases.js +537 -0
- package/benchmarks/create-conformance-card.mjs +139 -0
- package/benchmarks/create-submission-bundle.mjs +176 -0
- package/benchmarks/dry-run-external-adapters.mjs +165 -0
- package/benchmarks/guardbench.js +1035 -0
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/guardbench-manifest.json +414 -0
- package/benchmarks/output/guardbench-raw.json +1171 -0
- package/benchmarks/output/guardbench-summary.json +1981 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +63 -0
- package/benchmarks/output/submission-bundle/guardbench-manifest.json +414 -0
- package/benchmarks/output/submission-bundle/guardbench-raw.json +1171 -0
- package/benchmarks/output/submission-bundle/guardbench-summary.json +1981 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/benchmarks/output/summary.json +2354 -0
- package/benchmarks/perf-snapshot.js +304 -0
- package/benchmarks/perf.bench.js +161 -0
- package/benchmarks/public-paths.mjs +78 -0
- package/benchmarks/reference-results.js +70 -0
- package/benchmarks/report.js +259 -0
- package/benchmarks/run-external-guardbench.mjs +281 -0
- package/benchmarks/run.js +682 -0
- package/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/benchmarks/snapshots/perf-0.22.2.json +123 -0
- package/benchmarks/snapshots/perf-0.23.0.json +123 -0
- package/benchmarks/validate-adapter-module.mjs +104 -0
- package/benchmarks/validate-adapter-registry.mjs +134 -0
- package/benchmarks/validate-adapter-self-test.mjs +96 -0
- package/benchmarks/validate-guardbench-artifacts.mjs +343 -0
- package/benchmarks/verify-external-evidence.mjs +296 -0
- package/benchmarks/verify-publication-artifacts.mjs +286 -0
- package/benchmarks/verify-submission-bundle.mjs +167 -0
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +65 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +675 -157
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts +9 -0
- package/dist/src/action-key.d.ts.map +1 -0
- package/dist/src/action-key.js +49 -0
- package/dist/src/action-key.js.map +1 -0
- package/dist/src/adaptive.js +5 -5
- package/dist/src/affect.js +8 -8
- package/dist/src/audrey.d.ts +3 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +55 -3
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.js +4 -4
- package/dist/src/causal.js +3 -3
- package/dist/src/consolidate.js +48 -48
- package/dist/src/controller.d.ts +61 -5
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +230 -49
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.js +172 -172
- package/dist/src/decay.js +8 -8
- package/dist/src/embedding.d.ts +2 -1
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +39 -29
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.js +6 -6
- package/dist/src/feedback.d.ts +6 -0
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +6 -0
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.js +12 -12
- package/dist/src/hybrid-recall.js +9 -9
- package/dist/src/impact.js +6 -6
- package/dist/src/import.d.ts +3 -3
- package/dist/src/import.js +41 -41
- package/dist/src/index.d.ts +3 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +2 -2
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.js +14 -14
- package/dist/src/introspect.js +18 -18
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +41 -0
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/promote.js +7 -7
- package/dist/src/prompts.js +118 -118
- package/dist/src/recall.js +30 -30
- package/dist/src/reflexes.d.ts +1 -0
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +3 -0
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.js +4 -4
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +67 -1
- package/dist/src/routes.js.map +1 -1
- package/dist/src/validate.js +25 -25
- package/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/MEMORY_BENCHMARKING.md +59 -0
- package/docs/PRODUCTION_BACKLOG.md +304 -0
- package/docs/paper/00-master.md +48 -0
- package/docs/paper/01-introduction.md +27 -0
- package/docs/paper/02-related-work.md +47 -0
- package/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/04-design.md +164 -0
- package/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/06-implementation.md +113 -0
- package/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/claim-register.json +138 -0
- package/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/LICENSE +21 -0
- package/docs/paper/output/submission-bundle/README.md +533 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +50 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +56 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +63 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-manifest.json +414 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +1171 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +1981 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +93 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +7 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +131 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +31 -0
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +2354 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-registry.schema.json +69 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-adapter-self-test.schema.json +156 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-conformance-card.schema.json +184 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-dry-run.schema.json +74 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-evidence.schema.json +108 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-external-run.schema.json +160 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-leaderboard.schema.json +179 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-manifest.schema.json +213 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-publication-verification.schema.json +47 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +164 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-submission-manifest.schema.json +151 -0
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +228 -0
- package/docs/paper/output/submission-bundle/docs/AUDREY_PAPER_OUTLINE.md +175 -0
- package/docs/paper/output/submission-bundle/docs/paper/00-master.md +48 -0
- package/docs/paper/output/submission-bundle/docs/paper/01-introduction.md +27 -0
- package/docs/paper/output/submission-bundle/docs/paper/02-related-work.md +47 -0
- package/docs/paper/output/submission-bundle/docs/paper/03-problem-definition.md +108 -0
- package/docs/paper/output/submission-bundle/docs/paper/04-design.md +164 -0
- package/docs/paper/output/submission-bundle/docs/paper/05-guardbench-spec.md +412 -0
- package/docs/paper/output/submission-bundle/docs/paper/06-implementation.md +113 -0
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +168 -0
- package/docs/paper/output/submission-bundle/docs/paper/08-discussion-limitations.md +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/09-conclusion.md +11 -0
- package/docs/paper/output/submission-bundle/docs/paper/SUBMISSION_README.md +162 -0
- package/docs/paper/output/submission-bundle/docs/paper/appendix-a-demo-transcript.md +114 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-compile-report.schema.json +116 -0
- package/docs/paper/output/submission-bundle/docs/paper/arxiv-source.schema.json +61 -0
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +1106 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.json +209 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-plan.schema.json +100 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.json +86 -0
- package/docs/paper/output/submission-bundle/docs/paper/browser-launch-results.schema.json +66 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.json +138 -0
- package/docs/paper/output/submission-bundle/docs/paper/claim-register.schema.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +103 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/README-arxiv.txt +8 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +41 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +949 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/references.bib +222 -0
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +24 -0
- package/docs/paper/output/submission-bundle/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.json +81 -0
- package/docs/paper/output/submission-bundle/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/output/submission-bundle/docs/paper/references.bib +222 -0
- package/docs/paper/output/submission-bundle/package.json +212 -0
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +379 -0
- package/docs/paper/paper-submission-bundle.schema.json +70 -0
- package/docs/paper/publication-pack.json +81 -0
- package/docs/paper/publication-pack.schema.json +60 -0
- package/docs/paper/references.bib +222 -0
- package/package.json +87 -4
- package/scripts/audit-release-completion.mjs +362 -0
- package/scripts/create-arxiv-source.mjs +362 -0
- package/scripts/create-paper-submission-bundle.mjs +210 -0
- package/scripts/finalize-release.mjs +526 -0
- package/scripts/prepare-release-cut.mjs +269 -0
- package/scripts/publish-release-bundle.mjs +209 -0
- package/scripts/publish-release-github-api.mjs +429 -0
- package/scripts/run-vitest.mjs +34 -0
- package/scripts/smoke-cli.js +72 -0
- package/scripts/sync-paper-artifacts.mjs +109 -0
- package/scripts/verify-arxiv-compile.mjs +440 -0
- package/scripts/verify-arxiv-source.mjs +194 -0
- package/scripts/verify-browser-launch-plan.mjs +237 -0
- package/scripts/verify-browser-launch-results.mjs +285 -0
- package/scripts/verify-paper-artifacts.mjs +338 -0
- package/scripts/verify-paper-claims.mjs +226 -0
- package/scripts/verify-paper-submission-bundle.mjs +207 -0
- package/scripts/verify-publication-pack.mjs +196 -0
- package/scripts/verify-python-package.py +201 -0
- package/scripts/verify-release-readiness.mjs +741 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,24 +1,86 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
-
## 0.
|
|
4
|
-
|
|
5
|
-
###
|
|
6
|
-
|
|
7
|
-
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
-
|
|
3
|
+
## 1.0.0 - 2026-05-13
|
|
4
|
+
|
|
5
|
+
### Audrey Guard
|
|
6
|
+
|
|
7
|
+
- Ships Audrey Guard as the release-defining loop: receipt-backed `go`,
|
|
8
|
+
`caution`, and `block` decisions before tool use, followed by auditable
|
|
9
|
+
outcome capture through CLI, REST, MCP, and SDK surfaces.
|
|
10
|
+
- Adds Claude Code hook generation and an idempotent hook-apply path so
|
|
11
|
+
`guard --hook --fail-on-warn` can run at `PreToolUse` and post-tool events
|
|
12
|
+
can feed Audrey's redacted trace memory.
|
|
13
|
+
- Binds validation feedback to preflight event ids, evidence ids, and action
|
|
14
|
+
fingerprints so remembered guidance can be audited after use.
|
|
15
|
+
|
|
16
|
+
### GuardBench And Paper Artifacts
|
|
17
|
+
|
|
18
|
+
- Ships GuardBench, a local comparative benchmark for pre-action memory control
|
|
19
|
+
across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only
|
|
20
|
+
baselines.
|
|
21
|
+
- Adds portable GuardBench bundles, conformance cards, JSON schemas, adapter
|
|
22
|
+
self-tests, leaderboard generation, external adapter dry-runs, and pending
|
|
23
|
+
external evidence reports for Mem0 Platform and Zep Cloud.
|
|
24
|
+
- Ships the Audrey Guard paper source, claim register, publication-pack
|
|
25
|
+
verifier, browser launch plan/results ledger, deterministic arXiv source
|
|
26
|
+
package, local arXiv compile proof, and paper submission bundle.
|
|
27
|
+
|
|
28
|
+
### Release Controls
|
|
29
|
+
|
|
30
|
+
- Adds pending-aware `release:readiness` and strict `release:readiness:strict`
|
|
31
|
+
gates so code, paper, source control, npm, PyPI, browser publication, and
|
|
32
|
+
external-evidence blockers stay separate.
|
|
33
|
+
- Adds `release:cut:plan` and `release:cut:apply` so npm, lockfile, MCP,
|
|
34
|
+
Python, and changelog version surfaces are cut consistently.
|
|
35
|
+
- Adds production dependency audit coverage to release gates and keeps
|
|
36
|
+
`npm audit --omit=dev --audit-level=moderate` clean.
|
|
37
|
+
|
|
38
|
+
### Runtime And Client Hardening
|
|
39
|
+
|
|
40
|
+
- `Audrey.encodeBatch()` now calls provider-level `embedBatch()` once per batch
|
|
41
|
+
and writes each episode through the existing `encodeEpisode()` path with the
|
|
42
|
+
precomputed vector.
|
|
43
|
+
- OpenAI embedding batches are chunked by `batchSize` so large batch encodes do
|
|
44
|
+
not turn into one oversized API request.
|
|
45
|
+
- Improves recall degradation reporting across capsules, strict preflights,
|
|
46
|
+
status surfaces, and Guard decisions.
|
|
47
|
+
|
|
48
|
+
## 0.23.0 - 2026-05-05
|
|
49
|
+
|
|
50
|
+
### Audrey Guard — memory before action becomes the product loop
|
|
51
|
+
|
|
52
|
+
- Added Audrey Guard as a first-class controller loop: `beforeAction()` checks memory before an agent touches tools, returns a receipt-backed `go` / `caution` / `block` decision, and `afterAction()` records what happened afterward.
|
|
53
|
+
- Added JavaScript SDK exports and `Audrey.beforeAction()` / `Audrey.afterAction()` methods so agent runtimes can use the same loop without going through CLI or REST.
|
|
54
|
+
- Added `POST /v1/guard/before` and `POST /v1/guard/after` REST routes for sidecar agents.
|
|
55
|
+
- Added `memory_guard_before` and `memory_guard_after` MCP tools for hosts that want memory decisions at the tool boundary.
|
|
56
|
+
- Added `npx audrey guard` and `npx audrey guard-after` CLI commands, including JSON output for hooks and automation.
|
|
57
|
+
|
|
58
|
+
### Release-defining behavior
|
|
59
|
+
|
|
60
|
+
- Guard decisions reuse the existing preflight and reflex machinery without doing two independent recall passes.
|
|
61
|
+
- Guard receipts are stored as `memory_events` rows with guard metadata, evidence ids, reflex ids, preflight decision, warning counts, and redacted tool-trace linkage.
|
|
62
|
+
- `guard-after` now validates evidence feedback before mutating memory, rejects non-guard receipts, and prevents replaying the same receipt to apply duplicate feedback.
|
|
63
|
+
- A failed guarded tool run becomes future memory: the next guard check for the same tool can produce a recent-failure warning and reflex before the agent repeats the mistake.
|
|
64
|
+
- Strict guard mode can block high-severity must-follow memories before risky actions, which is the release's headline "memory firewall" behavior.
|
|
65
|
+
|
|
66
|
+
### Benchmarks
|
|
67
|
+
|
|
68
|
+
- Added an Agent Guard Loop benchmark suite covering prior tool-failure caution, strict must-follow blocking, receipt replay rejection, and non-guard receipt rejection.
|
|
69
|
+
- Added `npm run bench:memory:guard` for focused guard-loop regression testing.
|
|
70
|
+
- Kept guard-loop cases out of the comparable retrieval/lifecycle aggregate when all suites are run, so the local baseline chart remains honest rather than inflated by no-controller placeholders.
|
|
71
|
+
- Committed a fresh `benchmarks/snapshots/perf-0.23.0.json` performance snapshot and fixed direct snapshot runs so they resolve Audrey's package version without depending on npm-injected environment.
|
|
72
|
+
- Added a CLI smoke script to the release gate and Node CI jobs so `--version`, `doctor --json`, and `demo` are proven before pack dry-run.
|
|
73
|
+
- Included benchmark harness files and snapshots in the npm package so advertised benchmark scripts work from the published tarball.
|
|
74
|
+
- Added a package-lock consistency test so release versions cannot drift between `package.json` and `package-lock.json` again.
|
|
75
|
+
|
|
76
|
+
### Docs and release posture
|
|
77
|
+
|
|
78
|
+
- Updated README quick-start, surface tables, and benchmark notes around Audrey Guard.
|
|
79
|
+
- Added `docs/MEMORY_BENCHMARKING.md` to state the release's benchmark policy and map Audrey against LongMemEval, LoCoMo, MemoryAgentBench, StructMemEval, and MemGUI-Bench.
|
|
80
|
+
- Added release design and implementation docs under `docs/superpowers/`.
|
|
81
|
+
- Updated the production backlog to mark the v0.23 controller slice as shipped and to focus the next work on hook installation, external benchmark evidence, batching, and partial recall diagnostics.
|
|
82
|
+
- Bumped JavaScript, MCP CLI, and Python client version surfaces to `0.23.0`.
|
|
83
|
+
- Added the Python 3.9 `eval-type-backport` dependency marker required by Pydantic for Audrey's modern type annotations, and moved Python package metadata to the current setuptools license form.
|
|
22
84
|
|
|
23
85
|
## 0.22.2 - 2026-05-01
|
|
24
86
|
|
package/LICENSE
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2026 evilander
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 evilander
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -27,7 +27,7 @@ Audrey turns those hard-won lessons into a local memory runtime:
|
|
|
27
27
|
- `memory_recall` finds durable context by semantic similarity.
|
|
28
28
|
- `memory_preflight` checks prior failures, risks, rules, and relevant procedures before an action.
|
|
29
29
|
- `memory_reflexes` converts remembered evidence into trigger-response guidance agents can follow.
|
|
30
|
-
- `memory_validate` closes the loop after the action
|
|
30
|
+
- `memory_validate` closes the loop after the action: `helpful`, `used`, or `wrong` outcomes feed salience and can bind back to the exact preflight event, evidence ids, and Guard action fingerprint.
|
|
31
31
|
- `memory_dream` consolidates episodes into principles and applies decay.
|
|
32
32
|
- `audrey impact` and `audrey doctor` tell a human or CI system whether the runtime is doing real work and is actually ready.
|
|
33
33
|
|
|
@@ -52,7 +52,7 @@ npx audrey guard --tool Bash "npm run deploy"
|
|
|
52
52
|
Expected first-run shape:
|
|
53
53
|
|
|
54
54
|
```text
|
|
55
|
-
Audrey Doctor
|
|
55
|
+
Audrey Doctor v1.0.0
|
|
56
56
|
Store health: not initialized
|
|
57
57
|
Verdict: ready
|
|
58
58
|
```
|
|
@@ -75,6 +75,7 @@ Generate raw config blocks:
|
|
|
75
75
|
npx audrey mcp-config codex
|
|
76
76
|
npx audrey mcp-config generic
|
|
77
77
|
npx audrey mcp-config vscode
|
|
78
|
+
npx audrey hook-config claude-code
|
|
78
79
|
```
|
|
79
80
|
|
|
80
81
|
Claude Code can be registered directly:
|
|
@@ -84,6 +85,15 @@ npx audrey install
|
|
|
84
85
|
claude mcp list
|
|
85
86
|
```
|
|
86
87
|
|
|
88
|
+
For memory-before-action hooks, preview with `npx audrey hook-config
|
|
89
|
+
claude-code`, then apply with `npx audrey hook-config claude-code --apply
|
|
90
|
+
--scope project` for `.claude/settings.local.json` or `--scope user` for
|
|
91
|
+
`~/.claude/settings.json`. Audrey merges the hook block into existing settings
|
|
92
|
+
and writes a timestamped backup before changing a non-empty file. The generated
|
|
93
|
+
`PreToolUse` hook runs `audrey guard --hook --fail-on-warn`; the `PostToolUse`
|
|
94
|
+
and `PostToolUseFailure` hooks record redacted tool traces. Verify the active
|
|
95
|
+
hook set inside Claude Code with `/hooks`.
|
|
96
|
+
|
|
87
97
|
All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. Use `AUDREY_DATA_DIR` to isolate projects, tenants, or host identities.
|
|
88
98
|
|
|
89
99
|
Installer-generated host config does not include provider API keys by default. Prefer setting `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` in the host runtime environment; use `npx audrey install --include-secrets` only if you explicitly accept argv/config exposure.
|
|
@@ -121,7 +131,7 @@ Core sidecar tools:
|
|
|
121
131
|
| Surface | Status |
|
|
122
132
|
|---|---|
|
|
123
133
|
| MCP stdio server | 20 tools plus status/recent/principles resources and briefing/recall/reflection prompts |
|
|
124
|
-
| CLI | `doctor`, `demo`, `guard`, `install`, `mcp-config`, `status`, `dream`, `reembed`, `observe-tool`, `promote`, `impact` |
|
|
134
|
+
| CLI | `doctor`, `demo`, `guard`, `install`, `mcp-config`, `hook-config`, `status`, `dream`, `reembed`, `observe-tool`, `promote`, `impact` |
|
|
125
135
|
| REST API | Hono server with `/health` and `/v1/*` routes |
|
|
126
136
|
| JavaScript SDK | Direct TypeScript/Node import from `audrey` |
|
|
127
137
|
| Python client | `pip install audrey-memory`, calls the REST sidecar |
|
|
@@ -191,6 +201,9 @@ Release gates used for this package:
|
|
|
191
201
|
|
|
192
202
|
```bash
|
|
193
203
|
npm run release:gate
|
|
204
|
+
npm run python:release:check
|
|
205
|
+
npm run bench:guard:card
|
|
206
|
+
npm run bench:guard:validate
|
|
194
207
|
npx audrey doctor
|
|
195
208
|
npx audrey demo
|
|
196
209
|
```
|
|
@@ -237,7 +250,7 @@ Production controls you still own:
|
|
|
237
250
|
|
|
238
251
|
## Benchmarks
|
|
239
252
|
|
|
240
|
-
Audrey ships
|
|
253
|
+
Audrey ships three benchmark families.
|
|
241
254
|
|
|
242
255
|
### Performance snapshot
|
|
243
256
|
|
|
@@ -268,6 +281,196 @@ npm run bench:memory # full regression suite (writes JSON + report)
|
|
|
268
281
|
npm run bench:memory:check # release gate, exits non-zero on regression
|
|
269
282
|
```
|
|
270
283
|
|
|
284
|
+
### GuardBench comparative suite
|
|
285
|
+
|
|
286
|
+
`npm run bench:guard:check` runs Audrey's local GuardBench comparative suite:
|
|
287
|
+
ten pre-action scenarios across Audrey Guard, no-memory, recent-window,
|
|
288
|
+
vector-only, and FTS-only adapters. The scenarios cover exact repeated
|
|
289
|
+
failures, required procedures, changed file scopes, changed commands,
|
|
290
|
+
recovered failures, recall degradation, redaction safety, conflicting
|
|
291
|
+
instructions, and noisy stores. It writes
|
|
292
|
+
`benchmarks/output/guardbench-summary.json`,
|
|
293
|
+
`benchmarks/output/guardbench-manifest.json`, and
|
|
294
|
+
`benchmarks/output/guardbench-raw.json`. The emitted manifest, summary, and raw
|
|
295
|
+
output shapes are validated by JSON schemas under `benchmarks/schemas/`.
|
|
296
|
+
|
|
297
|
+
Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
|
|
298
|
+
rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
|
|
299
|
+
the raw-secret sweep, and 3.214ms / 21.395ms
|
|
300
|
+
p50/p95 guard latency under the mock-provider methodology. Local baseline
|
|
301
|
+
decision accuracy was: no-memory 10%, recent-window 60%, vector-only 40%, and
|
|
302
|
+
FTS-only 10%; none passed the full GuardBench decision-plus-evidence contract.
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
npm run bench:guard
|
|
306
|
+
npm run bench:guard:check
|
|
307
|
+
npm run bench:guard:manifest
|
|
308
|
+
npm run bench:guard:validate
|
|
309
|
+
npm run bench:guard:card
|
|
310
|
+
npm run bench:guard:bundle
|
|
311
|
+
npm run bench:guard:bundle:verify
|
|
312
|
+
npm run bench:guard:leaderboard
|
|
313
|
+
npm run bench:guard:adapter-registry:validate
|
|
314
|
+
npm run bench:guard:adapter-module:validate
|
|
315
|
+
npm run bench:guard:adapter-self-test
|
|
316
|
+
npm run bench:guard:adapter-self-test:validate
|
|
317
|
+
npm run bench:guard:publication:verify
|
|
318
|
+
npm run bench:guard:adapter-smoke
|
|
319
|
+
npm run bench:guard:adapter-conformance
|
|
320
|
+
npm run bench:guard:external:dry-run
|
|
321
|
+
npm run bench:guard:mem0 -- --dry-run
|
|
322
|
+
npm run bench:guard:zep -- --dry-run
|
|
323
|
+
node benchmarks/adapter-self-test.mjs --adapter ./path/to/adapter.mjs
|
|
324
|
+
node benchmarks/guardbench.js --adapter ./path/to/adapter.mjs --check
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
External GuardBench adapters are ESM modules that export either `default`,
|
|
328
|
+
`adapter`, or `createGuardBenchAdapter()`. The adapter receives scenario seed
|
|
329
|
+
data and the proposed action, but the harness withholds `expectedDecision` and
|
|
330
|
+
`requiredEvidence` until scoring. Start from
|
|
331
|
+
`benchmarks/adapters/example-allow.mjs` when wiring a new system. Adapter
|
|
332
|
+
authors can import `defineGuardBenchAdapter()` and `defineGuardBenchResult()`
|
|
333
|
+
from `benchmarks/adapter-kit.mjs` to validate module shape and decision output
|
|
334
|
+
while developing.
|
|
335
|
+
|
|
336
|
+
The published adapter registry lives at `benchmarks/adapters/registry.json`.
|
|
337
|
+
Run `npm run bench:guard:adapter-registry:validate` to verify registry shape,
|
|
338
|
+
adapter paths, and credential-free module loading.
|
|
339
|
+
|
|
340
|
+
Before running the full self-test, validate the ESM module shape quickly:
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
npm run bench:guard:adapter-module:validate -- --adapter ./path/to/adapter.mjs
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
Before publishing a new adapter, run `npm run bench:guard:adapter-self-test --
|
|
347
|
+
--adapter ./path/to/adapter.mjs`. The self-test validates the external adapter
|
|
348
|
+
contract and row conformance while explicitly allowing low benchmark scores, so
|
|
349
|
+
authors can separate "valid submission shape" from "competitive GuardBench
|
|
350
|
+
performance." The generated self-test report is validated against
|
|
351
|
+
`benchmarks/schemas/guardbench-adapter-self-test.schema.json`. Reviewers can
|
|
352
|
+
validate a submitted report without rerunning an adapter through `npm run
|
|
353
|
+
bench:guard:adapter-self-test:validate -- --report ./guardbench-adapter-self-test.json`.
|
|
354
|
+
|
|
355
|
+
Audrey ships external adapters for Mem0 Platform and Zep Cloud. Run them only
|
|
356
|
+
with runtime API keys:
|
|
357
|
+
|
|
358
|
+
```bash
|
|
359
|
+
set MEM0_API_KEY=...
|
|
360
|
+
npm run bench:guard:mem0
|
|
361
|
+
|
|
362
|
+
set ZEP_API_KEY=...
|
|
363
|
+
npm run bench:guard:zep
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
The Zep adapter uses the current REST surface for users, sessions, `memory.add`,
|
|
367
|
+
`graph.search`, and benchmark-user cleanup. If Zep graph ingestion needs more
|
|
368
|
+
time in a live account, set `ZEP_GUARDBENCH_INGEST_DELAY_MS` before the run.
|
|
369
|
+
|
|
370
|
+
Run `npm run bench:guard:external:dry-run` before coordinating credentialed
|
|
371
|
+
runs. It walks the runtime-env adapter registry, writes non-secret
|
|
372
|
+
`external-run-metadata.json` files for each adapter, and reports which runtime
|
|
373
|
+
environment variables are still missing. The external dry-run matrix report is schema-bound by
|
|
374
|
+
`benchmarks/schemas/guardbench-external-dry-run.schema.json` and written to
|
|
375
|
+
`benchmarks/output/external/guardbench-external-dry-run.json`.
|
|
376
|
+
|
|
377
|
+
Run `npm run bench:guard:external:evidence` after dry-runs or live runs to
|
|
378
|
+
write `benchmarks/output/external/guardbench-external-evidence.json`. This
|
|
379
|
+
external evidence verification report is schema-bound by
|
|
380
|
+
`benchmarks/schemas/guardbench-external-evidence.schema.json`, treats dry-run
|
|
381
|
+
or missing-key rows as pending in normal release gates, and checks that saved
|
|
382
|
+
metadata does not contain runtime credential values. Use
|
|
383
|
+
`npm run bench:guard:external:evidence:strict` when Mem0/Zep keys have been
|
|
384
|
+
provided; strict mode fails until every runtime-env adapter has a passed live
|
|
385
|
+
bundle.
|
|
386
|
+
|
|
387
|
+
External runs write `external-run-metadata.json` alongside the GuardBench
|
|
388
|
+
summary, manifest, and raw output bundle under
|
|
389
|
+
`benchmarks/output/external/<adapter>/`. The external runner validates the
|
|
390
|
+
emitted bundle with `benchmarks/validate-guardbench-artifacts.mjs` before
|
|
391
|
+
marking the run passed, and separately records adapter conformance so a valid
|
|
392
|
+
low-scoring adapter is distinguished from a malformed adapter. When
|
|
393
|
+
`external-run-metadata.json` is present, the validator also checks it against
|
|
394
|
+
`benchmarks/schemas/guardbench-external-run.schema.json` and verifies any
|
|
395
|
+
recorded SHA-256 artifact hashes against the bundle on disk.
|
|
396
|
+
|
|
397
|
+
For a shareable submission artifact, run `npm run bench:guard:card -- --dir
|
|
398
|
+
<output-dir>`. This writes `guardbench-conformance-card.json` with the subject
|
|
399
|
+
name, run status, score, conformance result, artifact hashes, optional
|
|
400
|
+
external-run metadata hash, and machine provenance. The standalone validator
|
|
401
|
+
checks the card when it is present.
|
|
402
|
+
|
|
403
|
+
For a portable submission directory, run `npm run bench:guard:bundle -- --dir
|
|
404
|
+
<output-dir>`. This creates `submission-bundle/` with the raw GuardBench
|
|
405
|
+
artifacts, conformance card, JSON schemas, validation report, and
|
|
406
|
+
`submission-manifest.json` with SHA-256 hashes for every bundled file.
|
|
407
|
+
Reviewers can run `npm run bench:guard:bundle:verify -- --dir
|
|
408
|
+
<submission-bundle>` to check manifest hashes, bundled schemas, and artifact
|
|
409
|
+
validation from the bundle alone.
|
|
410
|
+
|
|
411
|
+
For benchmark aggregation, run `npm run bench:guard:leaderboard -- --bundle
|
|
412
|
+
<submission-bundle>`. The leaderboard builder verifies each bundle before
|
|
413
|
+
ranking and writes JSON plus Markdown reports under `benchmarks/output/leaderboard/`.
|
|
414
|
+
|
|
415
|
+
Before publishing benchmark artifacts, run `npm run
|
|
416
|
+
bench:guard:publication:verify`. This single benchmark-focused verifier checks
|
|
417
|
+
the adapter registry, default adapter module, adapter self-test report,
|
|
418
|
+
GuardBench manifest/summary/raw artifacts, submission bundle, external dry-run
|
|
419
|
+
matrix, external evidence verification report, leaderboard, and a local
|
|
420
|
+
absolute-path sweep over the public artifact set.
|
|
421
|
+
The verifier validates its own machine-readable report against
|
|
422
|
+
`benchmarks/schemas/guardbench-publication-verification.schema.json` before it
|
|
423
|
+
exits.
|
|
424
|
+
|
|
425
|
+
Before turning the paper into public posts or submissions, run `npm run
|
|
426
|
+
paper:claims`. It validates `docs/paper/claim-register.json` against the
|
|
427
|
+
current paper, README, GuardBench artifacts, publication verifier, and external
|
|
428
|
+
evidence status so pending Mem0/Zep live-score claims cannot slip into public
|
|
429
|
+
copy.
|
|
430
|
+
Run `npm run paper:publication-pack` to verify the ready-to-use arXiv, Hacker
|
|
431
|
+
News, Reddit, X, and LinkedIn drafts in `docs/paper/publication-pack.json`
|
|
432
|
+
before browser-based submission. The X URL reserve is explicit: the first X
|
|
433
|
+
post carries `reservedUrlChars: 24`, and submitted artifact-url targets in
|
|
434
|
+
`browser-launch-results.json` must record the final `artifactUrl`.
|
|
435
|
+
Run `npm run paper:arxiv` to generate a deterministic TeX source package under
|
|
436
|
+
`docs/paper/output/arxiv/`, and `npm run paper:arxiv:verify` to check hashes,
|
|
437
|
+
citation conversion, bibliography coverage, seeded-secret redaction, and local
|
|
438
|
+
absolute-path leakage before arXiv upload.
|
|
439
|
+
Run `npm run paper:arxiv:compile` to record a schema-bound compile report at
|
|
440
|
+
`docs/paper/output/arxiv-compile-report.json`. It attempts `tectonic`,
|
|
441
|
+
`latexmk`, `pdflatex`/`bibtex`, or `uvx tecto` with a local bundle proxy when
|
|
442
|
+
available; `npm run paper:arxiv:compile:strict` stays blocked on hosts without
|
|
443
|
+
supported TeX tooling.
|
|
444
|
+
Run `npm run paper:launch-plan` to verify
|
|
445
|
+
`docs/paper/browser-launch-plan.json`, which maps those drafts to manual
|
|
446
|
+
browser targets, login/captcha expectations, platform-rule checks, source
|
|
447
|
+
URLs, and post-submit URL capture.
|
|
448
|
+
Run `npm run paper:launch-results` to validate
|
|
449
|
+
`docs/paper/browser-launch-results.json`, the post-submit ledger for arXiv,
|
|
450
|
+
Hacker News, Reddit, X, and LinkedIn targets. The normal verifier allows
|
|
451
|
+
pending rows with explicit blockers; `npm run paper:launch-results:strict`
|
|
452
|
+
fails until every target has a submitted, operator-verified public URL.
|
|
453
|
+
Run `npm run paper:bundle` to generate
|
|
454
|
+
`docs/paper/output/submission-bundle/`, a hash-manifested package containing
|
|
455
|
+
paper sources, claim and publication registers, GuardBench outputs, schemas,
|
|
456
|
+
and package metadata. `npm run paper:bundle:verify` checks the manifest and
|
|
457
|
+
file hashes before browser upload.
|
|
458
|
+
Run `npm run release:readiness` for the pending-aware Audrey 1.0 checklist.
|
|
459
|
+
It keeps code/paper readiness separate from publish blockers; `npm run
|
|
460
|
+
release:readiness:strict` fails until the 1.0 version surfaces,
|
|
461
|
+
source-control state, live remote-head verification, Python artifacts, npm
|
|
462
|
+
registry/auth readiness, PyPI publish readiness, arXiv compile proof, browser
|
|
463
|
+
publication URLs, and live Mem0/Zep evidence are complete.
|
|
464
|
+
Run `npm run release:cut:plan` to preview the exact 1.0 version/changelog
|
|
465
|
+
edits across npm, lockfile, MCP, and Python surfaces. `npm run
|
|
466
|
+
release:cut:apply -- --target-version 1.0.0` writes those edits only when the
|
|
467
|
+
final cut is intentional. The generated changelog section is release-note copy,
|
|
468
|
+
not a TODO scaffold; `release:readiness:strict` rejects placeholder changelog
|
|
469
|
+
markers before publication.
|
|
470
|
+
Run `npm run security:audit` before packaging or publishing; the release gates
|
|
471
|
+
call it after artifact verification so production dependency advisories cannot
|
|
472
|
+
slip past the final package check.
|
|
473
|
+
|
|
271
474
|
## Command Reference
|
|
272
475
|
|
|
273
476
|
```bash
|
|
@@ -279,6 +482,7 @@ npx audrey demo
|
|
|
279
482
|
npx audrey install --host codex --dry-run
|
|
280
483
|
npx audrey mcp-config codex
|
|
281
484
|
npx audrey mcp-config generic
|
|
485
|
+
npx audrey hook-config claude-code
|
|
282
486
|
npx audrey install
|
|
283
487
|
npx audrey uninstall
|
|
284
488
|
|
|
@@ -317,7 +521,7 @@ The Node sidecar defaults to `127.0.0.1:7437`. The Docker image intentionally bi
|
|
|
317
521
|
npm ci
|
|
318
522
|
npm run release:gate
|
|
319
523
|
python -m unittest discover -s python/tests -v
|
|
320
|
-
|
|
524
|
+
npm run python:release:check
|
|
321
525
|
```
|
|
322
526
|
|
|
323
527
|
`npm test` uses a repo-local Vitest launcher so locked-down Windows temp
|
package/SECURITY.md
CHANGED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { validateGuardBenchAdapter, validateAdapterResult } from './guardbench.js';
|
|
2
|
+
|
|
3
|
+
export const GUARDBENCH_ADAPTER_CONTRACT_VERSION = '1.0.0';
|
|
4
|
+
export const GUARDBENCH_DECISIONS = Object.freeze(['allow', 'warn', 'block']);
|
|
5
|
+
export const GUARDBENCH_RESULT_FIELDS = Object.freeze([
|
|
6
|
+
'decision',
|
|
7
|
+
'riskScore',
|
|
8
|
+
'evidenceIds',
|
|
9
|
+
'recommendedActions',
|
|
10
|
+
'summary',
|
|
11
|
+
'recallErrors',
|
|
12
|
+
]);
|
|
13
|
+
|
|
14
|
+
export function defineGuardBenchAdapter(adapter) {
|
|
15
|
+
return validateGuardBenchAdapter(adapter, adapter?.name ?? 'inline adapter');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function defineGuardBenchResult(result, adapterName = 'adapter', scenarioId = 'scenario') {
|
|
19
|
+
return validateAdapterResult(result, adapterName, scenarioId);
|
|
20
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { basename, dirname, resolve } from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import { loadExternalAdapters, runGuardBench } from './guardbench.js';
|
|
5
|
+
import { evaluateAdapterConformance } from './run-external-guardbench.mjs';
|
|
6
|
+
import { validateSchema } from './validate-guardbench-artifacts.mjs';
|
|
7
|
+
import { publicPath } from './public-paths.mjs';
|
|
8
|
+
|
|
9
|
+
const ROOT = resolve(dirname(fileURLToPath(import.meta.url)), '..');
|
|
10
|
+
const DEFAULT_ADAPTER = 'benchmarks/adapters/example-allow.mjs';
|
|
11
|
+
const DEFAULT_OUT = 'benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json';
|
|
12
|
+
const DEFAULT_SCHEMA = 'benchmarks/schemas/guardbench-adapter-self-test.schema.json';
|
|
13
|
+
const RESULT_FIELDS = [
|
|
14
|
+
'decision',
|
|
15
|
+
'riskScore',
|
|
16
|
+
'evidenceIds',
|
|
17
|
+
'recommendedActions',
|
|
18
|
+
'summary',
|
|
19
|
+
'recallErrors',
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
export function parseAdapterSelfTestArgs(argv = process.argv.slice(2)) {
|
|
23
|
+
const args = {
|
|
24
|
+
adapter: DEFAULT_ADAPTER,
|
|
25
|
+
out: DEFAULT_OUT,
|
|
26
|
+
json: false,
|
|
27
|
+
noWrite: false,
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
for (let i = 0; i < argv.length; i++) {
|
|
31
|
+
const token = argv[i];
|
|
32
|
+
if (token === '--adapter' && argv[i + 1]) args.adapter = argv[++i];
|
|
33
|
+
else if (token === '--out' && argv[i + 1]) args.out = argv[++i];
|
|
34
|
+
else if (token === '--json') args.json = true;
|
|
35
|
+
else if (token === '--no-write') args.noWrite = true;
|
|
36
|
+
else if (token === '--help' || token === '-h') args.help = true;
|
|
37
|
+
else throw new Error(`Unknown argument: ${token}`);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return args;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function usage() {
|
|
44
|
+
return `Usage: node benchmarks/adapter-self-test.mjs [options]
|
|
45
|
+
|
|
46
|
+
Options:
|
|
47
|
+
--adapter <path> ESM GuardBench adapter path. Default: ${DEFAULT_ADAPTER}.
|
|
48
|
+
--out <path> JSON report path. Default: ${DEFAULT_OUT}.
|
|
49
|
+
--json Print the full JSON report.
|
|
50
|
+
--no-write Do not write the JSON report.
|
|
51
|
+
`;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function systemSummary(report, adapterName) {
|
|
55
|
+
return report.systemSummaries.find(row => row.system === adapterName) ?? null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function scoreFromReport(report, adapterName) {
|
|
59
|
+
const summary = systemSummary(report, adapterName);
|
|
60
|
+
return {
|
|
61
|
+
scenarios: summary?.scenarios ?? 0,
|
|
62
|
+
fullContractPassRate: summary?.passRate ?? null,
|
|
63
|
+
decisionAccuracy: summary?.decisionAccuracy ?? null,
|
|
64
|
+
evidenceRecall: summary?.evidenceRecall ?? null,
|
|
65
|
+
redactionLeaks: summary?.redactionLeaks ?? null,
|
|
66
|
+
latency: summary?.latency ?? null,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function readJson(path) {
|
|
71
|
+
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function validateAdapterSelfTestReport(report, options = {}) {
|
|
75
|
+
const schemaPath = resolve(ROOT, options.schema ?? DEFAULT_SCHEMA);
|
|
76
|
+
const schema = options.schemaObject ?? readJson(schemaPath);
|
|
77
|
+
return validateSchema(report, schema, 'guardbench-adapter-self-test');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export async function runGuardBenchAdapterSelfTest(options = {}) {
|
|
81
|
+
const adapterPath = resolve(ROOT, options.adapterPath ?? options.adapter ?? DEFAULT_ADAPTER);
|
|
82
|
+
if (!existsSync(adapterPath)) {
|
|
83
|
+
throw new Error(`GuardBench adapter not found: ${adapterPath}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const adapters = await loadExternalAdapters([adapterPath]);
|
|
87
|
+
if (adapters.length !== 1) {
|
|
88
|
+
throw new Error(`GuardBench adapter self-test expected 1 adapter, got ${adapters.length}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const [adapter] = adapters;
|
|
92
|
+
const report = await runGuardBench({ externalAdapters: adapters });
|
|
93
|
+
const conformance = evaluateAdapterConformance(report, adapter.name);
|
|
94
|
+
const score = scoreFromReport(report, conformance.adapter);
|
|
95
|
+
const selfTest = {
|
|
96
|
+
schemaVersion: '1.0.0',
|
|
97
|
+
suite: 'GuardBench adapter self-test',
|
|
98
|
+
generatedAt: new Date().toISOString(),
|
|
99
|
+
ok: conformance.ok,
|
|
100
|
+
adapter: {
|
|
101
|
+
name: adapter.name,
|
|
102
|
+
path: publicPath(adapterPath),
|
|
103
|
+
moduleFile: basename(adapterPath),
|
|
104
|
+
description: adapter.description ?? null,
|
|
105
|
+
},
|
|
106
|
+
conformance,
|
|
107
|
+
score,
|
|
108
|
+
contract: {
|
|
109
|
+
expectedAnswersWithheld: true,
|
|
110
|
+
lowScoreAllowed: true,
|
|
111
|
+
requiredScenarioRows: report.scenarios,
|
|
112
|
+
requiredResultFields: RESULT_FIELDS,
|
|
113
|
+
redactionLeakTolerance: 0,
|
|
114
|
+
},
|
|
115
|
+
failures: conformance.failures,
|
|
116
|
+
};
|
|
117
|
+
const schemaErrors = validateAdapterSelfTestReport(selfTest);
|
|
118
|
+
if (schemaErrors.length > 0) {
|
|
119
|
+
throw new Error(`GuardBench adapter self-test schema validation failed: ${schemaErrors.join('; ')}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (options.out && options.write !== false) {
|
|
123
|
+
const outPath = resolve(ROOT, options.out);
|
|
124
|
+
mkdirSync(dirname(outPath), { recursive: true });
|
|
125
|
+
writeFileSync(outPath, `${JSON.stringify(selfTest, null, 2)}\n`, 'utf-8');
|
|
126
|
+
selfTest.outPath = publicPath(outPath);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return selfTest;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
async function main() {
|
|
133
|
+
const args = parseAdapterSelfTestArgs();
|
|
134
|
+
if (args.help) {
|
|
135
|
+
console.log(usage());
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const result = await runGuardBenchAdapterSelfTest({
|
|
140
|
+
adapter: args.adapter,
|
|
141
|
+
out: args.noWrite ? null : args.out,
|
|
142
|
+
write: !args.noWrite,
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
if (args.json) {
|
|
146
|
+
console.log(JSON.stringify(result, null, 2));
|
|
147
|
+
} else if (result.ok) {
|
|
148
|
+
console.log(`GuardBench adapter self-test passed: ${result.adapter.name}`);
|
|
149
|
+
console.log(`Contract rows: ${result.conformance.scenarios}/${result.conformance.expectedScenarios}`);
|
|
150
|
+
console.log(`Full-contract score: ${(result.score.fullContractPassRate * 100).toFixed(1)}%`);
|
|
151
|
+
console.log(`Decision accuracy: ${(result.score.decisionAccuracy * 100).toFixed(1)}%`);
|
|
152
|
+
if (result.outPath) console.log(`Self-test report: ${result.outPath}`);
|
|
153
|
+
} else {
|
|
154
|
+
console.error(`GuardBench adapter self-test failed: ${result.adapter.name}`);
|
|
155
|
+
for (const failure of result.failures) console.error(`- ${failure}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
process.exitCode = result.ok ? 0 : 1;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
|
|
162
|
+
main().catch(error => {
|
|
163
|
+
console.error(error.message);
|
|
164
|
+
process.exit(1);
|
|
165
|
+
});
|
|
166
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { defineGuardBenchAdapter } from '../adapter-kit.mjs';
|
|
2
|
+
|
|
3
|
+
export default defineGuardBenchAdapter({
|
|
4
|
+
name: 'Example Allow Adapter',
|
|
5
|
+
description: 'Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests.',
|
|
6
|
+
async setup({ scenario }) {
|
|
7
|
+
return {
|
|
8
|
+
memoryCount: (scenario.seed.seededMemories ?? []).length,
|
|
9
|
+
toolEventCount: (scenario.seed.seededToolEvents ?? []).length,
|
|
10
|
+
hasFaultInjection: Boolean(scenario.seed.faultInjection),
|
|
11
|
+
};
|
|
12
|
+
},
|
|
13
|
+
async decide({ scenario, state }) {
|
|
14
|
+
return {
|
|
15
|
+
decision: 'allow',
|
|
16
|
+
riskScore: 0,
|
|
17
|
+
evidenceIds: [],
|
|
18
|
+
recommendedActions: [],
|
|
19
|
+
summary: [
|
|
20
|
+
`Example adapter loaded ${state.memoryCount} seeded memories`,
|
|
21
|
+
`${state.toolEventCount} seeded tool events`,
|
|
22
|
+
scenario.seed.seededNoise ? `${scenario.seed.seededNoise.count} noise memories` : 'no noise block',
|
|
23
|
+
state.hasFaultInjection ? 'fault injection present but unsupported' : 'no fault injection',
|
|
24
|
+
].join('; '),
|
|
25
|
+
};
|
|
26
|
+
},
|
|
27
|
+
async cleanup() {},
|
|
28
|
+
});
|