audrey 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/README.md +5 -3
- package/benchmarks/adapter-self-test.mjs +6 -2
- package/benchmarks/adapters/example-allow.mjs +5 -2
- package/benchmarks/adapters/mem0-platform.mjs +19 -12
- package/benchmarks/adapters/zep-cloud.mjs +51 -27
- package/benchmarks/baselines.js +11 -6
- package/benchmarks/build-leaderboard.mjs +36 -23
- package/benchmarks/cases.js +24 -12
- package/benchmarks/create-conformance-card.mjs +12 -3
- package/benchmarks/create-submission-bundle.mjs +22 -8
- package/benchmarks/dry-run-external-adapters.mjs +24 -12
- package/benchmarks/guardbench.js +263 -123
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +11 -11
- package/benchmarks/output/guardbench-raw.json +107 -108
- package/benchmarks/output/guardbench-summary.json +170 -172
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +11 -11
- package/benchmarks/output/submission-bundle/guardbench-raw.json +107 -108
- package/benchmarks/output/submission-bundle/guardbench-summary.json +170 -172
- package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +57 -57
- package/benchmarks/perf-snapshot.js +12 -9
- package/benchmarks/perf.bench.js +14 -6
- package/benchmarks/public-paths.mjs +11 -5
- package/benchmarks/reference-results.js +10 -5
- package/benchmarks/report.js +48 -27
- package/benchmarks/run-external-guardbench.mjs +47 -25
- package/benchmarks/run.js +112 -59
- package/benchmarks/validate-adapter-module.mjs +13 -10
- package/benchmarks/validate-adapter-registry.mjs +16 -5
- package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
- package/benchmarks/verify-external-evidence.mjs +86 -31
- package/benchmarks/verify-publication-artifacts.mjs +34 -11
- package/benchmarks/verify-submission-bundle.mjs +9 -4
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +5 -3
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +4 -3
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +479 -172
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/src/action-key.d.ts.map +1 -1
- package/dist/src/action-key.js +6 -2
- package/dist/src/action-key.js.map +1 -1
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +4 -2
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +8 -5
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +1 -1
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +93 -49
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +37 -15
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +4 -2
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +5 -5
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +17 -9
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/context.js +1 -1
- package/dist/src/context.js.map +1 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +24 -13
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +78 -27
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +1 -1
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +12 -4
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +18 -16
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +5 -4
- package/dist/src/encode.js.map +1 -1
- package/dist/src/events.d.ts +3 -2
- package/dist/src/events.d.ts.map +1 -1
- package/dist/src/events.js +7 -3
- package/dist/src/events.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +21 -7
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +1 -1
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +12 -6
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.d.ts.map +1 -1
- package/dist/src/fts.js +20 -8
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +12 -6
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts.map +1 -1
- package/dist/src/impact.js +26 -10
- package/dist/src/impact.js.map +1 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +11 -6
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +3 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +10 -5
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.d.ts.map +1 -1
- package/dist/src/introspect.js +12 -6
- package/dist/src/introspect.js.map +1 -1
- package/dist/src/llm.d.ts +2 -2
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +6 -6
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +10 -4
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +6 -8
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts.map +1 -1
- package/dist/src/profile.js.map +1 -1
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +16 -7
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +1 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +85 -18
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +9 -4
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +1 -7
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +4 -2
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +33 -13
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +24 -2
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.js +2 -2
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts +2 -2
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +12 -4
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.js +1 -1
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +20 -10
- package/dist/src/validate.js.map +1 -1
- package/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +5 -3
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +11 -11
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +107 -108
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +170 -172
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +58 -58
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +17 -4
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +36 -36
- package/examples/fintech-ops-demo.js +12 -5
- package/examples/healthcare-ops-demo.js +8 -4
- package/examples/ollama-memory-agent.js +41 -13
- package/examples/stripe-demo.js +12 -5
- package/package.json +17 -4
- package/scripts/audit-release-completion.mjs +179 -101
- package/scripts/create-arxiv-source.mjs +20 -14
- package/scripts/create-paper-submission-bundle.mjs +6 -2
- package/scripts/finalize-release.mjs +111 -36
- package/scripts/prepare-release-cut.mjs +14 -6
- package/scripts/publish-release-bundle.mjs +62 -23
- package/scripts/publish-release-github-api.mjs +89 -24
- package/scripts/smoke-cli.js +9 -9
- package/scripts/sync-paper-artifacts.mjs +5 -1
- package/scripts/verify-arxiv-compile.mjs +52 -16
- package/scripts/verify-arxiv-source.mjs +45 -15
- package/scripts/verify-browser-launch-plan.mjs +28 -11
- package/scripts/verify-browser-launch-results.mjs +32 -14
- package/scripts/verify-paper-artifacts.mjs +539 -79
- package/scripts/verify-paper-claims.mjs +48 -20
- package/scripts/verify-paper-submission-bundle.mjs +22 -11
- package/scripts/verify-publication-pack.mjs +23 -9
- package/scripts/verify-release-readiness.mjs +211 -76
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schemaVersion": "1.0.0",
|
|
3
3
|
"suite": "GuardBench leaderboard",
|
|
4
|
-
"generatedAt": "2026-05-
|
|
4
|
+
"generatedAt": "2026-05-29T03:45:37.292Z",
|
|
5
5
|
"ranking": [
|
|
6
6
|
"verified bundle",
|
|
7
7
|
"adapter conformance",
|
|
@@ -28,9 +28,9 @@
|
|
|
28
28
|
"evidenceRecall": 1,
|
|
29
29
|
"redactionLeaks": 0,
|
|
30
30
|
"latency": {
|
|
31
|
-
"p50Ms": 2.
|
|
32
|
-
"p95Ms":
|
|
33
|
-
"maxMs":
|
|
31
|
+
"p50Ms": 2.916,
|
|
32
|
+
"p95Ms": 21.17,
|
|
33
|
+
"maxMs": 21.17
|
|
34
34
|
}
|
|
35
35
|
},
|
|
36
36
|
"conformance": {
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
},
|
|
42
42
|
"source": {
|
|
43
43
|
"dir": "benchmarks/output/submission-bundle",
|
|
44
|
-
"manifestGeneratedAt": "2026-05-
|
|
44
|
+
"manifestGeneratedAt": "2026-05-29T03:45:36.970Z",
|
|
45
45
|
"fileCount": 17
|
|
46
46
|
},
|
|
47
47
|
"verification": {
|
package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# GuardBench Leaderboard
|
|
2
2
|
|
|
3
|
-
Generated: 2026-05-
|
|
3
|
+
Generated: 2026-05-29T03:45:37.292Z
|
|
4
4
|
|
|
5
5
|
| Rank | Subject | Verified | Conformant | Full Contract | Decision Accuracy | Evidence Recall | Redaction Leaks | p95 Latency | Bundle |
|
|
6
6
|
|---:|---|---:|---:|---:|---:|---:|---:|---:|---|
|
|
7
|
-
| 1 | Audrey Guard | yes | yes | 100.0% | 100.0% | 100.0% | 0 |
|
|
7
|
+
| 1 | Audrey Guard | yes | yes | 100.0% | 100.0% | 100.0% | 0 | 21.17ms | benchmarks/output/submission-bundle |
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schemaVersion": "1.0.0",
|
|
3
3
|
"suite": "GuardBench submission bundle",
|
|
4
|
-
"generatedAt": "2026-05-
|
|
4
|
+
"generatedAt": "2026-05-29T03:45:36.970Z",
|
|
5
5
|
"sourceDir": "benchmarks/output",
|
|
6
6
|
"subject": {
|
|
7
7
|
"name": "Audrey Guard",
|
|
@@ -16,9 +16,9 @@
|
|
|
16
16
|
"evidenceRecall": 1,
|
|
17
17
|
"redactionLeaks": 0,
|
|
18
18
|
"latency": {
|
|
19
|
-
"p50Ms": 2.
|
|
20
|
-
"p95Ms":
|
|
21
|
-
"maxMs":
|
|
19
|
+
"p50Ms": 2.916,
|
|
20
|
+
"p95Ms": 21.17,
|
|
21
|
+
"maxMs": 21.17
|
|
22
22
|
}
|
|
23
23
|
},
|
|
24
24
|
"conformance": {
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
"files": [
|
|
45
45
|
{
|
|
46
46
|
"path": "guardbench-conformance-card.json",
|
|
47
|
-
"bytes":
|
|
48
|
-
"sha256": "
|
|
47
|
+
"bytes": 1733,
|
|
48
|
+
"sha256": "a32ce0a46b2f031e827caa0896fdd6a870acf92f6bccb481ca661a3ea4b74a79"
|
|
49
49
|
},
|
|
50
50
|
{
|
|
51
51
|
"path": "guardbench-manifest.json",
|
|
@@ -54,13 +54,13 @@
|
|
|
54
54
|
},
|
|
55
55
|
{
|
|
56
56
|
"path": "guardbench-raw.json",
|
|
57
|
-
"bytes":
|
|
58
|
-
"sha256": "
|
|
57
|
+
"bytes": 43407,
|
|
58
|
+
"sha256": "15b39fd1a65709a89455fbfcaf815daf364b204fa526d5065cc12fcaed281d28"
|
|
59
59
|
},
|
|
60
60
|
{
|
|
61
61
|
"path": "guardbench-summary.json",
|
|
62
|
-
"bytes":
|
|
63
|
-
"sha256": "
|
|
62
|
+
"bytes": 69260,
|
|
63
|
+
"sha256": "e8669cd6c80dc3dc849b3c4fcc473ea706eb3a760bced69682d0dc2396b2e233"
|
|
64
64
|
},
|
|
65
65
|
{
|
|
66
66
|
"path": "schemas/guardbench-adapter-registry.schema.json",
|
|
@@ -125,7 +125,7 @@
|
|
|
125
125
|
{
|
|
126
126
|
"path": "validation-report.json",
|
|
127
127
|
"bytes": 739,
|
|
128
|
-
"sha256": "
|
|
128
|
+
"sha256": "6f6c385d8db108b1a5e30de389a1131bece70ccb18c552a5f005f7dbe377d695"
|
|
129
129
|
}
|
|
130
130
|
]
|
|
131
131
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"generatedAt": "2026-05-
|
|
2
|
+
"generatedAt": "2026-05-29T03:45:32.997Z",
|
|
3
3
|
"command": "node benchmarks/run.js --provider mock --dimensions 64",
|
|
4
4
|
"config": {
|
|
5
5
|
"provider": "mock",
|
|
@@ -23,19 +23,19 @@
|
|
|
23
23
|
"system": "Audrey",
|
|
24
24
|
"scorePercent": 100,
|
|
25
25
|
"passRate": 100,
|
|
26
|
-
"avgDurationMs":
|
|
26
|
+
"avgDurationMs": 15.416666666666666
|
|
27
27
|
},
|
|
28
28
|
{
|
|
29
29
|
"system": "Vector Only",
|
|
30
30
|
"scorePercent": 41.66666666666667,
|
|
31
31
|
"passRate": 25,
|
|
32
|
-
"avgDurationMs": 0.
|
|
32
|
+
"avgDurationMs": 0.3333333333333333
|
|
33
33
|
},
|
|
34
34
|
{
|
|
35
35
|
"system": "Keyword + Recency",
|
|
36
36
|
"scorePercent": 41.66666666666667,
|
|
37
37
|
"passRate": 25,
|
|
38
|
-
"avgDurationMs": 0.
|
|
38
|
+
"avgDurationMs": 0.6666666666666666
|
|
39
39
|
},
|
|
40
40
|
{
|
|
41
41
|
"system": "Recent Window",
|
|
@@ -170,19 +170,19 @@
|
|
|
170
170
|
"system": "Audrey",
|
|
171
171
|
"scorePercent": 100,
|
|
172
172
|
"passRate": 100,
|
|
173
|
-
"avgDurationMs":
|
|
173
|
+
"avgDurationMs": 15.625
|
|
174
174
|
},
|
|
175
175
|
{
|
|
176
176
|
"system": "Vector Only",
|
|
177
177
|
"scorePercent": 56.25,
|
|
178
178
|
"passRate": 37.5,
|
|
179
|
-
"avgDurationMs": 0.
|
|
179
|
+
"avgDurationMs": 0.375
|
|
180
180
|
},
|
|
181
181
|
{
|
|
182
182
|
"system": "Keyword + Recency",
|
|
183
183
|
"scorePercent": 50,
|
|
184
184
|
"passRate": 37.5,
|
|
185
|
-
"avgDurationMs":
|
|
185
|
+
"avgDurationMs": 1
|
|
186
186
|
},
|
|
187
187
|
{
|
|
188
188
|
"system": "Recent Window",
|
|
@@ -277,7 +277,7 @@
|
|
|
277
277
|
"results": [
|
|
278
278
|
{
|
|
279
279
|
"system": "Audrey",
|
|
280
|
-
"durationMs":
|
|
280
|
+
"durationMs": 24,
|
|
281
281
|
"passed": true,
|
|
282
282
|
"score": 1,
|
|
283
283
|
"summary": "retrieved expected evidence",
|
|
@@ -289,7 +289,7 @@
|
|
|
289
289
|
},
|
|
290
290
|
{
|
|
291
291
|
"system": "Vector Only",
|
|
292
|
-
"durationMs":
|
|
292
|
+
"durationMs": 2,
|
|
293
293
|
"passed": true,
|
|
294
294
|
"score": 1,
|
|
295
295
|
"summary": "retrieved expected evidence",
|
|
@@ -301,7 +301,7 @@
|
|
|
301
301
|
},
|
|
302
302
|
{
|
|
303
303
|
"system": "Keyword + Recency",
|
|
304
|
-
"durationMs":
|
|
304
|
+
"durationMs": 7,
|
|
305
305
|
"passed": true,
|
|
306
306
|
"score": 1,
|
|
307
307
|
"summary": "retrieved expected evidence",
|
|
@@ -336,7 +336,7 @@
|
|
|
336
336
|
"results": [
|
|
337
337
|
{
|
|
338
338
|
"system": "Audrey",
|
|
339
|
-
"durationMs":
|
|
339
|
+
"durationMs": 13,
|
|
340
340
|
"passed": true,
|
|
341
341
|
"score": 1,
|
|
342
342
|
"summary": "retrieved expected evidence",
|
|
@@ -347,7 +347,7 @@
|
|
|
347
347
|
},
|
|
348
348
|
{
|
|
349
349
|
"system": "Vector Only",
|
|
350
|
-
"durationMs":
|
|
350
|
+
"durationMs": 1,
|
|
351
351
|
"passed": false,
|
|
352
352
|
"score": 0.5,
|
|
353
353
|
"summary": "retrieved expected evidence, but conflicting evidence still appeared later",
|
|
@@ -394,7 +394,7 @@
|
|
|
394
394
|
"results": [
|
|
395
395
|
{
|
|
396
396
|
"system": "Audrey",
|
|
397
|
-
"durationMs":
|
|
397
|
+
"durationMs": 15,
|
|
398
398
|
"passed": true,
|
|
399
399
|
"score": 1,
|
|
400
400
|
"summary": "retrieved expected evidence",
|
|
@@ -407,7 +407,7 @@
|
|
|
407
407
|
},
|
|
408
408
|
{
|
|
409
409
|
"system": "Vector Only",
|
|
410
|
-
"durationMs":
|
|
410
|
+
"durationMs": 0,
|
|
411
411
|
"passed": true,
|
|
412
412
|
"score": 1,
|
|
413
413
|
"summary": "retrieved expected evidence",
|
|
@@ -457,7 +457,7 @@
|
|
|
457
457
|
"results": [
|
|
458
458
|
{
|
|
459
459
|
"system": "Audrey",
|
|
460
|
-
"durationMs":
|
|
460
|
+
"durationMs": 15,
|
|
461
461
|
"passed": true,
|
|
462
462
|
"score": 1,
|
|
463
463
|
"summary": "retrieved expected evidence",
|
|
@@ -468,7 +468,7 @@
|
|
|
468
468
|
},
|
|
469
469
|
{
|
|
470
470
|
"system": "Vector Only",
|
|
471
|
-
"durationMs":
|
|
471
|
+
"durationMs": 0,
|
|
472
472
|
"passed": true,
|
|
473
473
|
"score": 1,
|
|
474
474
|
"summary": "retrieved expected evidence",
|
|
@@ -518,7 +518,7 @@
|
|
|
518
518
|
"results": [
|
|
519
519
|
{
|
|
520
520
|
"system": "Audrey",
|
|
521
|
-
"durationMs":
|
|
521
|
+
"durationMs": 14,
|
|
522
522
|
"passed": true,
|
|
523
523
|
"score": 1,
|
|
524
524
|
"summary": "correct abstention",
|
|
@@ -574,7 +574,7 @@
|
|
|
574
574
|
"results": [
|
|
575
575
|
{
|
|
576
576
|
"system": "Audrey",
|
|
577
|
-
"durationMs":
|
|
577
|
+
"durationMs": 13,
|
|
578
578
|
"passed": true,
|
|
579
579
|
"score": 1,
|
|
580
580
|
"summary": "retrieved expected evidence",
|
|
@@ -597,7 +597,7 @@
|
|
|
597
597
|
},
|
|
598
598
|
{
|
|
599
599
|
"system": "Keyword + Recency",
|
|
600
|
-
"durationMs":
|
|
600
|
+
"durationMs": 1,
|
|
601
601
|
"passed": false,
|
|
602
602
|
"score": 0,
|
|
603
603
|
"summary": "blocked content outranked the correct answer",
|
|
@@ -632,7 +632,7 @@
|
|
|
632
632
|
"results": [
|
|
633
633
|
{
|
|
634
634
|
"system": "Audrey",
|
|
635
|
-
"durationMs":
|
|
635
|
+
"durationMs": 18,
|
|
636
636
|
"passed": true,
|
|
637
637
|
"score": 1,
|
|
638
638
|
"summary": "retrieved expected evidence",
|
|
@@ -693,7 +693,7 @@
|
|
|
693
693
|
"results": [
|
|
694
694
|
{
|
|
695
695
|
"system": "Audrey",
|
|
696
|
-
"durationMs":
|
|
696
|
+
"durationMs": 13,
|
|
697
697
|
"passed": true,
|
|
698
698
|
"score": 1,
|
|
699
699
|
"summary": "correct abstention",
|
|
@@ -750,13 +750,13 @@
|
|
|
750
750
|
"system": "Audrey",
|
|
751
751
|
"scorePercent": 100,
|
|
752
752
|
"passRate": 100,
|
|
753
|
-
"avgDurationMs":
|
|
753
|
+
"avgDurationMs": 15
|
|
754
754
|
},
|
|
755
755
|
{
|
|
756
756
|
"system": "Keyword + Recency",
|
|
757
757
|
"scorePercent": 25,
|
|
758
758
|
"passRate": 0,
|
|
759
|
-
"avgDurationMs": 0
|
|
759
|
+
"avgDurationMs": 0
|
|
760
760
|
},
|
|
761
761
|
{
|
|
762
762
|
"system": "Vector Only",
|
|
@@ -821,7 +821,7 @@
|
|
|
821
821
|
"results": [
|
|
822
822
|
{
|
|
823
823
|
"system": "Audrey",
|
|
824
|
-
"durationMs":
|
|
824
|
+
"durationMs": 13,
|
|
825
825
|
"passed": true,
|
|
826
826
|
"score": 1,
|
|
827
827
|
"summary": "retrieved expected evidence",
|
|
@@ -879,7 +879,7 @@
|
|
|
879
879
|
"results": [
|
|
880
880
|
{
|
|
881
881
|
"system": "Audrey",
|
|
882
|
-
"durationMs":
|
|
882
|
+
"durationMs": 14,
|
|
883
883
|
"passed": true,
|
|
884
884
|
"score": 1,
|
|
885
885
|
"summary": "correct abstention",
|
|
@@ -899,7 +899,7 @@
|
|
|
899
899
|
},
|
|
900
900
|
{
|
|
901
901
|
"system": "Keyword + Recency",
|
|
902
|
-
"durationMs":
|
|
902
|
+
"durationMs": 0,
|
|
903
903
|
"passed": false,
|
|
904
904
|
"score": 0.5,
|
|
905
905
|
"summary": "no leak, but retrieved tangential context",
|
|
@@ -993,7 +993,7 @@
|
|
|
993
993
|
"results": [
|
|
994
994
|
{
|
|
995
995
|
"system": "Audrey",
|
|
996
|
-
"durationMs":
|
|
996
|
+
"durationMs": 16,
|
|
997
997
|
"passed": true,
|
|
998
998
|
"score": 1,
|
|
999
999
|
"summary": "retrieved expected evidence",
|
|
@@ -1055,7 +1055,7 @@
|
|
|
1055
1055
|
"system": "Audrey",
|
|
1056
1056
|
"scorePercent": 100,
|
|
1057
1057
|
"passRate": 100,
|
|
1058
|
-
"avgDurationMs":
|
|
1058
|
+
"avgDurationMs": 14.25
|
|
1059
1059
|
},
|
|
1060
1060
|
{
|
|
1061
1061
|
"system": "Vector Only",
|
|
@@ -1118,7 +1118,7 @@
|
|
|
1118
1118
|
"results": [
|
|
1119
1119
|
{
|
|
1120
1120
|
"system": "Audrey",
|
|
1121
|
-
"durationMs":
|
|
1121
|
+
"durationMs": 18,
|
|
1122
1122
|
"passed": true,
|
|
1123
1123
|
"score": 1,
|
|
1124
1124
|
"summary": "matched all required signals",
|
|
@@ -1176,7 +1176,7 @@
|
|
|
1176
1176
|
"results": [
|
|
1177
1177
|
{
|
|
1178
1178
|
"system": "Audrey",
|
|
1179
|
-
"durationMs":
|
|
1179
|
+
"durationMs": 14,
|
|
1180
1180
|
"passed": true,
|
|
1181
1181
|
"score": 1,
|
|
1182
1182
|
"summary": "matched all required signals",
|
|
@@ -1234,12 +1234,12 @@
|
|
|
1234
1234
|
"results": [
|
|
1235
1235
|
{
|
|
1236
1236
|
"system": "Audrey",
|
|
1237
|
-
"durationMs":
|
|
1237
|
+
"durationMs": 14,
|
|
1238
1238
|
"passed": true,
|
|
1239
1239
|
"score": 1,
|
|
1240
1240
|
"summary": "matched all required signals",
|
|
1241
1241
|
"topResults": [
|
|
1242
|
-
"guard_hardened:replay_rejected error:guard receipt already has an outcome:
|
|
1242
|
+
"guard_hardened:replay_rejected error:guard receipt already has an outcome: 01KSRXCNSKBMJ51Y9GFJ37J0NV",
|
|
1243
1243
|
"decision:caution verdict:caution risk:0.55 Caution: 1 memory signal, 1 medium severity found before acting.",
|
|
1244
1244
|
"warning:recent_failure severity:medium npm test failed 1x recently: Vitest failed with spawn EPERM Before re-running npm test, check what changed since the last failure."
|
|
1245
1245
|
],
|
|
@@ -1292,12 +1292,12 @@
|
|
|
1292
1292
|
"results": [
|
|
1293
1293
|
{
|
|
1294
1294
|
"system": "Audrey",
|
|
1295
|
-
"durationMs":
|
|
1295
|
+
"durationMs": 11,
|
|
1296
1296
|
"passed": true,
|
|
1297
1297
|
"score": 1,
|
|
1298
1298
|
"summary": "matched all required signals",
|
|
1299
1299
|
"topResults": [
|
|
1300
|
-
"guard_hardened:non_guard_receipt_rejected error:not a guard receipt:
|
|
1300
|
+
"guard_hardened:non_guard_receipt_rejected error:not a guard receipt: 01KSRXCNT0Q2HH0MK1SSNP5V1Q",
|
|
1301
1301
|
"decision:go verdict:clear risk:0 No relevant memory risks, prior failures, or must-follow procedures were found."
|
|
1302
1302
|
],
|
|
1303
1303
|
"retrievalSummary": "guard_hardened:non_guard_receipt_rejected error:not a guard receipt: 01K | decision:go verdict:clear risk:0 No relevant memory risks, prior failure"
|
|
@@ -1352,7 +1352,7 @@
|
|
|
1352
1352
|
"results": [
|
|
1353
1353
|
{
|
|
1354
1354
|
"system": "Audrey",
|
|
1355
|
-
"durationMs":
|
|
1355
|
+
"durationMs": 24,
|
|
1356
1356
|
"passed": true,
|
|
1357
1357
|
"score": 1,
|
|
1358
1358
|
"summary": "retrieved expected evidence",
|
|
@@ -1364,7 +1364,7 @@
|
|
|
1364
1364
|
},
|
|
1365
1365
|
{
|
|
1366
1366
|
"system": "Vector Only",
|
|
1367
|
-
"durationMs":
|
|
1367
|
+
"durationMs": 2,
|
|
1368
1368
|
"passed": true,
|
|
1369
1369
|
"score": 1,
|
|
1370
1370
|
"summary": "retrieved expected evidence",
|
|
@@ -1376,7 +1376,7 @@
|
|
|
1376
1376
|
},
|
|
1377
1377
|
{
|
|
1378
1378
|
"system": "Keyword + Recency",
|
|
1379
|
-
"durationMs":
|
|
1379
|
+
"durationMs": 7,
|
|
1380
1380
|
"passed": true,
|
|
1381
1381
|
"score": 1,
|
|
1382
1382
|
"summary": "retrieved expected evidence",
|
|
@@ -1411,7 +1411,7 @@
|
|
|
1411
1411
|
"results": [
|
|
1412
1412
|
{
|
|
1413
1413
|
"system": "Audrey",
|
|
1414
|
-
"durationMs":
|
|
1414
|
+
"durationMs": 13,
|
|
1415
1415
|
"passed": true,
|
|
1416
1416
|
"score": 1,
|
|
1417
1417
|
"summary": "retrieved expected evidence",
|
|
@@ -1422,7 +1422,7 @@
|
|
|
1422
1422
|
},
|
|
1423
1423
|
{
|
|
1424
1424
|
"system": "Vector Only",
|
|
1425
|
-
"durationMs":
|
|
1425
|
+
"durationMs": 1,
|
|
1426
1426
|
"passed": false,
|
|
1427
1427
|
"score": 0.5,
|
|
1428
1428
|
"summary": "retrieved expected evidence, but conflicting evidence still appeared later",
|
|
@@ -1469,7 +1469,7 @@
|
|
|
1469
1469
|
"results": [
|
|
1470
1470
|
{
|
|
1471
1471
|
"system": "Audrey",
|
|
1472
|
-
"durationMs":
|
|
1472
|
+
"durationMs": 15,
|
|
1473
1473
|
"passed": true,
|
|
1474
1474
|
"score": 1,
|
|
1475
1475
|
"summary": "retrieved expected evidence",
|
|
@@ -1482,7 +1482,7 @@
|
|
|
1482
1482
|
},
|
|
1483
1483
|
{
|
|
1484
1484
|
"system": "Vector Only",
|
|
1485
|
-
"durationMs":
|
|
1485
|
+
"durationMs": 0,
|
|
1486
1486
|
"passed": true,
|
|
1487
1487
|
"score": 1,
|
|
1488
1488
|
"summary": "retrieved expected evidence",
|
|
@@ -1532,7 +1532,7 @@
|
|
|
1532
1532
|
"results": [
|
|
1533
1533
|
{
|
|
1534
1534
|
"system": "Audrey",
|
|
1535
|
-
"durationMs":
|
|
1535
|
+
"durationMs": 15,
|
|
1536
1536
|
"passed": true,
|
|
1537
1537
|
"score": 1,
|
|
1538
1538
|
"summary": "retrieved expected evidence",
|
|
@@ -1543,7 +1543,7 @@
|
|
|
1543
1543
|
},
|
|
1544
1544
|
{
|
|
1545
1545
|
"system": "Vector Only",
|
|
1546
|
-
"durationMs":
|
|
1546
|
+
"durationMs": 0,
|
|
1547
1547
|
"passed": true,
|
|
1548
1548
|
"score": 1,
|
|
1549
1549
|
"summary": "retrieved expected evidence",
|
|
@@ -1593,7 +1593,7 @@
|
|
|
1593
1593
|
"results": [
|
|
1594
1594
|
{
|
|
1595
1595
|
"system": "Audrey",
|
|
1596
|
-
"durationMs":
|
|
1596
|
+
"durationMs": 14,
|
|
1597
1597
|
"passed": true,
|
|
1598
1598
|
"score": 1,
|
|
1599
1599
|
"summary": "correct abstention",
|
|
@@ -1649,7 +1649,7 @@
|
|
|
1649
1649
|
"results": [
|
|
1650
1650
|
{
|
|
1651
1651
|
"system": "Audrey",
|
|
1652
|
-
"durationMs":
|
|
1652
|
+
"durationMs": 13,
|
|
1653
1653
|
"passed": true,
|
|
1654
1654
|
"score": 1,
|
|
1655
1655
|
"summary": "retrieved expected evidence",
|
|
@@ -1672,7 +1672,7 @@
|
|
|
1672
1672
|
},
|
|
1673
1673
|
{
|
|
1674
1674
|
"system": "Keyword + Recency",
|
|
1675
|
-
"durationMs":
|
|
1675
|
+
"durationMs": 1,
|
|
1676
1676
|
"passed": false,
|
|
1677
1677
|
"score": 0,
|
|
1678
1678
|
"summary": "blocked content outranked the correct answer",
|
|
@@ -1707,7 +1707,7 @@
|
|
|
1707
1707
|
"results": [
|
|
1708
1708
|
{
|
|
1709
1709
|
"system": "Audrey",
|
|
1710
|
-
"durationMs":
|
|
1710
|
+
"durationMs": 18,
|
|
1711
1711
|
"passed": true,
|
|
1712
1712
|
"score": 1,
|
|
1713
1713
|
"summary": "retrieved expected evidence",
|
|
@@ -1768,7 +1768,7 @@
|
|
|
1768
1768
|
"results": [
|
|
1769
1769
|
{
|
|
1770
1770
|
"system": "Audrey",
|
|
1771
|
-
"durationMs":
|
|
1771
|
+
"durationMs": 13,
|
|
1772
1772
|
"passed": true,
|
|
1773
1773
|
"score": 1,
|
|
1774
1774
|
"summary": "correct abstention",
|
|
@@ -1824,7 +1824,7 @@
|
|
|
1824
1824
|
"results": [
|
|
1825
1825
|
{
|
|
1826
1826
|
"system": "Audrey",
|
|
1827
|
-
"durationMs":
|
|
1827
|
+
"durationMs": 13,
|
|
1828
1828
|
"passed": true,
|
|
1829
1829
|
"score": 1,
|
|
1830
1830
|
"summary": "retrieved expected evidence",
|
|
@@ -1882,7 +1882,7 @@
|
|
|
1882
1882
|
"results": [
|
|
1883
1883
|
{
|
|
1884
1884
|
"system": "Audrey",
|
|
1885
|
-
"durationMs":
|
|
1885
|
+
"durationMs": 14,
|
|
1886
1886
|
"passed": true,
|
|
1887
1887
|
"score": 1,
|
|
1888
1888
|
"summary": "correct abstention",
|
|
@@ -1902,7 +1902,7 @@
|
|
|
1902
1902
|
},
|
|
1903
1903
|
{
|
|
1904
1904
|
"system": "Keyword + Recency",
|
|
1905
|
-
"durationMs":
|
|
1905
|
+
"durationMs": 0,
|
|
1906
1906
|
"passed": false,
|
|
1907
1907
|
"score": 0.5,
|
|
1908
1908
|
"summary": "no leak, but retrieved tangential context",
|
|
@@ -1996,7 +1996,7 @@
|
|
|
1996
1996
|
"results": [
|
|
1997
1997
|
{
|
|
1998
1998
|
"system": "Audrey",
|
|
1999
|
-
"durationMs":
|
|
1999
|
+
"durationMs": 16,
|
|
2000
2000
|
"passed": true,
|
|
2001
2001
|
"score": 1,
|
|
2002
2002
|
"summary": "retrieved expected evidence",
|
|
@@ -2058,7 +2058,7 @@
|
|
|
2058
2058
|
"results": [
|
|
2059
2059
|
{
|
|
2060
2060
|
"system": "Audrey",
|
|
2061
|
-
"durationMs":
|
|
2061
|
+
"durationMs": 18,
|
|
2062
2062
|
"passed": true,
|
|
2063
2063
|
"score": 1,
|
|
2064
2064
|
"summary": "matched all required signals",
|
|
@@ -2116,7 +2116,7 @@
|
|
|
2116
2116
|
"results": [
|
|
2117
2117
|
{
|
|
2118
2118
|
"system": "Audrey",
|
|
2119
|
-
"durationMs":
|
|
2119
|
+
"durationMs": 14,
|
|
2120
2120
|
"passed": true,
|
|
2121
2121
|
"score": 1,
|
|
2122
2122
|
"summary": "matched all required signals",
|
|
@@ -2174,12 +2174,12 @@
|
|
|
2174
2174
|
"results": [
|
|
2175
2175
|
{
|
|
2176
2176
|
"system": "Audrey",
|
|
2177
|
-
"durationMs":
|
|
2177
|
+
"durationMs": 14,
|
|
2178
2178
|
"passed": true,
|
|
2179
2179
|
"score": 1,
|
|
2180
2180
|
"summary": "matched all required signals",
|
|
2181
2181
|
"topResults": [
|
|
2182
|
-
"guard_hardened:replay_rejected error:guard receipt already has an outcome:
|
|
2182
|
+
"guard_hardened:replay_rejected error:guard receipt already has an outcome: 01KSRXCNSKBMJ51Y9GFJ37J0NV",
|
|
2183
2183
|
"decision:caution verdict:caution risk:0.55 Caution: 1 memory signal, 1 medium severity found before acting.",
|
|
2184
2184
|
"warning:recent_failure severity:medium npm test failed 1x recently: Vitest failed with spawn EPERM Before re-running npm test, check what changed since the last failure."
|
|
2185
2185
|
],
|
|
@@ -2232,12 +2232,12 @@
|
|
|
2232
2232
|
"results": [
|
|
2233
2233
|
{
|
|
2234
2234
|
"system": "Audrey",
|
|
2235
|
-
"durationMs":
|
|
2235
|
+
"durationMs": 11,
|
|
2236
2236
|
"passed": true,
|
|
2237
2237
|
"score": 1,
|
|
2238
2238
|
"summary": "matched all required signals",
|
|
2239
2239
|
"topResults": [
|
|
2240
|
-
"guard_hardened:non_guard_receipt_rejected error:not a guard receipt:
|
|
2240
|
+
"guard_hardened:non_guard_receipt_rejected error:not a guard receipt: 01KSRXCNT0Q2HH0MK1SSNP5V1Q",
|
|
2241
2241
|
"decision:go verdict:clear risk:0 No relevant memory risks, prior failures, or must-follow procedures were found."
|
|
2242
2242
|
],
|
|
2243
2243
|
"retrievalSummary": "guard_hardened:non_guard_receipt_rejected error:not a guard receipt: 01K | decision:go verdict:clear risk:0 No relevant memory risks, prior failure"
|
|
@@ -24,13 +24,13 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
|
|
|
24
24
|
|
|
25
25
|
## Behavioral Regression Result
|
|
26
26
|
|
|
27
|
-
The current `benchmarks/output/summary.json` was generated on 2026-05-
|
|
27
|
+
The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
|
|
28
28
|
|
|
29
29
|
| System | Score Percent | Pass Rate | Average Duration Ms |
|
|
30
30
|
|---|---:|---:|---:|
|
|
31
|
-
| Audrey | 100 | 100 |
|
|
32
|
-
| Vector Only | 41.66666666666667 | 25 | 0.
|
|
33
|
-
| Keyword + Recency | 41.66666666666667 | 25 | 0.
|
|
31
|
+
| Audrey | 100 | 100 | 15.416666666666666 |
|
|
32
|
+
| Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
|
|
33
|
+
| Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
|
|
34
34
|
| Recent Window | 37.5 | 25 | 0 |
|
|
35
35
|
|
|
36
36
|
This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
|
|
@@ -55,7 +55,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
|
|
|
55
55
|
| Evidence recall | 100% |
|
|
56
56
|
| Redaction leaks | 0 |
|
|
57
57
|
| Recall-degradation detection | 100% |
|
|
58
|
-
| Guard latency p50 / p95 | 2.
|
|
58
|
+
| Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
|
|
59
59
|
| Published artifact raw-secret leaks | 0 |
|
|
60
60
|
| Audrey Guard decision accuracy | 100% |
|
|
61
61
|
| No-memory decision accuracy | 10% |
|
|
@@ -895,13 +895,13 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
|
|
|
895
895
|
|
|
896
896
|
### Behavioral Regression Result
|
|
897
897
|
|
|
898
|
-
The current `benchmarks/output/summary.json` was generated on 2026-05-
|
|
898
|
+
The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
|
|
899
899
|
|
|
900
900
|
| System | Score Percent | Pass Rate | Average Duration Ms |
|
|
901
901
|
|---|---:|---:|---:|
|
|
902
|
-
| Audrey | 100 | 100 |
|
|
903
|
-
| Vector Only | 41.66666666666667 | 25 | 0.
|
|
904
|
-
| Keyword + Recency | 41.66666666666667 | 25 | 0.
|
|
902
|
+
| Audrey | 100 | 100 | 15.416666666666666 |
|
|
903
|
+
| Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
|
|
904
|
+
| Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
|
|
905
905
|
| Recent Window | 37.5 | 25 | 0 |
|
|
906
906
|
|
|
907
907
|
This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
|
|
@@ -924,7 +924,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
|
|
|
924
924
|
| Evidence recall | 100% |
|
|
925
925
|
| Redaction leaks | 0 |
|
|
926
926
|
| Recall-degradation detection | 100% |
|
|
927
|
-
| Guard latency p50 / p95 | 2.
|
|
927
|
+
| Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
|
|
928
928
|
| Published artifact raw-secret leaks | 0 |
|
|
929
929
|
| Audrey Guard decision accuracy | 100% |
|
|
930
930
|
| No-memory decision accuracy | 10% |
|
|
@@ -49,7 +49,7 @@ Every implementation claim in the paper should point to one or more ledger IDs i
|
|
|
49
49
|
| E43 - Audrey exposes a Claude Code hook generator, guarded settings apply path, and hook-mode Guard command: `hook-config claude-code` emits hooks, `hook-config claude-code --apply --scope project|user` merges them into Claude Code settings with backup/idempotence, `guard --hook --fail-on-warn` consumes PreToolUse JSON and returns `hookSpecificOutput.permissionDecision`, and `observe-tool` records post-tool events. Codex hook wiring remains pending on a stable host hook surface. | Hook integration boundary | README.md; mcp-server/index.ts; tests/mcp-server.test.js | Yes, focused Vitest and CLI hook smoke passed on 2026-05-12 |
|
|
50
50
|
| E44 - Audrey preflight events now persist `preflight_evidence_ids` and `audrey_guard_action_key`; `memory_validate` accepts optional `preflight_event_id`, action key, and evidence ids, persists them on the validation audit event, and rejects validation lineage when the memory id was not evidence for that preflight. | Validation lineage implementation | src/action-key.ts; src/controller.ts; src/preflight.ts; src/audrey.ts; mcp-server/index.ts; tests/controller.test.js | Yes, focused Vitest passed on 2026-05-12 |
|
|
51
51
|
| E45 - Preflight risk scoring uses a fixed severity map (`info=0.1`, `low=0.25`, `medium=0.55`, `high=0.85`), sorts warnings by severity, and strict mode blocks on high-severity warnings; the scoring path does not consume validation feedback. | Fixed risk scoring boundary | src/preflight.ts:6-60,291-299,332-338; src/feedback.ts:3-18,70-163 | Yes, 2026-05-08 |
|
|
52
|
-
| E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 2.
|
|
52
|
+
| E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 2.916ms/21.17ms p50/p95 guard latency under the mock-provider methodology. Baseline decision accuracy was no-memory 10%, recent-window 60%, vector-only 40%, and FTS-only 10%, with 0% full-contract pass rate for each baseline. | GuardBench local comparative results | benchmarks/guardbench.js; benchmarks/output/guardbench-summary.json; benchmarks/output/guardbench-manifest.json; benchmarks/output/guardbench-raw.json; package.json | Yes, `npm run bench:guard:check` passed on 2026-05-13 |
|
|
53
53
|
| E47 - GuardBench accepts external ESM adapters through `--adapter`, supports `default`, `adapter`, or `createGuardBenchAdapter()` exports, withholds `expectedDecision` and `requiredEvidence` during adapter execution, then scores adapter output against the same full-contract decision/evidence/redaction checks. | GuardBench external adapter contract | benchmarks/guardbench.js; tests/guardbench.test.js; package.json | Yes, `node scripts/run-vitest.mjs run tests/guardbench.test.js` passed on 2026-05-12 |
|
|
54
54
|
| E48 - Audrey ships a Mem0 Platform GuardBench adapter that uses the current Mem0 REST shape: V3 async memory add with event polling, V2 filtered memory search, and user-entity cleanup. It requires runtime `MEM0_API_KEY` and is not run by default. | First external-system GuardBench adapter | benchmarks/adapters/mem0-platform.mjs; tests/guardbench.test.js; README.md | Import/contract and mocked REST-flow tests passed on 2026-05-12; live Mem0 run not yet executed |
|
|
55
55
|
| E49 - GuardBench ships a credential-free example external adapter and a `bench:guard:adapter-smoke` script so the adapter loader can be exercised through the real CLI path without external credentials. | External adapter smoke path | benchmarks/adapters/example-allow.mjs; package.json; README.md; tests/guardbench.test.js | Yes, `npm run bench:guard:adapter-smoke` passed on 2026-05-12 |
|