audrey 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/README.md +9 -1
  3. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  4. package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  5. package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  6. package/benchmarks/output/guardbench-conformance-card.json +9 -9
  7. package/benchmarks/output/guardbench-raw.json +104 -103
  8. package/benchmarks/output/guardbench-summary.json +167 -165
  9. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  10. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  11. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +9 -9
  12. package/benchmarks/output/submission-bundle/guardbench-raw.json +104 -103
  13. package/benchmarks/output/submission-bundle/guardbench-summary.json +167 -165
  14. package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  15. package/benchmarks/output/submission-bundle/validation-report.json +1 -1
  16. package/benchmarks/output/summary.json +48 -48
  17. package/dist/mcp-server/config.d.ts +1 -1
  18. package/dist/mcp-server/config.js +1 -1
  19. package/dist/mcp-server/index.d.ts +3 -344
  20. package/dist/mcp-server/index.d.ts.map +1 -1
  21. package/dist/mcp-server/index.js +6 -280
  22. package/dist/mcp-server/index.js.map +1 -1
  23. package/dist/mcp-server/tool-schemas.d.ts +341 -0
  24. package/dist/mcp-server/tool-schemas.d.ts.map +1 -0
  25. package/dist/mcp-server/tool-schemas.js +248 -0
  26. package/dist/mcp-server/tool-schemas.js.map +1 -0
  27. package/dist/mcp-server/tool-validation.d.ts +17 -0
  28. package/dist/mcp-server/tool-validation.d.ts.map +1 -0
  29. package/dist/mcp-server/tool-validation.js +41 -0
  30. package/dist/mcp-server/tool-validation.js.map +1 -0
  31. package/docs/paper/07-evaluation.md +6 -6
  32. package/docs/paper/audrey-paper-v1.md +6 -6
  33. package/docs/paper/evidence-ledger.md +1 -1
  34. package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  35. package/docs/paper/output/arxiv/main.tex +6 -6
  36. package/docs/paper/output/arxiv-compile-report.json +3 -3
  37. package/docs/paper/output/submission-bundle/README.md +9 -1
  38. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  39. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  40. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  41. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +9 -9
  42. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +104 -103
  43. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +167 -165
  44. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  45. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  46. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  47. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
  48. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +60 -60
  49. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +6 -6
  50. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
  51. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
  52. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  53. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
  54. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
  55. package/docs/paper/output/submission-bundle/package.json +1 -1
  56. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +34 -34
  57. package/package.json +1 -1
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tool-validation.js","sourceRoot":"","sources":["../../mcp-server/tool-validation.ts"],"names":[],"mappings":"AAOA,MAAM,CAAC,MAAM,aAAa,GAAG;IAC3B,oBAAoB;IACpB,cAAc;IACd,aAAa;IACb,WAAW;IACX,iBAAiB;CACT,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,UAAU,EAAE,UAAU,EAAE,YAAY,CAAU,CAAC;AAE3E,MAAM,CAAC,MAAM,yBAAyB,GAAG,MAAM,CAAC;AAChD,MAAM,CAAC,MAAM,eAAe,GAAG,2BAA2B,CAAC;AAE3D,MAAM,UAAU,cAAc,CAAC,KAAc;IAC3C,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,OAAe;IACnD,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,CAAC;IACD,IAAI,OAAO,CAAC,MAAM,GAAG,yBAAyB,EAAE,CAAC;QAC/C,MAAM,IAAI,KAAK,CAAC,qCAAqC,yBAAyB,aAAa,CAAC,CAAC;IAC/F,CAAC;AACH,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,EAAW,EAAE,KAAc;IACjE,IAAI,CAAC,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACrC,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,CAAC;AACH,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,MAA0C,OAAO,CAAC,GAAG;IAErD,MAAM,KAAK,GAAG,GAAG,CAAC,eAAe,CAAC,EAAE,WAAW,EAAE,CAAC;IAClD,OAAO,KAAK,KAAK,GAAG,IAAI,KAAK,KAAK,MAAM,IAAI,KAAK,KAAK,KAAK,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,MAA0C,OAAO,CAAC,GAAG;IACrF,IAAI,CAAC,mBAAmB,CAAC,GAAG,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CACb,wCAAwC,eAAe,qDAAqD,CAC7G,CAAC;IACJ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,2BAA2B,CAAC,QAA2B;IAC3E,IAAI,QAAQ,IAAI,OAAO,QAAQ,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;QACrD,MAAM,QAAQ,CAAC,KAAK,EAAE,CAAC;IACzB,CAAC;AACH,CAAC"}
@@ -24,14 +24,14 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
24
24
 
25
25
  ## Behavioral Regression Result
26
26
 
27
- The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
27
+ The current `benchmarks/output/summary.json` was generated on 2026-05-29T13:33:19.790Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
28
28
 
29
29
  | System | Score Percent | Pass Rate | Average Duration Ms |
30
30
  |---|---:|---:|---:|
31
- | Audrey | 100 | 100 | 15.416666666666666 |
32
- | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
33
- | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
34
- | Recent Window | 37.5 | 25 | 0 |
31
+ | Audrey | 100 | 100 | 14.75 |
32
+ | Vector Only | 41.66666666666667 | 25 | 0.4166666666666667 |
33
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.5833333333333334 |
34
+ | Recent Window | 37.5 | 25 | 0.08333333333333333 |
35
35
 
36
36
  This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
37
37
 
@@ -55,7 +55,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
55
55
  | Evidence recall | 100% |
56
56
  | Redaction leaks | 0 |
57
57
  | Recall-degradation detection | 100% |
58
- | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
58
+ | Guard latency p50 / p95 | 3.09 ms / 28.181 ms |
59
59
  | Published artifact raw-secret leaks | 0 |
60
60
  | Audrey Guard decision accuracy | 100% |
61
61
  | No-memory decision accuracy | 10% |
@@ -895,14 +895,14 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
895
895
 
896
896
  ### Behavioral Regression Result
897
897
 
898
- The current `benchmarks/output/summary.json` was generated on 2026-05-29T03:45:32.997Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
898
+ The current `benchmarks/output/summary.json` was generated on 2026-05-29T13:33:19.790Z with command `node benchmarks/run.js --provider mock --dimensions 64` (Ledger: E24). It reports:
899
899
 
900
900
  | System | Score Percent | Pass Rate | Average Duration Ms |
901
901
  |---|---:|---:|---:|
902
- | Audrey | 100 | 100 | 15.416666666666666 |
903
- | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
904
- | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
905
- | Recent Window | 37.5 | 25 | 0 |
902
+ | Audrey | 100 | 100 | 14.75 |
903
+ | Vector Only | 41.66666666666667 | 25 | 0.4166666666666667 |
904
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.5833333333333334 |
905
+ | Recent Window | 37.5 | 25 | 0.08333333333333333 |
906
906
 
907
907
  This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
908
908
 
@@ -924,7 +924,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
924
924
  | Evidence recall | 100% |
925
925
  | Redaction leaks | 0 |
926
926
  | Recall-degradation detection | 100% |
927
- | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
927
+ | Guard latency p50 / p95 | 3.09 ms / 28.181 ms |
928
928
  | Published artifact raw-secret leaks | 0 |
929
929
  | Audrey Guard decision accuracy | 100% |
930
930
  | No-memory decision accuracy | 10% |
@@ -49,7 +49,7 @@ Every implementation claim in the paper should point to one or more ledger IDs i
49
49
  | E43 - Audrey exposes a Claude Code hook generator, guarded settings apply path, and hook-mode Guard command: `hook-config claude-code` emits hooks, `hook-config claude-code --apply --scope project|user` merges them into Claude Code settings with backup/idempotence, `guard --hook --fail-on-warn` consumes PreToolUse JSON and returns `hookSpecificOutput.permissionDecision`, and `observe-tool` records post-tool events. Codex hook wiring remains pending on a stable host hook surface. | Hook integration boundary | README.md; mcp-server/index.ts; tests/mcp-server.test.js | Yes, focused Vitest and CLI hook smoke passed on 2026-05-12 |
50
50
  | E44 - Audrey preflight events now persist `preflight_evidence_ids` and `audrey_guard_action_key`; `memory_validate` accepts optional `preflight_event_id`, action key, and evidence ids, persists them on the validation audit event, and rejects validation lineage when the memory id was not evidence for that preflight. | Validation lineage implementation | src/action-key.ts; src/controller.ts; src/preflight.ts; src/audrey.ts; mcp-server/index.ts; tests/controller.test.js | Yes, focused Vitest passed on 2026-05-12 |
51
51
  | E45 - Preflight risk scoring uses a fixed severity map (`info=0.1`, `low=0.25`, `medium=0.55`, `high=0.85`), sorts warnings by severity, and strict mode blocks on high-severity warnings; the scoring path does not consume validation feedback. | Fixed risk scoring boundary | src/preflight.ts:6-60,291-299,332-338; src/feedback.ts:3-18,70-163 | Yes, 2026-05-08 |
52
- | E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 2.916ms/21.17ms p50/p95 guard latency under the mock-provider methodology. Baseline decision accuracy was no-memory 10%, recent-window 60%, vector-only 40%, and FTS-only 10%, with 0% full-contract pass rate for each baseline. | GuardBench local comparative results | benchmarks/guardbench.js; benchmarks/output/guardbench-summary.json; benchmarks/output/guardbench-manifest.json; benchmarks/output/guardbench-raw.json; package.json | Yes, `npm run bench:guard:check` passed on 2026-05-13 |
52
+ | E46 - `benchmarks/guardbench.js` runs ten local comparative GuardBench scenarios across Audrey Guard, no-memory, recent-window, vector-only, and FTS-only adapters and writes `benchmarks/output/guardbench-summary.json`, `benchmarks/output/guardbench-manifest.json`, and `benchmarks/output/guardbench-raw.json`; the latest local run has Audrey Guard passing 10/10 scenarios with 100% prevention rate, 0% false-block rate, 100% evidence recall, zero decision-output redaction leaks, zero published artifact raw-secret leaks, 100% recall-degradation detection, 100% decision accuracy, and 3.09ms/28.181ms p50/p95 guard latency under the mock-provider methodology. Baseline decision accuracy was no-memory 10%, recent-window 60%, vector-only 40%, and FTS-only 10%, with 0% full-contract pass rate for each baseline. | GuardBench local comparative results | benchmarks/guardbench.js; benchmarks/output/guardbench-summary.json; benchmarks/output/guardbench-manifest.json; benchmarks/output/guardbench-raw.json; package.json | Yes, `npm run bench:guard:check` passed on 2026-05-13 |
53
53
  | E47 - GuardBench accepts external ESM adapters through `--adapter`, supports `default`, `adapter`, or `createGuardBenchAdapter()` exports, withholds `expectedDecision` and `requiredEvidence` during adapter execution, then scores adapter output against the same full-contract decision/evidence/redaction checks. | GuardBench external adapter contract | benchmarks/guardbench.js; tests/guardbench.test.js; package.json | Yes, `node scripts/run-vitest.mjs run tests/guardbench.test.js` passed on 2026-05-12 |
54
54
  | E48 - Audrey ships a Mem0 Platform GuardBench adapter that uses the current Mem0 REST shape: V3 async memory add with event polling, V2 filtered memory search, and user-entity cleanup. It requires runtime `MEM0_API_KEY` and is not run by default. | First external-system GuardBench adapter | benchmarks/adapters/mem0-platform.mjs; tests/guardbench.test.js; README.md | Import/contract and mocked REST-flow tests passed on 2026-05-12; live Mem0 run not yet executed |
55
55
  | E49 - GuardBench ships a credential-free example external adapter and a `bench:guard:adapter-smoke` script so the adapter loader can be exercised through the real CLI path without external credentials. | External adapter smoke path | benchmarks/adapters/example-allow.mjs; package.json; README.md; tests/guardbench.test.js | Yes, `npm run bench:guard:adapter-smoke` passed on 2026-05-12 |
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "Audrey arXiv source package",
4
- "generatedAt": "2026-05-29T03:45:42.097Z",
4
+ "generatedAt": "2026-05-29T13:33:28.366Z",
5
5
  "sourceMarkdown": "docs/paper/audrey-paper-v1.md",
6
6
  "publicationPack": "docs/paper/publication-pack.json",
7
7
  "sourceHashes": {
8
- "sourceMarkdown": "f2afc1cda24b1ba91cf39429e7836ead97c7a1c815b294b3d623bb20d8f5e7e6",
8
+ "sourceMarkdown": "69410d651699e903d475b8dd7f2a23bf824143ca16061f66028c817bab673789",
9
9
  "publicationPack": "a1a523d5938faea72be568b843ac3890e61cea6070b0cfa46acf22ad3d2fb974",
10
10
  "referencesBib": "c0bfcaf7bfe37d6933c812e46352be8a95397eaa430a0f5bc94037600a53f654"
11
11
  },
@@ -13,8 +13,8 @@
13
13
  {
14
14
  "path": "main.tex",
15
15
  "source": "docs/paper/audrey-paper-v1.md",
16
- "bytes": 122667,
17
- "sha256": "e3ee98ea8c523e8f394b8fbbc73e206f0d6126b7349df5669206d4d48d9feea6"
16
+ "bytes": 122672,
17
+ "sha256": "d428d54f4b8de3bf0bba45b5e8c5b612dcf595e13770895300fa2581e3d8ff39"
18
18
  },
19
19
  {
20
20
  "path": "references.bib",
@@ -729,14 +729,14 @@ These numbers measure Audrey's local call path under an in-process mock embeddin
729
729
 
730
730
  \subsection{Behavioral Regression Result}
731
731
 
732
- The current \texttt{benchmarks/output/summary.json} was generated on 2026-05-29T03:45:32.997Z with command \texttt{node benchmarks/run.js --provider mock --dimensions 64} (Ledger: E24). It reports:
732
+ The current \texttt{benchmarks/output/summary.json} was generated on 2026-05-29T13:33:19.790Z with command \texttt{node benchmarks/run.js --provider mock --dimensions 64} (Ledger: E24). It reports:
733
733
 
734
734
  \begin{verbatim}
735
735
  | System | Score Percent | Pass Rate | Average Duration Ms |
736
- | Audrey | 100 | 100 | 15.416666666666666 |
737
- | Vector Only | 41.66666666666667 | 25 | 0.3333333333333333 |
738
- | Keyword + Recency | 41.66666666666667 | 25 | 0.6666666666666666 |
739
- | Recent Window | 37.5 | 25 | 0 |
736
+ | Audrey | 100 | 100 | 14.75 |
737
+ | Vector Only | 41.66666666666667 | 25 | 0.4166666666666667 |
738
+ | Keyword + Recency | 41.66666666666667 | 25 | 0.5833333333333334 |
739
+ | Recent Window | 37.5 | 25 | 0.08333333333333333 |
740
740
  \end{verbatim}
741
741
 
742
742
  This output is a regression-gate result. The baselines are toy local baselines used to catch retrieval and lifecycle regressions in the Audrey codebase. They are not external systems, not tuned competitor implementations, and not GuardBench baselines (Ledger: E23-E24). The current suite covers retrieval and operation families such as information extraction, knowledge updates, multi-session reasoning, conflict resolution, procedural learning, privacy boundary, overwrite, delete-and-abstain, semantic merge, and procedural merge (Ledger: E23-E24).
@@ -759,7 +759,7 @@ It reports local adapters only, not external-system comparisons (Ledger: E46):
759
759
  | Evidence recall | 100% |
760
760
  | Redaction leaks | 0 |
761
761
  | Recall-degradation detection | 100% |
762
- | Guard latency p50 / p95 | 2.916 ms / 21.17 ms |
762
+ | Guard latency p50 / p95 | 3.09 ms / 28.181 ms |
763
763
  | Published artifact raw-secret leaks | 0 |
764
764
  | Audrey Guard decision accuracy | 100% |
765
765
  | No-memory decision accuracy | 10% |
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "Audrey arXiv compile check",
4
- "generatedAt": "2026-05-29T03:45:42.412Z",
4
+ "generatedAt": "2026-05-29T13:33:29.968Z",
5
5
  "source": {
6
6
  "sourceDir": "docs/paper/output/arxiv",
7
7
  "manifest": "docs/paper/output/arxiv/arxiv-manifest.json",
8
- "manifestSha256": "6364c368755a4188d5b2deea9e3fe80201f5fa55658e6ae570141e54c5293bc6",
8
+ "manifestSha256": "cecbb545033a04a0dcc30e4de9cfaa016f27c95c724db61e3825bd3765fbd2c5",
9
9
  "mainTex": "docs/paper/output/arxiv/main.tex",
10
- "mainTexSha256": "e3ee98ea8c523e8f394b8fbbc73e206f0d6126b7349df5669206d4d48d9feea6",
10
+ "mainTexSha256": "d428d54f4b8de3bf0bba45b5e8c5b612dcf595e13770895300fa2581e3d8ff39",
11
11
  "referencesBib": "docs/paper/output/arxiv/references.bib",
12
12
  "referencesBibSha256": "c0bfcaf7bfe37d6933c812e46352be8a95397eaa430a0f5bc94037600a53f654"
13
13
  },
@@ -15,6 +15,14 @@
15
15
  </p>
16
16
  </div>
17
17
 
18
+ ## In Plain English
19
+
20
+ AI coding assistants are brilliant but forgetful. They'll happily rerun the same broken command they ran yesterday, forget the rules your team agreed on last week, and treat every new session like it's day one.
21
+
22
+ Audrey is the memory they're missing. It quietly keeps track of what worked, what failed, and what you told it — then checks that memory **before** the agent does something, so it can say "hold on, this exact command failed last time, and here's what fixed it" instead of repeating the mistake. Everything lives in one local file on your machine: no cloud, no account, and nothing about your code ever leaves your computer.
23
+
24
+ That's the whole idea. The rest of this README is the detail.
25
+
18
26
  ## Why Audrey Exists
19
27
 
20
28
  Agents forget the exact mistakes they made yesterday. They repeat broken commands, lose project-specific rules, miss contradictions, and treat every new session like a cold start.
@@ -296,7 +304,7 @@ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
296
304
 
297
305
  Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
298
306
  rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
299
- the raw-secret sweep, and 2.916ms / 21.17ms
307
+ the raw-secret sweep, and 3.09ms / 28.181ms
300
308
  p50/p95 guard latency under the mock-provider methodology.
301
309
 
302
310
  **Methodology caveats, on purpose.** All numbers above are produced against
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench adapter self-test",
4
- "generatedAt": "2026-05-29T03:45:40.969Z",
4
+ "generatedAt": "2026-05-29T13:33:27.293Z",
5
5
  "ok": true,
6
6
  "adapter": {
7
7
  "name": "Example Allow Adapter",
@@ -27,9 +27,9 @@
27
27
  "evidenceRecall": 0.1,
28
28
  "redactionLeaks": 0,
29
29
  "latency": {
30
- "p50Ms": 0.01,
31
- "p95Ms": 0.043,
32
- "maxMs": 0.043
30
+ "p50Ms": 0.012,
31
+ "p95Ms": 0.042,
32
+ "maxMs": 0.042
33
33
  }
34
34
  },
35
35
  "contract": {
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench external adapter dry-run matrix",
4
- "generatedAt": "2026-05-29T03:45:41.522Z",
4
+ "generatedAt": "2026-05-29T13:33:27.818Z",
5
5
  "ok": true,
6
6
  "registry": "benchmarks/adapters/registry.json",
7
7
  "outRoot": "benchmarks/output/external",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench external evidence verification",
4
- "generatedAt": "2026-05-29T03:45:41.794Z",
4
+ "generatedAt": "2026-05-29T13:33:28.076Z",
5
5
  "ok": true,
6
6
  "allowPending": true,
7
7
  "registry": "benchmarks/adapters/registry.json",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "schemaVersion": "1.0.0",
3
3
  "suite": "GuardBench conformance card",
4
- "generatedAt": "2026-05-29T03:45:36.958Z",
4
+ "generatedAt": "2026-05-29T13:33:23.522Z",
5
5
  "sourceDir": "benchmarks/output",
6
6
  "manifestVersion": "0.2.0",
7
7
  "suiteId": "guardbench-local-comparative",
@@ -25,9 +25,9 @@
25
25
  "evidenceRecall": 1,
26
26
  "redactionLeaks": 0,
27
27
  "latency": {
28
- "p50Ms": 2.916,
29
- "p95Ms": 21.17,
30
- "maxMs": 21.17
28
+ "p50Ms": 3.09,
29
+ "p95Ms": 28.181,
30
+ "maxMs": 28.181
31
31
  }
32
32
  },
33
33
  "conformance": {
@@ -39,21 +39,21 @@
39
39
  "integrity": {
40
40
  "artifactHashes": {
41
41
  "guardbench-manifest.json": "57636ce19fdaa6e50fc3fc961d9e499a9f43632f588c713a9fefe8e8a6fa724c",
42
- "guardbench-summary.json": "e8669cd6c80dc3dc849b3c4fcc473ea706eb3a760bced69682d0dc2396b2e233",
43
- "guardbench-raw.json": "15b39fd1a65709a89455fbfcaf815daf364b204fa526d5065cc12fcaed281d28"
42
+ "guardbench-summary.json": "91f264dd889e2c639a6fc6d1b867bc228b94c84ed5120345e23dddb79c11ee74",
43
+ "guardbench-raw.json": "66d4b69087258638f3572a40e1fd59bb84067034f899eaa2c27eed2dde554b2b"
44
44
  },
45
45
  "externalRunMetadataHash": null
46
46
  },
47
47
  "provenance": {
48
- "generatedAt": "2026-05-29T03:45:36.607Z",
49
- "gitSha": "ceed2f51b615175c8bb412b96b5e5a501561189f",
48
+ "generatedAt": "2026-05-29T13:33:23.189Z",
49
+ "gitSha": "9f771bae94f5ce4cfd5d5425e300a6a440c833d2",
50
50
  "gitDirty": false,
51
51
  "node": "v24.16.0",
52
52
  "v8": "13.6.233.17-node.49",
53
53
  "platform": "linux",
54
54
  "arch": "x64",
55
55
  "osRelease": "6.17.0-1015-azure",
56
- "cpuModel": "AMD EPYC 9V74 80-Core Processor",
56
+ "cpuModel": "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz",
57
57
  "cpuCount": 4,
58
58
  "totalMemoryGb": 15.61,
59
59
  "embeddingProvider": "mock",