audrey 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +57 -0
- package/README.md +13 -3
- package/benchmarks/adapter-self-test.mjs +6 -2
- package/benchmarks/adapters/example-allow.mjs +5 -2
- package/benchmarks/adapters/mem0-platform.mjs +19 -12
- package/benchmarks/adapters/zep-cloud.mjs +51 -27
- package/benchmarks/baselines.js +11 -6
- package/benchmarks/build-leaderboard.mjs +36 -23
- package/benchmarks/cases.js +24 -12
- package/benchmarks/create-conformance-card.mjs +12 -3
- package/benchmarks/create-submission-bundle.mjs +22 -8
- package/benchmarks/dry-run-external-adapters.mjs +24 -12
- package/benchmarks/guardbench.js +263 -123
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/guardbench-raw.json +106 -106
- package/benchmarks/output/guardbench-summary.json +168 -168
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/submission-bundle/guardbench-raw.json +106 -106
- package/benchmarks/output/submission-bundle/guardbench-summary.json +168 -168
- package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +58 -58
- package/benchmarks/perf-snapshot.js +12 -9
- package/benchmarks/perf.bench.js +14 -6
- package/benchmarks/public-paths.mjs +11 -5
- package/benchmarks/reference-results.js +10 -5
- package/benchmarks/report.js +48 -27
- package/benchmarks/run-external-guardbench.mjs +47 -25
- package/benchmarks/run.js +112 -59
- package/benchmarks/validate-adapter-module.mjs +13 -10
- package/benchmarks/validate-adapter-registry.mjs +16 -5
- package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
- package/benchmarks/verify-external-evidence.mjs +86 -31
- package/benchmarks/verify-publication-artifacts.mjs +34 -11
- package/benchmarks/verify-submission-bundle.mjs +9 -4
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.d.ts.map +1 -1
- package/dist/mcp-server/config.js +5 -3
- package/dist/mcp-server/config.js.map +1 -1
- package/dist/mcp-server/index.d.ts +7 -347
- package/dist/mcp-server/index.d.ts.map +1 -1
- package/dist/mcp-server/index.js +289 -256
- package/dist/mcp-server/index.js.map +1 -1
- package/dist/mcp-server/tool-schemas.d.ts +341 -0
- package/dist/mcp-server/tool-schemas.d.ts.map +1 -0
- package/dist/mcp-server/tool-schemas.js +248 -0
- package/dist/mcp-server/tool-schemas.js.map +1 -0
- package/dist/mcp-server/tool-validation.d.ts +17 -0
- package/dist/mcp-server/tool-validation.d.ts.map +1 -0
- package/dist/mcp-server/tool-validation.js +41 -0
- package/dist/mcp-server/tool-validation.js.map +1 -0
- package/dist/src/action-key.d.ts.map +1 -1
- package/dist/src/action-key.js +6 -2
- package/dist/src/action-key.js.map +1 -1
- package/dist/src/adaptive.d.ts.map +1 -1
- package/dist/src/adaptive.js +4 -2
- package/dist/src/adaptive.js.map +1 -1
- package/dist/src/affect.d.ts.map +1 -1
- package/dist/src/affect.js +8 -5
- package/dist/src/affect.js.map +1 -1
- package/dist/src/audrey.d.ts +1 -1
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +93 -49
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/capsule.d.ts.map +1 -1
- package/dist/src/capsule.js +37 -15
- package/dist/src/capsule.js.map +1 -1
- package/dist/src/causal.d.ts +1 -1
- package/dist/src/causal.d.ts.map +1 -1
- package/dist/src/causal.js +4 -2
- package/dist/src/causal.js.map +1 -1
- package/dist/src/confidence.d.ts.map +1 -1
- package/dist/src/confidence.js +5 -5
- package/dist/src/confidence.js.map +1 -1
- package/dist/src/consolidate.d.ts.map +1 -1
- package/dist/src/consolidate.js +17 -9
- package/dist/src/consolidate.js.map +1 -1
- package/dist/src/context.js +1 -1
- package/dist/src/context.js.map +1 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +24 -13
- package/dist/src/controller.js.map +1 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +78 -27
- package/dist/src/db.js.map +1 -1
- package/dist/src/decay.d.ts +1 -1
- package/dist/src/decay.d.ts.map +1 -1
- package/dist/src/decay.js +1 -1
- package/dist/src/decay.js.map +1 -1
- package/dist/src/embedding.d.ts +12 -4
- package/dist/src/embedding.d.ts.map +1 -1
- package/dist/src/embedding.js +18 -16
- package/dist/src/embedding.js.map +1 -1
- package/dist/src/encode.d.ts.map +1 -1
- package/dist/src/encode.js +5 -4
- package/dist/src/encode.js.map +1 -1
- package/dist/src/events.d.ts +3 -2
- package/dist/src/events.d.ts.map +1 -1
- package/dist/src/events.js +7 -3
- package/dist/src/events.js.map +1 -1
- package/dist/src/export.d.ts.map +1 -1
- package/dist/src/export.js +21 -7
- package/dist/src/export.js.map +1 -1
- package/dist/src/feedback.d.ts.map +1 -1
- package/dist/src/feedback.js +1 -1
- package/dist/src/feedback.js.map +1 -1
- package/dist/src/forget.d.ts.map +1 -1
- package/dist/src/forget.js +12 -6
- package/dist/src/forget.js.map +1 -1
- package/dist/src/fts.d.ts.map +1 -1
- package/dist/src/fts.js +20 -8
- package/dist/src/fts.js.map +1 -1
- package/dist/src/hybrid-recall.d.ts.map +1 -1
- package/dist/src/hybrid-recall.js +12 -6
- package/dist/src/hybrid-recall.js.map +1 -1
- package/dist/src/impact.d.ts.map +1 -1
- package/dist/src/impact.js +26 -10
- package/dist/src/impact.js.map +1 -1
- package/dist/src/import.d.ts.map +1 -1
- package/dist/src/import.js +11 -6
- package/dist/src/import.js.map +1 -1
- package/dist/src/index.d.ts +3 -3
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -3
- package/dist/src/index.js.map +1 -1
- package/dist/src/interference.d.ts.map +1 -1
- package/dist/src/interference.js +10 -5
- package/dist/src/interference.js.map +1 -1
- package/dist/src/introspect.d.ts.map +1 -1
- package/dist/src/introspect.js +12 -6
- package/dist/src/introspect.js.map +1 -1
- package/dist/src/llm.d.ts +2 -2
- package/dist/src/llm.d.ts.map +1 -1
- package/dist/src/llm.js +6 -6
- package/dist/src/llm.js.map +1 -1
- package/dist/src/migrate.d.ts.map +1 -1
- package/dist/src/migrate.js +10 -4
- package/dist/src/migrate.js.map +1 -1
- package/dist/src/preflight.d.ts.map +1 -1
- package/dist/src/preflight.js +6 -8
- package/dist/src/preflight.js.map +1 -1
- package/dist/src/profile.d.ts.map +1 -1
- package/dist/src/profile.js.map +1 -1
- package/dist/src/promote.d.ts.map +1 -1
- package/dist/src/promote.js +16 -7
- package/dist/src/promote.js.map +1 -1
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +1 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/recall.d.ts.map +1 -1
- package/dist/src/recall.js +85 -18
- package/dist/src/recall.js.map +1 -1
- package/dist/src/redact.d.ts.map +1 -1
- package/dist/src/redact.js +9 -4
- package/dist/src/redact.js.map +1 -1
- package/dist/src/reflexes.d.ts.map +1 -1
- package/dist/src/reflexes.js +1 -7
- package/dist/src/reflexes.js.map +1 -1
- package/dist/src/rollback.d.ts.map +1 -1
- package/dist/src/rollback.js +4 -2
- package/dist/src/rollback.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +33 -13
- package/dist/src/routes.js.map +1 -1
- package/dist/src/rules-compiler.d.ts.map +1 -1
- package/dist/src/rules-compiler.js +24 -2
- package/dist/src/rules-compiler.js.map +1 -1
- package/dist/src/server.js +2 -2
- package/dist/src/server.js.map +1 -1
- package/dist/src/tool-trace.d.ts +2 -2
- package/dist/src/tool-trace.d.ts.map +1 -1
- package/dist/src/tool-trace.js +12 -4
- package/dist/src/tool-trace.js.map +1 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/ulid.js +1 -1
- package/dist/src/ulid.js.map +1 -1
- package/dist/src/utils.d.ts.map +1 -1
- package/dist/src/utils.js.map +1 -1
- package/dist/src/validate.d.ts.map +1 -1
- package/dist/src/validate.js +20 -10
- package/dist/src/validate.js.map +1 -1
- package/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +13 -3
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +106 -106
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +168 -168
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +64 -64
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +17 -4
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +34 -34
- package/examples/fintech-ops-demo.js +12 -5
- package/examples/healthcare-ops-demo.js +8 -4
- package/examples/ollama-memory-agent.js +41 -13
- package/examples/stripe-demo.js +12 -5
- package/package.json +17 -4
- package/scripts/audit-release-completion.mjs +179 -101
- package/scripts/create-arxiv-source.mjs +20 -14
- package/scripts/create-paper-submission-bundle.mjs +6 -2
- package/scripts/finalize-release.mjs +111 -36
- package/scripts/prepare-release-cut.mjs +14 -6
- package/scripts/publish-release-bundle.mjs +62 -23
- package/scripts/publish-release-github-api.mjs +89 -24
- package/scripts/smoke-cli.js +9 -9
- package/scripts/sync-paper-artifacts.mjs +5 -1
- package/scripts/verify-arxiv-compile.mjs +52 -16
- package/scripts/verify-arxiv-source.mjs +45 -15
- package/scripts/verify-browser-launch-plan.mjs +28 -11
- package/scripts/verify-browser-launch-results.mjs +32 -14
- package/scripts/verify-paper-artifacts.mjs +539 -79
- package/scripts/verify-paper-claims.mjs +48 -20
- package/scripts/verify-paper-submission-bundle.mjs +22 -11
- package/scripts/verify-publication-pack.mjs +23 -9
- package/scripts/verify-release-readiness.mjs +211 -76
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,62 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.0.3 - 2026-05-28
|
|
4
|
+
|
|
5
|
+
Housekeeping release. Nothing about how Audrey behaves has changed — this is
|
|
6
|
+
all under-the-hood tidying plus a friendlier README. Safe to upgrade from 1.0.2
|
|
7
|
+
without touching anything.
|
|
8
|
+
|
|
9
|
+
### Cleaner code under the hood
|
|
10
|
+
|
|
11
|
+
- Started breaking up the big `mcp-server/index.ts` file (it had grown to ~3,600
|
|
12
|
+
lines that did everything at once). The memory-tool input schemas and the
|
|
13
|
+
shared validation helpers now live in their own small files
|
|
14
|
+
(`tool-schemas.ts`, `tool-validation.ts`). Same behavior, just easier to read
|
|
15
|
+
and work on. More of this tidying will follow.
|
|
16
|
+
|
|
17
|
+
### More reliable tests
|
|
18
|
+
|
|
19
|
+
- The test suite used to need a slow, multi-step "build all the benchmark and
|
|
20
|
+
paper files first" step before it could run. It now sets those up
|
|
21
|
+
automatically, so `npm test` (or a plain `vitest run`) just works from a fresh
|
|
22
|
+
checkout. 785 tests pass with nothing extra to remember.
|
|
23
|
+
|
|
24
|
+
### Friendlier docs
|
|
25
|
+
|
|
26
|
+
- The README now opens with a short "In Plain English" section that explains
|
|
27
|
+
what Audrey is for in everyday language, before diving into the technical
|
|
28
|
+
detail.
|
|
29
|
+
|
|
30
|
+
## 1.0.2 - 2026-05-28
|
|
31
|
+
|
|
32
|
+
Maintenance and engineering-quality release. No runtime behavior change — the
|
|
33
|
+
full test suite is unchanged from 1.0.1.
|
|
34
|
+
|
|
35
|
+
### Security
|
|
36
|
+
|
|
37
|
+
- Pin transitive `qs` to `^6.15.2` via `overrides` to resolve
|
|
38
|
+
[GHSA-q8mj-m7cp-5q26](https://github.com/advisories/GHSA-q8mj-m7cp-5q26)
|
|
39
|
+
(moderate denial-of-service in `qs.stringify`), which reaches `audrey` through
|
|
40
|
+
`@modelcontextprotocol/sdk → express@5`. The advisory was published after the
|
|
41
|
+
1.0.1 cut; production `npm audit --omit=dev --audit-level=moderate` is clean
|
|
42
|
+
again.
|
|
43
|
+
|
|
44
|
+
### Tooling and code quality
|
|
45
|
+
|
|
46
|
+
- Add flat-config ESLint with type-checked `typescript-eslint` rules over `src/`
|
|
47
|
+
and `mcp-server/`, plus Prettier and `.editorconfig` matched to the existing
|
|
48
|
+
house style. New scripts: `lint`, `lint:fix`, `format`, `format:check`.
|
|
49
|
+
- Wire `lint` and `format:check` into CI (Ubuntu matrix + Windows) and the
|
|
50
|
+
`release:gate`, `release:gate:sandbox`, and `release:gate:paper` gates so the
|
|
51
|
+
enforced baseline cannot regress.
|
|
52
|
+
- Resolve every lint finding at the source rather than by suppression: the REST
|
|
53
|
+
handlers now decode request bodies through a typed `RouteBody` contract
|
|
54
|
+
instead of Hono's default `any`; the three MCP `server` parameters and the
|
|
55
|
+
local embedding pipeline are typed structurally; rethrows attach an error
|
|
56
|
+
`cause`; and dead imports/bindings were removed across the tree.
|
|
57
|
+
- One-time Prettier normalization across the codebase, recorded in
|
|
58
|
+
`.git-blame-ignore-revs` so `git blame` stays meaningful.
|
|
59
|
+
|
|
3
60
|
## 1.0.1 - 2026-05-15
|
|
4
61
|
|
|
5
62
|
### Honest benchmarking
|
package/README.md
CHANGED
|
@@ -15,6 +15,14 @@
|
|
|
15
15
|
</p>
|
|
16
16
|
</div>
|
|
17
17
|
|
|
18
|
+
## In Plain English
|
|
19
|
+
|
|
20
|
+
AI coding assistants are brilliant but forgetful. They'll happily rerun the same broken command they ran yesterday, forget the rules your team agreed on last week, and treat every new session like it's day one.
|
|
21
|
+
|
|
22
|
+
Audrey is the memory they're missing. It quietly keeps track of what worked, what failed, and what you told it — then checks that memory **before** the agent does something, so it can say "hold on, this exact command failed last time, and here's what fixed it" instead of repeating the mistake. Everything lives in one local file on your machine: no cloud, no account, and nothing about your code ever leaves your computer.
|
|
23
|
+
|
|
24
|
+
That's the whole idea. The rest of this README is the detail.
|
|
25
|
+
|
|
18
26
|
## Why Audrey Exists
|
|
19
27
|
|
|
20
28
|
Agents forget the exact mistakes they made yesterday. They repeat broken commands, lose project-specific rules, miss contradictions, and treat every new session like a cold start.
|
|
@@ -52,7 +60,7 @@ npx audrey guard --tool Bash "npm run deploy"
|
|
|
52
60
|
Expected first-run shape:
|
|
53
61
|
|
|
54
62
|
```text
|
|
55
|
-
Audrey Doctor v1.0.
|
|
63
|
+
Audrey Doctor v1.0.2
|
|
56
64
|
Store health: not initialized
|
|
57
65
|
Verdict: ready
|
|
58
66
|
```
|
|
@@ -296,7 +304,7 @@ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
|
|
|
296
304
|
|
|
297
305
|
Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
|
|
298
306
|
rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
|
|
299
|
-
the raw-secret sweep, and
|
|
307
|
+
the raw-secret sweep, and 3.09ms / 28.181ms
|
|
300
308
|
p50/p95 guard latency under the mock-provider methodology.
|
|
301
309
|
|
|
302
310
|
**Methodology caveats, on purpose.** All numbers above are produced against
|
|
@@ -535,10 +543,12 @@ Developer setup runs from source, not from the published tarball, so `npm run bu
|
|
|
535
543
|
```bash
|
|
536
544
|
npm ci
|
|
537
545
|
npm run build
|
|
546
|
+
npm run lint # ESLint (type-checked typescript-eslint); CI requires it clean
|
|
547
|
+
npm run format # Prettier; use `npm run format:check` to verify without writing
|
|
538
548
|
npm test
|
|
539
549
|
```
|
|
540
550
|
|
|
541
|
-
Once built, the `Quick Start` commands work against the local `dist/` output. The full release gate runs everything CI runs:
|
|
551
|
+
Once built, the `Quick Start` commands work against the local `dist/` output. Code style and types are enforced: `npm run lint` and `npm run format:check` run in CI (Ubuntu + Windows) and in every release gate, so the baseline cannot regress. The full release gate runs everything CI runs:
|
|
542
552
|
|
|
543
553
|
```bash
|
|
544
554
|
npm run release:gate
|
|
@@ -116,7 +116,9 @@ export async function runGuardBenchAdapterSelfTest(options = {}) {
|
|
|
116
116
|
};
|
|
117
117
|
const schemaErrors = validateAdapterSelfTestReport(selfTest);
|
|
118
118
|
if (schemaErrors.length > 0) {
|
|
119
|
-
throw new Error(
|
|
119
|
+
throw new Error(
|
|
120
|
+
`GuardBench adapter self-test schema validation failed: ${schemaErrors.join('; ')}`,
|
|
121
|
+
);
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
if (options.out && options.write !== false) {
|
|
@@ -146,7 +148,9 @@ async function main() {
|
|
|
146
148
|
console.log(JSON.stringify(result, null, 2));
|
|
147
149
|
} else if (result.ok) {
|
|
148
150
|
console.log(`GuardBench adapter self-test passed: ${result.adapter.name}`);
|
|
149
|
-
console.log(
|
|
151
|
+
console.log(
|
|
152
|
+
`Contract rows: ${result.conformance.scenarios}/${result.conformance.expectedScenarios}`,
|
|
153
|
+
);
|
|
150
154
|
console.log(`Full-contract score: ${(result.score.fullContractPassRate * 100).toFixed(1)}%`);
|
|
151
155
|
console.log(`Decision accuracy: ${(result.score.decisionAccuracy * 100).toFixed(1)}%`);
|
|
152
156
|
if (result.outPath) console.log(`Self-test report: ${result.outPath}`);
|
|
@@ -2,7 +2,8 @@ import { defineGuardBenchAdapter } from '../adapter-kit.mjs';
|
|
|
2
2
|
|
|
3
3
|
export default defineGuardBenchAdapter({
|
|
4
4
|
name: 'Example Allow Adapter',
|
|
5
|
-
description:
|
|
5
|
+
description:
|
|
6
|
+
'Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests.',
|
|
6
7
|
async setup({ scenario }) {
|
|
7
8
|
return {
|
|
8
9
|
memoryCount: (scenario.seed.seededMemories ?? []).length,
|
|
@@ -19,7 +20,9 @@ export default defineGuardBenchAdapter({
|
|
|
19
20
|
summary: [
|
|
20
21
|
`Example adapter loaded ${state.memoryCount} seeded memories`,
|
|
21
22
|
`${state.toolEventCount} seeded tool events`,
|
|
22
|
-
scenario.seed.seededNoise
|
|
23
|
+
scenario.seed.seededNoise
|
|
24
|
+
? `${scenario.seed.seededNoise.count} noise memories`
|
|
25
|
+
: 'no noise block',
|
|
23
26
|
state.hasFaultInjection ? 'fault injection present but unsupported' : 'no fault injection',
|
|
24
27
|
].join('; '),
|
|
25
28
|
};
|
|
@@ -51,9 +51,7 @@ function memoryText(memory) {
|
|
|
51
51
|
}
|
|
52
52
|
|
|
53
53
|
function evidenceIds(memories) {
|
|
54
|
-
return memories
|
|
55
|
-
.map(memory => memory?.id ?? memory?.memory_id)
|
|
56
|
-
.filter(Boolean);
|
|
54
|
+
return memories.map(memory => memory?.id ?? memory?.memory_id).filter(Boolean);
|
|
57
55
|
}
|
|
58
56
|
|
|
59
57
|
function decisionFromMemories(memories, action, unsupportedFault = null) {
|
|
@@ -62,7 +60,9 @@ function decisionFromMemories(memories, action, unsupportedFault = null) {
|
|
|
62
60
|
decision: 'warn',
|
|
63
61
|
riskScore: 0.55,
|
|
64
62
|
evidenceIds: evidenceIds(memories),
|
|
65
|
-
recommendedActions: [
|
|
63
|
+
recommendedActions: [
|
|
64
|
+
'External adapter cannot inject storage faults into Mem0 Platform; verify memory health separately.',
|
|
65
|
+
],
|
|
66
66
|
summary: `Mem0 adapter cannot emulate fault injection: ${unsupportedFault}.`,
|
|
67
67
|
};
|
|
68
68
|
}
|
|
@@ -129,7 +129,9 @@ class Mem0PlatformClient {
|
|
|
129
129
|
|
|
130
130
|
if (!response.ok && response.status !== 204) {
|
|
131
131
|
const body = await response.text();
|
|
132
|
-
throw new Error(
|
|
132
|
+
throw new Error(
|
|
133
|
+
`Mem0 ${options.method ?? 'GET'} ${path} failed ${response.status}: ${body.slice(0, 500)}`,
|
|
134
|
+
);
|
|
133
135
|
}
|
|
134
136
|
|
|
135
137
|
if (response.status === 204) return null;
|
|
@@ -175,7 +177,7 @@ class Mem0PlatformClient {
|
|
|
175
177
|
filters: { user_id: userId },
|
|
176
178
|
}),
|
|
177
179
|
});
|
|
178
|
-
return Array.isArray(response) ? response : response?.results ?? [];
|
|
180
|
+
return Array.isArray(response) ? response : (response?.results ?? []);
|
|
179
181
|
}
|
|
180
182
|
|
|
181
183
|
async deleteUser(userId) {
|
|
@@ -191,9 +193,10 @@ function memoryMessagesFromScenario(scenario) {
|
|
|
191
193
|
messages.push({ role: 'user', content: memory.content });
|
|
192
194
|
}
|
|
193
195
|
for (const event of scenario.seed.seededToolEvents ?? []) {
|
|
194
|
-
const seededSecret =
|
|
195
|
-
|
|
196
|
-
|
|
196
|
+
const seededSecret =
|
|
197
|
+
event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
|
|
198
|
+
? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
|
|
199
|
+
: '';
|
|
197
200
|
messages.push({
|
|
198
201
|
role: 'user',
|
|
199
202
|
content: [
|
|
@@ -204,7 +207,9 @@ function memoryMessagesFromScenario(scenario) {
|
|
|
204
207
|
event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
|
|
205
208
|
seededSecret ? `Error: ${seededSecret}` : '',
|
|
206
209
|
event.output ? `Output: ${event.output}` : '',
|
|
207
|
-
]
|
|
210
|
+
]
|
|
211
|
+
.filter(Boolean)
|
|
212
|
+
.join('\n'),
|
|
208
213
|
});
|
|
209
214
|
}
|
|
210
215
|
if (scenario.seed.seededNoise?.count) {
|
|
@@ -234,14 +239,16 @@ async function addInBatches(client, { userId, scenario, messages }) {
|
|
|
234
239
|
|
|
235
240
|
function userIdForScenario(scenario) {
|
|
236
241
|
const prefix = process.env.MEM0_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
|
|
237
|
-
const runId =
|
|
242
|
+
const runId =
|
|
243
|
+
process.env.MEM0_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
|
|
238
244
|
return `${prefix}-${runId}-${scenario.id}`.toLowerCase();
|
|
239
245
|
}
|
|
240
246
|
|
|
241
247
|
export function createGuardBenchAdapter(options = {}) {
|
|
242
248
|
return {
|
|
243
249
|
name: 'Mem0 Platform',
|
|
244
|
-
description:
|
|
250
|
+
description:
|
|
251
|
+
'Mem0 Platform REST adapter using V3 add, V2 search, event polling, and entity cleanup.',
|
|
245
252
|
async setup({ scenario }) {
|
|
246
253
|
const client = new Mem0PlatformClient(options);
|
|
247
254
|
const userId = userIdForScenario(scenario);
|
|
@@ -46,13 +46,9 @@ function tokenOverlap(a, b) {
|
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
function resultText(result) {
|
|
49
|
-
return [
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
result?.summary,
|
|
53
|
-
result?.name,
|
|
54
|
-
result?.context,
|
|
55
|
-
].filter(Boolean).join('\n');
|
|
49
|
+
return [result?.fact, result?.content, result?.summary, result?.name, result?.context]
|
|
50
|
+
.filter(Boolean)
|
|
51
|
+
.join('\n');
|
|
56
52
|
}
|
|
57
53
|
|
|
58
54
|
function collectSearchResults(response) {
|
|
@@ -65,8 +61,14 @@ function collectSearchResults(response) {
|
|
|
65
61
|
}
|
|
66
62
|
|
|
67
63
|
function evidenceIds(results) {
|
|
68
|
-
return results.map(
|
|
69
|
-
result
|
|
64
|
+
return results.map(
|
|
65
|
+
(result, index) =>
|
|
66
|
+
result?.uuid ??
|
|
67
|
+
result?.id ??
|
|
68
|
+
result?.task_id ??
|
|
69
|
+
result?.thread_id ??
|
|
70
|
+
`zep-result-${index + 1}`,
|
|
71
|
+
);
|
|
70
72
|
}
|
|
71
73
|
|
|
72
74
|
function decisionFromSearchResults(results, action, unsupportedFault = null) {
|
|
@@ -75,7 +77,9 @@ function decisionFromSearchResults(results, action, unsupportedFault = null) {
|
|
|
75
77
|
decision: 'warn',
|
|
76
78
|
riskScore: 0.55,
|
|
77
79
|
evidenceIds: evidenceIds(results),
|
|
78
|
-
recommendedActions: [
|
|
80
|
+
recommendedActions: [
|
|
81
|
+
'External adapter cannot inject storage faults into Zep Cloud; verify memory health separately.',
|
|
82
|
+
],
|
|
79
83
|
summary: `Zep Cloud adapter cannot emulate fault injection: ${unsupportedFault}.`,
|
|
80
84
|
};
|
|
81
85
|
}
|
|
@@ -132,7 +136,10 @@ class ZepCloudClient {
|
|
|
132
136
|
return this.authScheme ? `${this.authScheme} ${this.apiKey}` : this.apiKey;
|
|
133
137
|
}
|
|
134
138
|
|
|
135
|
-
async request(
|
|
139
|
+
async request(
|
|
140
|
+
path,
|
|
141
|
+
{ method = 'GET', body, okStatuses = [200, 201, 204], ignoreNotFound = false } = {},
|
|
142
|
+
) {
|
|
136
143
|
const response = await this.fetch(`${this.baseUrl}${path}`, {
|
|
137
144
|
method,
|
|
138
145
|
headers: {
|
|
@@ -209,22 +216,33 @@ function memoryMessagesFromScenario(scenario) {
|
|
|
209
216
|
messages.push(message(memory.content));
|
|
210
217
|
}
|
|
211
218
|
for (const event of scenario.seed.seededToolEvents ?? []) {
|
|
212
|
-
const seededSecret =
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
219
|
+
const seededSecret =
|
|
220
|
+
event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
|
|
221
|
+
? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
|
|
222
|
+
: '';
|
|
223
|
+
messages.push(
|
|
224
|
+
message(
|
|
225
|
+
[
|
|
226
|
+
`Tool event: ${event.tool ?? 'tool'}`,
|
|
227
|
+
event.action ? `Action: ${event.action}` : '',
|
|
228
|
+
event.outcome ? `Outcome: ${event.outcome}` : '',
|
|
229
|
+
event.errorSummary ? `Error: ${event.errorSummary}` : '',
|
|
230
|
+
event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
|
|
231
|
+
seededSecret ? `Error: ${seededSecret}` : '',
|
|
232
|
+
event.output ? `Output: ${event.output}` : '',
|
|
233
|
+
]
|
|
234
|
+
.filter(Boolean)
|
|
235
|
+
.join('\n'),
|
|
236
|
+
),
|
|
237
|
+
);
|
|
224
238
|
}
|
|
225
239
|
if (scenario.seed.seededNoise?.count) {
|
|
226
240
|
for (let i = 0; i < scenario.seed.seededNoise.count; i++) {
|
|
227
|
-
messages.push(
|
|
241
|
+
messages.push(
|
|
242
|
+
message(
|
|
243
|
+
`Irrelevant background memory ${i}: UI color preference, lunch note, or unrelated calendar detail.`,
|
|
244
|
+
),
|
|
245
|
+
);
|
|
228
246
|
}
|
|
229
247
|
}
|
|
230
248
|
return messages;
|
|
@@ -241,14 +259,16 @@ async function addInBatches(client, { sessionId, messages }) {
|
|
|
241
259
|
|
|
242
260
|
function idForScenario(kind, scenario) {
|
|
243
261
|
const prefix = process.env.ZEP_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
|
|
244
|
-
const runId =
|
|
262
|
+
const runId =
|
|
263
|
+
process.env.ZEP_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
|
|
245
264
|
return `${prefix}-${runId}-${kind}-${scenario.id}`.toLowerCase();
|
|
246
265
|
}
|
|
247
266
|
|
|
248
267
|
export function createGuardBenchAdapter(options = {}) {
|
|
249
268
|
return {
|
|
250
269
|
name: 'Zep Cloud',
|
|
251
|
-
description:
|
|
270
|
+
description:
|
|
271
|
+
'Zep Cloud REST adapter using v2 users, sessions, memory.add, graph.search, and user cleanup.',
|
|
252
272
|
async setup({ scenario }) {
|
|
253
273
|
const client = new ZepCloudClient(options);
|
|
254
274
|
const userId = idForScenario('user', scenario);
|
|
@@ -257,7 +277,11 @@ export function createGuardBenchAdapter(options = {}) {
|
|
|
257
277
|
await client.createUser(userId);
|
|
258
278
|
await client.createSession({ sessionId, userId });
|
|
259
279
|
await addInBatches(client, { sessionId, messages });
|
|
260
|
-
const ingestDelayMs = Number(
|
|
280
|
+
const ingestDelayMs = Number(
|
|
281
|
+
options.ingestDelayMs ??
|
|
282
|
+
process.env.ZEP_GUARDBENCH_INGEST_DELAY_MS ??
|
|
283
|
+
DEFAULT_INGEST_DELAY_MS,
|
|
284
|
+
);
|
|
261
285
|
if (ingestDelayMs > 0) await sleep(ingestDelayMs);
|
|
262
286
|
return { client, userId, sessionId };
|
|
263
287
|
},
|
package/benchmarks/baselines.js
CHANGED
|
@@ -26,7 +26,10 @@ function keywordScore(queryTokens, content) {
|
|
|
26
26
|
function sortByScore(rows) {
|
|
27
27
|
return rows
|
|
28
28
|
.filter(row => Number.isFinite(row.score))
|
|
29
|
-
.sort(
|
|
29
|
+
.sort(
|
|
30
|
+
(a, b) =>
|
|
31
|
+
b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')),
|
|
32
|
+
);
|
|
30
33
|
}
|
|
31
34
|
|
|
32
35
|
function flattenMemories(benchmarkCase, ids = []) {
|
|
@@ -127,11 +130,13 @@ export async function runBaselineScenario(system, benchmarkCase, providerConfig,
|
|
|
127
130
|
|
|
128
131
|
export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
|
|
129
132
|
const queryTokens = tokenize(benchmarkCase.query);
|
|
130
|
-
return sortByScore(
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
133
|
+
return sortByScore(
|
|
134
|
+
flattenMemories(benchmarkCase).map(memory => ({
|
|
135
|
+
...memory,
|
|
136
|
+
type: 'episodic',
|
|
137
|
+
score: keywordScore(queryTokens, memory.content),
|
|
138
|
+
})),
|
|
139
|
+
).slice(0, limit);
|
|
135
140
|
}
|
|
136
141
|
|
|
137
142
|
export function runRecentWindowBaseline(benchmarkCase, limit = 3) {
|
|
@@ -34,14 +34,16 @@ function rowFromBundle(dir) {
|
|
|
34
34
|
|
|
35
35
|
function compareRows(a, b) {
|
|
36
36
|
return (
|
|
37
|
-
Number(b.verification.ok) - Number(a.verification.ok)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
37
|
+
Number(b.verification.ok) - Number(a.verification.ok) ||
|
|
38
|
+
Number(b.conformance.ok) - Number(a.conformance.ok) ||
|
|
39
|
+
(b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1) ||
|
|
40
|
+
(b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1) ||
|
|
41
|
+
(b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1) ||
|
|
42
|
+
(a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) -
|
|
43
|
+
(b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) ||
|
|
44
|
+
(a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) -
|
|
45
|
+
(b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) ||
|
|
46
|
+
a.subject.name.localeCompare(b.subject.name)
|
|
45
47
|
);
|
|
46
48
|
}
|
|
47
49
|
|
|
@@ -49,7 +51,9 @@ export function buildGuardBenchLeaderboard(options = {}) {
|
|
|
49
51
|
const bundleDirs = options.bundleDirs?.length
|
|
50
52
|
? options.bundleDirs
|
|
51
53
|
: ['benchmarks/output/submission-bundle'];
|
|
52
|
-
const rows = bundleDirs
|
|
54
|
+
const rows = bundleDirs
|
|
55
|
+
.map(rowFromBundle)
|
|
56
|
+
.sort(compareRows)
|
|
53
57
|
.map((row, index) => ({ rank: index + 1, ...row }));
|
|
54
58
|
return {
|
|
55
59
|
schemaVersion: '1.0.0',
|
|
@@ -66,12 +70,16 @@ export function buildGuardBenchLeaderboard(options = {}) {
|
|
|
66
70
|
'subject.name',
|
|
67
71
|
],
|
|
68
72
|
rows,
|
|
69
|
-
failures: rows.flatMap(row =>
|
|
73
|
+
failures: rows.flatMap(row =>
|
|
74
|
+
row.verification.failures.map(failure => `${row.subject.name}: ${failure}`),
|
|
75
|
+
),
|
|
70
76
|
};
|
|
71
77
|
}
|
|
72
78
|
|
|
73
79
|
export function writeGuardBenchLeaderboard(options = {}) {
|
|
74
|
-
const outJson = resolve(
|
|
80
|
+
const outJson = resolve(
|
|
81
|
+
options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json',
|
|
82
|
+
);
|
|
75
83
|
const outMd = resolve(options.outMd ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.md');
|
|
76
84
|
const schemasDir = resolve(options.schemasDir ?? 'benchmarks/schemas');
|
|
77
85
|
const leaderboard = buildGuardBenchLeaderboard(options);
|
|
@@ -97,18 +105,23 @@ export function renderMarkdown(leaderboard) {
|
|
|
97
105
|
'|---:|---|---:|---:|---:|---:|---:|---:|---:|---|',
|
|
98
106
|
];
|
|
99
107
|
for (const row of leaderboard.rows) {
|
|
100
|
-
lines.push(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
108
|
+
lines.push(
|
|
109
|
+
[
|
|
110
|
+
row.rank,
|
|
111
|
+
row.subject.name,
|
|
112
|
+
row.verification.ok ? 'yes' : 'no',
|
|
113
|
+
row.conformance.ok ? 'yes' : 'no',
|
|
114
|
+
percent(row.score.fullContractPassRate),
|
|
115
|
+
percent(row.score.decisionAccuracy),
|
|
116
|
+
percent(row.score.evidenceRecall),
|
|
117
|
+
number(row.score.redactionLeaks),
|
|
118
|
+
row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
|
|
119
|
+
row.source.dir,
|
|
120
|
+
]
|
|
121
|
+
.join(' | ')
|
|
122
|
+
.replace(/^/, '| ')
|
|
123
|
+
.replace(/$/, ' |'),
|
|
124
|
+
);
|
|
112
125
|
}
|
|
113
126
|
if (leaderboard.failures.length) {
|
|
114
127
|
lines.push('', '## Verification Failures', '');
|
package/benchmarks/cases.js
CHANGED
|
@@ -60,7 +60,8 @@ export const RETRIEVAL_CASES = [
|
|
|
60
60
|
expectAny: ['Northwind'],
|
|
61
61
|
memory: [
|
|
62
62
|
{
|
|
63
|
-
content:
|
|
63
|
+
content:
|
|
64
|
+
'During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.',
|
|
64
65
|
source: 'tool-result',
|
|
65
66
|
tags: ['project', 'pilot'],
|
|
66
67
|
context: { subject: 'sam', domain: 'operations' },
|
|
@@ -72,7 +73,8 @@ export const RETRIEVAL_CASES = [
|
|
|
72
73
|
context: { subject: 'sam', domain: 'operations' },
|
|
73
74
|
},
|
|
74
75
|
{
|
|
75
|
-
content:
|
|
76
|
+
content:
|
|
77
|
+
'The pilot budget review approved Northwind for rollout after the support SLA review.',
|
|
76
78
|
source: 'direct-observation',
|
|
77
79
|
tags: ['finance', 'vendor', 'approval'],
|
|
78
80
|
context: { subject: 'sam', domain: 'operations' },
|
|
@@ -169,17 +171,20 @@ export const RETRIEVAL_CASES = [
|
|
|
169
171
|
expectAny: ['cap retry batches', 'stagger retries'],
|
|
170
172
|
memory: [
|
|
171
173
|
{
|
|
172
|
-
content:
|
|
174
|
+
content:
|
|
175
|
+
'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.',
|
|
173
176
|
source: 'direct-observation',
|
|
174
177
|
tags: ['payments', 'rate-limit'],
|
|
175
178
|
},
|
|
176
179
|
{
|
|
177
|
-
content:
|
|
180
|
+
content:
|
|
181
|
+
'Payout incident volume dropped after retry batches were capped at 50 merchants per worker.',
|
|
178
182
|
source: 'tool-result',
|
|
179
183
|
tags: ['payments', 'rate-limit'],
|
|
180
184
|
},
|
|
181
185
|
{
|
|
182
|
-
content:
|
|
186
|
+
content:
|
|
187
|
+
'Risk operations requested an escalation when multiple merchants were affected in the same hour.',
|
|
183
188
|
source: 'told-by-user',
|
|
184
189
|
tags: ['payments', 'escalation'],
|
|
185
190
|
},
|
|
@@ -188,7 +193,8 @@ export const RETRIEVAL_CASES = [
|
|
|
188
193
|
minClusterSize: 3,
|
|
189
194
|
similarityThreshold: -0.3,
|
|
190
195
|
principle: {
|
|
191
|
-
content:
|
|
196
|
+
content:
|
|
197
|
+
'When payout retries start returning 429, cap retry batches and stagger retries before escalating.',
|
|
192
198
|
type: 'procedural',
|
|
193
199
|
conditions: ['processor returns 429', 'multiple merchants impacted'],
|
|
194
200
|
},
|
|
@@ -343,7 +349,8 @@ export const OPERATION_CASES = [
|
|
|
343
349
|
kind: 'operations',
|
|
344
350
|
family: 'procedural_merge',
|
|
345
351
|
title: 'Procedural merge',
|
|
346
|
-
description:
|
|
352
|
+
description:
|
|
353
|
+
'Related episodes should merge into an executable procedure, not just a loose fact.',
|
|
347
354
|
query: 'What should the agent do after two webhook signature failures?',
|
|
348
355
|
expectAny: ['rotate the signing secret', 'replay queued events'],
|
|
349
356
|
steps: [
|
|
@@ -376,7 +383,8 @@ export const OPERATION_CASES = [
|
|
|
376
383
|
minClusterSize: 3,
|
|
377
384
|
similarityThreshold: -0.3,
|
|
378
385
|
principle: {
|
|
379
|
-
content:
|
|
386
|
+
content:
|
|
387
|
+
'When webhook signature verification fails twice, rotate the signing secret and replay queued events.',
|
|
380
388
|
type: 'procedural',
|
|
381
389
|
conditions: ['signature verification fails twice', 'queued events pending'],
|
|
382
390
|
},
|
|
@@ -395,7 +403,8 @@ export const GUARD_CASES = [
|
|
|
395
403
|
kind: 'guard',
|
|
396
404
|
family: 'closed_loop_failure_memory',
|
|
397
405
|
title: 'Guard remembers failed tool outcome',
|
|
398
|
-
description:
|
|
406
|
+
description:
|
|
407
|
+
'A failed guarded tool run should create a future caution and warning reflex for the same tool.',
|
|
399
408
|
action: 'run npm test before release',
|
|
400
409
|
tool: 'npm test',
|
|
401
410
|
expectAll: ['decision:caution', 'warning:recent_failure', 'reflex:warn'],
|
|
@@ -439,7 +448,8 @@ export const GUARD_CASES = [
|
|
|
439
448
|
kind: 'guard',
|
|
440
449
|
family: 'guard_receipt_hardening',
|
|
441
450
|
title: 'Guard rejects replayed receipt outcomes',
|
|
442
|
-
description:
|
|
451
|
+
description:
|
|
452
|
+
'A receipt should only be closed once, while the failed outcome still becomes future caution memory.',
|
|
443
453
|
action: 'run npm test before release',
|
|
444
454
|
tool: 'npm test',
|
|
445
455
|
expectAll: ['guard_hardened:replay_rejected', 'decision:caution', 'warning:recent_failure'],
|
|
@@ -470,7 +480,8 @@ export const GUARD_CASES = [
|
|
|
470
480
|
kind: 'guard',
|
|
471
481
|
family: 'guard_receipt_hardening',
|
|
472
482
|
title: 'Guard rejects non-guard receipts',
|
|
473
|
-
description:
|
|
483
|
+
description:
|
|
484
|
+
'A normal tool trace must not be accepted as a guard receipt for after-action feedback.',
|
|
474
485
|
action: 'format docs',
|
|
475
486
|
tool: 'Bash',
|
|
476
487
|
expectAll: ['guard_hardened:non_guard_receipt_rejected'],
|
|
@@ -511,7 +522,8 @@ export const LOCAL_BENCHMARK_SUITES = [
|
|
|
511
522
|
{
|
|
512
523
|
id: 'guard',
|
|
513
524
|
title: 'Agent guard loop',
|
|
514
|
-
description:
|
|
525
|
+
description:
|
|
526
|
+
'Closed-loop memory-before-action behavior for receipts, warnings, and blocking reflexes.',
|
|
515
527
|
comparableToBaselines: false,
|
|
516
528
|
cases: GUARD_CASES,
|
|
517
529
|
},
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import { existsSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
2
|
import { createHash } from 'node:crypto';
|
|
3
3
|
import { join, resolve } from 'node:path';
|
|
4
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
computeGuardBenchArtifactHashes,
|
|
6
|
+
validateGuardBenchArtifacts,
|
|
7
|
+
} from './validate-guardbench-artifacts.mjs';
|
|
5
8
|
import { publicArtifactValue, publicPath } from './public-paths.mjs';
|
|
6
9
|
|
|
7
10
|
const CARD_FILE = 'guardbench-conformance-card.json';
|
|
@@ -18,7 +21,9 @@ function sha256File(path) {
|
|
|
18
21
|
function findExternalSubject(summary, requestedAdapter) {
|
|
19
22
|
const externalSubjects = (summary.manifest?.subjects ?? []).filter(subject => subject.external);
|
|
20
23
|
if (requestedAdapter) {
|
|
21
|
-
const requested = externalSubjects.find(
|
|
24
|
+
const requested = externalSubjects.find(
|
|
25
|
+
subject => subject.name === requestedAdapter || subject.id === requestedAdapter,
|
|
26
|
+
);
|
|
22
27
|
if (requested) return requested;
|
|
23
28
|
}
|
|
24
29
|
return externalSubjects.length === 1 ? externalSubjects[0] : null;
|
|
@@ -57,7 +62,11 @@ export function buildGuardBenchConformanceCard(options = {}) {
|
|
|
57
62
|
manifestVersion: summary.manifest?.manifestVersion ?? null,
|
|
58
63
|
suiteId: summary.manifest?.suiteId ?? null,
|
|
59
64
|
subject: {
|
|
60
|
-
name:
|
|
65
|
+
name:
|
|
66
|
+
systemSummary?.system ??
|
|
67
|
+
metadata?.adapterConformance?.adapter ??
|
|
68
|
+
metadata?.adapter ??
|
|
69
|
+
'unknown',
|
|
61
70
|
requestedAdapter: metadata?.adapterConformance?.requestedAdapter ?? metadata?.adapter ?? null,
|
|
62
71
|
external: Boolean(externalSubject?.external ?? metadata),
|
|
63
72
|
},
|