npm - audrey - Versions diffs - 1.0.1 → 1.0.3 - Mend

audrey 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (235) hide show

package/CHANGELOG.md +57 -0
package/README.md +13 -3
package/benchmarks/adapter-self-test.mjs +6 -2
package/benchmarks/adapters/example-allow.mjs +5 -2
package/benchmarks/adapters/mem0-platform.mjs +19 -12
package/benchmarks/adapters/zep-cloud.mjs +51 -27
package/benchmarks/baselines.js +11 -6
package/benchmarks/build-leaderboard.mjs +36 -23
package/benchmarks/cases.js +24 -12
package/benchmarks/create-conformance-card.mjs +12 -3
package/benchmarks/create-submission-bundle.mjs +22 -8
package/benchmarks/dry-run-external-adapters.mjs +24 -12
package/benchmarks/guardbench.js +263 -123
package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
package/benchmarks/output/guardbench-conformance-card.json +12 -12
package/benchmarks/output/guardbench-raw.json +106 -106
package/benchmarks/output/guardbench-summary.json +168 -168
package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
package/benchmarks/output/submission-bundle/guardbench-raw.json +106 -106
package/benchmarks/output/submission-bundle/guardbench-summary.json +168 -168
package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
package/benchmarks/output/submission-bundle/validation-report.json +1 -1
package/benchmarks/output/summary.json +58 -58
package/benchmarks/perf-snapshot.js +12 -9
package/benchmarks/perf.bench.js +14 -6
package/benchmarks/public-paths.mjs +11 -5
package/benchmarks/reference-results.js +10 -5
package/benchmarks/report.js +48 -27
package/benchmarks/run-external-guardbench.mjs +47 -25
package/benchmarks/run.js +112 -59
package/benchmarks/validate-adapter-module.mjs +13 -10
package/benchmarks/validate-adapter-registry.mjs +16 -5
package/benchmarks/validate-guardbench-artifacts.mjs +76 -19
package/benchmarks/verify-external-evidence.mjs +86 -31
package/benchmarks/verify-publication-artifacts.mjs +34 -11
package/benchmarks/verify-submission-bundle.mjs +9 -4
package/dist/mcp-server/config.d.ts +1 -1
package/dist/mcp-server/config.d.ts.map +1 -1
package/dist/mcp-server/config.js +5 -3
package/dist/mcp-server/config.js.map +1 -1
package/dist/mcp-server/index.d.ts +7 -347
package/dist/mcp-server/index.d.ts.map +1 -1
package/dist/mcp-server/index.js +289 -256
package/dist/mcp-server/index.js.map +1 -1
package/dist/mcp-server/tool-schemas.d.ts +341 -0
package/dist/mcp-server/tool-schemas.d.ts.map +1 -0
package/dist/mcp-server/tool-schemas.js +248 -0
package/dist/mcp-server/tool-schemas.js.map +1 -0
package/dist/mcp-server/tool-validation.d.ts +17 -0
package/dist/mcp-server/tool-validation.d.ts.map +1 -0
package/dist/mcp-server/tool-validation.js +41 -0
package/dist/mcp-server/tool-validation.js.map +1 -0
package/dist/src/action-key.d.ts.map +1 -1
package/dist/src/action-key.js +6 -2
package/dist/src/action-key.js.map +1 -1
package/dist/src/adaptive.d.ts.map +1 -1
package/dist/src/adaptive.js +4 -2
package/dist/src/adaptive.js.map +1 -1
package/dist/src/affect.d.ts.map +1 -1
package/dist/src/affect.js +8 -5
package/dist/src/affect.js.map +1 -1
package/dist/src/audrey.d.ts +1 -1
package/dist/src/audrey.d.ts.map +1 -1
package/dist/src/audrey.js +93 -49
package/dist/src/audrey.js.map +1 -1
package/dist/src/capsule.d.ts.map +1 -1
package/dist/src/capsule.js +37 -15
package/dist/src/capsule.js.map +1 -1
package/dist/src/causal.d.ts +1 -1
package/dist/src/causal.d.ts.map +1 -1
package/dist/src/causal.js +4 -2
package/dist/src/causal.js.map +1 -1
package/dist/src/confidence.d.ts.map +1 -1
package/dist/src/confidence.js +5 -5
package/dist/src/confidence.js.map +1 -1
package/dist/src/consolidate.d.ts.map +1 -1
package/dist/src/consolidate.js +17 -9
package/dist/src/consolidate.js.map +1 -1
package/dist/src/context.js +1 -1
package/dist/src/context.js.map +1 -1
package/dist/src/controller.d.ts.map +1 -1
package/dist/src/controller.js +24 -13
package/dist/src/controller.js.map +1 -1
package/dist/src/db.d.ts.map +1 -1
package/dist/src/db.js +78 -27
package/dist/src/db.js.map +1 -1
package/dist/src/decay.d.ts +1 -1
package/dist/src/decay.d.ts.map +1 -1
package/dist/src/decay.js +1 -1
package/dist/src/decay.js.map +1 -1
package/dist/src/embedding.d.ts +12 -4
package/dist/src/embedding.d.ts.map +1 -1
package/dist/src/embedding.js +18 -16
package/dist/src/embedding.js.map +1 -1
package/dist/src/encode.d.ts.map +1 -1
package/dist/src/encode.js +5 -4
package/dist/src/encode.js.map +1 -1
package/dist/src/events.d.ts +3 -2
package/dist/src/events.d.ts.map +1 -1
package/dist/src/events.js +7 -3
package/dist/src/events.js.map +1 -1
package/dist/src/export.d.ts.map +1 -1
package/dist/src/export.js +21 -7
package/dist/src/export.js.map +1 -1
package/dist/src/feedback.d.ts.map +1 -1
package/dist/src/feedback.js +1 -1
package/dist/src/feedback.js.map +1 -1
package/dist/src/forget.d.ts.map +1 -1
package/dist/src/forget.js +12 -6
package/dist/src/forget.js.map +1 -1
package/dist/src/fts.d.ts.map +1 -1
package/dist/src/fts.js +20 -8
package/dist/src/fts.js.map +1 -1
package/dist/src/hybrid-recall.d.ts.map +1 -1
package/dist/src/hybrid-recall.js +12 -6
package/dist/src/hybrid-recall.js.map +1 -1
package/dist/src/impact.d.ts.map +1 -1
package/dist/src/impact.js +26 -10
package/dist/src/impact.js.map +1 -1
package/dist/src/import.d.ts.map +1 -1
package/dist/src/import.js +11 -6
package/dist/src/import.js.map +1 -1
package/dist/src/index.d.ts +3 -3
package/dist/src/index.d.ts.map +1 -1
package/dist/src/index.js +3 -3
package/dist/src/index.js.map +1 -1
package/dist/src/interference.d.ts.map +1 -1
package/dist/src/interference.js +10 -5
package/dist/src/interference.js.map +1 -1
package/dist/src/introspect.d.ts.map +1 -1
package/dist/src/introspect.js +12 -6
package/dist/src/introspect.js.map +1 -1
package/dist/src/llm.d.ts +2 -2
package/dist/src/llm.d.ts.map +1 -1
package/dist/src/llm.js +6 -6
package/dist/src/llm.js.map +1 -1
package/dist/src/migrate.d.ts.map +1 -1
package/dist/src/migrate.js +10 -4
package/dist/src/migrate.js.map +1 -1
package/dist/src/preflight.d.ts.map +1 -1
package/dist/src/preflight.js +6 -8
package/dist/src/preflight.js.map +1 -1
package/dist/src/profile.d.ts.map +1 -1
package/dist/src/profile.js.map +1 -1
package/dist/src/promote.d.ts.map +1 -1
package/dist/src/promote.js +16 -7
package/dist/src/promote.js.map +1 -1
package/dist/src/prompts.d.ts.map +1 -1
package/dist/src/prompts.js +1 -2
package/dist/src/prompts.js.map +1 -1
package/dist/src/recall.d.ts.map +1 -1
package/dist/src/recall.js +85 -18
package/dist/src/recall.js.map +1 -1
package/dist/src/redact.d.ts.map +1 -1
package/dist/src/redact.js +9 -4
package/dist/src/redact.js.map +1 -1
package/dist/src/reflexes.d.ts.map +1 -1
package/dist/src/reflexes.js +1 -7
package/dist/src/reflexes.js.map +1 -1
package/dist/src/rollback.d.ts.map +1 -1
package/dist/src/rollback.js +4 -2
package/dist/src/rollback.js.map +1 -1
package/dist/src/routes.d.ts.map +1 -1
package/dist/src/routes.js +33 -13
package/dist/src/routes.js.map +1 -1
package/dist/src/rules-compiler.d.ts.map +1 -1
package/dist/src/rules-compiler.js +24 -2
package/dist/src/rules-compiler.js.map +1 -1
package/dist/src/server.js +2 -2
package/dist/src/server.js.map +1 -1
package/dist/src/tool-trace.d.ts +2 -2
package/dist/src/tool-trace.d.ts.map +1 -1
package/dist/src/tool-trace.js +12 -4
package/dist/src/tool-trace.js.map +1 -1
package/dist/src/types.d.ts.map +1 -1
package/dist/src/ulid.js +1 -1
package/dist/src/ulid.js.map +1 -1
package/dist/src/utils.d.ts.map +1 -1
package/dist/src/utils.js.map +1 -1
package/dist/src/validate.d.ts.map +1 -1
package/dist/src/validate.js +20 -10
package/dist/src/validate.js.map +1 -1
package/docs/paper/07-evaluation.md +5 -5
package/docs/paper/audrey-paper-v1.md +5 -5
package/docs/paper/evidence-ledger.md +1 -1
package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
package/docs/paper/output/arxiv/main.tex +5 -5
package/docs/paper/output/arxiv-compile-report.json +3 -3
package/docs/paper/output/submission-bundle/README.md +13 -3
package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +106 -106
package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +168 -168
package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +64 -64
package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +5 -5
package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
package/docs/paper/output/submission-bundle/package.json +17 -4
package/docs/paper/output/submission-bundle/paper-submission-manifest.json +34 -34
package/examples/fintech-ops-demo.js +12 -5
package/examples/healthcare-ops-demo.js +8 -4
package/examples/ollama-memory-agent.js +41 -13
package/examples/stripe-demo.js +12 -5
package/package.json +17 -4
package/scripts/audit-release-completion.mjs +179 -101
package/scripts/create-arxiv-source.mjs +20 -14
package/scripts/create-paper-submission-bundle.mjs +6 -2
package/scripts/finalize-release.mjs +111 -36
package/scripts/prepare-release-cut.mjs +14 -6
package/scripts/publish-release-bundle.mjs +62 -23
package/scripts/publish-release-github-api.mjs +89 -24
package/scripts/smoke-cli.js +9 -9
package/scripts/sync-paper-artifacts.mjs +5 -1
package/scripts/verify-arxiv-compile.mjs +52 -16
package/scripts/verify-arxiv-source.mjs +45 -15
package/scripts/verify-browser-launch-plan.mjs +28 -11
package/scripts/verify-browser-launch-results.mjs +32 -14
package/scripts/verify-paper-artifacts.mjs +539 -79
package/scripts/verify-paper-claims.mjs +48 -20
package/scripts/verify-paper-submission-bundle.mjs +22 -11
package/scripts/verify-publication-pack.mjs +23 -9
package/scripts/verify-release-readiness.mjs +211 -76

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,62 @@
 # Changelog
+## 1.0.3 - 2026-05-28
+Housekeeping release. Nothing about how Audrey behaves has changed — this is
+all under-the-hood tidying plus a friendlier README. Safe to upgrade from 1.0.2
+without touching anything.
+### Cleaner code under the hood
+- Started breaking up the big `mcp-server/index.ts` file (it had grown to ~3,600
+  lines that did everything at once). The memory-tool input schemas and the
+  shared validation helpers now live in their own small files
+  (`tool-schemas.ts`, `tool-validation.ts`). Same behavior, just easier to read
+  and work on. More of this tidying will follow.
+### More reliable tests
+- The test suite used to need a slow, multi-step "build all the benchmark and
+  paper files first" step before it could run. It now sets those up
+  automatically, so `npm test` (or a plain `vitest run`) just works from a fresh
+  checkout. 785 tests pass with nothing extra to remember.
+### Friendlier docs
+- The README now opens with a short "In Plain English" section that explains
+  what Audrey is for in everyday language, before diving into the technical
+  detail.
+## 1.0.2 - 2026-05-28
+Maintenance and engineering-quality release. No runtime behavior change — the
+full test suite is unchanged from 1.0.1.
+### Security
+- Pin transitive `qs` to `^6.15.2` via `overrides` to resolve
+  [GHSA-q8mj-m7cp-5q26](https://github.com/advisories/GHSA-q8mj-m7cp-5q26)
+  (moderate denial-of-service in `qs.stringify`), which reaches `audrey` through
+  `@modelcontextprotocol/sdk → express@5`. The advisory was published after the
+  1.0.1 cut; production `npm audit --omit=dev --audit-level=moderate` is clean
+  again.
+### Tooling and code quality
+- Add flat-config ESLint with type-checked `typescript-eslint` rules over `src/`
+  and `mcp-server/`, plus Prettier and `.editorconfig` matched to the existing
+  house style. New scripts: `lint`, `lint:fix`, `format`, `format:check`.
+- Wire `lint` and `format:check` into CI (Ubuntu matrix + Windows) and the
+  `release:gate`, `release:gate:sandbox`, and `release:gate:paper` gates so the
+  enforced baseline cannot regress.
+- Resolve every lint finding at the source rather than by suppression: the REST
+  handlers now decode request bodies through a typed `RouteBody` contract
+  instead of Hono's default `any`; the three MCP `server` parameters and the
+  local embedding pipeline are typed structurally; rethrows attach an error
+  `cause`; and dead imports/bindings were removed across the tree.
+- One-time Prettier normalization across the codebase, recorded in
+  `.git-blame-ignore-revs` so `git blame` stays meaningful.
 ## 1.0.1 - 2026-05-15
 ### Honest benchmarking

package/README.md CHANGED Viewed

@@ -15,6 +15,14 @@
   </p>
 </div>
+## In Plain English
+AI coding assistants are brilliant but forgetful. They'll happily rerun the same broken command they ran yesterday, forget the rules your team agreed on last week, and treat every new session like it's day one.
+Audrey is the memory they're missing. It quietly keeps track of what worked, what failed, and what you told it — then checks that memory **before** the agent does something, so it can say "hold on, this exact command failed last time, and here's what fixed it" instead of repeating the mistake. Everything lives in one local file on your machine: no cloud, no account, and nothing about your code ever leaves your computer.
+That's the whole idea. The rest of this README is the detail.
 ## Why Audrey Exists
 Agents forget the exact mistakes they made yesterday. They repeat broken commands, lose project-specific rules, miss contradictions, and treat every new session like a cold start.
@@ -52,7 +60,7 @@ npx audrey guard --tool Bash "npm run deploy"
 Expected first-run shape:
 ```text
-Audrey Doctor v1.0.0
+Audrey Doctor v1.0.2
 Store health: not initialized
 Verdict: ready
 ```
@@ -296,7 +304,7 @@ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
 Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
 rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
-the raw-secret sweep, and 2.465ms / 30.791ms
+the raw-secret sweep, and 3.09ms / 28.181ms
 p50/p95 guard latency under the mock-provider methodology.
 **Methodology caveats, on purpose.** All numbers above are produced against
@@ -535,10 +543,12 @@ Developer setup runs from source, not from the published tarball, so `npm run bu
 ```bash
 npm ci
 npm run build
+npm run lint     # ESLint (type-checked typescript-eslint); CI requires it clean
+npm run format   # Prettier; use `npm run format:check` to verify without writing
 npm test
 ```
-Once built, the `Quick Start` commands work against the local `dist/` output. The full release gate runs everything CI runs:
+Once built, the `Quick Start` commands work against the local `dist/` output. Code style and types are enforced: `npm run lint` and `npm run format:check` run in CI (Ubuntu + Windows) and in every release gate, so the baseline cannot regress. The full release gate runs everything CI runs:
 ```bash
 npm run release:gate

package/benchmarks/adapter-self-test.mjs CHANGED Viewed

@@ -116,7 +116,9 @@ export async function runGuardBenchAdapterSelfTest(options = {}) {
   };
   const schemaErrors = validateAdapterSelfTestReport(selfTest);
   if (schemaErrors.length > 0) {
-    throw new Error(`GuardBench adapter self-test schema validation failed: ${schemaErrors.join('; ')}`);
+    throw new Error(
+      `GuardBench adapter self-test schema validation failed: ${schemaErrors.join('; ')}`,
+    );
   }
   if (options.out && options.write !== false) {
@@ -146,7 +148,9 @@ async function main() {
     console.log(JSON.stringify(result, null, 2));
   } else if (result.ok) {
     console.log(`GuardBench adapter self-test passed: ${result.adapter.name}`);
-    console.log(`Contract rows: ${result.conformance.scenarios}/${result.conformance.expectedScenarios}`);
+    console.log(
+      `Contract rows: ${result.conformance.scenarios}/${result.conformance.expectedScenarios}`,
+    );
     console.log(`Full-contract score: ${(result.score.fullContractPassRate * 100).toFixed(1)}%`);
     console.log(`Decision accuracy: ${(result.score.decisionAccuracy * 100).toFixed(1)}%`);
     if (result.outPath) console.log(`Self-test report: ${result.outPath}`);

package/benchmarks/adapters/example-allow.mjs CHANGED Viewed

@@ -2,7 +2,8 @@ import { defineGuardBenchAdapter } from '../adapter-kit.mjs';
 export default defineGuardBenchAdapter({
   name: 'Example Allow Adapter',
-  description: 'Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests.',
+  description:
+    'Credential-free GuardBench adapter example. It always allows and is useful for adapter-loading smoke tests.',
   async setup({ scenario }) {
     return {
       memoryCount: (scenario.seed.seededMemories ?? []).length,
@@ -19,7 +20,9 @@ export default defineGuardBenchAdapter({
       summary: [
         `Example adapter loaded ${state.memoryCount} seeded memories`,
         `${state.toolEventCount} seeded tool events`,
-        scenario.seed.seededNoise ? `${scenario.seed.seededNoise.count} noise memories` : 'no noise block',
+        scenario.seed.seededNoise
+          ? `${scenario.seed.seededNoise.count} noise memories`
+          : 'no noise block',
         state.hasFaultInjection ? 'fault injection present but unsupported' : 'no fault injection',
       ].join('; '),
     };

package/benchmarks/adapters/mem0-platform.mjs CHANGED Viewed

@@ -51,9 +51,7 @@ function memoryText(memory) {
 }
 function evidenceIds(memories) {
-  return memories
-    .map(memory => memory?.id ?? memory?.memory_id)
-    .filter(Boolean);
+  return memories.map(memory => memory?.id ?? memory?.memory_id).filter(Boolean);
 }
 function decisionFromMemories(memories, action, unsupportedFault = null) {
@@ -62,7 +60,9 @@ function decisionFromMemories(memories, action, unsupportedFault = null) {
       decision: 'warn',
       riskScore: 0.55,
       evidenceIds: evidenceIds(memories),
-      recommendedActions: ['External adapter cannot inject storage faults into Mem0 Platform; verify memory health separately.'],
+      recommendedActions: [
+        'External adapter cannot inject storage faults into Mem0 Platform; verify memory health separately.',
+      ],
       summary: `Mem0 adapter cannot emulate fault injection: ${unsupportedFault}.`,
     };
   }
@@ -129,7 +129,9 @@ class Mem0PlatformClient {
     if (!response.ok && response.status !== 204) {
       const body = await response.text();
-      throw new Error(`Mem0 ${options.method ?? 'GET'} ${path} failed ${response.status}: ${body.slice(0, 500)}`);
+      throw new Error(
+        `Mem0 ${options.method ?? 'GET'} ${path} failed ${response.status}: ${body.slice(0, 500)}`,
+      );
     }
     if (response.status === 204) return null;
@@ -175,7 +177,7 @@ class Mem0PlatformClient {
         filters: { user_id: userId },
       }),
     });
-    return Array.isArray(response) ? response : response?.results ?? [];
+    return Array.isArray(response) ? response : (response?.results ?? []);
   }
   async deleteUser(userId) {
@@ -191,9 +193,10 @@ function memoryMessagesFromScenario(scenario) {
     messages.push({ role: 'user', content: memory.content });
   }
   for (const event of scenario.seed.seededToolEvents ?? []) {
-    const seededSecret = event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
-      ? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
-      : '';
+    const seededSecret =
+      event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
+        ? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
+        : '';
     messages.push({
       role: 'user',
       content: [
@@ -204,7 +207,9 @@ function memoryMessagesFromScenario(scenario) {
         event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
         seededSecret ? `Error: ${seededSecret}` : '',
         event.output ? `Output: ${event.output}` : '',
-      ].filter(Boolean).join('\n'),
+      ]
+        .filter(Boolean)
+        .join('\n'),
     });
   }
   if (scenario.seed.seededNoise?.count) {
@@ -234,14 +239,16 @@ async function addInBatches(client, { userId, scenario, messages }) {
 function userIdForScenario(scenario) {
   const prefix = process.env.MEM0_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
-  const runId = process.env.MEM0_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
+  const runId =
+    process.env.MEM0_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
   return `${prefix}-${runId}-${scenario.id}`.toLowerCase();
 }
 export function createGuardBenchAdapter(options = {}) {
   return {
     name: 'Mem0 Platform',
-    description: 'Mem0 Platform REST adapter using V3 add, V2 search, event polling, and entity cleanup.',
+    description:
+      'Mem0 Platform REST adapter using V3 add, V2 search, event polling, and entity cleanup.',
     async setup({ scenario }) {
       const client = new Mem0PlatformClient(options);
       const userId = userIdForScenario(scenario);

package/benchmarks/adapters/zep-cloud.mjs CHANGED Viewed

@@ -46,13 +46,9 @@ function tokenOverlap(a, b) {
 }
 function resultText(result) {
-  return [
-    result?.fact,
-    result?.content,
-    result?.summary,
-    result?.name,
-    result?.context,
-  ].filter(Boolean).join('\n');
+  return [result?.fact, result?.content, result?.summary, result?.name, result?.context]
+    .filter(Boolean)
+    .join('\n');
 }
 function collectSearchResults(response) {
@@ -65,8 +61,14 @@ function collectSearchResults(response) {
 }
 function evidenceIds(results) {
-  return results.map((result, index) =>
-    result?.uuid ?? result?.id ?? result?.task_id ?? result?.thread_id ?? `zep-result-${index + 1}`);
+  return results.map(
+    (result, index) =>
+      result?.uuid ??
+      result?.id ??
+      result?.task_id ??
+      result?.thread_id ??
+      `zep-result-${index + 1}`,
+  );
 }
 function decisionFromSearchResults(results, action, unsupportedFault = null) {
@@ -75,7 +77,9 @@ function decisionFromSearchResults(results, action, unsupportedFault = null) {
       decision: 'warn',
       riskScore: 0.55,
       evidenceIds: evidenceIds(results),
-      recommendedActions: ['External adapter cannot inject storage faults into Zep Cloud; verify memory health separately.'],
+      recommendedActions: [
+        'External adapter cannot inject storage faults into Zep Cloud; verify memory health separately.',
+      ],
       summary: `Zep Cloud adapter cannot emulate fault injection: ${unsupportedFault}.`,
     };
   }
@@ -132,7 +136,10 @@ class ZepCloudClient {
     return this.authScheme ? `${this.authScheme} ${this.apiKey}` : this.apiKey;
   }
-  async request(path, { method = 'GET', body, okStatuses = [200, 201, 204], ignoreNotFound = false } = {}) {
+  async request(
+    path,
+    { method = 'GET', body, okStatuses = [200, 201, 204], ignoreNotFound = false } = {},
+  ) {
     const response = await this.fetch(`${this.baseUrl}${path}`, {
       method,
       headers: {
@@ -209,22 +216,33 @@ function memoryMessagesFromScenario(scenario) {
     messages.push(message(memory.content));
   }
   for (const event of scenario.seed.seededToolEvents ?? []) {
-    const seededSecret = event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
-      ? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
-      : '';
-    messages.push(message([
-      `Tool event: ${event.tool ?? 'tool'}`,
-      event.action ? `Action: ${event.action}` : '',
-      event.outcome ? `Outcome: ${event.outcome}` : '',
-      event.errorSummary ? `Error: ${event.errorSummary}` : '',
-      event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
-      seededSecret ? `Error: ${seededSecret}` : '',
-      event.output ? `Output: ${event.output}` : '',
-    ].filter(Boolean).join('\n')));
+    const seededSecret =
+      event.errorSummaryPattern && scenario.privateSeed?.seededSecrets?.[0]
+        ? `${'x'.repeat(1990)} ${scenario.privateSeed.seededSecrets[0]}`
+        : '';
+    messages.push(
+      message(
+        [
+          `Tool event: ${event.tool ?? 'tool'}`,
+          event.action ? `Action: ${event.action}` : '',
+          event.outcome ? `Outcome: ${event.outcome}` : '',
+          event.errorSummary ? `Error: ${event.errorSummary}` : '',
+          event.errorSummaryPattern ? `Error pattern: ${event.errorSummaryPattern}` : '',
+          seededSecret ? `Error: ${seededSecret}` : '',
+          event.output ? `Output: ${event.output}` : '',
+        ]
+          .filter(Boolean)
+          .join('\n'),
+      ),
+    );
   }
   if (scenario.seed.seededNoise?.count) {
     for (let i = 0; i < scenario.seed.seededNoise.count; i++) {
-      messages.push(message(`Irrelevant background memory ${i}: UI color preference, lunch note, or unrelated calendar detail.`));
+      messages.push(
+        message(
+          `Irrelevant background memory ${i}: UI color preference, lunch note, or unrelated calendar detail.`,
+        ),
+      );
     }
   }
   return messages;
@@ -241,14 +259,16 @@ async function addInBatches(client, { sessionId, messages }) {
 function idForScenario(kind, scenario) {
   const prefix = process.env.ZEP_GUARDBENCH_USER_PREFIX ?? 'audrey-guardbench';
-  const runId = process.env.ZEP_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
+  const runId =
+    process.env.ZEP_GUARDBENCH_RUN_ID ?? `${Date.now()}-${randomBytes(8).toString('hex')}`;
   return `${prefix}-${runId}-${kind}-${scenario.id}`.toLowerCase();
 }
 export function createGuardBenchAdapter(options = {}) {
   return {
     name: 'Zep Cloud',
-    description: 'Zep Cloud REST adapter using v2 users, sessions, memory.add, graph.search, and user cleanup.',
+    description:
+      'Zep Cloud REST adapter using v2 users, sessions, memory.add, graph.search, and user cleanup.',
     async setup({ scenario }) {
       const client = new ZepCloudClient(options);
       const userId = idForScenario('user', scenario);
@@ -257,7 +277,11 @@ export function createGuardBenchAdapter(options = {}) {
       await client.createUser(userId);
       await client.createSession({ sessionId, userId });
       await addInBatches(client, { sessionId, messages });
-      const ingestDelayMs = Number(options.ingestDelayMs ?? process.env.ZEP_GUARDBENCH_INGEST_DELAY_MS ?? DEFAULT_INGEST_DELAY_MS);
+      const ingestDelayMs = Number(
+        options.ingestDelayMs ??
+          process.env.ZEP_GUARDBENCH_INGEST_DELAY_MS ??
+          DEFAULT_INGEST_DELAY_MS,
+      );
       if (ingestDelayMs > 0) await sleep(ingestDelayMs);
       return { client, userId, sessionId };
     },

package/benchmarks/baselines.js CHANGED Viewed

@@ -26,7 +26,10 @@ function keywordScore(queryTokens, content) {
 function sortByScore(rows) {
   return rows
     .filter(row => Number.isFinite(row.score))
-    .sort((a, b) => b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')));
+    .sort(
+      (a, b) =>
+        b.score - a.score || String(b.createdAt || '').localeCompare(String(a.createdAt || '')),
+    );
 }
 function flattenMemories(benchmarkCase, ids = []) {
@@ -127,11 +130,13 @@ export async function runBaselineScenario(system, benchmarkCase, providerConfig,
 export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) {
   const queryTokens = tokenize(benchmarkCase.query);
-  return sortByScore(flattenMemories(benchmarkCase).map(memory => ({
-    ...memory,
-    type: 'episodic',
-    score: keywordScore(queryTokens, memory.content),
-  }))).slice(0, limit);
+  return sortByScore(
+    flattenMemories(benchmarkCase).map(memory => ({
+      ...memory,
+      type: 'episodic',
+      score: keywordScore(queryTokens, memory.content),
+    })),
+  ).slice(0, limit);
 }
 export function runRecentWindowBaseline(benchmarkCase, limit = 3) {

package/benchmarks/build-leaderboard.mjs CHANGED Viewed

@@ -34,14 +34,16 @@ function rowFromBundle(dir) {
 function compareRows(a, b) {
   return (
-    Number(b.verification.ok) - Number(a.verification.ok)
-    || Number(b.conformance.ok) - Number(a.conformance.ok)
-    || (b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1)
-    || (b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1)
-    || (b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1)
-    || (a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) - (b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER)
-    || (a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) - (b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER)
-    || a.subject.name.localeCompare(b.subject.name)
+    Number(b.verification.ok) - Number(a.verification.ok) ||
+    Number(b.conformance.ok) - Number(a.conformance.ok) ||
+    (b.score.fullContractPassRate ?? -1) - (a.score.fullContractPassRate ?? -1) ||
+    (b.score.decisionAccuracy ?? -1) - (a.score.decisionAccuracy ?? -1) ||
+    (b.score.evidenceRecall ?? -1) - (a.score.evidenceRecall ?? -1) ||
+    (a.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) -
+      (b.score.redactionLeaks ?? Number.MAX_SAFE_INTEGER) ||
+    (a.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) -
+      (b.score.latency?.p95Ms ?? Number.MAX_SAFE_INTEGER) ||
+    a.subject.name.localeCompare(b.subject.name)
   );
 }
@@ -49,7 +51,9 @@ export function buildGuardBenchLeaderboard(options = {}) {
   const bundleDirs = options.bundleDirs?.length
     ? options.bundleDirs
     : ['benchmarks/output/submission-bundle'];
-  const rows = bundleDirs.map(rowFromBundle).sort(compareRows)
+  const rows = bundleDirs
+    .map(rowFromBundle)
+    .sort(compareRows)
     .map((row, index) => ({ rank: index + 1, ...row }));
   return {
     schemaVersion: '1.0.0',
@@ -66,12 +70,16 @@ export function buildGuardBenchLeaderboard(options = {}) {
       'subject.name',
     ],
     rows,
-    failures: rows.flatMap(row => row.verification.failures.map(failure => `${row.subject.name}: ${failure}`)),
+    failures: rows.flatMap(row =>
+      row.verification.failures.map(failure => `${row.subject.name}: ${failure}`),
+    ),
   };
 }
 export function writeGuardBenchLeaderboard(options = {}) {
-  const outJson = resolve(options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json');
+  const outJson = resolve(
+    options.outJson ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.json',
+  );
   const outMd = resolve(options.outMd ?? 'benchmarks/output/leaderboard/guardbench-leaderboard.md');
   const schemasDir = resolve(options.schemasDir ?? 'benchmarks/schemas');
   const leaderboard = buildGuardBenchLeaderboard(options);
@@ -97,18 +105,23 @@ export function renderMarkdown(leaderboard) {
     '|---:|---|---:|---:|---:|---:|---:|---:|---:|---|',
   ];
   for (const row of leaderboard.rows) {
-    lines.push([
-      row.rank,
-      row.subject.name,
-      row.verification.ok ? 'yes' : 'no',
-      row.conformance.ok ? 'yes' : 'no',
-      percent(row.score.fullContractPassRate),
-      percent(row.score.decisionAccuracy),
-      percent(row.score.evidenceRecall),
-      number(row.score.redactionLeaks),
-      row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
-      row.source.dir,
-    ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'));
+    lines.push(
+      [
+        row.rank,
+        row.subject.name,
+        row.verification.ok ? 'yes' : 'no',
+        row.conformance.ok ? 'yes' : 'no',
+        percent(row.score.fullContractPassRate),
+        percent(row.score.decisionAccuracy),
+        percent(row.score.evidenceRecall),
+        number(row.score.redactionLeaks),
+        row.score.latency?.p95Ms == null ? 'n/a' : `${row.score.latency.p95Ms}ms`,
+        row.source.dir,
+      ]
+        .join(' | ')
+        .replace(/^/, '| ')
+        .replace(/$/, ' |'),
+    );
   }
   if (leaderboard.failures.length) {
     lines.push('', '## Verification Failures', '');

package/benchmarks/cases.js CHANGED Viewed

@@ -60,7 +60,8 @@ export const RETRIEVAL_CASES = [
     expectAny: ['Northwind'],
     memory: [
       {
-        content: 'During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.',
+        content:
+          'During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.',
         source: 'tool-result',
         tags: ['project', 'pilot'],
         context: { subject: 'sam', domain: 'operations' },
@@ -72,7 +73,8 @@ export const RETRIEVAL_CASES = [
         context: { subject: 'sam', domain: 'operations' },
       },
       {
-        content: 'The pilot budget review approved Northwind for rollout after the support SLA review.',
+        content:
+          'The pilot budget review approved Northwind for rollout after the support SLA review.',
         source: 'direct-observation',
         tags: ['finance', 'vendor', 'approval'],
         context: { subject: 'sam', domain: 'operations' },
@@ -169,17 +171,20 @@ export const RETRIEVAL_CASES = [
     expectAny: ['cap retry batches', 'stagger retries'],
     memory: [
       {
-        content: 'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.',
+        content:
+          'Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.',
         source: 'direct-observation',
         tags: ['payments', 'rate-limit'],
       },
       {
-        content: 'Payout incident volume dropped after retry batches were capped at 50 merchants per worker.',
+        content:
+          'Payout incident volume dropped after retry batches were capped at 50 merchants per worker.',
         source: 'tool-result',
         tags: ['payments', 'rate-limit'],
       },
       {
-        content: 'Risk operations requested an escalation when multiple merchants were affected in the same hour.',
+        content:
+          'Risk operations requested an escalation when multiple merchants were affected in the same hour.',
         source: 'told-by-user',
         tags: ['payments', 'escalation'],
       },
@@ -188,7 +193,8 @@ export const RETRIEVAL_CASES = [
       minClusterSize: 3,
       similarityThreshold: -0.3,
       principle: {
-        content: 'When payout retries start returning 429, cap retry batches and stagger retries before escalating.',
+        content:
+          'When payout retries start returning 429, cap retry batches and stagger retries before escalating.',
         type: 'procedural',
         conditions: ['processor returns 429', 'multiple merchants impacted'],
       },
@@ -343,7 +349,8 @@ export const OPERATION_CASES = [
     kind: 'operations',
     family: 'procedural_merge',
     title: 'Procedural merge',
-    description: 'Related episodes should merge into an executable procedure, not just a loose fact.',
+    description:
+      'Related episodes should merge into an executable procedure, not just a loose fact.',
     query: 'What should the agent do after two webhook signature failures?',
     expectAny: ['rotate the signing secret', 'replay queued events'],
     steps: [
@@ -376,7 +383,8 @@ export const OPERATION_CASES = [
         minClusterSize: 3,
         similarityThreshold: -0.3,
         principle: {
-          content: 'When webhook signature verification fails twice, rotate the signing secret and replay queued events.',
+          content:
+            'When webhook signature verification fails twice, rotate the signing secret and replay queued events.',
           type: 'procedural',
           conditions: ['signature verification fails twice', 'queued events pending'],
         },
@@ -395,7 +403,8 @@ export const GUARD_CASES = [
     kind: 'guard',
     family: 'closed_loop_failure_memory',
     title: 'Guard remembers failed tool outcome',
-    description: 'A failed guarded tool run should create a future caution and warning reflex for the same tool.',
+    description:
+      'A failed guarded tool run should create a future caution and warning reflex for the same tool.',
     action: 'run npm test before release',
     tool: 'npm test',
     expectAll: ['decision:caution', 'warning:recent_failure', 'reflex:warn'],
@@ -439,7 +448,8 @@ export const GUARD_CASES = [
     kind: 'guard',
     family: 'guard_receipt_hardening',
     title: 'Guard rejects replayed receipt outcomes',
-    description: 'A receipt should only be closed once, while the failed outcome still becomes future caution memory.',
+    description:
+      'A receipt should only be closed once, while the failed outcome still becomes future caution memory.',
     action: 'run npm test before release',
     tool: 'npm test',
     expectAll: ['guard_hardened:replay_rejected', 'decision:caution', 'warning:recent_failure'],
@@ -470,7 +480,8 @@ export const GUARD_CASES = [
     kind: 'guard',
     family: 'guard_receipt_hardening',
     title: 'Guard rejects non-guard receipts',
-    description: 'A normal tool trace must not be accepted as a guard receipt for after-action feedback.',
+    description:
+      'A normal tool trace must not be accepted as a guard receipt for after-action feedback.',
     action: 'format docs',
     tool: 'Bash',
     expectAll: ['guard_hardened:non_guard_receipt_rejected'],
@@ -511,7 +522,8 @@ export const LOCAL_BENCHMARK_SUITES = [
   {
     id: 'guard',
     title: 'Agent guard loop',
-    description: 'Closed-loop memory-before-action behavior for receipts, warnings, and blocking reflexes.',
+    description:
+      'Closed-loop memory-before-action behavior for receipts, warnings, and blocking reflexes.',
     comparableToBaselines: false,
     cases: GUARD_CASES,
   },

package/benchmarks/create-conformance-card.mjs CHANGED Viewed

@@ -1,7 +1,10 @@
 import { existsSync, readFileSync, writeFileSync } from 'node:fs';
 import { createHash } from 'node:crypto';
 import { join, resolve } from 'node:path';
-import { computeGuardBenchArtifactHashes, validateGuardBenchArtifacts } from './validate-guardbench-artifacts.mjs';
+import {
+  computeGuardBenchArtifactHashes,
+  validateGuardBenchArtifacts,
+} from './validate-guardbench-artifacts.mjs';
 import { publicArtifactValue, publicPath } from './public-paths.mjs';
 const CARD_FILE = 'guardbench-conformance-card.json';
@@ -18,7 +21,9 @@ function sha256File(path) {
 function findExternalSubject(summary, requestedAdapter) {
   const externalSubjects = (summary.manifest?.subjects ?? []).filter(subject => subject.external);
   if (requestedAdapter) {
-    const requested = externalSubjects.find(subject => subject.name === requestedAdapter || subject.id === requestedAdapter);
+    const requested = externalSubjects.find(
+      subject => subject.name === requestedAdapter || subject.id === requestedAdapter,
+    );
     if (requested) return requested;
   }
   return externalSubjects.length === 1 ? externalSubjects[0] : null;
@@ -57,7 +62,11 @@ export function buildGuardBenchConformanceCard(options = {}) {
     manifestVersion: summary.manifest?.manifestVersion ?? null,
     suiteId: summary.manifest?.suiteId ?? null,
     subject: {
-      name: systemSummary?.system ?? metadata?.adapterConformance?.adapter ?? metadata?.adapter ?? 'unknown',
+      name:
+        systemSummary?.system ??
+        metadata?.adapterConformance?.adapter ??
+        metadata?.adapter ??
+        'unknown',
       requestedAdapter: metadata?.adapterConformance?.requestedAdapter ?? metadata?.adapter ?? null,
       external: Boolean(externalSubject?.external ?? metadata),
     },