npm - @joshuaswarren/openclaw-engram - Versions diffs - 9.0.17 → 9.0.18 - Mend

@joshuaswarren/openclaw-engram 9.0.17 → 9.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -30,7 +30,7 @@ AI agents forget everything between conversations. Engram fixes that.
 - **Local-first** — All memory data stays on your filesystem as plain markdown files. No cloud dependency, no vendor lock-in, fully portable.
 - **Pluggable search** — Choose from six search backends: QMD (hybrid BM25+vector+reranking), LanceDB, Meilisearch, Orama, remote HTTP, or bring your own.
 - **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
-- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording, so memory improvements can be measured on real agent trajectories instead of subjective recall demos.
+- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
 - **Zero-config start** — Install, add an API key, restart. Engram works out of the box with sensible defaults and progressively unlocks advanced features as you enable them.
 ## Quick Start
@@ -139,7 +139,7 @@ Engram's capabilities are organized into feature families that you can enable pr
 | **Compounding** | Weekly synthesis that surfaces patterns and recurring mistakes |
 | **Hot/Cold Tiering** | Automatic migration of aging memories to cold storage |
 | **Behavior Loop Tuning** | Runtime self-tuning of extraction and recall parameters |
-| **Evaluation Harness** | Tracks benchmark packs, run summaries, and live shadow recall records so future PRs can be gated on memory quality instead of anecdotes |
+| **Evaluation Harness** | Tracks benchmark packs, run summaries, live shadow recall records, and CI delta comparisons so future PRs can be gated on memory quality instead of anecdotes |
 Start with defaults, then enable features as needed. See [Enable All Features](docs/enable-all-v8.md) for a full-feature config profile.
@@ -152,6 +152,7 @@ openclaw engram compat --strict              # Compatibility check
 openclaw engram benchmark-status             # Benchmark/eval harness packs, runs, shadow recalls, latest summaries
 openclaw engram benchmark-validate <path>    # Validate a benchmark manifest or pack directory
 openclaw engram benchmark-import <path>      # Import a validated benchmark pack into the eval store
+openclaw engram benchmark-ci-gate            # Compare base vs candidate eval stores and fail on regressions
 openclaw engram conversation-index-health    # Conversation index status
 openclaw engram graph-health                 # Entity graph status
 openclaw engram tier-status                  # Hot/cold tier metrics
@@ -183,7 +184,7 @@ Full reference: [Config Reference](docs/config-reference.md)
 - [Search Backends](docs/search-backends.md) — Choosing and configuring search engines
 - [Writing a Search Backend](docs/writing-a-search-backend.md) — Build your own adapter
 - [Config Reference](docs/config-reference.md) — Every setting with defaults
-- [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack and run-summary format
+- [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack, shadow recall, and CI delta gate format
 - [Architecture Overview](docs/architecture/overview.md) — System design and storage layout
 - [Retrieval Pipeline](docs/architecture/retrieval-pipeline.md) — How recall works
 - [Memory Lifecycle](docs/architecture/memory-lifecycle.md) — Write, consolidation, expiry

package/dist/index.js CHANGED Viewed

@@ -11717,6 +11717,151 @@ async function listNamedFiles(dir, fileName) {
 async function readJsonFile(filePath) {
   return JSON.parse(await readFile12(filePath, "utf-8"));
 }
+var LOWER_IS_BETTER_METRICS = /* @__PURE__ */ new Set(["trustViolationRate"]);
+function computePassRate(run) {
+  return run.totalCases > 0 ? run.passedCases / run.totalCases : 0;
+}
+function latestCompletedRunsByBenchmark(runs) {
+  const sorted = [...runs].filter((run) => run.status === "completed").sort((a, b) => {
+    const aTime = Date.parse(a.completedAt ?? a.startedAt);
+    const bTime = Date.parse(b.completedAt ?? b.startedAt);
+    return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
+  });
+  const out = /* @__PURE__ */ new Map();
+  for (const run of sorted) {
+    if (!out.has(run.benchmarkId)) {
+      out.set(run.benchmarkId, run);
+    }
+  }
+  return out;
+}
+function compareMetricDeltas(baseMetrics, candidateMetrics) {
+  const deltas = {};
+  const regressions = [];
+  const improvements = [];
+  if (!baseMetrics || !candidateMetrics) {
+    return { deltas, regressions, improvements };
+  }
+  for (const metric of Object.keys(baseMetrics)) {
+    const baseValue = baseMetrics[metric];
+    const candidateValue = candidateMetrics[metric];
+    if (typeof baseValue !== "number" || typeof candidateValue !== "number") continue;
+    const delta = candidateValue - baseValue;
+    deltas[metric] = delta;
+    if (delta === 0) continue;
+    const lowerIsBetter = LOWER_IS_BETTER_METRICS.has(metric);
+    const improved = lowerIsBetter ? delta < 0 : delta > 0;
+    const summary = `${metric} ${baseValue} -> ${candidateValue}`;
+    if (improved) {
+      improvements.push(summary);
+    } else {
+      regressions.push(summary);
+    }
+  }
+  return { deltas, regressions, improvements };
+}
+async function collectEvalStoreSnapshot(options) {
+  const rootDir = options.rootDir;
+  const benchmarkDir = path14.join(rootDir, "benchmarks");
+  const runsDir = path14.join(rootDir, "runs");
+  const shadowDir = path14.join(rootDir, "shadow");
+  const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
+  const runFiles = await listJsonFiles(runsDir);
+  const shadowFiles = await listJsonFiles(shadowDir);
+  const invalidBenchmarks = [];
+  const invalidRuns = [];
+  const invalidShadows = [];
+  const manifests = [];
+  for (const filePath of benchmarkFiles) {
+    try {
+      manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
+    } catch (error) {
+      invalidBenchmarks.push({
+        path: filePath,
+        error: error instanceof Error ? error.message : String(error)
+      });
+    }
+  }
+  const runs = [];
+  for (const filePath of runFiles) {
+    try {
+      runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
+    } catch (error) {
+      invalidRuns.push({
+        path: filePath,
+        error: error instanceof Error ? error.message : String(error)
+      });
+    }
+  }
+  const shadows = [];
+  for (const filePath of shadowFiles) {
+    try {
+      shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
+    } catch (error) {
+      invalidShadows.push({
+        path: filePath,
+        error: error instanceof Error ? error.message : String(error)
+      });
+    }
+  }
+  runs.sort((a, b) => {
+    const aTime = Date.parse(a.completedAt ?? a.startedAt);
+    const bTime = Date.parse(b.completedAt ?? b.startedAt);
+    return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
+  });
+  shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
+  const tags = /* @__PURE__ */ new Set();
+  const sourceLinks = /* @__PURE__ */ new Set();
+  let totalCases = 0;
+  for (const manifest of manifests) {
+    totalCases += manifest.cases.length;
+    for (const tag of manifest.tags ?? []) tags.add(tag);
+    for (const link of manifest.sourceLinks ?? []) sourceLinks.add(link);
+  }
+  return {
+    status: {
+      enabled: options.enabled,
+      shadowModeEnabled: options.shadowModeEnabled,
+      rootDir,
+      benchmarkDir,
+      runsDir,
+      benchmarks: {
+        total: benchmarkFiles.length,
+        valid: manifests.length,
+        invalid: invalidBenchmarks.length,
+        totalCases,
+        tags: [...tags].sort(),
+        sourceLinks: [...sourceLinks].sort()
+      },
+      runs: {
+        total: runFiles.length,
+        invalid: invalidRuns.length,
+        completed: runs.filter((run) => run.status === "completed").length,
+        failed: runs.filter((run) => run.status === "failed").length,
+        partial: runs.filter((run) => run.status === "partial").length,
+        running: runs.filter((run) => run.status === "running").length,
+        latestRunId: runs[0]?.runId,
+        latestBenchmarkId: runs[0]?.benchmarkId,
+        latestCompletedAt: runs[0]?.completedAt
+      },
+      shadows: {
+        total: shadowFiles.length,
+        invalid: invalidShadows.length,
+        latestTraceId: shadows[0]?.traceId,
+        latestRecordedAt: shadows[0]?.recordedAt,
+        latestSessionKey: shadows[0]?.sessionKey
+      },
+      latestRun: runs[0],
+      latestShadow: shadows[0],
+      invalidBenchmarks,
+      invalidRuns,
+      invalidShadows
+    },
+    manifests,
+    runs,
+    shadows
+  };
+}
 async function resolveBenchmarkManifestPath(sourcePath) {
   const info = await stat3(sourcePath);
   if (info.isDirectory()) {
@@ -11793,102 +11938,125 @@ async function recordEvalShadowRecall(options) {
   return targetPath;
 }
 async function getEvalHarnessStatus(options) {
-  const rootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
-  const benchmarkDir = path14.join(rootDir, "benchmarks");
-  const runsDir = path14.join(rootDir, "runs");
-  const shadowDir = path14.join(rootDir, "shadow");
-  const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
-  const runFiles = await listJsonFiles(runsDir);
-  const shadowFiles = await listJsonFiles(shadowDir);
-  const invalidBenchmarks = [];
-  const invalidRuns = [];
-  const invalidShadows = [];
-  const manifests = [];
-  for (const filePath of benchmarkFiles) {
-    try {
-      manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
-    } catch (error) {
-      invalidBenchmarks.push({
-        path: filePath,
-        error: error instanceof Error ? error.message : String(error)
-      });
-    }
-  }
-  const runs = [];
-  for (const filePath of runFiles) {
-    try {
-      runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
-    } catch (error) {
-      invalidRuns.push({
-        path: filePath,
-        error: error instanceof Error ? error.message : String(error)
-      });
-    }
+  return (await collectEvalStoreSnapshot({
+    rootDir: resolveEvalStoreDir(options.memoryDir, options.evalStoreDir),
+    enabled: options.enabled,
+    shadowModeEnabled: options.shadowModeEnabled
+  })).status;
+}
+function resolveRequiredEvalStoreRoot(options, label) {
+  if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
+    return options.evalStoreDir.trim();
   }
-  const shadows = [];
-  for (const filePath of shadowFiles) {
-    try {
-      shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
-    } catch (error) {
-      invalidShadows.push({
-        path: filePath,
-        error: error instanceof Error ? error.message : String(error)
-      });
-    }
+  if (typeof options.memoryDir === "string" && options.memoryDir.trim().length > 0) {
+    return resolveEvalStoreDir(options.memoryDir.trim());
   }
-  runs.sort((a, b) => {
-    const aTime = Date.parse(a.completedAt ?? a.startedAt);
-    const bTime = Date.parse(b.completedAt ?? b.startedAt);
-    return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
+  throw new Error(`${label} requires memoryDir or evalStoreDir`);
+}
+async function runEvalBenchmarkCiGate(options) {
+  const baseRootDir = resolveRequiredEvalStoreRoot(
+    { memoryDir: options.baseMemoryDir, evalStoreDir: options.baseEvalStoreDir },
+    "base"
+  );
+  const candidateRootDir = resolveRequiredEvalStoreRoot(
+    { memoryDir: options.candidateMemoryDir, evalStoreDir: options.candidateEvalStoreDir },
+    "candidate"
+  );
+  const baseSnapshot = await collectEvalStoreSnapshot({
+    rootDir: baseRootDir,
+    enabled: true,
+    shadowModeEnabled: true
   });
-  const latestRun = runs[0];
-  shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
-  const latestShadow = shadows[0];
-  const tags = /* @__PURE__ */ new Set();
-  const sourceLinks = /* @__PURE__ */ new Set();
-  let totalCases = 0;
-  for (const manifest of manifests) {
-    totalCases += manifest.cases.length;
-    for (const tag of manifest.tags ?? []) tags.add(tag);
-    for (const link of manifest.sourceLinks ?? []) sourceLinks.add(link);
+  const candidateSnapshot = await collectEvalStoreSnapshot({
+    rootDir: candidateRootDir,
+    enabled: true,
+    shadowModeEnabled: true
+  });
+  const regressions = [];
+  const improvements = [];
+  if (baseSnapshot.status.invalidBenchmarks.length > 0) {
+    regressions.push(`base store has ${baseSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
+  }
+  if (baseSnapshot.status.invalidRuns.length > 0) {
+    regressions.push(`base store has ${baseSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
+  }
+  if (baseSnapshot.status.invalidShadows.length > 0) {
+    regressions.push(`base store has ${baseSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
+  }
+  if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
+    regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
+  }
+  if (candidateSnapshot.status.invalidRuns.length > 0) {
+    regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
+  }
+  if (candidateSnapshot.status.invalidShadows.length > 0) {
+    regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
+  }
+  const baseRuns = latestCompletedRunsByBenchmark(baseSnapshot.runs);
+  const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
+  const missingCandidateBenchmarks = [...baseRuns.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
+  for (const benchmarkId of missingCandidateBenchmarks) {
+    regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
+  }
+  const deltas = [];
+  for (const benchmarkId of [...baseRuns.keys()].sort()) {
+    const baseRun = baseRuns.get(benchmarkId);
+    const candidateRun = candidateRuns.get(benchmarkId);
+    if (!baseRun || !candidateRun) continue;
+    const basePassRate = computePassRate(baseRun);
+    const candidatePassRate = computePassRate(candidateRun);
+    const passRateDelta = candidatePassRate - basePassRate;
+    const delta = {
+      benchmarkId,
+      baseRunId: baseRun.runId,
+      candidateRunId: candidateRun.runId,
+      basePassRate,
+      candidatePassRate,
+      passRateDelta,
+      metricDeltas: {},
+      regressions: [],
+      improvements: []
+    };
+    if (passRateDelta < 0) {
+      delta.regressions.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
+      regressions.push(`${benchmarkId} pass rate regressed (${basePassRate} -> ${candidatePassRate})`);
+    } else if (passRateDelta > 0) {
+      delta.improvements.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
+      improvements.push(`${benchmarkId} pass rate improved (${basePassRate} -> ${candidatePassRate})`);
+    }
+    const metricDelta = compareMetricDeltas(baseRun.metrics, candidateRun.metrics);
+    delta.metricDeltas = metricDelta.deltas;
+    for (const regression of metricDelta.regressions) {
+      delta.regressions.push(regression);
+      regressions.push(`${benchmarkId} ${regression}`);
+    }
+    for (const improvement of metricDelta.improvements) {
+      delta.improvements.push(improvement);
+      improvements.push(`${benchmarkId} ${improvement}`);
+    }
+    deltas.push(delta);
   }
   return {
-    enabled: options.enabled,
-    shadowModeEnabled: options.shadowModeEnabled,
-    rootDir,
-    benchmarkDir,
-    runsDir,
-    benchmarks: {
-      total: benchmarkFiles.length,
-      valid: manifests.length,
-      invalid: invalidBenchmarks.length,
-      totalCases,
-      tags: [...tags].sort(),
-      sourceLinks: [...sourceLinks].sort()
-    },
-    runs: {
-      total: runFiles.length,
-      invalid: invalidRuns.length,
-      completed: runs.filter((run) => run.status === "completed").length,
-      failed: runs.filter((run) => run.status === "failed").length,
-      partial: runs.filter((run) => run.status === "partial").length,
-      running: runs.filter((run) => run.status === "running").length,
-      latestRunId: latestRun?.runId,
-      latestBenchmarkId: latestRun?.benchmarkId,
-      latestCompletedAt: latestRun?.completedAt
-    },
-    shadows: {
-      total: shadowFiles.length,
-      invalid: invalidShadows.length,
-      latestTraceId: latestShadow?.traceId,
-      latestRecordedAt: latestShadow?.recordedAt,
-      latestSessionKey: latestShadow?.sessionKey
+    passed: regressions.length === 0,
+    baseRootDir: baseSnapshot.status.rootDir,
+    candidateRootDir: candidateSnapshot.status.rootDir,
+    comparedBenchmarks: deltas.length,
+    missingCandidateBenchmarks,
+    invalidArtifacts: {
+      base: {
+        benchmarks: baseSnapshot.status.invalidBenchmarks.length,
+        runs: baseSnapshot.status.invalidRuns.length,
+        shadows: baseSnapshot.status.invalidShadows.length
+      },
+      candidate: {
+        benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
+        runs: candidateSnapshot.status.invalidRuns.length,
+        shadows: candidateSnapshot.status.invalidShadows.length
+      }
     },
-    latestRun,
-    latestShadow,
-    invalidBenchmarks,
-    invalidRuns,
-    invalidShadows
+    regressions,
+    improvements,
+    deltas
   };
 }
@@ -26026,6 +26194,12 @@ async function runBenchmarkImportCliCommand(options) {
     force: options.force === true
   });
 }
+async function runBenchmarkCiGateCliCommand(options) {
+  return runEvalBenchmarkCiGate({
+    baseEvalStoreDir: options.baseEvalStoreDir,
+    candidateEvalStoreDir: options.candidateEvalStoreDir
+  });
+}
 async function runSessionCheckCliCommand(options) {
   return analyzeSessionIntegrity({ memoryDir: options.memoryDir });
 }
@@ -27153,6 +27327,18 @@ function registerCli(api, orchestrator) {
         console.log(JSON.stringify(summary, null, 2));
         console.log("OK");
       });
+      cmd.command("benchmark-ci-gate").description("Compare two eval stores and fail when the candidate regresses benchmark outcomes").requiredOption("--base <path>", "Path to the base eval store directory").requiredOption("--candidate <path>", "Path to the candidate eval store directory").action(async (...args) => {
+        const options = args[0] ?? {};
+        const summary = await runBenchmarkCiGateCliCommand({
+          baseEvalStoreDir: typeof options.base === "string" ? options.base : "",
+          candidateEvalStoreDir: typeof options.candidate === "string" ? options.candidate : ""
+        });
+        console.log(JSON.stringify(summary, null, 2));
+        if (!summary.passed) {
+          throw new Error("benchmark CI gate detected regressions");
+        }
+        console.log("OK");
+      });
       cmd.command("conversation-index-health").description("Show conversation index backend health and index stats").action(async () => {
         const health = await runConversationIndexHealthCliCommand(orchestrator);
         console.log(JSON.stringify(health, null, 2));