@joshuaswarren/openclaw-engram 9.0.17 → 9.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -30,7 +30,7 @@ AI agents forget everything between conversations. Engram fixes that.
30
30
  - **Local-first** — All memory data stays on your filesystem as plain markdown files. No cloud dependency, no vendor lock-in, fully portable.
31
31
  - **Pluggable search** — Choose from six search backends: QMD (hybrid BM25+vector+reranking), LanceDB, Meilisearch, Orama, remote HTTP, or bring your own.
32
32
  - **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
33
- - **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording, so memory improvements can be measured on real agent trajectories instead of subjective recall demos.
33
+ - **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
34
34
  - **Zero-config start** — Install, add an API key, restart. Engram works out of the box with sensible defaults and progressively unlocks advanced features as you enable them.
35
35
 
36
36
  ## Quick Start
@@ -139,7 +139,7 @@ Engram's capabilities are organized into feature families that you can enable pr
139
139
  | **Compounding** | Weekly synthesis that surfaces patterns and recurring mistakes |
140
140
  | **Hot/Cold Tiering** | Automatic migration of aging memories to cold storage |
141
141
  | **Behavior Loop Tuning** | Runtime self-tuning of extraction and recall parameters |
142
- | **Evaluation Harness** | Tracks benchmark packs, run summaries, and live shadow recall records so future PRs can be gated on memory quality instead of anecdotes |
142
+ | **Evaluation Harness** | Tracks benchmark packs, run summaries, live shadow recall records, and CI delta comparisons so future PRs can be gated on memory quality instead of anecdotes |
143
143
 
144
144
  Start with defaults, then enable features as needed. See [Enable All Features](docs/enable-all-v8.md) for a full-feature config profile.
145
145
 
@@ -152,6 +152,7 @@ openclaw engram compat --strict # Compatibility check
152
152
  openclaw engram benchmark-status # Benchmark/eval harness packs, runs, shadow recalls, latest summaries
153
153
  openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
154
154
  openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
155
+ openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
155
156
  openclaw engram conversation-index-health # Conversation index status
156
157
  openclaw engram graph-health # Entity graph status
157
158
  openclaw engram tier-status # Hot/cold tier metrics
@@ -183,7 +184,7 @@ Full reference: [Config Reference](docs/config-reference.md)
183
184
  - [Search Backends](docs/search-backends.md) — Choosing and configuring search engines
184
185
  - [Writing a Search Backend](docs/writing-a-search-backend.md) — Build your own adapter
185
186
  - [Config Reference](docs/config-reference.md) — Every setting with defaults
186
- - [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack and run-summary format
187
+ - [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack, shadow recall, and CI delta gate format
187
188
  - [Architecture Overview](docs/architecture/overview.md) — System design and storage layout
188
189
  - [Retrieval Pipeline](docs/architecture/retrieval-pipeline.md) — How recall works
189
190
  - [Memory Lifecycle](docs/architecture/memory-lifecycle.md) — Write, consolidation, expiry
package/dist/index.js CHANGED
@@ -11717,6 +11717,151 @@ async function listNamedFiles(dir, fileName) {
11717
11717
  async function readJsonFile(filePath) {
11718
11718
  return JSON.parse(await readFile12(filePath, "utf-8"));
11719
11719
  }
11720
+ var LOWER_IS_BETTER_METRICS = /* @__PURE__ */ new Set(["trustViolationRate"]);
11721
+ function computePassRate(run) {
11722
+ return run.totalCases > 0 ? run.passedCases / run.totalCases : 0;
11723
+ }
11724
+ function latestCompletedRunsByBenchmark(runs) {
11725
+ const sorted = [...runs].filter((run) => run.status === "completed").sort((a, b) => {
11726
+ const aTime = Date.parse(a.completedAt ?? a.startedAt);
11727
+ const bTime = Date.parse(b.completedAt ?? b.startedAt);
11728
+ return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
11729
+ });
11730
+ const out = /* @__PURE__ */ new Map();
11731
+ for (const run of sorted) {
11732
+ if (!out.has(run.benchmarkId)) {
11733
+ out.set(run.benchmarkId, run);
11734
+ }
11735
+ }
11736
+ return out;
11737
+ }
11738
+ function compareMetricDeltas(baseMetrics, candidateMetrics) {
11739
+ const deltas = {};
11740
+ const regressions = [];
11741
+ const improvements = [];
11742
+ if (!baseMetrics || !candidateMetrics) {
11743
+ return { deltas, regressions, improvements };
11744
+ }
11745
+ for (const metric of Object.keys(baseMetrics)) {
11746
+ const baseValue = baseMetrics[metric];
11747
+ const candidateValue = candidateMetrics[metric];
11748
+ if (typeof baseValue !== "number" || typeof candidateValue !== "number") continue;
11749
+ const delta = candidateValue - baseValue;
11750
+ deltas[metric] = delta;
11751
+ if (delta === 0) continue;
11752
+ const lowerIsBetter = LOWER_IS_BETTER_METRICS.has(metric);
11753
+ const improved = lowerIsBetter ? delta < 0 : delta > 0;
11754
+ const summary = `${metric} ${baseValue} -> ${candidateValue}`;
11755
+ if (improved) {
11756
+ improvements.push(summary);
11757
+ } else {
11758
+ regressions.push(summary);
11759
+ }
11760
+ }
11761
+ return { deltas, regressions, improvements };
11762
+ }
11763
+ async function collectEvalStoreSnapshot(options) {
11764
+ const rootDir = options.rootDir;
11765
+ const benchmarkDir = path14.join(rootDir, "benchmarks");
11766
+ const runsDir = path14.join(rootDir, "runs");
11767
+ const shadowDir = path14.join(rootDir, "shadow");
11768
+ const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
11769
+ const runFiles = await listJsonFiles(runsDir);
11770
+ const shadowFiles = await listJsonFiles(shadowDir);
11771
+ const invalidBenchmarks = [];
11772
+ const invalidRuns = [];
11773
+ const invalidShadows = [];
11774
+ const manifests = [];
11775
+ for (const filePath of benchmarkFiles) {
11776
+ try {
11777
+ manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
11778
+ } catch (error) {
11779
+ invalidBenchmarks.push({
11780
+ path: filePath,
11781
+ error: error instanceof Error ? error.message : String(error)
11782
+ });
11783
+ }
11784
+ }
11785
+ const runs = [];
11786
+ for (const filePath of runFiles) {
11787
+ try {
11788
+ runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
11789
+ } catch (error) {
11790
+ invalidRuns.push({
11791
+ path: filePath,
11792
+ error: error instanceof Error ? error.message : String(error)
11793
+ });
11794
+ }
11795
+ }
11796
+ const shadows = [];
11797
+ for (const filePath of shadowFiles) {
11798
+ try {
11799
+ shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
11800
+ } catch (error) {
11801
+ invalidShadows.push({
11802
+ path: filePath,
11803
+ error: error instanceof Error ? error.message : String(error)
11804
+ });
11805
+ }
11806
+ }
11807
+ runs.sort((a, b) => {
11808
+ const aTime = Date.parse(a.completedAt ?? a.startedAt);
11809
+ const bTime = Date.parse(b.completedAt ?? b.startedAt);
11810
+ return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
11811
+ });
11812
+ shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
11813
+ const tags = /* @__PURE__ */ new Set();
11814
+ const sourceLinks = /* @__PURE__ */ new Set();
11815
+ let totalCases = 0;
11816
+ for (const manifest of manifests) {
11817
+ totalCases += manifest.cases.length;
11818
+ for (const tag of manifest.tags ?? []) tags.add(tag);
11819
+ for (const link of manifest.sourceLinks ?? []) sourceLinks.add(link);
11820
+ }
11821
+ return {
11822
+ status: {
11823
+ enabled: options.enabled,
11824
+ shadowModeEnabled: options.shadowModeEnabled,
11825
+ rootDir,
11826
+ benchmarkDir,
11827
+ runsDir,
11828
+ benchmarks: {
11829
+ total: benchmarkFiles.length,
11830
+ valid: manifests.length,
11831
+ invalid: invalidBenchmarks.length,
11832
+ totalCases,
11833
+ tags: [...tags].sort(),
11834
+ sourceLinks: [...sourceLinks].sort()
11835
+ },
11836
+ runs: {
11837
+ total: runFiles.length,
11838
+ invalid: invalidRuns.length,
11839
+ completed: runs.filter((run) => run.status === "completed").length,
11840
+ failed: runs.filter((run) => run.status === "failed").length,
11841
+ partial: runs.filter((run) => run.status === "partial").length,
11842
+ running: runs.filter((run) => run.status === "running").length,
11843
+ latestRunId: runs[0]?.runId,
11844
+ latestBenchmarkId: runs[0]?.benchmarkId,
11845
+ latestCompletedAt: runs[0]?.completedAt
11846
+ },
11847
+ shadows: {
11848
+ total: shadowFiles.length,
11849
+ invalid: invalidShadows.length,
11850
+ latestTraceId: shadows[0]?.traceId,
11851
+ latestRecordedAt: shadows[0]?.recordedAt,
11852
+ latestSessionKey: shadows[0]?.sessionKey
11853
+ },
11854
+ latestRun: runs[0],
11855
+ latestShadow: shadows[0],
11856
+ invalidBenchmarks,
11857
+ invalidRuns,
11858
+ invalidShadows
11859
+ },
11860
+ manifests,
11861
+ runs,
11862
+ shadows
11863
+ };
11864
+ }
11720
11865
  async function resolveBenchmarkManifestPath(sourcePath) {
11721
11866
  const info = await stat3(sourcePath);
11722
11867
  if (info.isDirectory()) {
@@ -11793,102 +11938,125 @@ async function recordEvalShadowRecall(options) {
11793
11938
  return targetPath;
11794
11939
  }
11795
11940
  async function getEvalHarnessStatus(options) {
11796
- const rootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
11797
- const benchmarkDir = path14.join(rootDir, "benchmarks");
11798
- const runsDir = path14.join(rootDir, "runs");
11799
- const shadowDir = path14.join(rootDir, "shadow");
11800
- const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
11801
- const runFiles = await listJsonFiles(runsDir);
11802
- const shadowFiles = await listJsonFiles(shadowDir);
11803
- const invalidBenchmarks = [];
11804
- const invalidRuns = [];
11805
- const invalidShadows = [];
11806
- const manifests = [];
11807
- for (const filePath of benchmarkFiles) {
11808
- try {
11809
- manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
11810
- } catch (error) {
11811
- invalidBenchmarks.push({
11812
- path: filePath,
11813
- error: error instanceof Error ? error.message : String(error)
11814
- });
11815
- }
11816
- }
11817
- const runs = [];
11818
- for (const filePath of runFiles) {
11819
- try {
11820
- runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
11821
- } catch (error) {
11822
- invalidRuns.push({
11823
- path: filePath,
11824
- error: error instanceof Error ? error.message : String(error)
11825
- });
11826
- }
11941
+ return (await collectEvalStoreSnapshot({
11942
+ rootDir: resolveEvalStoreDir(options.memoryDir, options.evalStoreDir),
11943
+ enabled: options.enabled,
11944
+ shadowModeEnabled: options.shadowModeEnabled
11945
+ })).status;
11946
+ }
11947
+ function resolveRequiredEvalStoreRoot(options, label) {
11948
+ if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
11949
+ return options.evalStoreDir.trim();
11827
11950
  }
11828
- const shadows = [];
11829
- for (const filePath of shadowFiles) {
11830
- try {
11831
- shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
11832
- } catch (error) {
11833
- invalidShadows.push({
11834
- path: filePath,
11835
- error: error instanceof Error ? error.message : String(error)
11836
- });
11837
- }
11951
+ if (typeof options.memoryDir === "string" && options.memoryDir.trim().length > 0) {
11952
+ return resolveEvalStoreDir(options.memoryDir.trim());
11838
11953
  }
11839
- runs.sort((a, b) => {
11840
- const aTime = Date.parse(a.completedAt ?? a.startedAt);
11841
- const bTime = Date.parse(b.completedAt ?? b.startedAt);
11842
- return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
11954
+ throw new Error(`${label} requires memoryDir or evalStoreDir`);
11955
+ }
11956
+ async function runEvalBenchmarkCiGate(options) {
11957
+ const baseRootDir = resolveRequiredEvalStoreRoot(
11958
+ { memoryDir: options.baseMemoryDir, evalStoreDir: options.baseEvalStoreDir },
11959
+ "base"
11960
+ );
11961
+ const candidateRootDir = resolveRequiredEvalStoreRoot(
11962
+ { memoryDir: options.candidateMemoryDir, evalStoreDir: options.candidateEvalStoreDir },
11963
+ "candidate"
11964
+ );
11965
+ const baseSnapshot = await collectEvalStoreSnapshot({
11966
+ rootDir: baseRootDir,
11967
+ enabled: true,
11968
+ shadowModeEnabled: true
11843
11969
  });
11844
- const latestRun = runs[0];
11845
- shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
11846
- const latestShadow = shadows[0];
11847
- const tags = /* @__PURE__ */ new Set();
11848
- const sourceLinks = /* @__PURE__ */ new Set();
11849
- let totalCases = 0;
11850
- for (const manifest of manifests) {
11851
- totalCases += manifest.cases.length;
11852
- for (const tag of manifest.tags ?? []) tags.add(tag);
11853
- for (const link of manifest.sourceLinks ?? []) sourceLinks.add(link);
11970
+ const candidateSnapshot = await collectEvalStoreSnapshot({
11971
+ rootDir: candidateRootDir,
11972
+ enabled: true,
11973
+ shadowModeEnabled: true
11974
+ });
11975
+ const regressions = [];
11976
+ const improvements = [];
11977
+ if (baseSnapshot.status.invalidBenchmarks.length > 0) {
11978
+ regressions.push(`base store has ${baseSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
11979
+ }
11980
+ if (baseSnapshot.status.invalidRuns.length > 0) {
11981
+ regressions.push(`base store has ${baseSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
11982
+ }
11983
+ if (baseSnapshot.status.invalidShadows.length > 0) {
11984
+ regressions.push(`base store has ${baseSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
11985
+ }
11986
+ if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
11987
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
11988
+ }
11989
+ if (candidateSnapshot.status.invalidRuns.length > 0) {
11990
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
11991
+ }
11992
+ if (candidateSnapshot.status.invalidShadows.length > 0) {
11993
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
11994
+ }
11995
+ const baseRuns = latestCompletedRunsByBenchmark(baseSnapshot.runs);
11996
+ const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
11997
+ const missingCandidateBenchmarks = [...baseRuns.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
11998
+ for (const benchmarkId of missingCandidateBenchmarks) {
11999
+ regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
12000
+ }
12001
+ const deltas = [];
12002
+ for (const benchmarkId of [...baseRuns.keys()].sort()) {
12003
+ const baseRun = baseRuns.get(benchmarkId);
12004
+ const candidateRun = candidateRuns.get(benchmarkId);
12005
+ if (!baseRun || !candidateRun) continue;
12006
+ const basePassRate = computePassRate(baseRun);
12007
+ const candidatePassRate = computePassRate(candidateRun);
12008
+ const passRateDelta = candidatePassRate - basePassRate;
12009
+ const delta = {
12010
+ benchmarkId,
12011
+ baseRunId: baseRun.runId,
12012
+ candidateRunId: candidateRun.runId,
12013
+ basePassRate,
12014
+ candidatePassRate,
12015
+ passRateDelta,
12016
+ metricDeltas: {},
12017
+ regressions: [],
12018
+ improvements: []
12019
+ };
12020
+ if (passRateDelta < 0) {
12021
+ delta.regressions.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
12022
+ regressions.push(`${benchmarkId} pass rate regressed (${basePassRate} -> ${candidatePassRate})`);
12023
+ } else if (passRateDelta > 0) {
12024
+ delta.improvements.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
12025
+ improvements.push(`${benchmarkId} pass rate improved (${basePassRate} -> ${candidatePassRate})`);
12026
+ }
12027
+ const metricDelta = compareMetricDeltas(baseRun.metrics, candidateRun.metrics);
12028
+ delta.metricDeltas = metricDelta.deltas;
12029
+ for (const regression of metricDelta.regressions) {
12030
+ delta.regressions.push(regression);
12031
+ regressions.push(`${benchmarkId} ${regression}`);
12032
+ }
12033
+ for (const improvement of metricDelta.improvements) {
12034
+ delta.improvements.push(improvement);
12035
+ improvements.push(`${benchmarkId} ${improvement}`);
12036
+ }
12037
+ deltas.push(delta);
11854
12038
  }
11855
12039
  return {
11856
- enabled: options.enabled,
11857
- shadowModeEnabled: options.shadowModeEnabled,
11858
- rootDir,
11859
- benchmarkDir,
11860
- runsDir,
11861
- benchmarks: {
11862
- total: benchmarkFiles.length,
11863
- valid: manifests.length,
11864
- invalid: invalidBenchmarks.length,
11865
- totalCases,
11866
- tags: [...tags].sort(),
11867
- sourceLinks: [...sourceLinks].sort()
11868
- },
11869
- runs: {
11870
- total: runFiles.length,
11871
- invalid: invalidRuns.length,
11872
- completed: runs.filter((run) => run.status === "completed").length,
11873
- failed: runs.filter((run) => run.status === "failed").length,
11874
- partial: runs.filter((run) => run.status === "partial").length,
11875
- running: runs.filter((run) => run.status === "running").length,
11876
- latestRunId: latestRun?.runId,
11877
- latestBenchmarkId: latestRun?.benchmarkId,
11878
- latestCompletedAt: latestRun?.completedAt
11879
- },
11880
- shadows: {
11881
- total: shadowFiles.length,
11882
- invalid: invalidShadows.length,
11883
- latestTraceId: latestShadow?.traceId,
11884
- latestRecordedAt: latestShadow?.recordedAt,
11885
- latestSessionKey: latestShadow?.sessionKey
12040
+ passed: regressions.length === 0,
12041
+ baseRootDir: baseSnapshot.status.rootDir,
12042
+ candidateRootDir: candidateSnapshot.status.rootDir,
12043
+ comparedBenchmarks: deltas.length,
12044
+ missingCandidateBenchmarks,
12045
+ invalidArtifacts: {
12046
+ base: {
12047
+ benchmarks: baseSnapshot.status.invalidBenchmarks.length,
12048
+ runs: baseSnapshot.status.invalidRuns.length,
12049
+ shadows: baseSnapshot.status.invalidShadows.length
12050
+ },
12051
+ candidate: {
12052
+ benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
12053
+ runs: candidateSnapshot.status.invalidRuns.length,
12054
+ shadows: candidateSnapshot.status.invalidShadows.length
12055
+ }
11886
12056
  },
11887
- latestRun,
11888
- latestShadow,
11889
- invalidBenchmarks,
11890
- invalidRuns,
11891
- invalidShadows
12057
+ regressions,
12058
+ improvements,
12059
+ deltas
11892
12060
  };
11893
12061
  }
11894
12062
 
@@ -26026,6 +26194,12 @@ async function runBenchmarkImportCliCommand(options) {
26026
26194
  force: options.force === true
26027
26195
  });
26028
26196
  }
26197
+ async function runBenchmarkCiGateCliCommand(options) {
26198
+ return runEvalBenchmarkCiGate({
26199
+ baseEvalStoreDir: options.baseEvalStoreDir,
26200
+ candidateEvalStoreDir: options.candidateEvalStoreDir
26201
+ });
26202
+ }
26029
26203
  async function runSessionCheckCliCommand(options) {
26030
26204
  return analyzeSessionIntegrity({ memoryDir: options.memoryDir });
26031
26205
  }
@@ -27153,6 +27327,18 @@ function registerCli(api, orchestrator) {
27153
27327
  console.log(JSON.stringify(summary, null, 2));
27154
27328
  console.log("OK");
27155
27329
  });
27330
+ cmd.command("benchmark-ci-gate").description("Compare two eval stores and fail when the candidate regresses benchmark outcomes").requiredOption("--base <path>", "Path to the base eval store directory").requiredOption("--candidate <path>", "Path to the candidate eval store directory").action(async (...args) => {
27331
+ const options = args[0] ?? {};
27332
+ const summary = await runBenchmarkCiGateCliCommand({
27333
+ baseEvalStoreDir: typeof options.base === "string" ? options.base : "",
27334
+ candidateEvalStoreDir: typeof options.candidate === "string" ? options.candidate : ""
27335
+ });
27336
+ console.log(JSON.stringify(summary, null, 2));
27337
+ if (!summary.passed) {
27338
+ throw new Error("benchmark CI gate detected regressions");
27339
+ }
27340
+ console.log("OK");
27341
+ });
27156
27342
  cmd.command("conversation-index-health").description("Show conversation index backend health and index stats").action(async () => {
27157
27343
  const health = await runConversationIndexHealthCliCommand(orchestrator);
27158
27344
  console.log(JSON.stringify(health, null, 2));