@joshuaswarren/openclaw-engram 9.0.51 → 9.0.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -32,6 +32,7 @@ AI agents forget everything between conversations. Engram fixes that.
32
32
  - **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
33
33
  - **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
34
34
  - **Baseline snapshot discipline** — Engram can now, when `benchmarkBaselineSnapshotsEnabled` is enabled, capture typed baseline snapshots of the latest completed benchmark runs so later PR delta reporting can compare candidates against a stable stored reference instead of an ad hoc branch state.
35
+ - **Named baseline delta reporting** — Engram can now, when `benchmarkDeltaReporterEnabled` is enabled, compare the current eval store against a stored baseline snapshot, emit a machine-readable delta report plus markdown summary, and fail fast when a candidate regresses a benchmark that previously passed.
35
36
  - **Objective-state recall** — Engram can now store normalized file, process, and tool outcomes and, when `objectiveStateRecallEnabled` is enabled, inject the most relevant objective-state snapshots back into recall context as a separate `Objective State` section.
36
37
  - **Causal trajectory graph foundation** — Engram can now persist typed `goal -> action -> observation -> outcome -> follow-up` chains when `causalTrajectoryMemoryEnabled` is enabled and, with `actionGraphRecallEnabled`, emit deterministic action-conditioned edges into the causal graph for later trajectory-aware retrieval.
37
38
  - **Causal trajectory recall** — Engram can now, when `causalTrajectoryRecallEnabled` is enabled, inject prompt-relevant causal chains back into recall context as a separate `Causal Trajectories` section with lightweight match explainability.
@@ -172,6 +173,7 @@ openclaw engram benchmark-status # Benchmark/eval harness packs, run
172
173
  openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
173
174
  openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
174
175
  openclaw engram benchmark-baseline-snapshot # Capture a typed baseline snapshot of the latest completed benchmark runs
176
+ openclaw engram benchmark-baseline-report # Compare the current eval store against a stored baseline snapshot
175
177
  openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
176
178
  openclaw engram objective-state-status # Objective-state snapshot counts and latest stored snapshot
177
179
  openclaw engram causal-trajectory-status # Causal-trajectory record counts and latest stored chain
@@ -212,6 +214,7 @@ Key settings:
212
214
  | `evalHarnessEnabled` | `false` | Enable the evaluation harness for benchmark packs, run summaries, and shadow recall bookkeeping |
213
215
  | `evalShadowModeEnabled` | `false` | Record live recall decisions to the eval store without changing injected output |
214
216
  | `benchmarkBaselineSnapshotsEnabled` | `false` | Enable versioned baseline snapshot artifacts for the latest completed benchmark runs |
217
+ | `benchmarkDeltaReporterEnabled` | `false` | Enable named-baseline delta reports against the current eval store |
215
218
  | `evalStoreDir` | `{memoryDir}/state/evals` | Root directory for benchmark packs, run summaries, and shadow recall records |
216
219
  | `objectiveStateMemoryEnabled` | `false` | Enable the objective-state memory foundation for normalized world/tool state snapshots |
217
220
  | `objectiveStateSnapshotWritesEnabled` | `false` | Permit objective-state snapshot writers to persist typed state records |
package/dist/index.js CHANGED
@@ -288,6 +288,7 @@ function parseConfig(raw) {
288
288
  evalHarnessEnabled: cfg.evalHarnessEnabled === true,
289
289
  evalShadowModeEnabled: cfg.evalShadowModeEnabled === true,
290
290
  benchmarkBaselineSnapshotsEnabled: cfg.benchmarkBaselineSnapshotsEnabled === true,
291
+ benchmarkDeltaReporterEnabled: cfg.benchmarkDeltaReporterEnabled === true,
291
292
  evalStoreDir: typeof cfg.evalStoreDir === "string" && cfg.evalStoreDir.trim().length > 0 ? cfg.evalStoreDir.trim() : path.join(memoryDir, "state", "evals"),
292
293
  objectiveStateMemoryEnabled: cfg.objectiveStateMemoryEnabled === true,
293
294
  objectiveStateSnapshotWritesEnabled: cfg.objectiveStateSnapshotWritesEnabled === true,
@@ -11908,6 +11909,48 @@ function compareMetricDeltas(baseMetrics, candidateMetrics) {
11908
11909
  }
11909
11910
  return { deltas, regressions, improvements };
11910
11911
  }
11912
+ function formatEvalBaselineDeltaMarkdown(report) {
11913
+ const lines = [
11914
+ "# Eval Baseline Delta Report",
11915
+ "",
11916
+ `- Passed: ${report.passed ? "yes" : "no"}`,
11917
+ `- Baseline snapshot: ${report.baselineSnapshotId}`,
11918
+ `- Baseline created: ${report.baselineCreatedAt}`,
11919
+ `- Baseline source root: ${report.baselineSourceRootDir}`,
11920
+ `- Candidate root: ${report.candidateRootDir}`,
11921
+ `- Benchmarks compared: ${report.comparedBenchmarks}`
11922
+ ];
11923
+ if (report.missingCandidateBenchmarks.length > 0) {
11924
+ lines.push(`- Missing candidate benchmarks: ${report.missingCandidateBenchmarks.join(", ")}`);
11925
+ }
11926
+ lines.push(
11927
+ `- Invalid candidate artifacts: benchmarks=${report.invalidArtifacts.candidate.benchmarks}, runs=${report.invalidArtifacts.candidate.runs}, shadows=${report.invalidArtifacts.candidate.shadows}, baselines=${report.invalidArtifacts.candidate.baselines}`,
11928
+ "",
11929
+ "## Regressions"
11930
+ );
11931
+ if (report.regressions.length === 0) {
11932
+ lines.push("- none");
11933
+ } else {
11934
+ for (const regression of report.regressions) lines.push(`- ${regression}`);
11935
+ }
11936
+ lines.push("", "## Improvements");
11937
+ if (report.improvements.length === 0) {
11938
+ lines.push("- none");
11939
+ } else {
11940
+ for (const improvement of report.improvements) lines.push(`- ${improvement}`);
11941
+ }
11942
+ lines.push("", "## Benchmark Deltas");
11943
+ if (report.deltas.length === 0) {
11944
+ lines.push("- none");
11945
+ } else {
11946
+ for (const delta of report.deltas) {
11947
+ lines.push(
11948
+ `- ${delta.benchmarkId}: passRate ${delta.basePassRate} -> ${delta.candidatePassRate} (delta ${delta.passRateDelta})`
11949
+ );
11950
+ }
11951
+ }
11952
+ return lines.join("\n");
11953
+ }
11911
11954
  async function collectEvalStoreSnapshot(options) {
11912
11955
  const rootDir = options.rootDir;
11913
11956
  const benchmarkDir = path15.join(rootDir, "benchmarks");
@@ -12179,6 +12222,105 @@ async function createEvalBaselineSnapshot(options) {
12179
12222
  await writeFile11(targetPath, JSON.stringify(snapshot, null, 2), "utf-8");
12180
12223
  return { targetPath, snapshot };
12181
12224
  }
12225
+ async function runEvalBaselineDeltaReport(options) {
12226
+ if (options.benchmarkDeltaReporterEnabled !== true) {
12227
+ throw new Error("benchmark delta reporter is disabled");
12228
+ }
12229
+ const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
12230
+ const candidateRootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
12231
+ const candidateSnapshot = await collectEvalStoreSnapshot({
12232
+ rootDir: candidateRootDir,
12233
+ enabled: true,
12234
+ shadowModeEnabled: true,
12235
+ baselineSnapshotsEnabled: true,
12236
+ memoryRedTeamBenchEnabled: true
12237
+ });
12238
+ const baselineSnapshot = candidateSnapshot.baselines.find((snapshot) => snapshot.snapshotId === snapshotId);
12239
+ if (!baselineSnapshot) {
12240
+ throw new Error(`benchmark baseline snapshot not found: ${snapshotId}`);
12241
+ }
12242
+ const regressions = [];
12243
+ const improvements = [];
12244
+ if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
12245
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
12246
+ }
12247
+ if (candidateSnapshot.status.invalidRuns.length > 0) {
12248
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
12249
+ }
12250
+ if (candidateSnapshot.status.invalidShadows.length > 0) {
12251
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
12252
+ }
12253
+ if (candidateSnapshot.status.invalidBaselines.length > 0) {
12254
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidBaselines.length} invalid baseline snapshot file(s)`);
12255
+ }
12256
+ const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
12257
+ const baselineBenchmarks = new Map(
12258
+ baselineSnapshot.benchmarks.map((benchmark) => [benchmark.benchmarkId, benchmark])
12259
+ );
12260
+ const missingCandidateBenchmarks = [...baselineBenchmarks.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
12261
+ for (const benchmarkId of missingCandidateBenchmarks) {
12262
+ regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
12263
+ }
12264
+ const deltas = [];
12265
+ for (const benchmarkId of [...baselineBenchmarks.keys()].sort()) {
12266
+ const baseBenchmark = baselineBenchmarks.get(benchmarkId);
12267
+ const candidateRun = candidateRuns.get(benchmarkId);
12268
+ if (!baseBenchmark || !candidateRun) continue;
12269
+ const passRateDelta = computePassRate(candidateRun) - baseBenchmark.passRate;
12270
+ const delta = {
12271
+ benchmarkId,
12272
+ baseRunId: baseBenchmark.runId,
12273
+ candidateRunId: candidateRun.runId,
12274
+ basePassRate: baseBenchmark.passRate,
12275
+ candidatePassRate: computePassRate(candidateRun),
12276
+ passRateDelta,
12277
+ metricDeltas: {},
12278
+ regressions: [],
12279
+ improvements: []
12280
+ };
12281
+ if (passRateDelta < 0) {
12282
+ delta.regressions.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
12283
+ regressions.push(`${benchmarkId} pass rate regressed (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
12284
+ } else if (passRateDelta > 0) {
12285
+ delta.improvements.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
12286
+ improvements.push(`${benchmarkId} pass rate improved (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
12287
+ }
12288
+ const metricDelta = compareMetricDeltas(baseBenchmark.metrics, candidateRun.metrics);
12289
+ delta.metricDeltas = metricDelta.deltas;
12290
+ for (const regression of metricDelta.regressions) {
12291
+ delta.regressions.push(regression);
12292
+ regressions.push(`${benchmarkId} ${regression}`);
12293
+ }
12294
+ for (const improvement of metricDelta.improvements) {
12295
+ delta.improvements.push(improvement);
12296
+ improvements.push(`${benchmarkId} ${improvement}`);
12297
+ }
12298
+ deltas.push(delta);
12299
+ }
12300
+ const report = {
12301
+ passed: regressions.length === 0,
12302
+ baselineSnapshotId: baselineSnapshot.snapshotId,
12303
+ baselineCreatedAt: baselineSnapshot.createdAt,
12304
+ baselineSourceRootDir: baselineSnapshot.sourceRootDir,
12305
+ candidateRootDir: candidateSnapshot.status.rootDir,
12306
+ comparedBenchmarks: deltas.length,
12307
+ missingCandidateBenchmarks,
12308
+ invalidArtifacts: {
12309
+ candidate: {
12310
+ benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
12311
+ runs: candidateSnapshot.status.invalidRuns.length,
12312
+ shadows: candidateSnapshot.status.invalidShadows.length,
12313
+ baselines: candidateSnapshot.status.invalidBaselines.length
12314
+ }
12315
+ },
12316
+ regressions,
12317
+ improvements,
12318
+ deltas,
12319
+ markdownReport: ""
12320
+ };
12321
+ report.markdownReport = formatEvalBaselineDeltaMarkdown(report);
12322
+ return report;
12323
+ }
12182
12324
  function resolveRequiredEvalStoreRoot(options, label) {
12183
12325
  if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
12184
12326
  return options.evalStoreDir.trim();
@@ -28964,6 +29106,14 @@ async function runBenchmarkCiGateCliCommand(options) {
28964
29106
  candidateEvalStoreDir: options.candidateEvalStoreDir
28965
29107
  });
28966
29108
  }
29109
+ async function runBenchmarkBaselineReportCliCommand(options) {
29110
+ return runEvalBaselineDeltaReport({
29111
+ memoryDir: options.memoryDir,
29112
+ evalStoreDir: options.evalStoreDir,
29113
+ benchmarkDeltaReporterEnabled: options.benchmarkDeltaReporterEnabled,
29114
+ snapshotId: options.snapshotId
29115
+ });
29116
+ }
28967
29117
  async function runObjectiveStateStatusCliCommand(options) {
28968
29118
  return getObjectiveStateStoreStatus({
28969
29119
  memoryDir: options.memoryDir,
@@ -30354,6 +30504,22 @@ function registerCli(api, orchestrator) {
30354
30504
  }
30355
30505
  console.log("OK");
30356
30506
  });
30507
+ cmd.command("benchmark-baseline-report").description("Compare the current eval store against a named stored benchmark baseline snapshot").requiredOption("--snapshot-id <id>", "Stable baseline snapshot identifier").action(async (...args) => {
30508
+ const options = args[0] ?? {};
30509
+ const summary = await runBenchmarkBaselineReportCliCommand({
30510
+ memoryDir: orchestrator.config.memoryDir,
30511
+ evalStoreDir: orchestrator.config.evalStoreDir,
30512
+ benchmarkDeltaReporterEnabled: orchestrator.config.benchmarkDeltaReporterEnabled,
30513
+ snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : ""
30514
+ });
30515
+ const { markdownReport, ...jsonSummary } = summary;
30516
+ console.log(JSON.stringify(jsonSummary, null, 2));
30517
+ console.log(markdownReport);
30518
+ if (!summary.passed) {
30519
+ throw new Error("benchmark baseline report detected regressions");
30520
+ }
30521
+ console.log("OK");
30522
+ });
30357
30523
  cmd.command("objective-state-status").description("Show objective-state store status, snapshot counts, and latest stored snapshot").action(async () => {
30358
30524
  const status = await runObjectiveStateStatusCliCommand({
30359
30525
  memoryDir: orchestrator.config.memoryDir,