@joshuaswarren/openclaw-engram 9.0.51 → 9.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/index.js +166 -0
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +10 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -32,6 +32,7 @@ AI agents forget everything between conversations. Engram fixes that.
|
|
|
32
32
|
- **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
|
|
33
33
|
- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
|
|
34
34
|
- **Baseline snapshot discipline** — Engram can now, when `benchmarkBaselineSnapshotsEnabled` is enabled, capture typed baseline snapshots of the latest completed benchmark runs so later PR delta reporting can compare candidates against a stable stored reference instead of an ad hoc branch state.
|
|
35
|
+
- **Named baseline delta reporting** — Engram can now, when `benchmarkDeltaReporterEnabled` is enabled, compare the current eval store against a stored baseline snapshot, emit a machine-readable delta report plus markdown summary, and fail fast when a candidate regresses a benchmark that previously passed.
|
|
35
36
|
- **Objective-state recall** — Engram can now store normalized file, process, and tool outcomes and, when `objectiveStateRecallEnabled` is enabled, inject the most relevant objective-state snapshots back into recall context as a separate `Objective State` section.
|
|
36
37
|
- **Causal trajectory graph foundation** — Engram can now persist typed `goal -> action -> observation -> outcome -> follow-up` chains when `causalTrajectoryMemoryEnabled` is enabled and, with `actionGraphRecallEnabled`, emit deterministic action-conditioned edges into the causal graph for later trajectory-aware retrieval.
|
|
37
38
|
- **Causal trajectory recall** — Engram can now, when `causalTrajectoryRecallEnabled` is enabled, inject prompt-relevant causal chains back into recall context as a separate `Causal Trajectories` section with lightweight match explainability.
|
|
@@ -172,6 +173,7 @@ openclaw engram benchmark-status # Benchmark/eval harness packs, run
|
|
|
172
173
|
openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
|
|
173
174
|
openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
|
|
174
175
|
openclaw engram benchmark-baseline-snapshot # Capture a typed baseline snapshot of the latest completed benchmark runs
|
|
176
|
+
openclaw engram benchmark-baseline-report # Compare the current eval store against a stored baseline snapshot
|
|
175
177
|
openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
|
|
176
178
|
openclaw engram objective-state-status # Objective-state snapshot counts and latest stored snapshot
|
|
177
179
|
openclaw engram causal-trajectory-status # Causal-trajectory record counts and latest stored chain
|
|
@@ -212,6 +214,7 @@ Key settings:
|
|
|
212
214
|
| `evalHarnessEnabled` | `false` | Enable the evaluation harness for benchmark packs, run summaries, and shadow recall bookkeeping |
|
|
213
215
|
| `evalShadowModeEnabled` | `false` | Record live recall decisions to the eval store without changing injected output |
|
|
214
216
|
| `benchmarkBaselineSnapshotsEnabled` | `false` | Enable versioned baseline snapshot artifacts for the latest completed benchmark runs |
|
|
217
|
+
| `benchmarkDeltaReporterEnabled` | `false` | Enable named-baseline delta reports against the current eval store |
|
|
215
218
|
| `evalStoreDir` | `{memoryDir}/state/evals` | Root directory for benchmark packs, run summaries, and shadow recall records |
|
|
216
219
|
| `objectiveStateMemoryEnabled` | `false` | Enable the objective-state memory foundation for normalized world/tool state snapshots |
|
|
217
220
|
| `objectiveStateSnapshotWritesEnabled` | `false` | Permit objective-state snapshot writers to persist typed state records |
|
package/dist/index.js
CHANGED
|
@@ -288,6 +288,7 @@ function parseConfig(raw) {
|
|
|
288
288
|
evalHarnessEnabled: cfg.evalHarnessEnabled === true,
|
|
289
289
|
evalShadowModeEnabled: cfg.evalShadowModeEnabled === true,
|
|
290
290
|
benchmarkBaselineSnapshotsEnabled: cfg.benchmarkBaselineSnapshotsEnabled === true,
|
|
291
|
+
benchmarkDeltaReporterEnabled: cfg.benchmarkDeltaReporterEnabled === true,
|
|
291
292
|
evalStoreDir: typeof cfg.evalStoreDir === "string" && cfg.evalStoreDir.trim().length > 0 ? cfg.evalStoreDir.trim() : path.join(memoryDir, "state", "evals"),
|
|
292
293
|
objectiveStateMemoryEnabled: cfg.objectiveStateMemoryEnabled === true,
|
|
293
294
|
objectiveStateSnapshotWritesEnabled: cfg.objectiveStateSnapshotWritesEnabled === true,
|
|
@@ -11908,6 +11909,48 @@ function compareMetricDeltas(baseMetrics, candidateMetrics) {
|
|
|
11908
11909
|
}
|
|
11909
11910
|
return { deltas, regressions, improvements };
|
|
11910
11911
|
}
|
|
11912
|
+
function formatEvalBaselineDeltaMarkdown(report) {
|
|
11913
|
+
const lines = [
|
|
11914
|
+
"# Eval Baseline Delta Report",
|
|
11915
|
+
"",
|
|
11916
|
+
`- Passed: ${report.passed ? "yes" : "no"}`,
|
|
11917
|
+
`- Baseline snapshot: ${report.baselineSnapshotId}`,
|
|
11918
|
+
`- Baseline created: ${report.baselineCreatedAt}`,
|
|
11919
|
+
`- Baseline source root: ${report.baselineSourceRootDir}`,
|
|
11920
|
+
`- Candidate root: ${report.candidateRootDir}`,
|
|
11921
|
+
`- Benchmarks compared: ${report.comparedBenchmarks}`
|
|
11922
|
+
];
|
|
11923
|
+
if (report.missingCandidateBenchmarks.length > 0) {
|
|
11924
|
+
lines.push(`- Missing candidate benchmarks: ${report.missingCandidateBenchmarks.join(", ")}`);
|
|
11925
|
+
}
|
|
11926
|
+
lines.push(
|
|
11927
|
+
`- Invalid candidate artifacts: benchmarks=${report.invalidArtifacts.candidate.benchmarks}, runs=${report.invalidArtifacts.candidate.runs}, shadows=${report.invalidArtifacts.candidate.shadows}, baselines=${report.invalidArtifacts.candidate.baselines}`,
|
|
11928
|
+
"",
|
|
11929
|
+
"## Regressions"
|
|
11930
|
+
);
|
|
11931
|
+
if (report.regressions.length === 0) {
|
|
11932
|
+
lines.push("- none");
|
|
11933
|
+
} else {
|
|
11934
|
+
for (const regression of report.regressions) lines.push(`- ${regression}`);
|
|
11935
|
+
}
|
|
11936
|
+
lines.push("", "## Improvements");
|
|
11937
|
+
if (report.improvements.length === 0) {
|
|
11938
|
+
lines.push("- none");
|
|
11939
|
+
} else {
|
|
11940
|
+
for (const improvement of report.improvements) lines.push(`- ${improvement}`);
|
|
11941
|
+
}
|
|
11942
|
+
lines.push("", "## Benchmark Deltas");
|
|
11943
|
+
if (report.deltas.length === 0) {
|
|
11944
|
+
lines.push("- none");
|
|
11945
|
+
} else {
|
|
11946
|
+
for (const delta of report.deltas) {
|
|
11947
|
+
lines.push(
|
|
11948
|
+
`- ${delta.benchmarkId}: passRate ${delta.basePassRate} -> ${delta.candidatePassRate} (delta ${delta.passRateDelta})`
|
|
11949
|
+
);
|
|
11950
|
+
}
|
|
11951
|
+
}
|
|
11952
|
+
return lines.join("\n");
|
|
11953
|
+
}
|
|
11911
11954
|
async function collectEvalStoreSnapshot(options) {
|
|
11912
11955
|
const rootDir = options.rootDir;
|
|
11913
11956
|
const benchmarkDir = path15.join(rootDir, "benchmarks");
|
|
@@ -12179,6 +12222,105 @@ async function createEvalBaselineSnapshot(options) {
|
|
|
12179
12222
|
await writeFile11(targetPath, JSON.stringify(snapshot, null, 2), "utf-8");
|
|
12180
12223
|
return { targetPath, snapshot };
|
|
12181
12224
|
}
|
|
12225
|
+
async function runEvalBaselineDeltaReport(options) {
|
|
12226
|
+
if (options.benchmarkDeltaReporterEnabled !== true) {
|
|
12227
|
+
throw new Error("benchmark delta reporter is disabled");
|
|
12228
|
+
}
|
|
12229
|
+
const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
|
|
12230
|
+
const candidateRootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
|
|
12231
|
+
const candidateSnapshot = await collectEvalStoreSnapshot({
|
|
12232
|
+
rootDir: candidateRootDir,
|
|
12233
|
+
enabled: true,
|
|
12234
|
+
shadowModeEnabled: true,
|
|
12235
|
+
baselineSnapshotsEnabled: true,
|
|
12236
|
+
memoryRedTeamBenchEnabled: true
|
|
12237
|
+
});
|
|
12238
|
+
const baselineSnapshot = candidateSnapshot.baselines.find((snapshot) => snapshot.snapshotId === snapshotId);
|
|
12239
|
+
if (!baselineSnapshot) {
|
|
12240
|
+
throw new Error(`benchmark baseline snapshot not found: ${snapshotId}`);
|
|
12241
|
+
}
|
|
12242
|
+
const regressions = [];
|
|
12243
|
+
const improvements = [];
|
|
12244
|
+
if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
|
|
12245
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
|
|
12246
|
+
}
|
|
12247
|
+
if (candidateSnapshot.status.invalidRuns.length > 0) {
|
|
12248
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
|
|
12249
|
+
}
|
|
12250
|
+
if (candidateSnapshot.status.invalidShadows.length > 0) {
|
|
12251
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
|
|
12252
|
+
}
|
|
12253
|
+
if (candidateSnapshot.status.invalidBaselines.length > 0) {
|
|
12254
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidBaselines.length} invalid baseline snapshot file(s)`);
|
|
12255
|
+
}
|
|
12256
|
+
const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
|
|
12257
|
+
const baselineBenchmarks = new Map(
|
|
12258
|
+
baselineSnapshot.benchmarks.map((benchmark) => [benchmark.benchmarkId, benchmark])
|
|
12259
|
+
);
|
|
12260
|
+
const missingCandidateBenchmarks = [...baselineBenchmarks.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
|
|
12261
|
+
for (const benchmarkId of missingCandidateBenchmarks) {
|
|
12262
|
+
regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
|
|
12263
|
+
}
|
|
12264
|
+
const deltas = [];
|
|
12265
|
+
for (const benchmarkId of [...baselineBenchmarks.keys()].sort()) {
|
|
12266
|
+
const baseBenchmark = baselineBenchmarks.get(benchmarkId);
|
|
12267
|
+
const candidateRun = candidateRuns.get(benchmarkId);
|
|
12268
|
+
if (!baseBenchmark || !candidateRun) continue;
|
|
12269
|
+
const passRateDelta = computePassRate(candidateRun) - baseBenchmark.passRate;
|
|
12270
|
+
const delta = {
|
|
12271
|
+
benchmarkId,
|
|
12272
|
+
baseRunId: baseBenchmark.runId,
|
|
12273
|
+
candidateRunId: candidateRun.runId,
|
|
12274
|
+
basePassRate: baseBenchmark.passRate,
|
|
12275
|
+
candidatePassRate: computePassRate(candidateRun),
|
|
12276
|
+
passRateDelta,
|
|
12277
|
+
metricDeltas: {},
|
|
12278
|
+
regressions: [],
|
|
12279
|
+
improvements: []
|
|
12280
|
+
};
|
|
12281
|
+
if (passRateDelta < 0) {
|
|
12282
|
+
delta.regressions.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
|
|
12283
|
+
regressions.push(`${benchmarkId} pass rate regressed (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
|
|
12284
|
+
} else if (passRateDelta > 0) {
|
|
12285
|
+
delta.improvements.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
|
|
12286
|
+
improvements.push(`${benchmarkId} pass rate improved (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
|
|
12287
|
+
}
|
|
12288
|
+
const metricDelta = compareMetricDeltas(baseBenchmark.metrics, candidateRun.metrics);
|
|
12289
|
+
delta.metricDeltas = metricDelta.deltas;
|
|
12290
|
+
for (const regression of metricDelta.regressions) {
|
|
12291
|
+
delta.regressions.push(regression);
|
|
12292
|
+
regressions.push(`${benchmarkId} ${regression}`);
|
|
12293
|
+
}
|
|
12294
|
+
for (const improvement of metricDelta.improvements) {
|
|
12295
|
+
delta.improvements.push(improvement);
|
|
12296
|
+
improvements.push(`${benchmarkId} ${improvement}`);
|
|
12297
|
+
}
|
|
12298
|
+
deltas.push(delta);
|
|
12299
|
+
}
|
|
12300
|
+
const report = {
|
|
12301
|
+
passed: regressions.length === 0,
|
|
12302
|
+
baselineSnapshotId: baselineSnapshot.snapshotId,
|
|
12303
|
+
baselineCreatedAt: baselineSnapshot.createdAt,
|
|
12304
|
+
baselineSourceRootDir: baselineSnapshot.sourceRootDir,
|
|
12305
|
+
candidateRootDir: candidateSnapshot.status.rootDir,
|
|
12306
|
+
comparedBenchmarks: deltas.length,
|
|
12307
|
+
missingCandidateBenchmarks,
|
|
12308
|
+
invalidArtifacts: {
|
|
12309
|
+
candidate: {
|
|
12310
|
+
benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
|
|
12311
|
+
runs: candidateSnapshot.status.invalidRuns.length,
|
|
12312
|
+
shadows: candidateSnapshot.status.invalidShadows.length,
|
|
12313
|
+
baselines: candidateSnapshot.status.invalidBaselines.length
|
|
12314
|
+
}
|
|
12315
|
+
},
|
|
12316
|
+
regressions,
|
|
12317
|
+
improvements,
|
|
12318
|
+
deltas,
|
|
12319
|
+
markdownReport: ""
|
|
12320
|
+
};
|
|
12321
|
+
report.markdownReport = formatEvalBaselineDeltaMarkdown(report);
|
|
12322
|
+
return report;
|
|
12323
|
+
}
|
|
12182
12324
|
function resolveRequiredEvalStoreRoot(options, label) {
|
|
12183
12325
|
if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
|
|
12184
12326
|
return options.evalStoreDir.trim();
|
|
@@ -28964,6 +29106,14 @@ async function runBenchmarkCiGateCliCommand(options) {
|
|
|
28964
29106
|
candidateEvalStoreDir: options.candidateEvalStoreDir
|
|
28965
29107
|
});
|
|
28966
29108
|
}
|
|
29109
|
+
async function runBenchmarkBaselineReportCliCommand(options) {
|
|
29110
|
+
return runEvalBaselineDeltaReport({
|
|
29111
|
+
memoryDir: options.memoryDir,
|
|
29112
|
+
evalStoreDir: options.evalStoreDir,
|
|
29113
|
+
benchmarkDeltaReporterEnabled: options.benchmarkDeltaReporterEnabled,
|
|
29114
|
+
snapshotId: options.snapshotId
|
|
29115
|
+
});
|
|
29116
|
+
}
|
|
28967
29117
|
async function runObjectiveStateStatusCliCommand(options) {
|
|
28968
29118
|
return getObjectiveStateStoreStatus({
|
|
28969
29119
|
memoryDir: options.memoryDir,
|
|
@@ -30354,6 +30504,22 @@ function registerCli(api, orchestrator) {
|
|
|
30354
30504
|
}
|
|
30355
30505
|
console.log("OK");
|
|
30356
30506
|
});
|
|
30507
|
+
cmd.command("benchmark-baseline-report").description("Compare the current eval store against a named stored benchmark baseline snapshot").requiredOption("--snapshot-id <id>", "Stable baseline snapshot identifier").action(async (...args) => {
|
|
30508
|
+
const options = args[0] ?? {};
|
|
30509
|
+
const summary = await runBenchmarkBaselineReportCliCommand({
|
|
30510
|
+
memoryDir: orchestrator.config.memoryDir,
|
|
30511
|
+
evalStoreDir: orchestrator.config.evalStoreDir,
|
|
30512
|
+
benchmarkDeltaReporterEnabled: orchestrator.config.benchmarkDeltaReporterEnabled,
|
|
30513
|
+
snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : ""
|
|
30514
|
+
});
|
|
30515
|
+
const { markdownReport, ...jsonSummary } = summary;
|
|
30516
|
+
console.log(JSON.stringify(jsonSummary, null, 2));
|
|
30517
|
+
console.log(markdownReport);
|
|
30518
|
+
if (!summary.passed) {
|
|
30519
|
+
throw new Error("benchmark baseline report detected regressions");
|
|
30520
|
+
}
|
|
30521
|
+
console.log("OK");
|
|
30522
|
+
});
|
|
30357
30523
|
cmd.command("objective-state-status").description("Show objective-state store status, snapshot counts, and latest stored snapshot").action(async () => {
|
|
30358
30524
|
const status = await runObjectiveStateStatusCliCommand({
|
|
30359
30525
|
memoryDir: orchestrator.config.memoryDir,
|