@joshuaswarren/openclaw-engram 9.0.50 → 9.0.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -31,6 +31,8 @@ AI agents forget everything between conversations. Engram fixes that.
31
31
  - **Pluggable search** — Choose from six search backends: QMD (hybrid BM25+vector+reranking), LanceDB, Meilisearch, Orama, remote HTTP, or bring your own.
32
32
  - **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
33
33
  - **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
34
+ - **Baseline snapshot discipline** — Engram can now, when `benchmarkBaselineSnapshotsEnabled` is enabled, capture typed baseline snapshots of the latest completed benchmark runs so later PR delta reporting can compare candidates against a stable stored reference instead of an ad hoc branch state.
35
+ - **Named baseline delta reporting** — Engram can now, when `benchmarkDeltaReporterEnabled` is enabled, compare the current eval store against a stored baseline snapshot, emit a machine-readable delta report plus markdown summary, and fail fast when a candidate regresses a benchmark that previously passed.
34
36
  - **Objective-state recall** — Engram can now store normalized file, process, and tool outcomes and, when `objectiveStateRecallEnabled` is enabled, inject the most relevant objective-state snapshots back into recall context as a separate `Objective State` section.
35
37
  - **Causal trajectory graph foundation** — Engram can now persist typed `goal -> action -> observation -> outcome -> follow-up` chains when `causalTrajectoryMemoryEnabled` is enabled and, with `actionGraphRecallEnabled`, emit deterministic action-conditioned edges into the causal graph for later trajectory-aware retrieval.
36
38
  - **Causal trajectory recall** — Engram can now, when `causalTrajectoryRecallEnabled` is enabled, inject prompt-relevant causal chains back into recall context as a separate `Causal Trajectories` section with lightweight match explainability.
@@ -170,6 +172,8 @@ openclaw engram compat --strict # Compatibility check
170
172
  openclaw engram benchmark-status # Benchmark/eval harness packs, runs, shadow recalls, latest summaries
171
173
  openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
172
174
  openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
175
+ openclaw engram benchmark-baseline-snapshot # Capture a typed baseline snapshot of the latest completed benchmark runs
176
+ openclaw engram benchmark-baseline-report # Compare the current eval store against a stored baseline snapshot
173
177
  openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
174
178
  openclaw engram objective-state-status # Objective-state snapshot counts and latest stored snapshot
175
179
  openclaw engram causal-trajectory-status # Causal-trajectory record counts and latest stored chain
@@ -209,6 +213,8 @@ Key settings:
209
213
  | `memoryDir` | `~/.openclaw/workspace/memory/local` | Memory storage root |
210
214
  | `evalHarnessEnabled` | `false` | Enable the evaluation harness for benchmark packs, run summaries, and shadow recall bookkeeping |
211
215
  | `evalShadowModeEnabled` | `false` | Record live recall decisions to the eval store without changing injected output |
216
+ | `benchmarkBaselineSnapshotsEnabled` | `false` | Enable versioned baseline snapshot artifacts for the latest completed benchmark runs |
217
+ | `benchmarkDeltaReporterEnabled` | `false` | Enable named-baseline delta reports against the current eval store |
212
218
  | `evalStoreDir` | `{memoryDir}/state/evals` | Root directory for benchmark packs, run summaries, and shadow recall records |
213
219
  | `objectiveStateMemoryEnabled` | `false` | Enable the objective-state memory foundation for normalized world/tool state snapshots |
214
220
  | `objectiveStateSnapshotWritesEnabled` | `false` | Permit objective-state snapshot writers to persist typed state records |
package/dist/index.js CHANGED
@@ -287,6 +287,8 @@ function parseConfig(raw) {
287
287
  conversationRecallTimeoutMs: typeof cfg.conversationRecallTimeoutMs === "number" ? cfg.conversationRecallTimeoutMs : 800,
288
288
  evalHarnessEnabled: cfg.evalHarnessEnabled === true,
289
289
  evalShadowModeEnabled: cfg.evalShadowModeEnabled === true,
290
+ benchmarkBaselineSnapshotsEnabled: cfg.benchmarkBaselineSnapshotsEnabled === true,
291
+ benchmarkDeltaReporterEnabled: cfg.benchmarkDeltaReporterEnabled === true,
290
292
  evalStoreDir: typeof cfg.evalStoreDir === "string" && cfg.evalStoreDir.trim().length > 0 ? cfg.evalStoreDir.trim() : path.join(memoryDir, "state", "evals"),
291
293
  objectiveStateMemoryEnabled: cfg.objectiveStateMemoryEnabled === true,
292
294
  objectiveStateSnapshotWritesEnabled: cfg.objectiveStateSnapshotWritesEnabled === true,
@@ -11666,11 +11668,14 @@ function resolveEvalStoreDir(memoryDir, overrideDir) {
11666
11668
  }
11667
11669
  return path15.join(memoryDir, "state", "evals");
11668
11670
  }
11669
- function assertSafeBenchmarkId(benchmarkId) {
11670
- if (benchmarkId === "." || benchmarkId === ".." || benchmarkId.includes("/") || benchmarkId.includes("\\")) {
11671
- throw new Error("benchmarkId must be a safe path segment");
11671
+ function assertSafePathSegment(value, field) {
11672
+ if (value === "." || value === ".." || value.includes("/") || value.includes("\\")) {
11673
+ throw new Error(`${field} must be a safe path segment`);
11672
11674
  }
11673
- return benchmarkId;
11675
+ return value;
11676
+ }
11677
+ function assertSafeBenchmarkId(benchmarkId) {
11678
+ return assertSafePathSegment(benchmarkId, "benchmarkId");
11674
11679
  }
11675
11680
  function validateEvalBenchmarkManifest(raw, options) {
11676
11681
  if (!isRecord(raw)) throw new Error("benchmark manifest must be an object");
@@ -11727,14 +11732,7 @@ function validateEvalRunSummary(raw) {
11727
11732
  if (!Number.isFinite(totalCases) || totalCases < 0) throw new Error("totalCases must be a non-negative number");
11728
11733
  if (!Number.isFinite(passedCases) || passedCases < 0) throw new Error("passedCases must be a non-negative number");
11729
11734
  if (!Number.isFinite(failedCases) || failedCases < 0) throw new Error("failedCases must be a non-negative number");
11730
- const metrics = isRecord(raw.metrics) ? {
11731
- recallPrecisionAtK: typeof raw.metrics.recallPrecisionAtK === "number" ? raw.metrics.recallPrecisionAtK : void 0,
11732
- actionOutcomeScore: typeof raw.metrics.actionOutcomeScore === "number" ? raw.metrics.actionOutcomeScore : void 0,
11733
- objectiveStateCoverage: typeof raw.metrics.objectiveStateCoverage === "number" ? raw.metrics.objectiveStateCoverage : void 0,
11734
- causalPathRecall: typeof raw.metrics.causalPathRecall === "number" ? raw.metrics.causalPathRecall : void 0,
11735
- trustViolationRate: typeof raw.metrics.trustViolationRate === "number" ? raw.metrics.trustViolationRate : void 0,
11736
- creationRecoveryScore: typeof raw.metrics.creationRecoveryScore === "number" ? raw.metrics.creationRecoveryScore : void 0
11737
- } : void 0;
11735
+ const metrics = parseOptionalEvalRunMetrics(raw.metrics);
11738
11736
  return {
11739
11737
  schemaVersion: 1,
11740
11738
  runId: assertString(raw.runId, "runId"),
@@ -11750,6 +11748,55 @@ function validateEvalRunSummary(raw) {
11750
11748
  gitRef: typeof raw.gitRef === "string" && raw.gitRef.trim().length > 0 ? raw.gitRef.trim() : void 0
11751
11749
  };
11752
11750
  }
11751
+ function validateEvalBaselineSnapshot(raw) {
11752
+ if (!isRecord(raw)) throw new Error("eval baseline snapshot must be an object");
11753
+ if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
11754
+ if (!Array.isArray(raw.benchmarks)) throw new Error("benchmarks must be an array");
11755
+ const benchmarks = raw.benchmarks.map((item, index) => {
11756
+ if (!isRecord(item)) throw new Error(`benchmarks[${index}] must be an object`);
11757
+ const passRate = Number(item.passRate);
11758
+ if (!Number.isFinite(passRate) || passRate < 0 || passRate > 1) {
11759
+ throw new Error(`benchmarks[${index}].passRate must be a number between 0 and 1`);
11760
+ }
11761
+ const metrics = parseOptionalEvalRunMetrics(item.metrics);
11762
+ return {
11763
+ benchmarkId: assertString(item.benchmarkId, `benchmarks[${index}].benchmarkId`),
11764
+ runId: assertString(item.runId, `benchmarks[${index}].runId`),
11765
+ completedAt: typeof item.completedAt === "string" && item.completedAt.trim().length > 0 ? item.completedAt.trim() : void 0,
11766
+ gitRef: typeof item.gitRef === "string" && item.gitRef.trim().length > 0 ? item.gitRef.trim() : void 0,
11767
+ passRate,
11768
+ metrics
11769
+ };
11770
+ });
11771
+ const benchmarkCount = Number(raw.benchmarkCount);
11772
+ if (!Number.isFinite(benchmarkCount) || benchmarkCount < 0) {
11773
+ throw new Error("benchmarkCount must be a non-negative number");
11774
+ }
11775
+ if (benchmarkCount !== benchmarks.length) {
11776
+ throw new Error("benchmarkCount must match benchmarks.length");
11777
+ }
11778
+ return {
11779
+ schemaVersion: 1,
11780
+ snapshotId: assertString(raw.snapshotId, "snapshotId"),
11781
+ createdAt: assertString(raw.createdAt, "createdAt"),
11782
+ sourceRootDir: assertString(raw.sourceRootDir, "sourceRootDir"),
11783
+ benchmarkCount,
11784
+ benchmarks,
11785
+ notes: typeof raw.notes === "string" && raw.notes.trim().length > 0 ? raw.notes.trim() : void 0,
11786
+ gitRef: typeof raw.gitRef === "string" && raw.gitRef.trim().length > 0 ? raw.gitRef.trim() : void 0
11787
+ };
11788
+ }
11789
+ function parseOptionalEvalRunMetrics(raw) {
11790
+ if (!isRecord(raw)) return void 0;
11791
+ return {
11792
+ recallPrecisionAtK: typeof raw.recallPrecisionAtK === "number" ? raw.recallPrecisionAtK : void 0,
11793
+ actionOutcomeScore: typeof raw.actionOutcomeScore === "number" ? raw.actionOutcomeScore : void 0,
11794
+ objectiveStateCoverage: typeof raw.objectiveStateCoverage === "number" ? raw.objectiveStateCoverage : void 0,
11795
+ causalPathRecall: typeof raw.causalPathRecall === "number" ? raw.causalPathRecall : void 0,
11796
+ trustViolationRate: typeof raw.trustViolationRate === "number" ? raw.trustViolationRate : void 0,
11797
+ creationRecoveryScore: typeof raw.creationRecoveryScore === "number" ? raw.creationRecoveryScore : void 0
11798
+ };
11799
+ }
11753
11800
  function validateEvalShadowRecallRecord(raw) {
11754
11801
  if (!isRecord(raw)) throw new Error("eval shadow recall record must be an object");
11755
11802
  if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
@@ -11862,17 +11909,62 @@ function compareMetricDeltas(baseMetrics, candidateMetrics) {
11862
11909
  }
11863
11910
  return { deltas, regressions, improvements };
11864
11911
  }
11912
+ function formatEvalBaselineDeltaMarkdown(report) {
11913
+ const lines = [
11914
+ "# Eval Baseline Delta Report",
11915
+ "",
11916
+ `- Passed: ${report.passed ? "yes" : "no"}`,
11917
+ `- Baseline snapshot: ${report.baselineSnapshotId}`,
11918
+ `- Baseline created: ${report.baselineCreatedAt}`,
11919
+ `- Baseline source root: ${report.baselineSourceRootDir}`,
11920
+ `- Candidate root: ${report.candidateRootDir}`,
11921
+ `- Benchmarks compared: ${report.comparedBenchmarks}`
11922
+ ];
11923
+ if (report.missingCandidateBenchmarks.length > 0) {
11924
+ lines.push(`- Missing candidate benchmarks: ${report.missingCandidateBenchmarks.join(", ")}`);
11925
+ }
11926
+ lines.push(
11927
+ `- Invalid candidate artifacts: benchmarks=${report.invalidArtifacts.candidate.benchmarks}, runs=${report.invalidArtifacts.candidate.runs}, shadows=${report.invalidArtifacts.candidate.shadows}, baselines=${report.invalidArtifacts.candidate.baselines}`,
11928
+ "",
11929
+ "## Regressions"
11930
+ );
11931
+ if (report.regressions.length === 0) {
11932
+ lines.push("- none");
11933
+ } else {
11934
+ for (const regression of report.regressions) lines.push(`- ${regression}`);
11935
+ }
11936
+ lines.push("", "## Improvements");
11937
+ if (report.improvements.length === 0) {
11938
+ lines.push("- none");
11939
+ } else {
11940
+ for (const improvement of report.improvements) lines.push(`- ${improvement}`);
11941
+ }
11942
+ lines.push("", "## Benchmark Deltas");
11943
+ if (report.deltas.length === 0) {
11944
+ lines.push("- none");
11945
+ } else {
11946
+ for (const delta of report.deltas) {
11947
+ lines.push(
11948
+ `- ${delta.benchmarkId}: passRate ${delta.basePassRate} -> ${delta.candidatePassRate} (delta ${delta.passRateDelta})`
11949
+ );
11950
+ }
11951
+ }
11952
+ return lines.join("\n");
11953
+ }
11865
11954
  async function collectEvalStoreSnapshot(options) {
11866
11955
  const rootDir = options.rootDir;
11867
11956
  const benchmarkDir = path15.join(rootDir, "benchmarks");
11868
11957
  const runsDir = path15.join(rootDir, "runs");
11869
11958
  const shadowDir = path15.join(rootDir, "shadow");
11959
+ const baselineDir = path15.join(rootDir, "baselines");
11870
11960
  const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
11871
11961
  const runFiles = await listJsonFiles(runsDir);
11872
11962
  const shadowFiles = await listJsonFiles(shadowDir);
11963
+ const baselineFiles = await listJsonFiles(baselineDir);
11873
11964
  const invalidBenchmarks = [];
11874
11965
  const invalidRuns = [];
11875
11966
  const invalidShadows = [];
11967
+ const invalidBaselines = [];
11876
11968
  const manifests = [];
11877
11969
  for (const filePath of benchmarkFiles) {
11878
11970
  try {
@@ -11910,12 +12002,24 @@ async function collectEvalStoreSnapshot(options) {
11910
12002
  });
11911
12003
  }
11912
12004
  }
12005
+ const baselines = [];
12006
+ for (const filePath of baselineFiles) {
12007
+ try {
12008
+ baselines.push(validateEvalBaselineSnapshot(await readJsonFile(filePath)));
12009
+ } catch (error) {
12010
+ invalidBaselines.push({
12011
+ path: filePath,
12012
+ error: error instanceof Error ? error.message : String(error)
12013
+ });
12014
+ }
12015
+ }
11913
12016
  runs.sort((a, b) => {
11914
12017
  const aTime = Date.parse(a.completedAt ?? a.startedAt);
11915
12018
  const bTime = Date.parse(b.completedAt ?? b.startedAt);
11916
12019
  return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
11917
12020
  });
11918
12021
  shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
12022
+ baselines.sort((a, b) => b.createdAt.localeCompare(a.createdAt));
11919
12023
  const tags = /* @__PURE__ */ new Set();
11920
12024
  const attackClasses = /* @__PURE__ */ new Set();
11921
12025
  const sourceLinks = /* @__PURE__ */ new Set();
@@ -11968,15 +12072,26 @@ async function collectEvalStoreSnapshot(options) {
11968
12072
  latestRecordedAt: shadows[0]?.recordedAt,
11969
12073
  latestSessionKey: shadows[0]?.sessionKey
11970
12074
  },
12075
+ baselines: {
12076
+ enabled: options.baselineSnapshotsEnabled === true,
12077
+ total: baselineFiles.length,
12078
+ invalid: invalidBaselines.length,
12079
+ latestSnapshotId: baselines[0]?.snapshotId,
12080
+ latestCreatedAt: baselines[0]?.createdAt,
12081
+ latestBenchmarkCount: baselines[0]?.benchmarkCount
12082
+ },
11971
12083
  latestRun: runs[0],
11972
12084
  latestShadow: shadows[0],
12085
+ latestBaseline: baselines[0],
11973
12086
  invalidBenchmarks,
11974
12087
  invalidRuns,
11975
- invalidShadows
12088
+ invalidShadows,
12089
+ invalidBaselines
11976
12090
  },
11977
12091
  manifests,
11978
12092
  runs,
11979
- shadows
12093
+ shadows,
12094
+ baselines
11980
12095
  };
11981
12096
  }
11982
12097
  async function resolveBenchmarkManifestPath(sourcePath) {
@@ -12066,9 +12181,146 @@ async function getEvalHarnessStatus(options) {
12066
12181
  rootDir: resolveEvalStoreDir(options.memoryDir, options.evalStoreDir),
12067
12182
  enabled: options.enabled,
12068
12183
  shadowModeEnabled: options.shadowModeEnabled,
12184
+ baselineSnapshotsEnabled: options.baselineSnapshotsEnabled,
12069
12185
  memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
12070
12186
  })).status;
12071
12187
  }
12188
+ async function createEvalBaselineSnapshot(options) {
12189
+ if (options.baselineSnapshotsEnabled !== true) {
12190
+ throw new Error("benchmark baseline snapshots are disabled");
12191
+ }
12192
+ const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
12193
+ const rootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
12194
+ const store = await collectEvalStoreSnapshot({
12195
+ rootDir,
12196
+ enabled: true,
12197
+ shadowModeEnabled: true,
12198
+ baselineSnapshotsEnabled: true,
12199
+ memoryRedTeamBenchEnabled: true
12200
+ });
12201
+ const latestRuns = latestCompletedRunsByBenchmark(store.runs);
12202
+ const benchmarks = [...latestRuns.values()].sort((a, b) => a.benchmarkId.localeCompare(b.benchmarkId)).map((run) => ({
12203
+ benchmarkId: run.benchmarkId,
12204
+ runId: run.runId,
12205
+ completedAt: run.completedAt,
12206
+ gitRef: run.gitRef,
12207
+ passRate: computePassRate(run),
12208
+ metrics: run.metrics
12209
+ }));
12210
+ const snapshot = validateEvalBaselineSnapshot({
12211
+ schemaVersion: 1,
12212
+ snapshotId,
12213
+ createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
12214
+ sourceRootDir: rootDir,
12215
+ benchmarkCount: benchmarks.length,
12216
+ benchmarks,
12217
+ notes: options.notes,
12218
+ gitRef: options.gitRef
12219
+ });
12220
+ const targetPath = path15.join(rootDir, "baselines", `${snapshot.snapshotId}.json`);
12221
+ await mkdir10(path15.dirname(targetPath), { recursive: true });
12222
+ await writeFile11(targetPath, JSON.stringify(snapshot, null, 2), "utf-8");
12223
+ return { targetPath, snapshot };
12224
+ }
12225
+ async function runEvalBaselineDeltaReport(options) {
12226
+ if (options.benchmarkDeltaReporterEnabled !== true) {
12227
+ throw new Error("benchmark delta reporter is disabled");
12228
+ }
12229
+ const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
12230
+ const candidateRootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
12231
+ const candidateSnapshot = await collectEvalStoreSnapshot({
12232
+ rootDir: candidateRootDir,
12233
+ enabled: true,
12234
+ shadowModeEnabled: true,
12235
+ baselineSnapshotsEnabled: true,
12236
+ memoryRedTeamBenchEnabled: true
12237
+ });
12238
+ const baselineSnapshot = candidateSnapshot.baselines.find((snapshot) => snapshot.snapshotId === snapshotId);
12239
+ if (!baselineSnapshot) {
12240
+ throw new Error(`benchmark baseline snapshot not found: ${snapshotId}`);
12241
+ }
12242
+ const regressions = [];
12243
+ const improvements = [];
12244
+ if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
12245
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
12246
+ }
12247
+ if (candidateSnapshot.status.invalidRuns.length > 0) {
12248
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
12249
+ }
12250
+ if (candidateSnapshot.status.invalidShadows.length > 0) {
12251
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
12252
+ }
12253
+ if (candidateSnapshot.status.invalidBaselines.length > 0) {
12254
+ regressions.push(`candidate store has ${candidateSnapshot.status.invalidBaselines.length} invalid baseline snapshot file(s)`);
12255
+ }
12256
+ const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
12257
+ const baselineBenchmarks = new Map(
12258
+ baselineSnapshot.benchmarks.map((benchmark) => [benchmark.benchmarkId, benchmark])
12259
+ );
12260
+ const missingCandidateBenchmarks = [...baselineBenchmarks.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
12261
+ for (const benchmarkId of missingCandidateBenchmarks) {
12262
+ regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
12263
+ }
12264
+ const deltas = [];
12265
+ for (const benchmarkId of [...baselineBenchmarks.keys()].sort()) {
12266
+ const baseBenchmark = baselineBenchmarks.get(benchmarkId);
12267
+ const candidateRun = candidateRuns.get(benchmarkId);
12268
+ if (!baseBenchmark || !candidateRun) continue;
12269
+ const passRateDelta = computePassRate(candidateRun) - baseBenchmark.passRate;
12270
+ const delta = {
12271
+ benchmarkId,
12272
+ baseRunId: baseBenchmark.runId,
12273
+ candidateRunId: candidateRun.runId,
12274
+ basePassRate: baseBenchmark.passRate,
12275
+ candidatePassRate: computePassRate(candidateRun),
12276
+ passRateDelta,
12277
+ metricDeltas: {},
12278
+ regressions: [],
12279
+ improvements: []
12280
+ };
12281
+ if (passRateDelta < 0) {
12282
+ delta.regressions.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
12283
+ regressions.push(`${benchmarkId} pass rate regressed (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
12284
+ } else if (passRateDelta > 0) {
12285
+ delta.improvements.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
12286
+ improvements.push(`${benchmarkId} pass rate improved (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
12287
+ }
12288
+ const metricDelta = compareMetricDeltas(baseBenchmark.metrics, candidateRun.metrics);
12289
+ delta.metricDeltas = metricDelta.deltas;
12290
+ for (const regression of metricDelta.regressions) {
12291
+ delta.regressions.push(regression);
12292
+ regressions.push(`${benchmarkId} ${regression}`);
12293
+ }
12294
+ for (const improvement of metricDelta.improvements) {
12295
+ delta.improvements.push(improvement);
12296
+ improvements.push(`${benchmarkId} ${improvement}`);
12297
+ }
12298
+ deltas.push(delta);
12299
+ }
12300
+ const report = {
12301
+ passed: regressions.length === 0,
12302
+ baselineSnapshotId: baselineSnapshot.snapshotId,
12303
+ baselineCreatedAt: baselineSnapshot.createdAt,
12304
+ baselineSourceRootDir: baselineSnapshot.sourceRootDir,
12305
+ candidateRootDir: candidateSnapshot.status.rootDir,
12306
+ comparedBenchmarks: deltas.length,
12307
+ missingCandidateBenchmarks,
12308
+ invalidArtifacts: {
12309
+ candidate: {
12310
+ benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
12311
+ runs: candidateSnapshot.status.invalidRuns.length,
12312
+ shadows: candidateSnapshot.status.invalidShadows.length,
12313
+ baselines: candidateSnapshot.status.invalidBaselines.length
12314
+ }
12315
+ },
12316
+ regressions,
12317
+ improvements,
12318
+ deltas,
12319
+ markdownReport: ""
12320
+ };
12321
+ report.markdownReport = formatEvalBaselineDeltaMarkdown(report);
12322
+ return report;
12323
+ }
12072
12324
  function resolveRequiredEvalStoreRoot(options, label) {
12073
12325
  if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
12074
12326
  return options.evalStoreDir.trim();
@@ -14331,7 +14583,7 @@ function optionalString(value) {
14331
14583
  if (typeof value !== "string" || value.trim().length === 0) return void 0;
14332
14584
  return value.trim();
14333
14585
  }
14334
- function assertSafePathSegment(value, field) {
14586
+ function assertSafePathSegment2(value, field) {
14335
14587
  if (value === "." || value === ".." || value.includes("/") || value.includes("\\")) {
14336
14588
  throw new Error(`${field} must be a safe path segment`);
14337
14589
  }
@@ -14386,7 +14638,7 @@ function validateCausalTrajectoryRecord(raw) {
14386
14638
  }
14387
14639
  return {
14388
14640
  schemaVersion: 1,
14389
- trajectoryId: assertSafePathSegment(assertString2(raw.trajectoryId, "trajectoryId"), "trajectoryId"),
14641
+ trajectoryId: assertSafePathSegment2(assertString2(raw.trajectoryId, "trajectoryId"), "trajectoryId"),
14390
14642
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
14391
14643
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
14392
14644
  goal: assertString2(raw.goal, "goal"),
@@ -14547,7 +14799,7 @@ function validateObjectiveStateSnapshot(raw) {
14547
14799
  }
14548
14800
  return {
14549
14801
  schemaVersion: 1,
14550
- snapshotId: assertSafePathSegment(assertString2(raw.snapshotId, "snapshotId"), "snapshotId"),
14802
+ snapshotId: assertSafePathSegment2(assertString2(raw.snapshotId, "snapshotId"), "snapshotId"),
14551
14803
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
14552
14804
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
14553
14805
  source,
@@ -14725,7 +14977,7 @@ function validateTrustZoneRecord(raw) {
14725
14977
  if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
14726
14978
  return {
14727
14979
  schemaVersion: 1,
14728
- recordId: assertSafePathSegment(assertString2(raw.recordId, "recordId"), "recordId"),
14980
+ recordId: assertSafePathSegment2(assertString2(raw.recordId, "recordId"), "recordId"),
14729
14981
  zone: validateZone(raw.zone, "zone"),
14730
14982
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
14731
14983
  kind: validateKind(raw.kind),
@@ -14863,7 +15115,7 @@ async function promoteTrustZoneRecord(options) {
14863
15115
  const sourceRecord = await findTrustZoneRecordById({
14864
15116
  memoryDir: options.memoryDir,
14865
15117
  trustZoneStoreDir: options.trustZoneStoreDir,
14866
- recordId: assertSafePathSegment(assertString2(options.sourceRecordId, "sourceRecordId"), "sourceRecordId")
15118
+ recordId: assertSafePathSegment2(assertString2(options.sourceRecordId, "sourceRecordId"), "sourceRecordId")
14867
15119
  });
14868
15120
  if (!sourceRecord) {
14869
15121
  throw new Error(`source trust-zone record not found: ${options.sourceRecordId}`);
@@ -15090,7 +15342,7 @@ function validateAbstractionNode(raw) {
15090
15342
  if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
15091
15343
  return {
15092
15344
  schemaVersion: 1,
15093
- nodeId: assertSafePathSegment(assertString2(raw.nodeId, "nodeId"), "nodeId"),
15345
+ nodeId: assertSafePathSegment2(assertString2(raw.nodeId, "nodeId"), "nodeId"),
15094
15346
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
15095
15347
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
15096
15348
  kind: validateKind2(raw.kind),
@@ -15161,7 +15413,7 @@ function validateNodeRefs(raw) {
15161
15413
  if (!nodeRefs || nodeRefs.length === 0) {
15162
15414
  throw new Error("nodeRefs must contain at least one node reference");
15163
15415
  }
15164
- return nodeRefs.map((nodeRef, index) => assertSafePathSegment(nodeRef, `nodeRefs[${index}]`));
15416
+ return nodeRefs.map((nodeRef, index) => assertSafePathSegment2(nodeRef, `nodeRefs[${index}]`));
15165
15417
  }
15166
15418
  function resolveCueAnchorStoreDir(abstractionNodeStoreDir, overrideDir) {
15167
15419
  if (typeof overrideDir === "string" && overrideDir.trim().length > 0) {
@@ -15174,7 +15426,7 @@ function validateCueAnchor(raw) {
15174
15426
  if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
15175
15427
  return {
15176
15428
  schemaVersion: 1,
15177
- anchorId: assertSafePathSegment(assertString2(raw.anchorId, "anchorId"), "anchorId"),
15429
+ anchorId: assertSafePathSegment2(assertString2(raw.anchorId, "anchorId"), "anchorId"),
15178
15430
  anchorType: validateAnchorType(raw.anchorType),
15179
15431
  anchorValue: assertString2(raw.anchorValue, "anchorValue"),
15180
15432
  normalizedCue: assertString2(raw.normalizedCue, "normalizedCue"),
@@ -15588,7 +15840,7 @@ function validateCommitmentLedgerEntry(raw) {
15588
15840
  const normalizedResolvedAt = resolvedAt ?? (state === "open" ? void 0 : normalizedStateChangedAt);
15589
15841
  return {
15590
15842
  schemaVersion: 1,
15591
- entryId: assertSafePathSegment(assertString2(raw.entryId, "entryId"), "entryId"),
15843
+ entryId: assertSafePathSegment2(assertString2(raw.entryId, "entryId"), "entryId"),
15592
15844
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
15593
15845
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
15594
15846
  source,
@@ -15771,7 +16023,7 @@ function validateWorkProductLedgerEntry(raw) {
15771
16023
  }
15772
16024
  return {
15773
16025
  schemaVersion: 1,
15774
- entryId: assertSafePathSegment(assertString2(raw.entryId, "entryId"), "entryId"),
16026
+ entryId: assertSafePathSegment2(assertString2(raw.entryId, "entryId"), "entryId"),
15775
16027
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
15776
16028
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
15777
16029
  source,
@@ -18090,7 +18342,7 @@ function validateUtilityTelemetryEvent(raw) {
18090
18342
  }
18091
18343
  return {
18092
18344
  schemaVersion: 1,
18093
- eventId: assertSafePathSegment(assertString2(raw.eventId, "eventId"), "eventId"),
18345
+ eventId: assertSafePathSegment2(assertString2(raw.eventId, "eventId"), "eventId"),
18094
18346
  recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
18095
18347
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
18096
18348
  source,
@@ -28329,7 +28581,7 @@ function validateResumeBundle(raw) {
28329
28581
  }
28330
28582
  return {
28331
28583
  schemaVersion: 1,
28332
- bundleId: assertSafePathSegment(assertString2(raw.bundleId, "bundleId"), "bundleId"),
28584
+ bundleId: assertSafePathSegment2(assertString2(raw.bundleId, "bundleId"), "bundleId"),
28333
28585
  recordedAt,
28334
28586
  sessionKey: assertString2(raw.sessionKey, "sessionKey"),
28335
28587
  source,
@@ -28819,9 +29071,21 @@ async function runBenchmarkStatusCliCommand(options) {
28819
29071
  evalStoreDir: options.evalStoreDir,
28820
29072
  enabled: options.evalHarnessEnabled,
28821
29073
  shadowModeEnabled: options.evalShadowModeEnabled,
29074
+ baselineSnapshotsEnabled: options.benchmarkBaselineSnapshotsEnabled,
28822
29075
  memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
28823
29076
  });
28824
29077
  }
29078
+ async function runBenchmarkBaselineSnapshotCliCommand(options) {
29079
+ return createEvalBaselineSnapshot({
29080
+ memoryDir: options.memoryDir,
29081
+ evalStoreDir: options.evalStoreDir,
29082
+ baselineSnapshotsEnabled: options.benchmarkBaselineSnapshotsEnabled,
29083
+ snapshotId: options.snapshotId,
29084
+ createdAt: options.createdAt,
29085
+ notes: options.notes,
29086
+ gitRef: options.gitRef
29087
+ });
29088
+ }
28825
29089
  async function runBenchmarkValidateCliCommand(options) {
28826
29090
  return validateEvalBenchmarkPack(options.path, {
28827
29091
  memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
@@ -28842,6 +29106,14 @@ async function runBenchmarkCiGateCliCommand(options) {
28842
29106
  candidateEvalStoreDir: options.candidateEvalStoreDir
28843
29107
  });
28844
29108
  }
29109
+ async function runBenchmarkBaselineReportCliCommand(options) {
29110
+ return runEvalBaselineDeltaReport({
29111
+ memoryDir: options.memoryDir,
29112
+ evalStoreDir: options.evalStoreDir,
29113
+ benchmarkDeltaReporterEnabled: options.benchmarkDeltaReporterEnabled,
29114
+ snapshotId: options.snapshotId
29115
+ });
29116
+ }
28845
29117
  async function runObjectiveStateStatusCliCommand(options) {
28846
29118
  return getObjectiveStateStoreStatus({
28847
29119
  memoryDir: options.memoryDir,
@@ -30178,6 +30450,7 @@ function registerCli(api, orchestrator) {
30178
30450
  evalStoreDir: orchestrator.config.evalStoreDir,
30179
30451
  evalHarnessEnabled: orchestrator.config.evalHarnessEnabled,
30180
30452
  evalShadowModeEnabled: orchestrator.config.evalShadowModeEnabled,
30453
+ benchmarkBaselineSnapshotsEnabled: orchestrator.config.benchmarkBaselineSnapshotsEnabled,
30181
30454
  memoryRedTeamBenchEnabled: orchestrator.config.memoryRedTeamBenchEnabled
30182
30455
  });
30183
30456
  console.log(JSON.stringify(status, null, 2));
@@ -30192,6 +30465,20 @@ function registerCli(api, orchestrator) {
30192
30465
  console.log(JSON.stringify(summary, null, 2));
30193
30466
  console.log("OK");
30194
30467
  });
30468
+ cmd.command("benchmark-baseline-snapshot").description("Capture a versioned baseline snapshot of the latest completed benchmark runs").requiredOption("--snapshot-id <id>", "Stable snapshot identifier").option("--created-at <iso>", "Override snapshot creation timestamp").option("--git-ref <ref>", "Override the git ref recorded in the snapshot").option("--notes <text>", "Optional operator notes for the snapshot").action(async (...args) => {
30469
+ const options = args[0] ?? {};
30470
+ const summary = await runBenchmarkBaselineSnapshotCliCommand({
30471
+ memoryDir: orchestrator.config.memoryDir,
30472
+ evalStoreDir: orchestrator.config.evalStoreDir,
30473
+ benchmarkBaselineSnapshotsEnabled: orchestrator.config.benchmarkBaselineSnapshotsEnabled,
30474
+ snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : "",
30475
+ createdAt: typeof options.createdAt === "string" ? options.createdAt : void 0,
30476
+ gitRef: typeof options.gitRef === "string" ? options.gitRef : void 0,
30477
+ notes: typeof options.notes === "string" ? options.notes : void 0
30478
+ });
30479
+ console.log(JSON.stringify(summary, null, 2));
30480
+ console.log("OK");
30481
+ });
30195
30482
  cmd.command("benchmark-import").description("Validate and import a benchmark manifest file or pack directory into Engram's eval store").argument("<path>", "Path to a benchmark manifest JSON file or a directory with manifest.json").option("--force", "Replace an existing imported benchmark pack with the same benchmarkId").action(async (...args) => {
30196
30483
  const inputPath = args[0];
30197
30484
  const options = args[1] ?? {};
@@ -30217,6 +30504,22 @@ function registerCli(api, orchestrator) {
30217
30504
  }
30218
30505
  console.log("OK");
30219
30506
  });
30507
+ cmd.command("benchmark-baseline-report").description("Compare the current eval store against a named stored benchmark baseline snapshot").requiredOption("--snapshot-id <id>", "Stable baseline snapshot identifier").action(async (...args) => {
30508
+ const options = args[0] ?? {};
30509
+ const summary = await runBenchmarkBaselineReportCliCommand({
30510
+ memoryDir: orchestrator.config.memoryDir,
30511
+ evalStoreDir: orchestrator.config.evalStoreDir,
30512
+ benchmarkDeltaReporterEnabled: orchestrator.config.benchmarkDeltaReporterEnabled,
30513
+ snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : ""
30514
+ });
30515
+ const { markdownReport, ...jsonSummary } = summary;
30516
+ console.log(JSON.stringify(jsonSummary, null, 2));
30517
+ console.log(markdownReport);
30518
+ if (!summary.passed) {
30519
+ throw new Error("benchmark baseline report detected regressions");
30520
+ }
30521
+ console.log("OK");
30522
+ });
30220
30523
  cmd.command("objective-state-status").description("Show objective-state store status, snapshot counts, and latest stored snapshot").action(async () => {
30221
30524
  const status = await runObjectiveStateStatusCliCommand({
30222
30525
  memoryDir: orchestrator.config.memoryDir,