@joshuaswarren/openclaw-engram 9.0.50 → 9.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/index.js +329 -26
- package/dist/index.js.map +1 -1
- package/openclaw.plugin.json +20 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -31,6 +31,8 @@ AI agents forget everything between conversations. Engram fixes that.
|
|
|
31
31
|
- **Pluggable search** — Choose from six search backends: QMD (hybrid BM25+vector+reranking), LanceDB, Meilisearch, Orama, remote HTTP, or bring your own.
|
|
32
32
|
- **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
|
|
33
33
|
- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
|
|
34
|
+
- **Baseline snapshot discipline** — Engram can now, when `benchmarkBaselineSnapshotsEnabled` is enabled, capture typed baseline snapshots of the latest completed benchmark runs so later PR delta reporting can compare candidates against a stable stored reference instead of an ad hoc branch state.
|
|
35
|
+
- **Named baseline delta reporting** — Engram can now, when `benchmarkDeltaReporterEnabled` is enabled, compare the current eval store against a stored baseline snapshot, emit a machine-readable delta report plus markdown summary, and fail fast when a candidate regresses a benchmark that previously passed.
|
|
34
36
|
- **Objective-state recall** — Engram can now store normalized file, process, and tool outcomes and, when `objectiveStateRecallEnabled` is enabled, inject the most relevant objective-state snapshots back into recall context as a separate `Objective State` section.
|
|
35
37
|
- **Causal trajectory graph foundation** — Engram can now persist typed `goal -> action -> observation -> outcome -> follow-up` chains when `causalTrajectoryMemoryEnabled` is enabled and, with `actionGraphRecallEnabled`, emit deterministic action-conditioned edges into the causal graph for later trajectory-aware retrieval.
|
|
36
38
|
- **Causal trajectory recall** — Engram can now, when `causalTrajectoryRecallEnabled` is enabled, inject prompt-relevant causal chains back into recall context as a separate `Causal Trajectories` section with lightweight match explainability.
|
|
@@ -170,6 +172,8 @@ openclaw engram compat --strict # Compatibility check
|
|
|
170
172
|
openclaw engram benchmark-status # Benchmark/eval harness packs, runs, shadow recalls, latest summaries
|
|
171
173
|
openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
|
|
172
174
|
openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
|
|
175
|
+
openclaw engram benchmark-baseline-snapshot # Capture a typed baseline snapshot of the latest completed benchmark runs
|
|
176
|
+
openclaw engram benchmark-baseline-report # Compare the current eval store against a stored baseline snapshot
|
|
173
177
|
openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
|
|
174
178
|
openclaw engram objective-state-status # Objective-state snapshot counts and latest stored snapshot
|
|
175
179
|
openclaw engram causal-trajectory-status # Causal-trajectory record counts and latest stored chain
|
|
@@ -209,6 +213,8 @@ Key settings:
|
|
|
209
213
|
| `memoryDir` | `~/.openclaw/workspace/memory/local` | Memory storage root |
|
|
210
214
|
| `evalHarnessEnabled` | `false` | Enable the evaluation harness for benchmark packs, run summaries, and shadow recall bookkeeping |
|
|
211
215
|
| `evalShadowModeEnabled` | `false` | Record live recall decisions to the eval store without changing injected output |
|
|
216
|
+
| `benchmarkBaselineSnapshotsEnabled` | `false` | Enable versioned baseline snapshot artifacts for the latest completed benchmark runs |
|
|
217
|
+
| `benchmarkDeltaReporterEnabled` | `false` | Enable named-baseline delta reports against the current eval store |
|
|
212
218
|
| `evalStoreDir` | `{memoryDir}/state/evals` | Root directory for benchmark packs, run summaries, and shadow recall records |
|
|
213
219
|
| `objectiveStateMemoryEnabled` | `false` | Enable the objective-state memory foundation for normalized world/tool state snapshots |
|
|
214
220
|
| `objectiveStateSnapshotWritesEnabled` | `false` | Permit objective-state snapshot writers to persist typed state records |
|
package/dist/index.js
CHANGED
|
@@ -287,6 +287,8 @@ function parseConfig(raw) {
|
|
|
287
287
|
conversationRecallTimeoutMs: typeof cfg.conversationRecallTimeoutMs === "number" ? cfg.conversationRecallTimeoutMs : 800,
|
|
288
288
|
evalHarnessEnabled: cfg.evalHarnessEnabled === true,
|
|
289
289
|
evalShadowModeEnabled: cfg.evalShadowModeEnabled === true,
|
|
290
|
+
benchmarkBaselineSnapshotsEnabled: cfg.benchmarkBaselineSnapshotsEnabled === true,
|
|
291
|
+
benchmarkDeltaReporterEnabled: cfg.benchmarkDeltaReporterEnabled === true,
|
|
290
292
|
evalStoreDir: typeof cfg.evalStoreDir === "string" && cfg.evalStoreDir.trim().length > 0 ? cfg.evalStoreDir.trim() : path.join(memoryDir, "state", "evals"),
|
|
291
293
|
objectiveStateMemoryEnabled: cfg.objectiveStateMemoryEnabled === true,
|
|
292
294
|
objectiveStateSnapshotWritesEnabled: cfg.objectiveStateSnapshotWritesEnabled === true,
|
|
@@ -11666,11 +11668,14 @@ function resolveEvalStoreDir(memoryDir, overrideDir) {
|
|
|
11666
11668
|
}
|
|
11667
11669
|
return path15.join(memoryDir, "state", "evals");
|
|
11668
11670
|
}
|
|
11669
|
-
function
|
|
11670
|
-
if (
|
|
11671
|
-
throw new Error(
|
|
11671
|
+
function assertSafePathSegment(value, field) {
|
|
11672
|
+
if (value === "." || value === ".." || value.includes("/") || value.includes("\\")) {
|
|
11673
|
+
throw new Error(`${field} must be a safe path segment`);
|
|
11672
11674
|
}
|
|
11673
|
-
return
|
|
11675
|
+
return value;
|
|
11676
|
+
}
|
|
11677
|
+
function assertSafeBenchmarkId(benchmarkId) {
|
|
11678
|
+
return assertSafePathSegment(benchmarkId, "benchmarkId");
|
|
11674
11679
|
}
|
|
11675
11680
|
function validateEvalBenchmarkManifest(raw, options) {
|
|
11676
11681
|
if (!isRecord(raw)) throw new Error("benchmark manifest must be an object");
|
|
@@ -11727,14 +11732,7 @@ function validateEvalRunSummary(raw) {
|
|
|
11727
11732
|
if (!Number.isFinite(totalCases) || totalCases < 0) throw new Error("totalCases must be a non-negative number");
|
|
11728
11733
|
if (!Number.isFinite(passedCases) || passedCases < 0) throw new Error("passedCases must be a non-negative number");
|
|
11729
11734
|
if (!Number.isFinite(failedCases) || failedCases < 0) throw new Error("failedCases must be a non-negative number");
|
|
11730
|
-
const metrics =
|
|
11731
|
-
recallPrecisionAtK: typeof raw.metrics.recallPrecisionAtK === "number" ? raw.metrics.recallPrecisionAtK : void 0,
|
|
11732
|
-
actionOutcomeScore: typeof raw.metrics.actionOutcomeScore === "number" ? raw.metrics.actionOutcomeScore : void 0,
|
|
11733
|
-
objectiveStateCoverage: typeof raw.metrics.objectiveStateCoverage === "number" ? raw.metrics.objectiveStateCoverage : void 0,
|
|
11734
|
-
causalPathRecall: typeof raw.metrics.causalPathRecall === "number" ? raw.metrics.causalPathRecall : void 0,
|
|
11735
|
-
trustViolationRate: typeof raw.metrics.trustViolationRate === "number" ? raw.metrics.trustViolationRate : void 0,
|
|
11736
|
-
creationRecoveryScore: typeof raw.metrics.creationRecoveryScore === "number" ? raw.metrics.creationRecoveryScore : void 0
|
|
11737
|
-
} : void 0;
|
|
11735
|
+
const metrics = parseOptionalEvalRunMetrics(raw.metrics);
|
|
11738
11736
|
return {
|
|
11739
11737
|
schemaVersion: 1,
|
|
11740
11738
|
runId: assertString(raw.runId, "runId"),
|
|
@@ -11750,6 +11748,55 @@ function validateEvalRunSummary(raw) {
|
|
|
11750
11748
|
gitRef: typeof raw.gitRef === "string" && raw.gitRef.trim().length > 0 ? raw.gitRef.trim() : void 0
|
|
11751
11749
|
};
|
|
11752
11750
|
}
|
|
11751
|
+
function validateEvalBaselineSnapshot(raw) {
|
|
11752
|
+
if (!isRecord(raw)) throw new Error("eval baseline snapshot must be an object");
|
|
11753
|
+
if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
|
|
11754
|
+
if (!Array.isArray(raw.benchmarks)) throw new Error("benchmarks must be an array");
|
|
11755
|
+
const benchmarks = raw.benchmarks.map((item, index) => {
|
|
11756
|
+
if (!isRecord(item)) throw new Error(`benchmarks[${index}] must be an object`);
|
|
11757
|
+
const passRate = Number(item.passRate);
|
|
11758
|
+
if (!Number.isFinite(passRate) || passRate < 0 || passRate > 1) {
|
|
11759
|
+
throw new Error(`benchmarks[${index}].passRate must be a number between 0 and 1`);
|
|
11760
|
+
}
|
|
11761
|
+
const metrics = parseOptionalEvalRunMetrics(item.metrics);
|
|
11762
|
+
return {
|
|
11763
|
+
benchmarkId: assertString(item.benchmarkId, `benchmarks[${index}].benchmarkId`),
|
|
11764
|
+
runId: assertString(item.runId, `benchmarks[${index}].runId`),
|
|
11765
|
+
completedAt: typeof item.completedAt === "string" && item.completedAt.trim().length > 0 ? item.completedAt.trim() : void 0,
|
|
11766
|
+
gitRef: typeof item.gitRef === "string" && item.gitRef.trim().length > 0 ? item.gitRef.trim() : void 0,
|
|
11767
|
+
passRate,
|
|
11768
|
+
metrics
|
|
11769
|
+
};
|
|
11770
|
+
});
|
|
11771
|
+
const benchmarkCount = Number(raw.benchmarkCount);
|
|
11772
|
+
if (!Number.isFinite(benchmarkCount) || benchmarkCount < 0) {
|
|
11773
|
+
throw new Error("benchmarkCount must be a non-negative number");
|
|
11774
|
+
}
|
|
11775
|
+
if (benchmarkCount !== benchmarks.length) {
|
|
11776
|
+
throw new Error("benchmarkCount must match benchmarks.length");
|
|
11777
|
+
}
|
|
11778
|
+
return {
|
|
11779
|
+
schemaVersion: 1,
|
|
11780
|
+
snapshotId: assertString(raw.snapshotId, "snapshotId"),
|
|
11781
|
+
createdAt: assertString(raw.createdAt, "createdAt"),
|
|
11782
|
+
sourceRootDir: assertString(raw.sourceRootDir, "sourceRootDir"),
|
|
11783
|
+
benchmarkCount,
|
|
11784
|
+
benchmarks,
|
|
11785
|
+
notes: typeof raw.notes === "string" && raw.notes.trim().length > 0 ? raw.notes.trim() : void 0,
|
|
11786
|
+
gitRef: typeof raw.gitRef === "string" && raw.gitRef.trim().length > 0 ? raw.gitRef.trim() : void 0
|
|
11787
|
+
};
|
|
11788
|
+
}
|
|
11789
|
+
function parseOptionalEvalRunMetrics(raw) {
|
|
11790
|
+
if (!isRecord(raw)) return void 0;
|
|
11791
|
+
return {
|
|
11792
|
+
recallPrecisionAtK: typeof raw.recallPrecisionAtK === "number" ? raw.recallPrecisionAtK : void 0,
|
|
11793
|
+
actionOutcomeScore: typeof raw.actionOutcomeScore === "number" ? raw.actionOutcomeScore : void 0,
|
|
11794
|
+
objectiveStateCoverage: typeof raw.objectiveStateCoverage === "number" ? raw.objectiveStateCoverage : void 0,
|
|
11795
|
+
causalPathRecall: typeof raw.causalPathRecall === "number" ? raw.causalPathRecall : void 0,
|
|
11796
|
+
trustViolationRate: typeof raw.trustViolationRate === "number" ? raw.trustViolationRate : void 0,
|
|
11797
|
+
creationRecoveryScore: typeof raw.creationRecoveryScore === "number" ? raw.creationRecoveryScore : void 0
|
|
11798
|
+
};
|
|
11799
|
+
}
|
|
11753
11800
|
function validateEvalShadowRecallRecord(raw) {
|
|
11754
11801
|
if (!isRecord(raw)) throw new Error("eval shadow recall record must be an object");
|
|
11755
11802
|
if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
|
|
@@ -11862,17 +11909,62 @@ function compareMetricDeltas(baseMetrics, candidateMetrics) {
|
|
|
11862
11909
|
}
|
|
11863
11910
|
return { deltas, regressions, improvements };
|
|
11864
11911
|
}
|
|
11912
|
+
function formatEvalBaselineDeltaMarkdown(report) {
|
|
11913
|
+
const lines = [
|
|
11914
|
+
"# Eval Baseline Delta Report",
|
|
11915
|
+
"",
|
|
11916
|
+
`- Passed: ${report.passed ? "yes" : "no"}`,
|
|
11917
|
+
`- Baseline snapshot: ${report.baselineSnapshotId}`,
|
|
11918
|
+
`- Baseline created: ${report.baselineCreatedAt}`,
|
|
11919
|
+
`- Baseline source root: ${report.baselineSourceRootDir}`,
|
|
11920
|
+
`- Candidate root: ${report.candidateRootDir}`,
|
|
11921
|
+
`- Benchmarks compared: ${report.comparedBenchmarks}`
|
|
11922
|
+
];
|
|
11923
|
+
if (report.missingCandidateBenchmarks.length > 0) {
|
|
11924
|
+
lines.push(`- Missing candidate benchmarks: ${report.missingCandidateBenchmarks.join(", ")}`);
|
|
11925
|
+
}
|
|
11926
|
+
lines.push(
|
|
11927
|
+
`- Invalid candidate artifacts: benchmarks=${report.invalidArtifacts.candidate.benchmarks}, runs=${report.invalidArtifacts.candidate.runs}, shadows=${report.invalidArtifacts.candidate.shadows}, baselines=${report.invalidArtifacts.candidate.baselines}`,
|
|
11928
|
+
"",
|
|
11929
|
+
"## Regressions"
|
|
11930
|
+
);
|
|
11931
|
+
if (report.regressions.length === 0) {
|
|
11932
|
+
lines.push("- none");
|
|
11933
|
+
} else {
|
|
11934
|
+
for (const regression of report.regressions) lines.push(`- ${regression}`);
|
|
11935
|
+
}
|
|
11936
|
+
lines.push("", "## Improvements");
|
|
11937
|
+
if (report.improvements.length === 0) {
|
|
11938
|
+
lines.push("- none");
|
|
11939
|
+
} else {
|
|
11940
|
+
for (const improvement of report.improvements) lines.push(`- ${improvement}`);
|
|
11941
|
+
}
|
|
11942
|
+
lines.push("", "## Benchmark Deltas");
|
|
11943
|
+
if (report.deltas.length === 0) {
|
|
11944
|
+
lines.push("- none");
|
|
11945
|
+
} else {
|
|
11946
|
+
for (const delta of report.deltas) {
|
|
11947
|
+
lines.push(
|
|
11948
|
+
`- ${delta.benchmarkId}: passRate ${delta.basePassRate} -> ${delta.candidatePassRate} (delta ${delta.passRateDelta})`
|
|
11949
|
+
);
|
|
11950
|
+
}
|
|
11951
|
+
}
|
|
11952
|
+
return lines.join("\n");
|
|
11953
|
+
}
|
|
11865
11954
|
async function collectEvalStoreSnapshot(options) {
|
|
11866
11955
|
const rootDir = options.rootDir;
|
|
11867
11956
|
const benchmarkDir = path15.join(rootDir, "benchmarks");
|
|
11868
11957
|
const runsDir = path15.join(rootDir, "runs");
|
|
11869
11958
|
const shadowDir = path15.join(rootDir, "shadow");
|
|
11959
|
+
const baselineDir = path15.join(rootDir, "baselines");
|
|
11870
11960
|
const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
|
|
11871
11961
|
const runFiles = await listJsonFiles(runsDir);
|
|
11872
11962
|
const shadowFiles = await listJsonFiles(shadowDir);
|
|
11963
|
+
const baselineFiles = await listJsonFiles(baselineDir);
|
|
11873
11964
|
const invalidBenchmarks = [];
|
|
11874
11965
|
const invalidRuns = [];
|
|
11875
11966
|
const invalidShadows = [];
|
|
11967
|
+
const invalidBaselines = [];
|
|
11876
11968
|
const manifests = [];
|
|
11877
11969
|
for (const filePath of benchmarkFiles) {
|
|
11878
11970
|
try {
|
|
@@ -11910,12 +12002,24 @@ async function collectEvalStoreSnapshot(options) {
|
|
|
11910
12002
|
});
|
|
11911
12003
|
}
|
|
11912
12004
|
}
|
|
12005
|
+
const baselines = [];
|
|
12006
|
+
for (const filePath of baselineFiles) {
|
|
12007
|
+
try {
|
|
12008
|
+
baselines.push(validateEvalBaselineSnapshot(await readJsonFile(filePath)));
|
|
12009
|
+
} catch (error) {
|
|
12010
|
+
invalidBaselines.push({
|
|
12011
|
+
path: filePath,
|
|
12012
|
+
error: error instanceof Error ? error.message : String(error)
|
|
12013
|
+
});
|
|
12014
|
+
}
|
|
12015
|
+
}
|
|
11913
12016
|
runs.sort((a, b) => {
|
|
11914
12017
|
const aTime = Date.parse(a.completedAt ?? a.startedAt);
|
|
11915
12018
|
const bTime = Date.parse(b.completedAt ?? b.startedAt);
|
|
11916
12019
|
return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
|
|
11917
12020
|
});
|
|
11918
12021
|
shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
|
|
12022
|
+
baselines.sort((a, b) => b.createdAt.localeCompare(a.createdAt));
|
|
11919
12023
|
const tags = /* @__PURE__ */ new Set();
|
|
11920
12024
|
const attackClasses = /* @__PURE__ */ new Set();
|
|
11921
12025
|
const sourceLinks = /* @__PURE__ */ new Set();
|
|
@@ -11968,15 +12072,26 @@ async function collectEvalStoreSnapshot(options) {
|
|
|
11968
12072
|
latestRecordedAt: shadows[0]?.recordedAt,
|
|
11969
12073
|
latestSessionKey: shadows[0]?.sessionKey
|
|
11970
12074
|
},
|
|
12075
|
+
baselines: {
|
|
12076
|
+
enabled: options.baselineSnapshotsEnabled === true,
|
|
12077
|
+
total: baselineFiles.length,
|
|
12078
|
+
invalid: invalidBaselines.length,
|
|
12079
|
+
latestSnapshotId: baselines[0]?.snapshotId,
|
|
12080
|
+
latestCreatedAt: baselines[0]?.createdAt,
|
|
12081
|
+
latestBenchmarkCount: baselines[0]?.benchmarkCount
|
|
12082
|
+
},
|
|
11971
12083
|
latestRun: runs[0],
|
|
11972
12084
|
latestShadow: shadows[0],
|
|
12085
|
+
latestBaseline: baselines[0],
|
|
11973
12086
|
invalidBenchmarks,
|
|
11974
12087
|
invalidRuns,
|
|
11975
|
-
invalidShadows
|
|
12088
|
+
invalidShadows,
|
|
12089
|
+
invalidBaselines
|
|
11976
12090
|
},
|
|
11977
12091
|
manifests,
|
|
11978
12092
|
runs,
|
|
11979
|
-
shadows
|
|
12093
|
+
shadows,
|
|
12094
|
+
baselines
|
|
11980
12095
|
};
|
|
11981
12096
|
}
|
|
11982
12097
|
async function resolveBenchmarkManifestPath(sourcePath) {
|
|
@@ -12066,9 +12181,146 @@ async function getEvalHarnessStatus(options) {
|
|
|
12066
12181
|
rootDir: resolveEvalStoreDir(options.memoryDir, options.evalStoreDir),
|
|
12067
12182
|
enabled: options.enabled,
|
|
12068
12183
|
shadowModeEnabled: options.shadowModeEnabled,
|
|
12184
|
+
baselineSnapshotsEnabled: options.baselineSnapshotsEnabled,
|
|
12069
12185
|
memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
|
|
12070
12186
|
})).status;
|
|
12071
12187
|
}
|
|
12188
|
+
async function createEvalBaselineSnapshot(options) {
|
|
12189
|
+
if (options.baselineSnapshotsEnabled !== true) {
|
|
12190
|
+
throw new Error("benchmark baseline snapshots are disabled");
|
|
12191
|
+
}
|
|
12192
|
+
const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
|
|
12193
|
+
const rootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
|
|
12194
|
+
const store = await collectEvalStoreSnapshot({
|
|
12195
|
+
rootDir,
|
|
12196
|
+
enabled: true,
|
|
12197
|
+
shadowModeEnabled: true,
|
|
12198
|
+
baselineSnapshotsEnabled: true,
|
|
12199
|
+
memoryRedTeamBenchEnabled: true
|
|
12200
|
+
});
|
|
12201
|
+
const latestRuns = latestCompletedRunsByBenchmark(store.runs);
|
|
12202
|
+
const benchmarks = [...latestRuns.values()].sort((a, b) => a.benchmarkId.localeCompare(b.benchmarkId)).map((run) => ({
|
|
12203
|
+
benchmarkId: run.benchmarkId,
|
|
12204
|
+
runId: run.runId,
|
|
12205
|
+
completedAt: run.completedAt,
|
|
12206
|
+
gitRef: run.gitRef,
|
|
12207
|
+
passRate: computePassRate(run),
|
|
12208
|
+
metrics: run.metrics
|
|
12209
|
+
}));
|
|
12210
|
+
const snapshot = validateEvalBaselineSnapshot({
|
|
12211
|
+
schemaVersion: 1,
|
|
12212
|
+
snapshotId,
|
|
12213
|
+
createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
12214
|
+
sourceRootDir: rootDir,
|
|
12215
|
+
benchmarkCount: benchmarks.length,
|
|
12216
|
+
benchmarks,
|
|
12217
|
+
notes: options.notes,
|
|
12218
|
+
gitRef: options.gitRef
|
|
12219
|
+
});
|
|
12220
|
+
const targetPath = path15.join(rootDir, "baselines", `${snapshot.snapshotId}.json`);
|
|
12221
|
+
await mkdir10(path15.dirname(targetPath), { recursive: true });
|
|
12222
|
+
await writeFile11(targetPath, JSON.stringify(snapshot, null, 2), "utf-8");
|
|
12223
|
+
return { targetPath, snapshot };
|
|
12224
|
+
}
|
|
12225
|
+
async function runEvalBaselineDeltaReport(options) {
|
|
12226
|
+
if (options.benchmarkDeltaReporterEnabled !== true) {
|
|
12227
|
+
throw new Error("benchmark delta reporter is disabled");
|
|
12228
|
+
}
|
|
12229
|
+
const snapshotId = assertSafePathSegment(assertString(options.snapshotId, "snapshotId"), "snapshotId");
|
|
12230
|
+
const candidateRootDir = resolveEvalStoreDir(options.memoryDir, options.evalStoreDir);
|
|
12231
|
+
const candidateSnapshot = await collectEvalStoreSnapshot({
|
|
12232
|
+
rootDir: candidateRootDir,
|
|
12233
|
+
enabled: true,
|
|
12234
|
+
shadowModeEnabled: true,
|
|
12235
|
+
baselineSnapshotsEnabled: true,
|
|
12236
|
+
memoryRedTeamBenchEnabled: true
|
|
12237
|
+
});
|
|
12238
|
+
const baselineSnapshot = candidateSnapshot.baselines.find((snapshot) => snapshot.snapshotId === snapshotId);
|
|
12239
|
+
if (!baselineSnapshot) {
|
|
12240
|
+
throw new Error(`benchmark baseline snapshot not found: ${snapshotId}`);
|
|
12241
|
+
}
|
|
12242
|
+
const regressions = [];
|
|
12243
|
+
const improvements = [];
|
|
12244
|
+
if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
|
|
12245
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
|
|
12246
|
+
}
|
|
12247
|
+
if (candidateSnapshot.status.invalidRuns.length > 0) {
|
|
12248
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
|
|
12249
|
+
}
|
|
12250
|
+
if (candidateSnapshot.status.invalidShadows.length > 0) {
|
|
12251
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
|
|
12252
|
+
}
|
|
12253
|
+
if (candidateSnapshot.status.invalidBaselines.length > 0) {
|
|
12254
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidBaselines.length} invalid baseline snapshot file(s)`);
|
|
12255
|
+
}
|
|
12256
|
+
const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
|
|
12257
|
+
const baselineBenchmarks = new Map(
|
|
12258
|
+
baselineSnapshot.benchmarks.map((benchmark) => [benchmark.benchmarkId, benchmark])
|
|
12259
|
+
);
|
|
12260
|
+
const missingCandidateBenchmarks = [...baselineBenchmarks.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
|
|
12261
|
+
for (const benchmarkId of missingCandidateBenchmarks) {
|
|
12262
|
+
regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
|
|
12263
|
+
}
|
|
12264
|
+
const deltas = [];
|
|
12265
|
+
for (const benchmarkId of [...baselineBenchmarks.keys()].sort()) {
|
|
12266
|
+
const baseBenchmark = baselineBenchmarks.get(benchmarkId);
|
|
12267
|
+
const candidateRun = candidateRuns.get(benchmarkId);
|
|
12268
|
+
if (!baseBenchmark || !candidateRun) continue;
|
|
12269
|
+
const passRateDelta = computePassRate(candidateRun) - baseBenchmark.passRate;
|
|
12270
|
+
const delta = {
|
|
12271
|
+
benchmarkId,
|
|
12272
|
+
baseRunId: baseBenchmark.runId,
|
|
12273
|
+
candidateRunId: candidateRun.runId,
|
|
12274
|
+
basePassRate: baseBenchmark.passRate,
|
|
12275
|
+
candidatePassRate: computePassRate(candidateRun),
|
|
12276
|
+
passRateDelta,
|
|
12277
|
+
metricDeltas: {},
|
|
12278
|
+
regressions: [],
|
|
12279
|
+
improvements: []
|
|
12280
|
+
};
|
|
12281
|
+
if (passRateDelta < 0) {
|
|
12282
|
+
delta.regressions.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
|
|
12283
|
+
regressions.push(`${benchmarkId} pass rate regressed (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
|
|
12284
|
+
} else if (passRateDelta > 0) {
|
|
12285
|
+
delta.improvements.push(`passRate ${baseBenchmark.passRate} -> ${delta.candidatePassRate}`);
|
|
12286
|
+
improvements.push(`${benchmarkId} pass rate improved (${baseBenchmark.passRate} -> ${delta.candidatePassRate})`);
|
|
12287
|
+
}
|
|
12288
|
+
const metricDelta = compareMetricDeltas(baseBenchmark.metrics, candidateRun.metrics);
|
|
12289
|
+
delta.metricDeltas = metricDelta.deltas;
|
|
12290
|
+
for (const regression of metricDelta.regressions) {
|
|
12291
|
+
delta.regressions.push(regression);
|
|
12292
|
+
regressions.push(`${benchmarkId} ${regression}`);
|
|
12293
|
+
}
|
|
12294
|
+
for (const improvement of metricDelta.improvements) {
|
|
12295
|
+
delta.improvements.push(improvement);
|
|
12296
|
+
improvements.push(`${benchmarkId} ${improvement}`);
|
|
12297
|
+
}
|
|
12298
|
+
deltas.push(delta);
|
|
12299
|
+
}
|
|
12300
|
+
const report = {
|
|
12301
|
+
passed: regressions.length === 0,
|
|
12302
|
+
baselineSnapshotId: baselineSnapshot.snapshotId,
|
|
12303
|
+
baselineCreatedAt: baselineSnapshot.createdAt,
|
|
12304
|
+
baselineSourceRootDir: baselineSnapshot.sourceRootDir,
|
|
12305
|
+
candidateRootDir: candidateSnapshot.status.rootDir,
|
|
12306
|
+
comparedBenchmarks: deltas.length,
|
|
12307
|
+
missingCandidateBenchmarks,
|
|
12308
|
+
invalidArtifacts: {
|
|
12309
|
+
candidate: {
|
|
12310
|
+
benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
|
|
12311
|
+
runs: candidateSnapshot.status.invalidRuns.length,
|
|
12312
|
+
shadows: candidateSnapshot.status.invalidShadows.length,
|
|
12313
|
+
baselines: candidateSnapshot.status.invalidBaselines.length
|
|
12314
|
+
}
|
|
12315
|
+
},
|
|
12316
|
+
regressions,
|
|
12317
|
+
improvements,
|
|
12318
|
+
deltas,
|
|
12319
|
+
markdownReport: ""
|
|
12320
|
+
};
|
|
12321
|
+
report.markdownReport = formatEvalBaselineDeltaMarkdown(report);
|
|
12322
|
+
return report;
|
|
12323
|
+
}
|
|
12072
12324
|
function resolveRequiredEvalStoreRoot(options, label) {
|
|
12073
12325
|
if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
|
|
12074
12326
|
return options.evalStoreDir.trim();
|
|
@@ -14331,7 +14583,7 @@ function optionalString(value) {
|
|
|
14331
14583
|
if (typeof value !== "string" || value.trim().length === 0) return void 0;
|
|
14332
14584
|
return value.trim();
|
|
14333
14585
|
}
|
|
14334
|
-
function
|
|
14586
|
+
function assertSafePathSegment2(value, field) {
|
|
14335
14587
|
if (value === "." || value === ".." || value.includes("/") || value.includes("\\")) {
|
|
14336
14588
|
throw new Error(`${field} must be a safe path segment`);
|
|
14337
14589
|
}
|
|
@@ -14386,7 +14638,7 @@ function validateCausalTrajectoryRecord(raw) {
|
|
|
14386
14638
|
}
|
|
14387
14639
|
return {
|
|
14388
14640
|
schemaVersion: 1,
|
|
14389
|
-
trajectoryId:
|
|
14641
|
+
trajectoryId: assertSafePathSegment2(assertString2(raw.trajectoryId, "trajectoryId"), "trajectoryId"),
|
|
14390
14642
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
14391
14643
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
14392
14644
|
goal: assertString2(raw.goal, "goal"),
|
|
@@ -14547,7 +14799,7 @@ function validateObjectiveStateSnapshot(raw) {
|
|
|
14547
14799
|
}
|
|
14548
14800
|
return {
|
|
14549
14801
|
schemaVersion: 1,
|
|
14550
|
-
snapshotId:
|
|
14802
|
+
snapshotId: assertSafePathSegment2(assertString2(raw.snapshotId, "snapshotId"), "snapshotId"),
|
|
14551
14803
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
14552
14804
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
14553
14805
|
source,
|
|
@@ -14725,7 +14977,7 @@ function validateTrustZoneRecord(raw) {
|
|
|
14725
14977
|
if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
|
|
14726
14978
|
return {
|
|
14727
14979
|
schemaVersion: 1,
|
|
14728
|
-
recordId:
|
|
14980
|
+
recordId: assertSafePathSegment2(assertString2(raw.recordId, "recordId"), "recordId"),
|
|
14729
14981
|
zone: validateZone(raw.zone, "zone"),
|
|
14730
14982
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
14731
14983
|
kind: validateKind(raw.kind),
|
|
@@ -14863,7 +15115,7 @@ async function promoteTrustZoneRecord(options) {
|
|
|
14863
15115
|
const sourceRecord = await findTrustZoneRecordById({
|
|
14864
15116
|
memoryDir: options.memoryDir,
|
|
14865
15117
|
trustZoneStoreDir: options.trustZoneStoreDir,
|
|
14866
|
-
recordId:
|
|
15118
|
+
recordId: assertSafePathSegment2(assertString2(options.sourceRecordId, "sourceRecordId"), "sourceRecordId")
|
|
14867
15119
|
});
|
|
14868
15120
|
if (!sourceRecord) {
|
|
14869
15121
|
throw new Error(`source trust-zone record not found: ${options.sourceRecordId}`);
|
|
@@ -15090,7 +15342,7 @@ function validateAbstractionNode(raw) {
|
|
|
15090
15342
|
if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
|
|
15091
15343
|
return {
|
|
15092
15344
|
schemaVersion: 1,
|
|
15093
|
-
nodeId:
|
|
15345
|
+
nodeId: assertSafePathSegment2(assertString2(raw.nodeId, "nodeId"), "nodeId"),
|
|
15094
15346
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
15095
15347
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
15096
15348
|
kind: validateKind2(raw.kind),
|
|
@@ -15161,7 +15413,7 @@ function validateNodeRefs(raw) {
|
|
|
15161
15413
|
if (!nodeRefs || nodeRefs.length === 0) {
|
|
15162
15414
|
throw new Error("nodeRefs must contain at least one node reference");
|
|
15163
15415
|
}
|
|
15164
|
-
return nodeRefs.map((nodeRef, index) =>
|
|
15416
|
+
return nodeRefs.map((nodeRef, index) => assertSafePathSegment2(nodeRef, `nodeRefs[${index}]`));
|
|
15165
15417
|
}
|
|
15166
15418
|
function resolveCueAnchorStoreDir(abstractionNodeStoreDir, overrideDir) {
|
|
15167
15419
|
if (typeof overrideDir === "string" && overrideDir.trim().length > 0) {
|
|
@@ -15174,7 +15426,7 @@ function validateCueAnchor(raw) {
|
|
|
15174
15426
|
if (raw.schemaVersion !== 1) throw new Error("schemaVersion must be 1");
|
|
15175
15427
|
return {
|
|
15176
15428
|
schemaVersion: 1,
|
|
15177
|
-
anchorId:
|
|
15429
|
+
anchorId: assertSafePathSegment2(assertString2(raw.anchorId, "anchorId"), "anchorId"),
|
|
15178
15430
|
anchorType: validateAnchorType(raw.anchorType),
|
|
15179
15431
|
anchorValue: assertString2(raw.anchorValue, "anchorValue"),
|
|
15180
15432
|
normalizedCue: assertString2(raw.normalizedCue, "normalizedCue"),
|
|
@@ -15588,7 +15840,7 @@ function validateCommitmentLedgerEntry(raw) {
|
|
|
15588
15840
|
const normalizedResolvedAt = resolvedAt ?? (state === "open" ? void 0 : normalizedStateChangedAt);
|
|
15589
15841
|
return {
|
|
15590
15842
|
schemaVersion: 1,
|
|
15591
|
-
entryId:
|
|
15843
|
+
entryId: assertSafePathSegment2(assertString2(raw.entryId, "entryId"), "entryId"),
|
|
15592
15844
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
15593
15845
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
15594
15846
|
source,
|
|
@@ -15771,7 +16023,7 @@ function validateWorkProductLedgerEntry(raw) {
|
|
|
15771
16023
|
}
|
|
15772
16024
|
return {
|
|
15773
16025
|
schemaVersion: 1,
|
|
15774
|
-
entryId:
|
|
16026
|
+
entryId: assertSafePathSegment2(assertString2(raw.entryId, "entryId"), "entryId"),
|
|
15775
16027
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
15776
16028
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
15777
16029
|
source,
|
|
@@ -18090,7 +18342,7 @@ function validateUtilityTelemetryEvent(raw) {
|
|
|
18090
18342
|
}
|
|
18091
18343
|
return {
|
|
18092
18344
|
schemaVersion: 1,
|
|
18093
|
-
eventId:
|
|
18345
|
+
eventId: assertSafePathSegment2(assertString2(raw.eventId, "eventId"), "eventId"),
|
|
18094
18346
|
recordedAt: assertIsoRecordedAt(assertString2(raw.recordedAt, "recordedAt")),
|
|
18095
18347
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
18096
18348
|
source,
|
|
@@ -28329,7 +28581,7 @@ function validateResumeBundle(raw) {
|
|
|
28329
28581
|
}
|
|
28330
28582
|
return {
|
|
28331
28583
|
schemaVersion: 1,
|
|
28332
|
-
bundleId:
|
|
28584
|
+
bundleId: assertSafePathSegment2(assertString2(raw.bundleId, "bundleId"), "bundleId"),
|
|
28333
28585
|
recordedAt,
|
|
28334
28586
|
sessionKey: assertString2(raw.sessionKey, "sessionKey"),
|
|
28335
28587
|
source,
|
|
@@ -28819,9 +29071,21 @@ async function runBenchmarkStatusCliCommand(options) {
|
|
|
28819
29071
|
evalStoreDir: options.evalStoreDir,
|
|
28820
29072
|
enabled: options.evalHarnessEnabled,
|
|
28821
29073
|
shadowModeEnabled: options.evalShadowModeEnabled,
|
|
29074
|
+
baselineSnapshotsEnabled: options.benchmarkBaselineSnapshotsEnabled,
|
|
28822
29075
|
memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
|
|
28823
29076
|
});
|
|
28824
29077
|
}
|
|
29078
|
+
async function runBenchmarkBaselineSnapshotCliCommand(options) {
|
|
29079
|
+
return createEvalBaselineSnapshot({
|
|
29080
|
+
memoryDir: options.memoryDir,
|
|
29081
|
+
evalStoreDir: options.evalStoreDir,
|
|
29082
|
+
baselineSnapshotsEnabled: options.benchmarkBaselineSnapshotsEnabled,
|
|
29083
|
+
snapshotId: options.snapshotId,
|
|
29084
|
+
createdAt: options.createdAt,
|
|
29085
|
+
notes: options.notes,
|
|
29086
|
+
gitRef: options.gitRef
|
|
29087
|
+
});
|
|
29088
|
+
}
|
|
28825
29089
|
async function runBenchmarkValidateCliCommand(options) {
|
|
28826
29090
|
return validateEvalBenchmarkPack(options.path, {
|
|
28827
29091
|
memoryRedTeamBenchEnabled: options.memoryRedTeamBenchEnabled
|
|
@@ -28842,6 +29106,14 @@ async function runBenchmarkCiGateCliCommand(options) {
|
|
|
28842
29106
|
candidateEvalStoreDir: options.candidateEvalStoreDir
|
|
28843
29107
|
});
|
|
28844
29108
|
}
|
|
29109
|
+
async function runBenchmarkBaselineReportCliCommand(options) {
|
|
29110
|
+
return runEvalBaselineDeltaReport({
|
|
29111
|
+
memoryDir: options.memoryDir,
|
|
29112
|
+
evalStoreDir: options.evalStoreDir,
|
|
29113
|
+
benchmarkDeltaReporterEnabled: options.benchmarkDeltaReporterEnabled,
|
|
29114
|
+
snapshotId: options.snapshotId
|
|
29115
|
+
});
|
|
29116
|
+
}
|
|
28845
29117
|
async function runObjectiveStateStatusCliCommand(options) {
|
|
28846
29118
|
return getObjectiveStateStoreStatus({
|
|
28847
29119
|
memoryDir: options.memoryDir,
|
|
@@ -30178,6 +30450,7 @@ function registerCli(api, orchestrator) {
|
|
|
30178
30450
|
evalStoreDir: orchestrator.config.evalStoreDir,
|
|
30179
30451
|
evalHarnessEnabled: orchestrator.config.evalHarnessEnabled,
|
|
30180
30452
|
evalShadowModeEnabled: orchestrator.config.evalShadowModeEnabled,
|
|
30453
|
+
benchmarkBaselineSnapshotsEnabled: orchestrator.config.benchmarkBaselineSnapshotsEnabled,
|
|
30181
30454
|
memoryRedTeamBenchEnabled: orchestrator.config.memoryRedTeamBenchEnabled
|
|
30182
30455
|
});
|
|
30183
30456
|
console.log(JSON.stringify(status, null, 2));
|
|
@@ -30192,6 +30465,20 @@ function registerCli(api, orchestrator) {
|
|
|
30192
30465
|
console.log(JSON.stringify(summary, null, 2));
|
|
30193
30466
|
console.log("OK");
|
|
30194
30467
|
});
|
|
30468
|
+
cmd.command("benchmark-baseline-snapshot").description("Capture a versioned baseline snapshot of the latest completed benchmark runs").requiredOption("--snapshot-id <id>", "Stable snapshot identifier").option("--created-at <iso>", "Override snapshot creation timestamp").option("--git-ref <ref>", "Override the git ref recorded in the snapshot").option("--notes <text>", "Optional operator notes for the snapshot").action(async (...args) => {
|
|
30469
|
+
const options = args[0] ?? {};
|
|
30470
|
+
const summary = await runBenchmarkBaselineSnapshotCliCommand({
|
|
30471
|
+
memoryDir: orchestrator.config.memoryDir,
|
|
30472
|
+
evalStoreDir: orchestrator.config.evalStoreDir,
|
|
30473
|
+
benchmarkBaselineSnapshotsEnabled: orchestrator.config.benchmarkBaselineSnapshotsEnabled,
|
|
30474
|
+
snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : "",
|
|
30475
|
+
createdAt: typeof options.createdAt === "string" ? options.createdAt : void 0,
|
|
30476
|
+
gitRef: typeof options.gitRef === "string" ? options.gitRef : void 0,
|
|
30477
|
+
notes: typeof options.notes === "string" ? options.notes : void 0
|
|
30478
|
+
});
|
|
30479
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
30480
|
+
console.log("OK");
|
|
30481
|
+
});
|
|
30195
30482
|
cmd.command("benchmark-import").description("Validate and import a benchmark manifest file or pack directory into Engram's eval store").argument("<path>", "Path to a benchmark manifest JSON file or a directory with manifest.json").option("--force", "Replace an existing imported benchmark pack with the same benchmarkId").action(async (...args) => {
|
|
30196
30483
|
const inputPath = args[0];
|
|
30197
30484
|
const options = args[1] ?? {};
|
|
@@ -30217,6 +30504,22 @@ function registerCli(api, orchestrator) {
|
|
|
30217
30504
|
}
|
|
30218
30505
|
console.log("OK");
|
|
30219
30506
|
});
|
|
30507
|
+
cmd.command("benchmark-baseline-report").description("Compare the current eval store against a named stored benchmark baseline snapshot").requiredOption("--snapshot-id <id>", "Stable baseline snapshot identifier").action(async (...args) => {
|
|
30508
|
+
const options = args[0] ?? {};
|
|
30509
|
+
const summary = await runBenchmarkBaselineReportCliCommand({
|
|
30510
|
+
memoryDir: orchestrator.config.memoryDir,
|
|
30511
|
+
evalStoreDir: orchestrator.config.evalStoreDir,
|
|
30512
|
+
benchmarkDeltaReporterEnabled: orchestrator.config.benchmarkDeltaReporterEnabled,
|
|
30513
|
+
snapshotId: typeof options.snapshotId === "string" ? options.snapshotId : ""
|
|
30514
|
+
});
|
|
30515
|
+
const { markdownReport, ...jsonSummary } = summary;
|
|
30516
|
+
console.log(JSON.stringify(jsonSummary, null, 2));
|
|
30517
|
+
console.log(markdownReport);
|
|
30518
|
+
if (!summary.passed) {
|
|
30519
|
+
throw new Error("benchmark baseline report detected regressions");
|
|
30520
|
+
}
|
|
30521
|
+
console.log("OK");
|
|
30522
|
+
});
|
|
30220
30523
|
cmd.command("objective-state-status").description("Show objective-state store status, snapshot counts, and latest stored snapshot").action(async () => {
|
|
30221
30524
|
const status = await runObjectiveStateStatusCliCommand({
|
|
30222
30525
|
memoryDir: orchestrator.config.memoryDir,
|