@joshuaswarren/openclaw-engram 9.0.17 → 9.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/index.js +276 -90
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -30,7 +30,7 @@ AI agents forget everything between conversations. Engram fixes that.
|
|
|
30
30
|
- **Local-first** — All memory data stays on your filesystem as plain markdown files. No cloud dependency, no vendor lock-in, fully portable.
|
|
31
31
|
- **Pluggable search** — Choose from six search backends: QMD (hybrid BM25+vector+reranking), LanceDB, Meilisearch, Orama, remote HTTP, or bring your own.
|
|
32
32
|
- **Memory OS features** — Graph recall, temporal memory tree, lifecycle policy, compounding, shared context, memory boxes, and identity continuity can be enabled progressively as your install grows.
|
|
33
|
-
- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording, so memory improvements can be measured
|
|
33
|
+
- **Benchmark-first roadmap** — Engram now has an evaluation harness with live shadow recall recording and a CI benchmark delta gate, so memory improvements can be measured and regression-checked instead of argued from anecdotes.
|
|
34
34
|
- **Zero-config start** — Install, add an API key, restart. Engram works out of the box with sensible defaults and progressively unlocks advanced features as you enable them.
|
|
35
35
|
|
|
36
36
|
## Quick Start
|
|
@@ -139,7 +139,7 @@ Engram's capabilities are organized into feature families that you can enable pr
|
|
|
139
139
|
| **Compounding** | Weekly synthesis that surfaces patterns and recurring mistakes |
|
|
140
140
|
| **Hot/Cold Tiering** | Automatic migration of aging memories to cold storage |
|
|
141
141
|
| **Behavior Loop Tuning** | Runtime self-tuning of extraction and recall parameters |
|
|
142
|
-
| **Evaluation Harness** | Tracks benchmark packs, run summaries,
|
|
142
|
+
| **Evaluation Harness** | Tracks benchmark packs, run summaries, live shadow recall records, and CI delta comparisons so future PRs can be gated on memory quality instead of anecdotes |
|
|
143
143
|
|
|
144
144
|
Start with defaults, then enable features as needed. See [Enable All Features](docs/enable-all-v8.md) for a full-feature config profile.
|
|
145
145
|
|
|
@@ -152,6 +152,7 @@ openclaw engram compat --strict # Compatibility check
|
|
|
152
152
|
openclaw engram benchmark-status # Benchmark/eval harness packs, runs, shadow recalls, latest summaries
|
|
153
153
|
openclaw engram benchmark-validate <path> # Validate a benchmark manifest or pack directory
|
|
154
154
|
openclaw engram benchmark-import <path> # Import a validated benchmark pack into the eval store
|
|
155
|
+
openclaw engram benchmark-ci-gate # Compare base vs candidate eval stores and fail on regressions
|
|
155
156
|
openclaw engram conversation-index-health # Conversation index status
|
|
156
157
|
openclaw engram graph-health # Entity graph status
|
|
157
158
|
openclaw engram tier-status # Hot/cold tier metrics
|
|
@@ -183,7 +184,7 @@ Full reference: [Config Reference](docs/config-reference.md)
|
|
|
183
184
|
- [Search Backends](docs/search-backends.md) — Choosing and configuring search engines
|
|
184
185
|
- [Writing a Search Backend](docs/writing-a-search-backend.md) — Build your own adapter
|
|
185
186
|
- [Config Reference](docs/config-reference.md) — Every setting with defaults
|
|
186
|
-
- [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack and
|
|
187
|
+
- [Evaluation Harness](docs/evaluation-harness.md) — Benchmark pack, shadow recall, and CI delta gate format
|
|
187
188
|
- [Architecture Overview](docs/architecture/overview.md) — System design and storage layout
|
|
188
189
|
- [Retrieval Pipeline](docs/architecture/retrieval-pipeline.md) — How recall works
|
|
189
190
|
- [Memory Lifecycle](docs/architecture/memory-lifecycle.md) — Write, consolidation, expiry
|
package/dist/index.js
CHANGED
|
@@ -11717,6 +11717,151 @@ async function listNamedFiles(dir, fileName) {
|
|
|
11717
11717
|
async function readJsonFile(filePath) {
|
|
11718
11718
|
return JSON.parse(await readFile12(filePath, "utf-8"));
|
|
11719
11719
|
}
|
|
11720
|
+
var LOWER_IS_BETTER_METRICS = /* @__PURE__ */ new Set(["trustViolationRate"]);
|
|
11721
|
+
function computePassRate(run) {
|
|
11722
|
+
return run.totalCases > 0 ? run.passedCases / run.totalCases : 0;
|
|
11723
|
+
}
|
|
11724
|
+
function latestCompletedRunsByBenchmark(runs) {
|
|
11725
|
+
const sorted = [...runs].filter((run) => run.status === "completed").sort((a, b) => {
|
|
11726
|
+
const aTime = Date.parse(a.completedAt ?? a.startedAt);
|
|
11727
|
+
const bTime = Date.parse(b.completedAt ?? b.startedAt);
|
|
11728
|
+
return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
|
|
11729
|
+
});
|
|
11730
|
+
const out = /* @__PURE__ */ new Map();
|
|
11731
|
+
for (const run of sorted) {
|
|
11732
|
+
if (!out.has(run.benchmarkId)) {
|
|
11733
|
+
out.set(run.benchmarkId, run);
|
|
11734
|
+
}
|
|
11735
|
+
}
|
|
11736
|
+
return out;
|
|
11737
|
+
}
|
|
11738
|
+
function compareMetricDeltas(baseMetrics, candidateMetrics) {
|
|
11739
|
+
const deltas = {};
|
|
11740
|
+
const regressions = [];
|
|
11741
|
+
const improvements = [];
|
|
11742
|
+
if (!baseMetrics || !candidateMetrics) {
|
|
11743
|
+
return { deltas, regressions, improvements };
|
|
11744
|
+
}
|
|
11745
|
+
for (const metric of Object.keys(baseMetrics)) {
|
|
11746
|
+
const baseValue = baseMetrics[metric];
|
|
11747
|
+
const candidateValue = candidateMetrics[metric];
|
|
11748
|
+
if (typeof baseValue !== "number" || typeof candidateValue !== "number") continue;
|
|
11749
|
+
const delta = candidateValue - baseValue;
|
|
11750
|
+
deltas[metric] = delta;
|
|
11751
|
+
if (delta === 0) continue;
|
|
11752
|
+
const lowerIsBetter = LOWER_IS_BETTER_METRICS.has(metric);
|
|
11753
|
+
const improved = lowerIsBetter ? delta < 0 : delta > 0;
|
|
11754
|
+
const summary = `${metric} ${baseValue} -> ${candidateValue}`;
|
|
11755
|
+
if (improved) {
|
|
11756
|
+
improvements.push(summary);
|
|
11757
|
+
} else {
|
|
11758
|
+
regressions.push(summary);
|
|
11759
|
+
}
|
|
11760
|
+
}
|
|
11761
|
+
return { deltas, regressions, improvements };
|
|
11762
|
+
}
|
|
11763
|
+
async function collectEvalStoreSnapshot(options) {
|
|
11764
|
+
const rootDir = options.rootDir;
|
|
11765
|
+
const benchmarkDir = path14.join(rootDir, "benchmarks");
|
|
11766
|
+
const runsDir = path14.join(rootDir, "runs");
|
|
11767
|
+
const shadowDir = path14.join(rootDir, "shadow");
|
|
11768
|
+
const benchmarkFiles = await listNamedFiles(benchmarkDir, "manifest.json");
|
|
11769
|
+
const runFiles = await listJsonFiles(runsDir);
|
|
11770
|
+
const shadowFiles = await listJsonFiles(shadowDir);
|
|
11771
|
+
const invalidBenchmarks = [];
|
|
11772
|
+
const invalidRuns = [];
|
|
11773
|
+
const invalidShadows = [];
|
|
11774
|
+
const manifests = [];
|
|
11775
|
+
for (const filePath of benchmarkFiles) {
|
|
11776
|
+
try {
|
|
11777
|
+
manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
|
|
11778
|
+
} catch (error) {
|
|
11779
|
+
invalidBenchmarks.push({
|
|
11780
|
+
path: filePath,
|
|
11781
|
+
error: error instanceof Error ? error.message : String(error)
|
|
11782
|
+
});
|
|
11783
|
+
}
|
|
11784
|
+
}
|
|
11785
|
+
const runs = [];
|
|
11786
|
+
for (const filePath of runFiles) {
|
|
11787
|
+
try {
|
|
11788
|
+
runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
|
|
11789
|
+
} catch (error) {
|
|
11790
|
+
invalidRuns.push({
|
|
11791
|
+
path: filePath,
|
|
11792
|
+
error: error instanceof Error ? error.message : String(error)
|
|
11793
|
+
});
|
|
11794
|
+
}
|
|
11795
|
+
}
|
|
11796
|
+
const shadows = [];
|
|
11797
|
+
for (const filePath of shadowFiles) {
|
|
11798
|
+
try {
|
|
11799
|
+
shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
|
|
11800
|
+
} catch (error) {
|
|
11801
|
+
invalidShadows.push({
|
|
11802
|
+
path: filePath,
|
|
11803
|
+
error: error instanceof Error ? error.message : String(error)
|
|
11804
|
+
});
|
|
11805
|
+
}
|
|
11806
|
+
}
|
|
11807
|
+
runs.sort((a, b) => {
|
|
11808
|
+
const aTime = Date.parse(a.completedAt ?? a.startedAt);
|
|
11809
|
+
const bTime = Date.parse(b.completedAt ?? b.startedAt);
|
|
11810
|
+
return (Number.isNaN(bTime) ? 0 : bTime) - (Number.isNaN(aTime) ? 0 : aTime);
|
|
11811
|
+
});
|
|
11812
|
+
shadows.sort((a, b) => b.recordedAt.localeCompare(a.recordedAt));
|
|
11813
|
+
const tags = /* @__PURE__ */ new Set();
|
|
11814
|
+
const sourceLinks = /* @__PURE__ */ new Set();
|
|
11815
|
+
let totalCases = 0;
|
|
11816
|
+
for (const manifest of manifests) {
|
|
11817
|
+
totalCases += manifest.cases.length;
|
|
11818
|
+
for (const tag of manifest.tags ?? []) tags.add(tag);
|
|
11819
|
+
for (const link of manifest.sourceLinks ?? []) sourceLinks.add(link);
|
|
11820
|
+
}
|
|
11821
|
+
return {
|
|
11822
|
+
status: {
|
|
11823
|
+
enabled: options.enabled,
|
|
11824
|
+
shadowModeEnabled: options.shadowModeEnabled,
|
|
11825
|
+
rootDir,
|
|
11826
|
+
benchmarkDir,
|
|
11827
|
+
runsDir,
|
|
11828
|
+
benchmarks: {
|
|
11829
|
+
total: benchmarkFiles.length,
|
|
11830
|
+
valid: manifests.length,
|
|
11831
|
+
invalid: invalidBenchmarks.length,
|
|
11832
|
+
totalCases,
|
|
11833
|
+
tags: [...tags].sort(),
|
|
11834
|
+
sourceLinks: [...sourceLinks].sort()
|
|
11835
|
+
},
|
|
11836
|
+
runs: {
|
|
11837
|
+
total: runFiles.length,
|
|
11838
|
+
invalid: invalidRuns.length,
|
|
11839
|
+
completed: runs.filter((run) => run.status === "completed").length,
|
|
11840
|
+
failed: runs.filter((run) => run.status === "failed").length,
|
|
11841
|
+
partial: runs.filter((run) => run.status === "partial").length,
|
|
11842
|
+
running: runs.filter((run) => run.status === "running").length,
|
|
11843
|
+
latestRunId: runs[0]?.runId,
|
|
11844
|
+
latestBenchmarkId: runs[0]?.benchmarkId,
|
|
11845
|
+
latestCompletedAt: runs[0]?.completedAt
|
|
11846
|
+
},
|
|
11847
|
+
shadows: {
|
|
11848
|
+
total: shadowFiles.length,
|
|
11849
|
+
invalid: invalidShadows.length,
|
|
11850
|
+
latestTraceId: shadows[0]?.traceId,
|
|
11851
|
+
latestRecordedAt: shadows[0]?.recordedAt,
|
|
11852
|
+
latestSessionKey: shadows[0]?.sessionKey
|
|
11853
|
+
},
|
|
11854
|
+
latestRun: runs[0],
|
|
11855
|
+
latestShadow: shadows[0],
|
|
11856
|
+
invalidBenchmarks,
|
|
11857
|
+
invalidRuns,
|
|
11858
|
+
invalidShadows
|
|
11859
|
+
},
|
|
11860
|
+
manifests,
|
|
11861
|
+
runs,
|
|
11862
|
+
shadows
|
|
11863
|
+
};
|
|
11864
|
+
}
|
|
11720
11865
|
async function resolveBenchmarkManifestPath(sourcePath) {
|
|
11721
11866
|
const info = await stat3(sourcePath);
|
|
11722
11867
|
if (info.isDirectory()) {
|
|
@@ -11793,102 +11938,125 @@ async function recordEvalShadowRecall(options) {
|
|
|
11793
11938
|
return targetPath;
|
|
11794
11939
|
}
|
|
11795
11940
|
async function getEvalHarnessStatus(options) {
|
|
11796
|
-
|
|
11797
|
-
|
|
11798
|
-
|
|
11799
|
-
|
|
11800
|
-
|
|
11801
|
-
|
|
11802
|
-
|
|
11803
|
-
|
|
11804
|
-
|
|
11805
|
-
const invalidShadows = [];
|
|
11806
|
-
const manifests = [];
|
|
11807
|
-
for (const filePath of benchmarkFiles) {
|
|
11808
|
-
try {
|
|
11809
|
-
manifests.push(validateEvalBenchmarkManifest(await readJsonFile(filePath)));
|
|
11810
|
-
} catch (error) {
|
|
11811
|
-
invalidBenchmarks.push({
|
|
11812
|
-
path: filePath,
|
|
11813
|
-
error: error instanceof Error ? error.message : String(error)
|
|
11814
|
-
});
|
|
11815
|
-
}
|
|
11816
|
-
}
|
|
11817
|
-
const runs = [];
|
|
11818
|
-
for (const filePath of runFiles) {
|
|
11819
|
-
try {
|
|
11820
|
-
runs.push(validateEvalRunSummary(await readJsonFile(filePath)));
|
|
11821
|
-
} catch (error) {
|
|
11822
|
-
invalidRuns.push({
|
|
11823
|
-
path: filePath,
|
|
11824
|
-
error: error instanceof Error ? error.message : String(error)
|
|
11825
|
-
});
|
|
11826
|
-
}
|
|
11941
|
+
return (await collectEvalStoreSnapshot({
|
|
11942
|
+
rootDir: resolveEvalStoreDir(options.memoryDir, options.evalStoreDir),
|
|
11943
|
+
enabled: options.enabled,
|
|
11944
|
+
shadowModeEnabled: options.shadowModeEnabled
|
|
11945
|
+
})).status;
|
|
11946
|
+
}
|
|
11947
|
+
function resolveRequiredEvalStoreRoot(options, label) {
|
|
11948
|
+
if (typeof options.evalStoreDir === "string" && options.evalStoreDir.trim().length > 0) {
|
|
11949
|
+
return options.evalStoreDir.trim();
|
|
11827
11950
|
}
|
|
11828
|
-
|
|
11829
|
-
|
|
11830
|
-
try {
|
|
11831
|
-
shadows.push(validateEvalShadowRecallRecord(await readJsonFile(filePath)));
|
|
11832
|
-
} catch (error) {
|
|
11833
|
-
invalidShadows.push({
|
|
11834
|
-
path: filePath,
|
|
11835
|
-
error: error instanceof Error ? error.message : String(error)
|
|
11836
|
-
});
|
|
11837
|
-
}
|
|
11951
|
+
if (typeof options.memoryDir === "string" && options.memoryDir.trim().length > 0) {
|
|
11952
|
+
return resolveEvalStoreDir(options.memoryDir.trim());
|
|
11838
11953
|
}
|
|
11839
|
-
|
|
11840
|
-
|
|
11841
|
-
|
|
11842
|
-
|
|
11954
|
+
throw new Error(`${label} requires memoryDir or evalStoreDir`);
|
|
11955
|
+
}
|
|
11956
|
+
async function runEvalBenchmarkCiGate(options) {
|
|
11957
|
+
const baseRootDir = resolveRequiredEvalStoreRoot(
|
|
11958
|
+
{ memoryDir: options.baseMemoryDir, evalStoreDir: options.baseEvalStoreDir },
|
|
11959
|
+
"base"
|
|
11960
|
+
);
|
|
11961
|
+
const candidateRootDir = resolveRequiredEvalStoreRoot(
|
|
11962
|
+
{ memoryDir: options.candidateMemoryDir, evalStoreDir: options.candidateEvalStoreDir },
|
|
11963
|
+
"candidate"
|
|
11964
|
+
);
|
|
11965
|
+
const baseSnapshot = await collectEvalStoreSnapshot({
|
|
11966
|
+
rootDir: baseRootDir,
|
|
11967
|
+
enabled: true,
|
|
11968
|
+
shadowModeEnabled: true
|
|
11843
11969
|
});
|
|
11844
|
-
const
|
|
11845
|
-
|
|
11846
|
-
|
|
11847
|
-
|
|
11848
|
-
|
|
11849
|
-
|
|
11850
|
-
|
|
11851
|
-
|
|
11852
|
-
|
|
11853
|
-
|
|
11970
|
+
const candidateSnapshot = await collectEvalStoreSnapshot({
|
|
11971
|
+
rootDir: candidateRootDir,
|
|
11972
|
+
enabled: true,
|
|
11973
|
+
shadowModeEnabled: true
|
|
11974
|
+
});
|
|
11975
|
+
const regressions = [];
|
|
11976
|
+
const improvements = [];
|
|
11977
|
+
if (baseSnapshot.status.invalidBenchmarks.length > 0) {
|
|
11978
|
+
regressions.push(`base store has ${baseSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
|
|
11979
|
+
}
|
|
11980
|
+
if (baseSnapshot.status.invalidRuns.length > 0) {
|
|
11981
|
+
regressions.push(`base store has ${baseSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
|
|
11982
|
+
}
|
|
11983
|
+
if (baseSnapshot.status.invalidShadows.length > 0) {
|
|
11984
|
+
regressions.push(`base store has ${baseSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
|
|
11985
|
+
}
|
|
11986
|
+
if (candidateSnapshot.status.invalidBenchmarks.length > 0) {
|
|
11987
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidBenchmarks.length} invalid benchmark manifest(s)`);
|
|
11988
|
+
}
|
|
11989
|
+
if (candidateSnapshot.status.invalidRuns.length > 0) {
|
|
11990
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidRuns.length} invalid run summary file(s)`);
|
|
11991
|
+
}
|
|
11992
|
+
if (candidateSnapshot.status.invalidShadows.length > 0) {
|
|
11993
|
+
regressions.push(`candidate store has ${candidateSnapshot.status.invalidShadows.length} invalid shadow record(s)`);
|
|
11994
|
+
}
|
|
11995
|
+
const baseRuns = latestCompletedRunsByBenchmark(baseSnapshot.runs);
|
|
11996
|
+
const candidateRuns = latestCompletedRunsByBenchmark(candidateSnapshot.runs);
|
|
11997
|
+
const missingCandidateBenchmarks = [...baseRuns.keys()].filter((benchmarkId) => !candidateRuns.has(benchmarkId)).sort();
|
|
11998
|
+
for (const benchmarkId of missingCandidateBenchmarks) {
|
|
11999
|
+
regressions.push(`candidate is missing latest completed benchmark run for ${benchmarkId}`);
|
|
12000
|
+
}
|
|
12001
|
+
const deltas = [];
|
|
12002
|
+
for (const benchmarkId of [...baseRuns.keys()].sort()) {
|
|
12003
|
+
const baseRun = baseRuns.get(benchmarkId);
|
|
12004
|
+
const candidateRun = candidateRuns.get(benchmarkId);
|
|
12005
|
+
if (!baseRun || !candidateRun) continue;
|
|
12006
|
+
const basePassRate = computePassRate(baseRun);
|
|
12007
|
+
const candidatePassRate = computePassRate(candidateRun);
|
|
12008
|
+
const passRateDelta = candidatePassRate - basePassRate;
|
|
12009
|
+
const delta = {
|
|
12010
|
+
benchmarkId,
|
|
12011
|
+
baseRunId: baseRun.runId,
|
|
12012
|
+
candidateRunId: candidateRun.runId,
|
|
12013
|
+
basePassRate,
|
|
12014
|
+
candidatePassRate,
|
|
12015
|
+
passRateDelta,
|
|
12016
|
+
metricDeltas: {},
|
|
12017
|
+
regressions: [],
|
|
12018
|
+
improvements: []
|
|
12019
|
+
};
|
|
12020
|
+
if (passRateDelta < 0) {
|
|
12021
|
+
delta.regressions.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
|
|
12022
|
+
regressions.push(`${benchmarkId} pass rate regressed (${basePassRate} -> ${candidatePassRate})`);
|
|
12023
|
+
} else if (passRateDelta > 0) {
|
|
12024
|
+
delta.improvements.push(`passRate ${basePassRate} -> ${candidatePassRate}`);
|
|
12025
|
+
improvements.push(`${benchmarkId} pass rate improved (${basePassRate} -> ${candidatePassRate})`);
|
|
12026
|
+
}
|
|
12027
|
+
const metricDelta = compareMetricDeltas(baseRun.metrics, candidateRun.metrics);
|
|
12028
|
+
delta.metricDeltas = metricDelta.deltas;
|
|
12029
|
+
for (const regression of metricDelta.regressions) {
|
|
12030
|
+
delta.regressions.push(regression);
|
|
12031
|
+
regressions.push(`${benchmarkId} ${regression}`);
|
|
12032
|
+
}
|
|
12033
|
+
for (const improvement of metricDelta.improvements) {
|
|
12034
|
+
delta.improvements.push(improvement);
|
|
12035
|
+
improvements.push(`${benchmarkId} ${improvement}`);
|
|
12036
|
+
}
|
|
12037
|
+
deltas.push(delta);
|
|
11854
12038
|
}
|
|
11855
12039
|
return {
|
|
11856
|
-
|
|
11857
|
-
|
|
11858
|
-
rootDir,
|
|
11859
|
-
|
|
11860
|
-
|
|
11861
|
-
|
|
11862
|
-
|
|
11863
|
-
|
|
11864
|
-
|
|
11865
|
-
|
|
11866
|
-
|
|
11867
|
-
|
|
11868
|
-
|
|
11869
|
-
|
|
11870
|
-
|
|
11871
|
-
|
|
11872
|
-
completed: runs.filter((run) => run.status === "completed").length,
|
|
11873
|
-
failed: runs.filter((run) => run.status === "failed").length,
|
|
11874
|
-
partial: runs.filter((run) => run.status === "partial").length,
|
|
11875
|
-
running: runs.filter((run) => run.status === "running").length,
|
|
11876
|
-
latestRunId: latestRun?.runId,
|
|
11877
|
-
latestBenchmarkId: latestRun?.benchmarkId,
|
|
11878
|
-
latestCompletedAt: latestRun?.completedAt
|
|
11879
|
-
},
|
|
11880
|
-
shadows: {
|
|
11881
|
-
total: shadowFiles.length,
|
|
11882
|
-
invalid: invalidShadows.length,
|
|
11883
|
-
latestTraceId: latestShadow?.traceId,
|
|
11884
|
-
latestRecordedAt: latestShadow?.recordedAt,
|
|
11885
|
-
latestSessionKey: latestShadow?.sessionKey
|
|
12040
|
+
passed: regressions.length === 0,
|
|
12041
|
+
baseRootDir: baseSnapshot.status.rootDir,
|
|
12042
|
+
candidateRootDir: candidateSnapshot.status.rootDir,
|
|
12043
|
+
comparedBenchmarks: deltas.length,
|
|
12044
|
+
missingCandidateBenchmarks,
|
|
12045
|
+
invalidArtifacts: {
|
|
12046
|
+
base: {
|
|
12047
|
+
benchmarks: baseSnapshot.status.invalidBenchmarks.length,
|
|
12048
|
+
runs: baseSnapshot.status.invalidRuns.length,
|
|
12049
|
+
shadows: baseSnapshot.status.invalidShadows.length
|
|
12050
|
+
},
|
|
12051
|
+
candidate: {
|
|
12052
|
+
benchmarks: candidateSnapshot.status.invalidBenchmarks.length,
|
|
12053
|
+
runs: candidateSnapshot.status.invalidRuns.length,
|
|
12054
|
+
shadows: candidateSnapshot.status.invalidShadows.length
|
|
12055
|
+
}
|
|
11886
12056
|
},
|
|
11887
|
-
|
|
11888
|
-
|
|
11889
|
-
|
|
11890
|
-
invalidRuns,
|
|
11891
|
-
invalidShadows
|
|
12057
|
+
regressions,
|
|
12058
|
+
improvements,
|
|
12059
|
+
deltas
|
|
11892
12060
|
};
|
|
11893
12061
|
}
|
|
11894
12062
|
|
|
@@ -26026,6 +26194,12 @@ async function runBenchmarkImportCliCommand(options) {
|
|
|
26026
26194
|
force: options.force === true
|
|
26027
26195
|
});
|
|
26028
26196
|
}
|
|
26197
|
+
async function runBenchmarkCiGateCliCommand(options) {
|
|
26198
|
+
return runEvalBenchmarkCiGate({
|
|
26199
|
+
baseEvalStoreDir: options.baseEvalStoreDir,
|
|
26200
|
+
candidateEvalStoreDir: options.candidateEvalStoreDir
|
|
26201
|
+
});
|
|
26202
|
+
}
|
|
26029
26203
|
async function runSessionCheckCliCommand(options) {
|
|
26030
26204
|
return analyzeSessionIntegrity({ memoryDir: options.memoryDir });
|
|
26031
26205
|
}
|
|
@@ -27153,6 +27327,18 @@ function registerCli(api, orchestrator) {
|
|
|
27153
27327
|
console.log(JSON.stringify(summary, null, 2));
|
|
27154
27328
|
console.log("OK");
|
|
27155
27329
|
});
|
|
27330
|
+
cmd.command("benchmark-ci-gate").description("Compare two eval stores and fail when the candidate regresses benchmark outcomes").requiredOption("--base <path>", "Path to the base eval store directory").requiredOption("--candidate <path>", "Path to the candidate eval store directory").action(async (...args) => {
|
|
27331
|
+
const options = args[0] ?? {};
|
|
27332
|
+
const summary = await runBenchmarkCiGateCliCommand({
|
|
27333
|
+
baseEvalStoreDir: typeof options.base === "string" ? options.base : "",
|
|
27334
|
+
candidateEvalStoreDir: typeof options.candidate === "string" ? options.candidate : ""
|
|
27335
|
+
});
|
|
27336
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
27337
|
+
if (!summary.passed) {
|
|
27338
|
+
throw new Error("benchmark CI gate detected regressions");
|
|
27339
|
+
}
|
|
27340
|
+
console.log("OK");
|
|
27341
|
+
});
|
|
27156
27342
|
cmd.command("conversation-index-health").description("Show conversation index backend health and index stats").action(async () => {
|
|
27157
27343
|
const health = await runConversationIndexHealthCliCommand(orchestrator);
|
|
27158
27344
|
console.log(JSON.stringify(health, null, 2));
|