@ls-stack/agent-eval 0.55.1 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BD0D9-7k.mjs → app-Bpe6Monh.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-CfSiAVmi.js +377 -0
- package/dist/apps/web/dist/assets/{index-2I-eWzVL.css → index-Xa_7PteQ.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-BR3wMZMx.mjs → cli-DQO2Fpt2.mjs} +60 -13
- package/dist/index.d.mts +610 -62
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-Sw38bCaq.mjs → runExecution-6lrtj48K.mjs} +458 -121
- package/dist/{runOrchestration-DJsdLYeZ.mjs → runOrchestration-BYaN2mzS.mjs} +1 -1
- package/dist/{runner-dB69WsnM.mjs → runner-C3CiS2o7.mjs} +1 -1
- package/dist/{runner-72rsqJRq.mjs → runner-DYlwuAT3.mjs} +2 -2
- package/dist/{src-hBGtzWuA.mjs → src-DCGrFAmO.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +29 -10
- package/dist/apps/web/dist/assets/index-CvsPmlHl.js +0 -377
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-
|
|
2
|
-
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-
|
|
1
|
+
import { Ct as buildEvalKey, Nt as getEvalRegistry, St as resolveLlmCallsConfig, bt as runSummarySchema, c as resolveArtifactPath, ct as applyDerivedCallAttributes, dt as getEvalDisplayStatus, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, gt as matchesTagsFilter, h as normalizeScoreDef, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, s as resolveTracePresentation, ut as getEvalTitle, v as createFsCacheStore, wt as getCaseRowCaseKey, xt as resolveApiCallsConfig } from "./runExecution-6lrtj48K.mjs";
|
|
2
|
+
import { C as validateCharts, S as parseEvalDiscovery, _ as runTouchesEval, a as validateTagsFilters, b as deriveEvalFreshness, c as getLatestRunInfos, d as nextShortIdFromSnapshots, f as persistCaseDetail, g as recomputePersistedCaseStatus, h as recomputeEvalStatusesInRuns, i as resolveEvalTags, l as loadPersistedRunSnapshot, m as persistRunState, n as getTargetEvalKeys, o as generateRunId, p as deleteTemporaryRuns, s as getLastRunStatuses, u as loadPersistedRunSnapshots, v as buildManualInputDescriptor, x as loadIsolatedEvalRegistry, y as parseManualInputValues } from "./runOrchestration-BYaN2mzS.mjs";
|
|
3
3
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
5
5
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -794,6 +794,7 @@ async function markRunErrored(runState, message, managerContext) {
|
|
|
794
794
|
payload: { message }
|
|
795
795
|
});
|
|
796
796
|
managerContext.emitDiscoveryEvent();
|
|
797
|
+
managerContext.onRunTerminal();
|
|
797
798
|
}
|
|
798
799
|
async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
799
800
|
const snapshot = await loadPersistedRunSnapshot(runState.runDir);
|
|
@@ -815,6 +816,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
815
816
|
}
|
|
816
817
|
managerContext.emitEvent(runState, event);
|
|
817
818
|
managerContext.emitDiscoveryEvent();
|
|
819
|
+
managerContext.onRunTerminal();
|
|
818
820
|
}
|
|
819
821
|
//#endregion
|
|
820
822
|
//#region ../runner/src/runnerStateHydration.ts
|
|
@@ -883,6 +885,7 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
883
885
|
}
|
|
884
886
|
//#endregion
|
|
885
887
|
//#region ../runner/src/runner.ts
|
|
888
|
+
const defaultCachePruneIdleDelayMs = 5e3;
|
|
886
889
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
887
890
|
function createRunner({ watchForChanges = true } = {}) {
|
|
888
891
|
let config;
|
|
@@ -903,6 +906,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
903
906
|
let runHistoryWatcher;
|
|
904
907
|
let discoveryRefreshTimer;
|
|
905
908
|
let runHistoryRefreshTimer;
|
|
909
|
+
let cachePruneIdleTimer;
|
|
906
910
|
let registryLoadCounter = 0;
|
|
907
911
|
const configReload = createConfigReloadController({
|
|
908
912
|
getActiveRunCount,
|
|
@@ -942,6 +946,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
942
946
|
async clearCache(filter) {
|
|
943
947
|
await cacheStore.clear(filter);
|
|
944
948
|
},
|
|
949
|
+
async repairCache() {
|
|
950
|
+
return cacheStore.repair();
|
|
951
|
+
},
|
|
945
952
|
async recomputeStatusesForEval(evalKey) {
|
|
946
953
|
const evalMeta = resolveEvalMeta(evalKey);
|
|
947
954
|
if (!evalMeta) return { updatedRuns: 0 };
|
|
@@ -1252,6 +1259,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1252
1259
|
emitDiscoveryEvent();
|
|
1253
1260
|
},
|
|
1254
1261
|
async startRun(request) {
|
|
1262
|
+
cancelCacheRetentionPrune();
|
|
1255
1263
|
const tagsFilterError = validateTagsFilters(request.target.tagsFilter);
|
|
1256
1264
|
if (tagsFilterError !== null) throw new Error(tagsFilterError);
|
|
1257
1265
|
const deletedTemporaryRuns = await deleteTemporaryRuns({
|
|
@@ -1349,7 +1357,8 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1349
1357
|
workspaceRoot,
|
|
1350
1358
|
evals,
|
|
1351
1359
|
emitEvent,
|
|
1352
|
-
emitDiscoveryEvent
|
|
1360
|
+
emitDiscoveryEvent,
|
|
1361
|
+
onRunTerminal: scheduleCacheRetentionPrune
|
|
1353
1362
|
}
|
|
1354
1363
|
});
|
|
1355
1364
|
if (deletedTemporaryRuns > 0) emitDiscoveryEvent();
|
|
@@ -1398,6 +1407,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1398
1407
|
payload: run.summary
|
|
1399
1408
|
});
|
|
1400
1409
|
emitDiscoveryEvent();
|
|
1410
|
+
scheduleCacheRetentionPrune();
|
|
1401
1411
|
},
|
|
1402
1412
|
getCaseDetail(runId, caseId) {
|
|
1403
1413
|
const run = runs.get(runId);
|
|
@@ -1419,6 +1429,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1419
1429
|
};
|
|
1420
1430
|
},
|
|
1421
1431
|
async close() {
|
|
1432
|
+
cancelCacheRetentionPrune();
|
|
1422
1433
|
await Promise.all([closeWatchers(), configReload.close()]);
|
|
1423
1434
|
},
|
|
1424
1435
|
getWorkspaceRoot() {
|
|
@@ -1522,6 +1533,28 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1522
1533
|
function getActiveRunCount() {
|
|
1523
1534
|
return [...runs.values()].filter((run) => run.manifest.status === "running").length;
|
|
1524
1535
|
}
|
|
1536
|
+
function getCachePruneIdleDelayMs() {
|
|
1537
|
+
const configured = config.cache?.pruneIdleDelayMs;
|
|
1538
|
+
if (configured === void 0 || !Number.isFinite(configured) || configured <= 0) return defaultCachePruneIdleDelayMs;
|
|
1539
|
+
return Math.floor(configured);
|
|
1540
|
+
}
|
|
1541
|
+
function cancelCacheRetentionPrune() {
|
|
1542
|
+
if (cachePruneIdleTimer === void 0) return;
|
|
1543
|
+
clearTimeout(cachePruneIdleTimer);
|
|
1544
|
+
cachePruneIdleTimer = void 0;
|
|
1545
|
+
}
|
|
1546
|
+
function scheduleCacheRetentionPrune() {
|
|
1547
|
+
cancelCacheRetentionPrune();
|
|
1548
|
+
cachePruneIdleTimer = setTimeout(() => {
|
|
1549
|
+
cachePruneIdleTimer = void 0;
|
|
1550
|
+
pruneCacheRetentionIfIdle();
|
|
1551
|
+
}, getCachePruneIdleDelayMs());
|
|
1552
|
+
cachePruneIdleTimer.unref();
|
|
1553
|
+
}
|
|
1554
|
+
async function pruneCacheRetentionIfIdle() {
|
|
1555
|
+
if (getActiveRunCount() > 0) return;
|
|
1556
|
+
await cacheStore.pruneRetention();
|
|
1557
|
+
}
|
|
1525
1558
|
function emitDiscoveryEvent() {
|
|
1526
1559
|
const lastRunStatuses = getLastRunStatuses({
|
|
1527
1560
|
runs: runs.values(),
|
|
@@ -1657,19 +1690,20 @@ Flags:
|
|
|
1657
1690
|
`);
|
|
1658
1691
|
return;
|
|
1659
1692
|
}
|
|
1660
|
-
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
1693
|
+
if (topic === "cache" || topic === "cache list" || topic === "cache clear" || topic === "cache repair") {
|
|
1661
1694
|
console.info(`
|
|
1662
|
-
agent-evals cache - Manage cached
|
|
1695
|
+
agent-evals cache - Manage cached namespace/key entries
|
|
1663
1696
|
|
|
1664
1697
|
Usage:
|
|
1665
1698
|
agent-evals cache list [flags]
|
|
1666
1699
|
agent-evals cache clear --eval <id>
|
|
1667
1700
|
agent-evals cache clear --all
|
|
1701
|
+
agent-evals cache repair [flags]
|
|
1668
1702
|
|
|
1669
1703
|
Flags:
|
|
1670
1704
|
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
1671
1705
|
--all Confirm clearing every cached entry
|
|
1672
|
-
--json Output cache listing as JSON
|
|
1706
|
+
--json Output cache listing or repair summary as JSON
|
|
1673
1707
|
--no-env Disable automatic .env loading
|
|
1674
1708
|
--help, -h Show this help
|
|
1675
1709
|
`);
|
|
@@ -1683,9 +1717,10 @@ Commands:
|
|
|
1683
1717
|
list List discovered evals
|
|
1684
1718
|
run Run evals
|
|
1685
1719
|
show-runs [id|latest] Show saved run artifact file paths
|
|
1686
|
-
cache list List cached
|
|
1720
|
+
cache list List cached namespace/key entries
|
|
1687
1721
|
cache clear --eval <id> Clear cache entries for one eval
|
|
1688
1722
|
cache clear --all Clear every cached entry
|
|
1723
|
+
cache repair Remove unindexed/orphaned cache files
|
|
1689
1724
|
help Show this help
|
|
1690
1725
|
|
|
1691
1726
|
Options:
|
|
@@ -1961,7 +1996,7 @@ function parseArgs(argv) {
|
|
|
1961
1996
|
let cursor = 1;
|
|
1962
1997
|
if (args.command === "cache") {
|
|
1963
1998
|
const sub = normalizedArgv[cursor];
|
|
1964
|
-
if (sub === "list" || sub === "clear") {
|
|
1999
|
+
if (sub === "list" || sub === "clear" || sub === "repair") {
|
|
1965
2000
|
args.subcommand = sub;
|
|
1966
2001
|
args.helpTopic = `cache ${sub}`;
|
|
1967
2002
|
cursor++;
|
|
@@ -2136,8 +2171,8 @@ async function commandApp(args) {
|
|
|
2136
2171
|
const { serve } = await import("@hono/node-server");
|
|
2137
2172
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2138
2173
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2139
|
-
const appModule = await import("./app-
|
|
2140
|
-
const runnerModule = await import("./runner-
|
|
2174
|
+
const appModule = await import("./app-Bpe6Monh.mjs");
|
|
2175
|
+
const runnerModule = await import("./runner-C3CiS2o7.mjs");
|
|
2141
2176
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2142
2177
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2143
2178
|
await runnerModule.initRunner();
|
|
@@ -2311,10 +2346,8 @@ async function commandCache(args) {
|
|
|
2311
2346
|
for (const entry of entries) {
|
|
2312
2347
|
console.info(` ${entry.namespace}`);
|
|
2313
2348
|
console.info(` key: ${entry.key}`);
|
|
2314
|
-
const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
|
|
2315
|
-
console.info(` operation: ${operationLabel}`);
|
|
2316
2349
|
console.info(` stored: ${entry.storedAt}`);
|
|
2317
|
-
console.info(`
|
|
2350
|
+
console.info(` last accessed: ${entry.lastAccessedAt}`);
|
|
2318
2351
|
console.info("");
|
|
2319
2352
|
}
|
|
2320
2353
|
return;
|
|
@@ -2342,6 +2375,20 @@ async function commandCache(args) {
|
|
|
2342
2375
|
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
2343
2376
|
process.exit(1);
|
|
2344
2377
|
}
|
|
2378
|
+
if (args.subcommand === "repair") {
|
|
2379
|
+
const summary = await runner.repairCache();
|
|
2380
|
+
if (args.json) {
|
|
2381
|
+
console.info(JSON.stringify(summary, null, 2));
|
|
2382
|
+
return;
|
|
2383
|
+
}
|
|
2384
|
+
console.info("Cache repair complete.");
|
|
2385
|
+
console.info(`Removed cache files: ${String(summary.removedCacheFiles)}`);
|
|
2386
|
+
console.info(`Removed debug files: ${String(summary.removedDebugFiles)}`);
|
|
2387
|
+
console.info(`Removed blob files: ${String(summary.removedBlobFiles)}`);
|
|
2388
|
+
console.info(`Removed index rows: ${String(summary.removedIndexRows)}`);
|
|
2389
|
+
console.info(`Rewritten indexes: ${String(summary.rewrittenIndexes)}`);
|
|
2390
|
+
return;
|
|
2391
|
+
}
|
|
2345
2392
|
printHelp(args.helpTopic);
|
|
2346
2393
|
}
|
|
2347
2394
|
function getSortedRunSnapshots(runner) {
|