@ls-stack/agent-eval 0.42.1 → 0.42.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-mOYjX9zq.mjs → app-DPamBr5R.mjs} +4 -4
- package/dist/apps/web/dist/assets/{index-CANDLTsq.js → index-DH0qYD76.js} +1 -1
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DbVfkr9T.mjs → cli-BeJCJMQo.mjs} +49 -23
- package/dist/index.d.mts +110 -111
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-SPaHx-SC.mjs → runOrchestration-OVUFw1fL.mjs} +12 -23
- package/dist/{runner-DiCQ57JQ.mjs → runner-B1KygirW.mjs} +2 -2
- package/dist/{runner-BYOdLBle.mjs → runner-BJQq7cpd.mjs} +1 -1
- package/dist/{src-CANi3gpd.mjs → src-D7_xKo7h.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +15 -8
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DH0qYD76.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-eFM9VIsz.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig,
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-OVUFw1fL.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -476,6 +476,11 @@ function validateManualInputsForRequest(params) {
|
|
|
476
476
|
};
|
|
477
477
|
}
|
|
478
478
|
//#endregion
|
|
479
|
+
//#region ../runner/src/objectUtils.ts
|
|
480
|
+
function isRecord(value) {
|
|
481
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
482
|
+
}
|
|
483
|
+
//#endregion
|
|
479
484
|
//#region ../runner/src/recalculateDerivedAttributes.ts
|
|
480
485
|
function getCaseArtifactFileIdForExistingRun(runState, caseRow) {
|
|
481
486
|
const caseKey = getCaseRowCaseKey(caseRow);
|
|
@@ -505,7 +510,7 @@ async function recalculateDerivedAttributesForCase(params) {
|
|
|
505
510
|
});
|
|
506
511
|
let nextTrace = spansWithDerivedAttributes;
|
|
507
512
|
let nextTraceDisplay = caseDetail.traceDisplay;
|
|
508
|
-
const evalMeta = params.evals.get(
|
|
513
|
+
const evalMeta = caseRow.evalKey === void 0 ? void 0 : params.evals.get(caseRow.evalKey);
|
|
509
514
|
const entry = evalMeta === void 0 ? void 0 : getEvalRegistry().get(evalMeta.id);
|
|
510
515
|
if (entry !== void 0) entry.use((evalDef) => {
|
|
511
516
|
const resolved = resolveTracePresentation(spansWithDerivedAttributes, params.traceDisplayConfig, evalDef.traceDisplay);
|
|
@@ -787,6 +792,36 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
787
792
|
managerContext.emitDiscoveryEvent();
|
|
788
793
|
}
|
|
789
794
|
//#endregion
|
|
795
|
+
//#region ../runner/src/runnerStateHydration.ts
|
|
796
|
+
/** Rehydrate a persisted run while preserving live listeners/process handles. */
|
|
797
|
+
function toRunnerRunState(snapshot, existing) {
|
|
798
|
+
return {
|
|
799
|
+
...snapshot,
|
|
800
|
+
listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
|
|
801
|
+
childProcess: existing?.childProcess,
|
|
802
|
+
childTerminalReceived: existing?.childTerminalReceived ?? false
|
|
803
|
+
};
|
|
804
|
+
}
|
|
805
|
+
//#endregion
|
|
806
|
+
//#region ../runner/src/runTargetPersistence.ts
|
|
807
|
+
/** Build the exact-key run target persisted in run history. */
|
|
808
|
+
function buildPersistedRunTarget(params) {
|
|
809
|
+
const { target, evalKeys } = params;
|
|
810
|
+
if (target.mode === "all") return { mode: "all" };
|
|
811
|
+
const persistEvalKeys = (target.evalKeys?.length ?? 0) > 0 || (target.evalIds?.length ?? 0) > 0 || (target.files?.length ?? 0) > 0;
|
|
812
|
+
const keyedTarget = {
|
|
813
|
+
mode: target.mode,
|
|
814
|
+
evalKeys: persistEvalKeys && evalKeys.length > 0 ? evalKeys : void 0,
|
|
815
|
+
files: target.files,
|
|
816
|
+
tagsFilter: target.tagsFilter
|
|
817
|
+
};
|
|
818
|
+
if (target.mode === "caseIds") return {
|
|
819
|
+
...keyedTarget,
|
|
820
|
+
caseIds: target.caseIds
|
|
821
|
+
};
|
|
822
|
+
return keyedTarget;
|
|
823
|
+
}
|
|
824
|
+
//#endregion
|
|
790
825
|
//#region ../runner/src/watchRoots.ts
|
|
791
826
|
const globMagicCharacters = new Set([
|
|
792
827
|
"*",
|
|
@@ -823,9 +858,6 @@ function getWatchRootsForIncludePatterns(params) {
|
|
|
823
858
|
}
|
|
824
859
|
//#endregion
|
|
825
860
|
//#region ../runner/src/runner.ts
|
|
826
|
-
function isRecord(value) {
|
|
827
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
828
|
-
}
|
|
829
861
|
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
830
862
|
function createRunner({ watchForChanges = true } = {}) {
|
|
831
863
|
let config;
|
|
@@ -898,7 +930,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
898
930
|
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
899
931
|
runs: runs.values(),
|
|
900
932
|
evalKey: evalMeta.key,
|
|
901
|
-
evalId: evalMeta.id,
|
|
902
933
|
evalExists: evals.has(evalMeta.key),
|
|
903
934
|
scoreThresholds,
|
|
904
935
|
persistCaseDetail
|
|
@@ -930,7 +961,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
930
961
|
target: run.manifest.target,
|
|
931
962
|
caseRows: run.cases,
|
|
932
963
|
evalKey: evalMeta?.key ?? evalKey,
|
|
933
|
-
evalId: evalMeta?.id,
|
|
934
964
|
evalExists: evalMeta !== void 0
|
|
935
965
|
})) continue;
|
|
936
966
|
if (run.manifest.status === "running") continue;
|
|
@@ -959,7 +989,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
959
989
|
updated: false,
|
|
960
990
|
reason: "Case not found"
|
|
961
991
|
};
|
|
962
|
-
const evalMeta = evals.get(
|
|
992
|
+
const evalMeta = caseRow.evalKey === void 0 ? void 0 : evals.get(caseRow.evalKey);
|
|
963
993
|
if (!evalMeta) return {
|
|
964
994
|
updated: false,
|
|
965
995
|
reason: "Eval not found"
|
|
@@ -1170,6 +1200,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1170
1200
|
const cacheMode = request.cache?.mode ?? "use";
|
|
1171
1201
|
const runDir = join(localStateDir, "runs", runId);
|
|
1172
1202
|
const gitState = readGitWorktreeState(workspaceRoot);
|
|
1203
|
+
const targetEvalKeys = getTargetEvalKeys({
|
|
1204
|
+
request,
|
|
1205
|
+
sortedEvals: getSortedEvalMetas()
|
|
1206
|
+
});
|
|
1173
1207
|
const manifest = {
|
|
1174
1208
|
id: runId,
|
|
1175
1209
|
shortId,
|
|
@@ -1179,7 +1213,10 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1179
1213
|
endedAt: null,
|
|
1180
1214
|
commitSha: gitState.commitSha,
|
|
1181
1215
|
evalSourceFingerprints: {},
|
|
1182
|
-
target:
|
|
1216
|
+
target: buildPersistedRunTarget({
|
|
1217
|
+
target: request.target,
|
|
1218
|
+
evalKeys: targetEvalKeys
|
|
1219
|
+
}),
|
|
1183
1220
|
trials: request.trials,
|
|
1184
1221
|
trialSelection: config.trialSelection ?? "lowestScore",
|
|
1185
1222
|
cacheMode
|
|
@@ -1224,10 +1261,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1224
1261
|
runs.set(runId, runState);
|
|
1225
1262
|
setLatestRunInfoMap({
|
|
1226
1263
|
latestRunInfoMap,
|
|
1227
|
-
evalIds:
|
|
1228
|
-
request: materializedRequest,
|
|
1229
|
-
sortedEvals: getSortedEvalMetas()
|
|
1230
|
-
}),
|
|
1264
|
+
evalIds: targetEvalKeys,
|
|
1231
1265
|
info: {
|
|
1232
1266
|
status: "running",
|
|
1233
1267
|
startedAt: now,
|
|
@@ -1478,14 +1512,6 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
1478
1512
|
nextShortIdNum = Math.max(nextShortIdNum, nextShortIdFromSnapshots(persistedRuns));
|
|
1479
1513
|
if (changed) emitDiscoveryEvent();
|
|
1480
1514
|
}
|
|
1481
|
-
function toRunnerRunState(snapshot, existing) {
|
|
1482
|
-
return {
|
|
1483
|
-
...snapshot,
|
|
1484
|
-
listeners: existing?.listeners ?? /* @__PURE__ */ new Set(),
|
|
1485
|
-
childProcess: existing?.childProcess,
|
|
1486
|
-
childTerminalReceived: existing?.childTerminalReceived ?? false
|
|
1487
|
-
};
|
|
1488
|
-
}
|
|
1489
1515
|
return runner;
|
|
1490
1516
|
}
|
|
1491
1517
|
//#endregion
|
|
@@ -2046,8 +2072,8 @@ async function commandApp(args) {
|
|
|
2046
2072
|
const { serve } = await import("@hono/node-server");
|
|
2047
2073
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2048
2074
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2049
|
-
const appModule = await import("./app-
|
|
2050
|
-
const runnerModule = await import("./runner-
|
|
2075
|
+
const appModule = await import("./app-DPamBr5R.mjs");
|
|
2076
|
+
const runnerModule = await import("./runner-BJQq7cpd.mjs");
|
|
2051
2077
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2052
2078
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2053
2079
|
await runnerModule.initRunner();
|