@ls-stack/agent-eval 0.60.1 → 0.60.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CxKEVlng.mjs → app-l3ynaNsb.mjs} +4 -4
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-CVBSlTD8.mjs → cli-BSVUCUxr.mjs} +34 -8
- package/dist/index.d.mts +110 -110
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-CjWJUUZ5.mjs → runExecution-Bq0Y3y_1.mjs} +2 -2
- package/dist/{runOrchestration-DE2TFAS6.mjs → runOrchestration-C7qQISz2.mjs} +41 -8
- package/dist/{runner-Cu1CQPTB.mjs → runner-C9xNJHt3.mjs} +1 -1
- package/dist/{runner-DzDRasWV.mjs → runner-DmkSq-QG.mjs} +2 -2
- package/dist/{src-DjOTPnDz.mjs → src-D5vGo2iv.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +2 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { Et as getCaseRowCaseKey, Ot as caseRowSchema, dt as getEvalTitle, nt as updateManualScoreRequestSchema, rt as extractCacheEntries, tt as createRunRequestSchema } from "./runExecution-
|
|
2
|
-
import { o as stageManualInputFile } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { Et as getCaseRowCaseKey, Ot as caseRowSchema, dt as getEvalTitle, nt as updateManualScoreRequestSchema, rt as extractCacheEntries, tt as createRunRequestSchema } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
|
+
import { o as stageManualInputFile } from "./cli-BSVUCUxr.mjs";
|
|
3
|
+
import "./src-D5vGo2iv.mjs";
|
|
4
|
+
import { t as getRunnerInstance } from "./runner-DmkSq-QG.mjs";
|
|
5
5
|
import { z } from "zod";
|
|
6
6
|
import { resultify } from "t-result";
|
|
7
7
|
import { readFile } from "node:fs/promises";
|
package/dist/bin.mjs
CHANGED
package/dist/caseChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, It as runWithEvalRegistry, J as runInEvalRuntimeScope, L as configureEvalRunLogs, St as resolveApiCallsConfig, _ as createBufferedCacheStore, a as isCaseChildParentMessage, d as loadEvalModule, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, r as runCase, v as createFsCacheStore, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
2
|
//#region ../runner/src/caseChild.ts
|
|
3
3
|
let fatalErrorReported = false;
|
|
4
4
|
let disconnectExpected = false;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-
|
|
2
|
-
import { C as
|
|
1
|
+
import { Ct as resolveLlmCallsConfig, Et as getCaseRowCaseKey, Ft as getEvalRegistry, Ot as caseRowSchema, St as resolveApiCallsConfig, Tt as buildEvalKey, _t as matchesTagsFilter, c as resolveArtifactPath, dt as getEvalTitle, f as resolveEvalDefaultConfig, ft as getEvalDisplayStatus, h as normalizeScoreDef, lt as applyDerivedCallAttributes, m as buildDeclaredColumnDefs, o as stripTerminalControlCodes, p as loadConfig, pt as deriveScopedSummaryFromCases, s as resolveTracePresentation, v as createFsCacheStore, xt as runSummarySchema, y as getCacheRetentionOptions } from "./runExecution-Bq0Y3y_1.mjs";
|
|
2
|
+
import { C as parseEvalDiscovery, S as loadIsolatedEvalRegistry, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as parseManualInputValues, c as getLatestRunInfos, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, l as loadPersistedCaseDetail, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as generateRunId, p as persistCaseDetail, s as getLastRunStatuses, u as loadPersistedRunSnapshot, v as runTouchesEval, w as validateCharts, x as deriveEvalFreshness, y as buildManualInputDescriptor } from "./runOrchestration-C7qQISz2.mjs";
|
|
3
3
|
import { parseEnv } from "node:util";
|
|
4
4
|
import { resultify } from "t-result";
|
|
5
5
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
@@ -28,10 +28,14 @@ function resolveCaseDetailLookup(run, caseId) {
|
|
|
28
28
|
const caseDetail = run.caseDetails.get(lookupId);
|
|
29
29
|
if (caseDetail) return caseDetail;
|
|
30
30
|
}
|
|
31
|
-
const matchingCaseRow = run
|
|
31
|
+
const matchingCaseRow = resolveCaseRowForCaseDetailLookup(run, caseId);
|
|
32
32
|
if (matchingCaseRow === void 0) return void 0;
|
|
33
33
|
return run.caseDetails.get(getCaseRowCaseKey(matchingCaseRow));
|
|
34
34
|
}
|
|
35
|
+
function resolveCaseRowForCaseDetailLookup(run, caseId) {
|
|
36
|
+
const lookupIds = new Set(getCaseLookupIds(caseId));
|
|
37
|
+
return run.cases.find((caseRow) => lookupIds.has(getCaseRowCaseKey(caseRow)) || lookupIds.has(caseRow.caseId));
|
|
38
|
+
}
|
|
35
39
|
//#endregion
|
|
36
40
|
//#region ../runner/src/configReload.ts
|
|
37
41
|
/** Coordinates idle-only reloads for workspace config and `.env` in app mode. */
|
|
@@ -823,7 +827,7 @@ async function markRunTerminalFromChild(runState, event, managerContext) {
|
|
|
823
827
|
runState.manifest = snapshot.manifest;
|
|
824
828
|
runState.summary = snapshot.summary;
|
|
825
829
|
runState.cases = snapshot.cases;
|
|
826
|
-
runState.caseDetails =
|
|
830
|
+
runState.caseDetails = /* @__PURE__ */ new Map();
|
|
827
831
|
} else if (event.type === "run.finished") {
|
|
828
832
|
runState.manifest.status = "completed";
|
|
829
833
|
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -995,6 +999,26 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
|
995
999
|
if (typeof config.concurrency !== "number" || !Number.isFinite(config.concurrency)) return 1;
|
|
996
1000
|
return Math.max(1, Math.floor(config.concurrency));
|
|
997
1001
|
}
|
|
1002
|
+
function getCaseDetailFileId(run, caseRow) {
|
|
1003
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
1004
|
+
return run.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
|
|
1005
|
+
}
|
|
1006
|
+
function hydrateCaseDetailForRow(run, caseRow) {
|
|
1007
|
+
const caseKey = getCaseRowCaseKey(caseRow);
|
|
1008
|
+
const cached = run.caseDetails.get(caseKey);
|
|
1009
|
+
if (cached !== void 0) return cached;
|
|
1010
|
+
const detail = loadPersistedCaseDetail(run.runDir, getCaseDetailFileId(run, caseRow));
|
|
1011
|
+
if (detail === null) return void 0;
|
|
1012
|
+
run.caseDetails.set(detail.caseKey ?? detail.caseId, detail);
|
|
1013
|
+
return detail;
|
|
1014
|
+
}
|
|
1015
|
+
function hydrateCaseDetailForLookup(run, caseId) {
|
|
1016
|
+
const cached = resolveCaseDetailLookup(run, caseId);
|
|
1017
|
+
if (cached !== void 0) return cached;
|
|
1018
|
+
const caseRow = resolveCaseRowForCaseDetailLookup(run, caseId);
|
|
1019
|
+
if (caseRow === void 0) return void 0;
|
|
1020
|
+
return hydrateCaseDetailForRow(run, caseRow);
|
|
1021
|
+
}
|
|
998
1022
|
function nextRegistryLoadIsolationKey(prefix, filePath) {
|
|
999
1023
|
registryLoadCounter++;
|
|
1000
1024
|
return `${prefix}:${String(registryLoadCounter)}:${filePath}`;
|
|
@@ -1041,6 +1065,7 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
|
1041
1065
|
evalKey: evalMeta.key,
|
|
1042
1066
|
evalExists: evals.has(evalMeta.key),
|
|
1043
1067
|
scoreThresholds,
|
|
1068
|
+
getCaseDetail: hydrateCaseDetailForRow,
|
|
1044
1069
|
persistCaseDetail
|
|
1045
1070
|
});
|
|
1046
1071
|
emitDiscoveryEvent();
|
|
@@ -1052,6 +1077,7 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
|
1052
1077
|
updated: false,
|
|
1053
1078
|
reason: "Run not found"
|
|
1054
1079
|
};
|
|
1080
|
+
hydrateCaseDetailForLookup(run, caseId);
|
|
1055
1081
|
return recalculateDerivedAttributesForCase({
|
|
1056
1082
|
run,
|
|
1057
1083
|
caseId,
|
|
@@ -1107,7 +1133,7 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
|
1107
1133
|
updated: false,
|
|
1108
1134
|
reason: "Manual score not found"
|
|
1109
1135
|
};
|
|
1110
|
-
const caseDetail = run
|
|
1136
|
+
const caseDetail = hydrateCaseDetailForRow(run, caseRow);
|
|
1111
1137
|
if (!caseDetail) return {
|
|
1112
1138
|
updated: false,
|
|
1113
1139
|
reason: "Case detail not found"
|
|
@@ -1480,7 +1506,7 @@ function createRunner({ watchForChanges = true, loadEnv = true } = {}) {
|
|
|
1480
1506
|
getCaseDetail(runId, caseId) {
|
|
1481
1507
|
const run = runs.get(runId);
|
|
1482
1508
|
if (!run) return void 0;
|
|
1483
|
-
return
|
|
1509
|
+
return hydrateCaseDetailForLookup(run, caseId);
|
|
1484
1510
|
},
|
|
1485
1511
|
subscribe(runId, listener) {
|
|
1486
1512
|
const run = runs.get(runId);
|
|
@@ -2232,8 +2258,8 @@ async function commandApp(args) {
|
|
|
2232
2258
|
const { serve } = await import("@hono/node-server");
|
|
2233
2259
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2234
2260
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2235
|
-
const appModule = await import("./app-
|
|
2236
|
-
const runnerModule = await import("./runner-
|
|
2261
|
+
const appModule = await import("./app-l3ynaNsb.mjs");
|
|
2262
|
+
const runnerModule = await import("./runner-C9xNJHt3.mjs");
|
|
2237
2263
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2238
2264
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2239
2265
|
await runnerModule.initRunner({ loadEnv: args.loadEnv });
|