@ls-stack/agent-eval 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DKWm1oxc.mjs → app-B8e-oWYc.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-9hO8NpgZ.js +117 -0
- package/dist/apps/web/dist/assets/{index-BVnLr79e.css → index-MARPw1bH.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-CMiCEQ-3.mjs → cli-BmrtjQj_.mjs} +155 -99
- package/dist/index.d.mts +40 -6
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +41 -3
- package/dist/{runOrchestration-H0pSUl3I.mjs → runOrchestration-BDyNrRQT.mjs} +271 -124
- package/dist/{runner-DLnj18MO.mjs → runner-CsZqhbiA.mjs} +2 -2
- package/dist/{runner-Dx1sMCbh.mjs → runner-DABFPXkx.mjs} +1 -1
- package/dist/src-CEAJYN_X.mjs +3 -0
- package/package.json +5 -4
- package/skills/agent-eval/SKILL.md +408 -0
- package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +0 -117
- package/dist/src-BgGL7DDp.mjs +0 -3
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as
|
|
1
|
+
import { A as getEvalTitle, I as runSummarySchema, J as resolveApiCallsConfig, M as deriveScopedSummaryFromCases, S as createFsCacheStore, Y as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as buildDeclaredColumnDefs, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, hn as getEvalRegistry, i as getLatestRunInfos, j as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, x as normalizeScoreDef, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -82,98 +82,6 @@ function validateCharts(params) {
|
|
|
82
82
|
};
|
|
83
83
|
}
|
|
84
84
|
//#endregion
|
|
85
|
-
//#region ../runner/src/discovery.ts
|
|
86
|
-
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
87
|
-
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
88
|
-
function parseEvalMetas(filePath, content) {
|
|
89
|
-
const metas = [];
|
|
90
|
-
let searchIndex = 0;
|
|
91
|
-
while (searchIndex < content.length) {
|
|
92
|
-
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
93
|
-
if (defineEvalIndex === -1) break;
|
|
94
|
-
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
95
|
-
if (!extracted) {
|
|
96
|
-
searchIndex = defineEvalIndex + 10;
|
|
97
|
-
continue;
|
|
98
|
-
}
|
|
99
|
-
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
100
|
-
if (id !== void 0) {
|
|
101
|
-
const result = {
|
|
102
|
-
filePath,
|
|
103
|
-
id
|
|
104
|
-
};
|
|
105
|
-
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
106
|
-
if (title !== void 0) result.title = title;
|
|
107
|
-
metas.push(result);
|
|
108
|
-
}
|
|
109
|
-
searchIndex = extracted.nextIndex;
|
|
110
|
-
}
|
|
111
|
-
return metas;
|
|
112
|
-
}
|
|
113
|
-
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
114
|
-
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
115
|
-
if (openParenIndex === -1) return void 0;
|
|
116
|
-
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
117
|
-
if (objectStartIndex === -1) return void 0;
|
|
118
|
-
let depth = 0;
|
|
119
|
-
let quote;
|
|
120
|
-
let inBlockComment = false;
|
|
121
|
-
let inLineComment = false;
|
|
122
|
-
let isEscaped = false;
|
|
123
|
-
for (let index = objectStartIndex; index < content.length; index++) {
|
|
124
|
-
const currentChar = content[index];
|
|
125
|
-
const nextChar = content[index + 1];
|
|
126
|
-
if (inLineComment) {
|
|
127
|
-
if (currentChar === "\n") inLineComment = false;
|
|
128
|
-
continue;
|
|
129
|
-
}
|
|
130
|
-
if (inBlockComment) {
|
|
131
|
-
if (currentChar === "*" && nextChar === "/") {
|
|
132
|
-
inBlockComment = false;
|
|
133
|
-
index++;
|
|
134
|
-
}
|
|
135
|
-
continue;
|
|
136
|
-
}
|
|
137
|
-
if (quote) {
|
|
138
|
-
if (isEscaped) {
|
|
139
|
-
isEscaped = false;
|
|
140
|
-
continue;
|
|
141
|
-
}
|
|
142
|
-
if (currentChar === "\\") {
|
|
143
|
-
isEscaped = true;
|
|
144
|
-
continue;
|
|
145
|
-
}
|
|
146
|
-
if (currentChar === quote) quote = void 0;
|
|
147
|
-
continue;
|
|
148
|
-
}
|
|
149
|
-
if (currentChar === "/" && nextChar === "/") {
|
|
150
|
-
inLineComment = true;
|
|
151
|
-
index++;
|
|
152
|
-
continue;
|
|
153
|
-
}
|
|
154
|
-
if (currentChar === "/" && nextChar === "*") {
|
|
155
|
-
inBlockComment = true;
|
|
156
|
-
index++;
|
|
157
|
-
continue;
|
|
158
|
-
}
|
|
159
|
-
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
160
|
-
quote = currentChar;
|
|
161
|
-
continue;
|
|
162
|
-
}
|
|
163
|
-
if (currentChar === "{") {
|
|
164
|
-
depth++;
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
if (currentChar === "}") {
|
|
168
|
-
depth--;
|
|
169
|
-
if (depth === 0) return {
|
|
170
|
-
nextIndex: index + 1,
|
|
171
|
-
objectText: content.slice(objectStartIndex, index + 1)
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
//#endregion
|
|
177
85
|
//#region ../runner/src/gitState.ts
|
|
178
86
|
function runGitCommand(workspaceRoot, args) {
|
|
179
87
|
const result = spawnSync("git", args, {
|
|
@@ -316,7 +224,10 @@ function upsertFinishedCase(runState, caseDetail, caseRow) {
|
|
|
316
224
|
function applyChildEvalMetas(evals, childMetas) {
|
|
317
225
|
for (const childMeta of childMetas) {
|
|
318
226
|
const evalMeta = evals.get(childMeta.id);
|
|
319
|
-
if (evalMeta === void 0)
|
|
227
|
+
if (evalMeta === void 0) {
|
|
228
|
+
evals.set(childMeta.id, childMeta);
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
320
231
|
evalMeta.columnDefs = childMeta.columnDefs;
|
|
321
232
|
evalMeta.caseCount = childMeta.caseCount;
|
|
322
233
|
evalMeta.stats = childMeta.stats;
|
|
@@ -728,8 +639,7 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
728
639
|
workspaceRoot,
|
|
729
640
|
runDir,
|
|
730
641
|
manifest,
|
|
731
|
-
summary
|
|
732
|
-
evals: getSortedEvalMetas()
|
|
642
|
+
summary
|
|
733
643
|
};
|
|
734
644
|
await writeFile(join(runDir, "run-child-context.json"), JSON.stringify(childContext, null, 2));
|
|
735
645
|
startRunChild({
|
|
@@ -820,6 +730,9 @@ function createRunner({ watchForChanges = true } = {}) {
|
|
|
820
730
|
getWorkspaceRoot() {
|
|
821
731
|
return workspaceRoot;
|
|
822
732
|
},
|
|
733
|
+
getAllowCliRunAll() {
|
|
734
|
+
return config.allowCliRunAll === true;
|
|
735
|
+
},
|
|
823
736
|
getLlmCallsConfig() {
|
|
824
737
|
return llmCallsConfig;
|
|
825
738
|
},
|
|
@@ -900,6 +813,7 @@ function parseArgs(argv) {
|
|
|
900
813
|
const args = {
|
|
901
814
|
command: "help",
|
|
902
815
|
subcommand: void 0,
|
|
816
|
+
positionals: [],
|
|
903
817
|
showHelp: false,
|
|
904
818
|
helpTopic: "global",
|
|
905
819
|
unknownHelpTarget: void 0,
|
|
@@ -933,6 +847,7 @@ function parseArgs(argv) {
|
|
|
933
847
|
}
|
|
934
848
|
for (let i = cursor; i < normalizedArgv.length; i++) {
|
|
935
849
|
const arg = normalizedArgv[i];
|
|
850
|
+
if (arg === void 0) continue;
|
|
936
851
|
const next = normalizedArgv[i + 1];
|
|
937
852
|
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
938
853
|
else if (arg === "--eval" && next) {
|
|
@@ -955,6 +870,7 @@ function parseArgs(argv) {
|
|
|
955
870
|
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
956
871
|
else if (arg === "--clear-cache") args.clearCache = true;
|
|
957
872
|
else if (arg === "--all") args.all = true;
|
|
873
|
+
else if (!arg.startsWith("-")) args.positionals.push(arg);
|
|
958
874
|
}
|
|
959
875
|
return args;
|
|
960
876
|
}
|
|
@@ -988,6 +904,9 @@ async function runCli(argv) {
|
|
|
988
904
|
case "run":
|
|
989
905
|
await commandRun(args);
|
|
990
906
|
break;
|
|
907
|
+
case "show-runs":
|
|
908
|
+
await commandShowRuns(args);
|
|
909
|
+
break;
|
|
991
910
|
case "cache":
|
|
992
911
|
await commandCache(args);
|
|
993
912
|
break;
|
|
@@ -997,7 +916,7 @@ async function runCli(argv) {
|
|
|
997
916
|
}
|
|
998
917
|
}
|
|
999
918
|
function isCliCommand(command) {
|
|
1000
|
-
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
919
|
+
return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
|
|
1001
920
|
}
|
|
1002
921
|
function loadWorkspaceEnv() {
|
|
1003
922
|
const envPath = resolve(process.cwd(), ".env");
|
|
@@ -1059,8 +978,8 @@ async function commandApp(args) {
|
|
|
1059
978
|
const { serve } = await import("@hono/node-server");
|
|
1060
979
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1061
980
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1062
|
-
const appModule = await import("./app-
|
|
1063
|
-
const runnerModule = await import("./runner-
|
|
981
|
+
const appModule = await import("./app-B8e-oWYc.mjs");
|
|
982
|
+
const runnerModule = await import("./runner-DABFPXkx.mjs");
|
|
1064
983
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1065
984
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1066
985
|
await runnerModule.initRunner();
|
|
@@ -1098,6 +1017,11 @@ async function commandList(args_) {
|
|
|
1098
1017
|
async function commandRun(args) {
|
|
1099
1018
|
const runner = createRunner({ watchForChanges: false });
|
|
1100
1019
|
await runner.init();
|
|
1020
|
+
if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
|
|
1021
|
+
console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
|
|
1022
|
+
process.exit(1);
|
|
1023
|
+
return;
|
|
1024
|
+
}
|
|
1101
1025
|
if (args.clearCache) {
|
|
1102
1026
|
await runner.clearCache();
|
|
1103
1027
|
if (!args.json) {
|
|
@@ -1144,6 +1068,32 @@ async function commandRun(args) {
|
|
|
1144
1068
|
}
|
|
1145
1069
|
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
1146
1070
|
}
|
|
1071
|
+
async function commandShowRuns(args) {
|
|
1072
|
+
const runner = createRunner({ watchForChanges: false });
|
|
1073
|
+
await runner.init();
|
|
1074
|
+
const runRef = args.positionals[0];
|
|
1075
|
+
if (runRef !== void 0) {
|
|
1076
|
+
const run = resolveRunSnapshot(runner, runRef);
|
|
1077
|
+
if (!run) {
|
|
1078
|
+
printMissingRun(runRef);
|
|
1079
|
+
process.exit(1);
|
|
1080
|
+
return;
|
|
1081
|
+
}
|
|
1082
|
+
const index = buildRunFileIndex(runner.getWorkspaceRoot(), run);
|
|
1083
|
+
if (args.json) {
|
|
1084
|
+
printJson(index);
|
|
1085
|
+
return;
|
|
1086
|
+
}
|
|
1087
|
+
printRunFileIndex(index);
|
|
1088
|
+
return;
|
|
1089
|
+
}
|
|
1090
|
+
const indexes = getSortedRunSnapshots(runner).map((run) => buildRunFileIndex(runner.getWorkspaceRoot(), run));
|
|
1091
|
+
if (args.json) {
|
|
1092
|
+
printJson(indexes);
|
|
1093
|
+
return;
|
|
1094
|
+
}
|
|
1095
|
+
printRunFileIndexes(indexes);
|
|
1096
|
+
}
|
|
1147
1097
|
async function commandCache(args) {
|
|
1148
1098
|
const runner = createRunner({ watchForChanges: false });
|
|
1149
1099
|
await runner.init();
|
|
@@ -1194,6 +1144,93 @@ async function commandCache(args) {
|
|
|
1194
1144
|
}
|
|
1195
1145
|
printHelp(args.helpTopic);
|
|
1196
1146
|
}
|
|
1147
|
+
function getSortedRunSnapshots(runner) {
|
|
1148
|
+
return runner.getRuns().toSorted((a, b) => getRunStartTime(a) - getRunStartTime(b)).map((manifest) => runner.getRun(manifest.id)).filter((run) => run !== void 0);
|
|
1149
|
+
}
|
|
1150
|
+
function buildRunFileIndex(workspaceRoot, run) {
|
|
1151
|
+
const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
|
|
1152
|
+
return {
|
|
1153
|
+
id: run.manifest.id,
|
|
1154
|
+
shortId: run.manifest.shortId,
|
|
1155
|
+
status: run.manifest.status,
|
|
1156
|
+
startedAt: run.manifest.startedAt,
|
|
1157
|
+
endedAt: run.manifest.endedAt,
|
|
1158
|
+
target: run.manifest.target,
|
|
1159
|
+
summary: run.summary,
|
|
1160
|
+
files: {
|
|
1161
|
+
dir: runDir,
|
|
1162
|
+
run: join(runDir, "run.json"),
|
|
1163
|
+
summary: join(runDir, "summary.json"),
|
|
1164
|
+
cases: join(runDir, "cases.jsonl"),
|
|
1165
|
+
caseDetailsDir: join(runDir, "case-details"),
|
|
1166
|
+
tracesDir: join(runDir, "traces")
|
|
1167
|
+
},
|
|
1168
|
+
cases: run.cases.map((caseRow) => {
|
|
1169
|
+
const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
|
|
1170
|
+
return {
|
|
1171
|
+
caseId: caseRow.caseId,
|
|
1172
|
+
evalId: caseRow.evalId,
|
|
1173
|
+
status: caseRow.status,
|
|
1174
|
+
files: {
|
|
1175
|
+
caseDetail: join(runDir, "case-details", fileName),
|
|
1176
|
+
trace: join(runDir, "traces", fileName)
|
|
1177
|
+
}
|
|
1178
|
+
};
|
|
1179
|
+
})
|
|
1180
|
+
};
|
|
1181
|
+
}
|
|
1182
|
+
function resolveRunSnapshot(runner, runRef) {
|
|
1183
|
+
const runs = getSortedRunSnapshots(runner);
|
|
1184
|
+
if (runs.length === 0) return void 0;
|
|
1185
|
+
if (runRef === void 0 || runRef === "latest") return runs[runs.length - 1];
|
|
1186
|
+
return runs.find((run) => run.manifest.id === runRef || run.manifest.shortId === runRef);
|
|
1187
|
+
}
|
|
1188
|
+
function printMissingRun(runRef) {
|
|
1189
|
+
console.error(runRef === void 0 ? "No saved runs found." : `No saved run found for "${runRef}".`);
|
|
1190
|
+
}
|
|
1191
|
+
function getRunStartTime(manifest) {
|
|
1192
|
+
const parsed = new Date(manifest.startedAt).getTime();
|
|
1193
|
+
return Number.isFinite(parsed) ? parsed : 0;
|
|
1194
|
+
}
|
|
1195
|
+
function printJson(value) {
|
|
1196
|
+
console.info(JSON.stringify(value, null, 2));
|
|
1197
|
+
}
|
|
1198
|
+
function printRunFileIndexes(indexes) {
|
|
1199
|
+
if (indexes.length === 0) {
|
|
1200
|
+
console.info("No saved runs.");
|
|
1201
|
+
return;
|
|
1202
|
+
}
|
|
1203
|
+
console.info(`Saved runs (${String(indexes.length)}):\n`);
|
|
1204
|
+
for (const index of indexes) {
|
|
1205
|
+
printRunFileIndex(index);
|
|
1206
|
+
console.info("");
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
function printRunFileIndex(index) {
|
|
1210
|
+
console.info(`${index.shortId} (${index.id}) ${index.status} ${formatCaseCounts(index.summary)}`);
|
|
1211
|
+
console.info(` dir: ${index.files.dir}`);
|
|
1212
|
+
console.info(` run: ${index.files.run}`);
|
|
1213
|
+
console.info(` summary: ${index.files.summary}`);
|
|
1214
|
+
console.info(` cases: ${index.files.cases}`);
|
|
1215
|
+
console.info(` case details: ${index.files.caseDetailsDir}`);
|
|
1216
|
+
console.info(` traces: ${index.files.tracesDir}`);
|
|
1217
|
+
if (index.cases.length === 0) return;
|
|
1218
|
+
console.info(" case files:");
|
|
1219
|
+
for (const caseEntry of index.cases) {
|
|
1220
|
+
console.info(` ${caseEntry.caseId} [${caseEntry.evalId}] ${caseEntry.status}`);
|
|
1221
|
+
console.info(` detail: ${caseEntry.files.caseDetail}`);
|
|
1222
|
+
console.info(` trace: ${caseEntry.files.trace}`);
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
function formatCaseCounts(summary) {
|
|
1226
|
+
return [
|
|
1227
|
+
`${String(summary.totalCases)} total`,
|
|
1228
|
+
`${String(summary.passedCases)} passed`,
|
|
1229
|
+
`${String(summary.failedCases)} failed`,
|
|
1230
|
+
`${String(summary.errorCases)} errors`,
|
|
1231
|
+
`${String(summary.cancelledCases)} cancelled`
|
|
1232
|
+
].join(", ");
|
|
1233
|
+
}
|
|
1197
1234
|
async function waitForRunCompletion(runner, runId) {
|
|
1198
1235
|
return new Promise((resolvePromise) => {
|
|
1199
1236
|
const check = () => {
|
|
@@ -1255,6 +1292,24 @@ Flags:
|
|
|
1255
1292
|
--clear-cache Clear the cache before starting the run
|
|
1256
1293
|
--no-env Disable automatic .env loading
|
|
1257
1294
|
--help, -h Show this help
|
|
1295
|
+
`);
|
|
1296
|
+
return;
|
|
1297
|
+
}
|
|
1298
|
+
if (topic === "show-runs") {
|
|
1299
|
+
console.info(`
|
|
1300
|
+
agent-evals show-runs - Show saved run artifact file paths
|
|
1301
|
+
|
|
1302
|
+
Usage:
|
|
1303
|
+
agent-evals show-runs [<run-id>|latest] [--json]
|
|
1304
|
+
|
|
1305
|
+
Prints the run directory and stable artifact paths for run.json, summary.json,
|
|
1306
|
+
cases.jsonl, case detail JSON, and trace JSON files. Run ids can be full
|
|
1307
|
+
timestamp ids, short ids such as r0, or latest.
|
|
1308
|
+
|
|
1309
|
+
Flags:
|
|
1310
|
+
--json Output the file index as JSON
|
|
1311
|
+
--no-env Disable automatic .env loading
|
|
1312
|
+
--help, -h Show this help
|
|
1258
1313
|
`);
|
|
1259
1314
|
return;
|
|
1260
1315
|
}
|
|
@@ -1283,6 +1338,7 @@ Commands:
|
|
|
1283
1338
|
app Start server with UI
|
|
1284
1339
|
list List discovered evals
|
|
1285
1340
|
run Run evals
|
|
1341
|
+
show-runs [id|latest] Show saved run artifact file paths
|
|
1286
1342
|
cache list List cached operation entries
|
|
1287
1343
|
cache clear --eval <id> Clear cache entries for one eval
|
|
1288
1344
|
cache clear --all Clear every cached entry
|
package/dist/index.d.mts
CHANGED
|
@@ -1674,6 +1674,12 @@ type AgentEvalsConfig = {
|
|
|
1674
1674
|
* considered outdated. Defaults to `14`.
|
|
1675
1675
|
*/
|
|
1676
1676
|
staleAfterDays?: number;
|
|
1677
|
+
/**
|
|
1678
|
+
* Whether `agent-evals run` may run every discovered eval when no `--eval`
|
|
1679
|
+
* or `--case` filter is provided. Defaults to `false`; set to `true` to
|
|
1680
|
+
* opt into unfiltered CLI runs. Grouped runs in the UI are still allowed.
|
|
1681
|
+
*/
|
|
1682
|
+
allowCliRunAll?: boolean;
|
|
1677
1683
|
/**
|
|
1678
1684
|
* Global trace attribute display config for the UI.
|
|
1679
1685
|
*
|
|
@@ -1762,6 +1768,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1762
1768
|
}>>;
|
|
1763
1769
|
concurrency: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1764
1770
|
staleAfterDays: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1771
|
+
allowCliRunAll: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
1765
1772
|
traceDisplay: z$1.ZodOptional<z$1.ZodObject<{
|
|
1766
1773
|
attributes: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1767
1774
|
key: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2720,6 +2727,14 @@ type EvalCaseScope = {
|
|
|
2720
2727
|
caseCacheRefs: TraceCacheRef[]; /** Background promises that should settle before the case scope finalizes. */
|
|
2721
2728
|
pendingBackgroundJobs: Set<Promise<unknown>>;
|
|
2722
2729
|
};
|
|
2730
|
+
/**
|
|
2731
|
+
* Runtime phase currently owned by the eval runner.
|
|
2732
|
+
*
|
|
2733
|
+
* `null` means the current async execution is outside an eval run. `env`
|
|
2734
|
+
* covers run-time module/environment loading, including top-level code in
|
|
2735
|
+
* modules imported while a run is being prepared.
|
|
2736
|
+
*/
|
|
2737
|
+
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
2723
2738
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
2724
2739
|
declare class EvalAssertionError extends Error {
|
|
2725
2740
|
constructor(message: string);
|
|
@@ -2727,12 +2742,14 @@ declare class EvalAssertionError extends Error {
|
|
|
2727
2742
|
/** Return the current eval scope for the active async context, if any. */
|
|
2728
2743
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
2729
2744
|
/**
|
|
2730
|
-
* Return
|
|
2745
|
+
* Return the current eval runner phase for this async execution.
|
|
2731
2746
|
*
|
|
2732
|
-
*
|
|
2733
|
-
*
|
|
2747
|
+
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2748
|
+
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2749
|
+
* case `execute`, `derive` while deriving outputs from traces, `outputsSchema`
|
|
2750
|
+
* while validating outputs, and `scorer` while computing scores.
|
|
2734
2751
|
*/
|
|
2735
|
-
declare function isInEvalScope():
|
|
2752
|
+
declare function isInEvalScope(): EvalRuntimeScope | null;
|
|
2736
2753
|
/**
|
|
2737
2754
|
* Register background work that should settle before eval finalization.
|
|
2738
2755
|
*
|
|
@@ -2762,8 +2779,18 @@ type RunInEvalScopeOptions = {
|
|
|
2762
2779
|
/** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
|
|
2763
2780
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
2764
2781
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
2765
|
-
waitForBackgroundJobs?: boolean;
|
|
2782
|
+
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
2783
|
+
runtimeScope?: EvalRuntimeScope;
|
|
2766
2784
|
};
|
|
2785
|
+
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
2786
|
+
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2787
|
+
/**
|
|
2788
|
+
* Execute a callback with an existing case scope and a specific runner phase.
|
|
2789
|
+
*
|
|
2790
|
+
* Runner-internal helper for post-execute phases that still need access to the
|
|
2791
|
+
* completed case scope through output, trace, assertion, and input helpers.
|
|
2792
|
+
*/
|
|
2793
|
+
declare function runInExistingEvalScope<T>(scope: EvalCaseScope, runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
2767
2794
|
/**
|
|
2768
2795
|
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
2769
2796
|
* trace data, and terminal error state.
|
|
@@ -3058,6 +3085,13 @@ type EvalRunner = {
|
|
|
3058
3085
|
subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
|
|
3059
3086
|
close(): Promise<void>; /** Resolve the workspace root backing this runner instance. */
|
|
3060
3087
|
getWorkspaceRoot(): string;
|
|
3088
|
+
/**
|
|
3089
|
+
* Return whether the current workspace allows an unfiltered CLI run.
|
|
3090
|
+
*
|
|
3091
|
+
* `false` means `agent-evals run` must include `--eval` or `--case`.
|
|
3092
|
+
* Programmatic/server runs are intentionally unaffected.
|
|
3093
|
+
*/
|
|
3094
|
+
getAllowCliRunAll(): boolean;
|
|
3061
3095
|
/**
|
|
3062
3096
|
* Resolved LLM-calls config used by the UI to derive the LLM calls tab.
|
|
3063
3097
|
*
|
|
@@ -3135,4 +3169,4 @@ declare function createRunner({
|
|
|
3135
3169
|
*/
|
|
3136
3170
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3137
3171
|
//#endregion
|
|
3138
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3172
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-BDyNrRQT.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-BmrtjQj_.mjs";
|
|
3
|
+
import "./src-CEAJYN_X.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import { F as
|
|
1
|
+
import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { relative } from "node:path";
|
|
4
5
|
import { z } from "zod/v4";
|
|
6
|
+
import { glob } from "glob";
|
|
5
7
|
//#region ../runner/src/runChild.ts
|
|
6
8
|
const evalMetaSchema = z.object({
|
|
7
9
|
id: z.string(),
|
|
@@ -20,7 +22,7 @@ const runChildContextSchema = z.object({
|
|
|
20
22
|
runDir: z.string(),
|
|
21
23
|
manifest: runManifestSchema,
|
|
22
24
|
summary: runSummarySchema,
|
|
23
|
-
evals: z.array(evalMetaSchema)
|
|
25
|
+
evals: z.array(evalMetaSchema).optional()
|
|
24
26
|
});
|
|
25
27
|
function sendMessage(message) {
|
|
26
28
|
if (process.send === void 0) return;
|
|
@@ -37,6 +39,38 @@ function getTargetEvals(params) {
|
|
|
37
39
|
if (params.request.target.evalIds && params.request.target.evalIds.length > 0) return params.request.target.evalIds.map((id) => params.evals.get(id)).filter((entry) => entry !== void 0);
|
|
38
40
|
return [...params.evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
39
41
|
}
|
|
42
|
+
function toWorkspaceRelativePath(params) {
|
|
43
|
+
return relative(params.workspaceRoot, params.filePath).replaceAll("\\", "/");
|
|
44
|
+
}
|
|
45
|
+
async function discoverRunEvals(params) {
|
|
46
|
+
const discovered = [];
|
|
47
|
+
for (const pattern of params.config.include) {
|
|
48
|
+
const files = await glob(pattern, {
|
|
49
|
+
cwd: params.workspaceRoot,
|
|
50
|
+
absolute: true
|
|
51
|
+
});
|
|
52
|
+
discovered.push(...files);
|
|
53
|
+
}
|
|
54
|
+
const evals = /* @__PURE__ */ new Map();
|
|
55
|
+
for (const filePath of discovered) {
|
|
56
|
+
const source = await readFile(filePath, "utf-8");
|
|
57
|
+
const sourceFingerprint = getSourceFingerprint(source);
|
|
58
|
+
const metas = parseEvalMetas(filePath, source);
|
|
59
|
+
for (const meta of metas) evals.set(meta.id, {
|
|
60
|
+
id: meta.id,
|
|
61
|
+
title: meta.title,
|
|
62
|
+
filePath: toWorkspaceRelativePath({
|
|
63
|
+
filePath: meta.filePath,
|
|
64
|
+
workspaceRoot: params.workspaceRoot
|
|
65
|
+
}),
|
|
66
|
+
sourceFilePath: meta.filePath,
|
|
67
|
+
sourceFingerprint,
|
|
68
|
+
columnDefs: [],
|
|
69
|
+
caseCount: null
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
73
|
+
}
|
|
40
74
|
async function readContext(contextPath) {
|
|
41
75
|
if (contextPath === void 0) throw new Error("Missing run child context path");
|
|
42
76
|
return runChildContextSchema.parse(JSON.parse(await readFile(contextPath, "utf-8")));
|
|
@@ -54,7 +88,11 @@ async function main() {
|
|
|
54
88
|
maxEntriesPerNamespace: config.cache?.maxEntriesPerNamespace ?? config.cache?.maxEntriesPerEval,
|
|
55
89
|
maxEntriesByNamespace: config.cache?.maxEntriesByNamespace
|
|
56
90
|
});
|
|
57
|
-
const
|
|
91
|
+
const evalMetas = await discoverRunEvals({
|
|
92
|
+
config,
|
|
93
|
+
workspaceRoot: context.workspaceRoot
|
|
94
|
+
});
|
|
95
|
+
const evals = new Map(evalMetas.map((evalMeta) => [evalMeta.id, evalMeta]));
|
|
58
96
|
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
59
97
|
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
60
98
|
await executeRun({
|