@ls-stack/agent-eval 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-7qDBq_ub.mjs → app-TjV5nDMM.mjs} +5 -5
- package/dist/apps/web/dist/assets/{index-CdxG9-O-.css → index-ClE28i5w.css} +1 -1
- package/dist/apps/web/dist/assets/index-gGumCEnD.js +112 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +39 -6
- package/dist/cli-BTtgQLjB.mjs +1285 -0
- package/dist/index.d.mts +2 -3
- package/dist/index.mjs +3 -2
- package/dist/runChild.d.mts +1 -0
- package/dist/runChild.mjs +107 -0
- package/dist/{cli-C-n-Fd4o.mjs → runOrchestration-HaMahl6b.mjs} +394 -1352
- package/dist/{runner-uzzY8kk1.mjs → runner-CBDZos0Z.mjs} +1 -1
- package/dist/{runner-CwEtnUFf.mjs → runner-DGVoOyJt.mjs} +2 -2
- package/dist/src-Bt5Fz9HS.mjs +3 -0
- package/package.json +5 -4
- package/dist/apps/web/dist/assets/index-J1yKYGfN.js +0 -112
- package/dist/src-Dy31CPXH.mjs +0 -2
|
@@ -1,16 +1,14 @@
|
|
|
1
|
+
import { createRequire, registerHooks } from "node:module";
|
|
1
2
|
import { createHash } from "node:crypto";
|
|
2
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
3
|
-
import {
|
|
4
|
+
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
4
5
|
import { z, z as z$1 } from "zod/v4";
|
|
5
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
6
7
|
import { Buffer as Buffer$1 } from "node:buffer";
|
|
7
8
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
8
|
-
import { watch } from "chokidar";
|
|
9
|
-
import { glob } from "glob";
|
|
10
9
|
import { existsSync } from "node:fs";
|
|
11
10
|
import { resultify } from "t-result";
|
|
12
11
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
13
|
-
import { spawn, spawnSync } from "node:child_process";
|
|
14
12
|
//#region ../sdk/src/defineEval.ts
|
|
15
13
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
16
14
|
/** Return the in-memory registry of evals defined in the current process. */
|
|
@@ -2230,80 +2228,6 @@ function safeJsonParse(text) {
|
|
|
2230
2228
|
return parsed.value;
|
|
2231
2229
|
}
|
|
2232
2230
|
//#endregion
|
|
2233
|
-
//#region ../runner/src/chartValidation.ts
|
|
2234
|
-
function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
|
|
2235
|
-
const columnDef = columnsByKey.get(metric.key);
|
|
2236
|
-
if (!columnDef) {
|
|
2237
|
-
warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
|
|
2238
|
-
return false;
|
|
2239
|
-
}
|
|
2240
|
-
if (metric.aggregate === "passThresholdRate") {
|
|
2241
|
-
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
2242
|
-
warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
2243
|
-
return false;
|
|
2244
|
-
}
|
|
2245
|
-
}
|
|
2246
|
-
return true;
|
|
2247
|
-
}
|
|
2248
|
-
function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
|
|
2249
|
-
const columnDef = columnsByKey.get(extra.key);
|
|
2250
|
-
if (!columnDef) {
|
|
2251
|
-
warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
|
|
2252
|
-
return false;
|
|
2253
|
-
}
|
|
2254
|
-
if (extra.aggregate === "passThresholdRate") {
|
|
2255
|
-
if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
|
|
2256
|
-
warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
|
|
2257
|
-
return false;
|
|
2258
|
-
}
|
|
2259
|
-
}
|
|
2260
|
-
return true;
|
|
2261
|
-
}
|
|
2262
|
-
function sanitizeChart(chart, columnsByKey, evalId, warnings) {
|
|
2263
|
-
const metrics = chart.metrics.filter((metric) => {
|
|
2264
|
-
if (metric.source === "builtin") return true;
|
|
2265
|
-
return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
|
|
2266
|
-
});
|
|
2267
|
-
if (metrics.length === 0) {
|
|
2268
|
-
warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
|
|
2269
|
-
return null;
|
|
2270
|
-
}
|
|
2271
|
-
const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
|
|
2272
|
-
if (extra.source === "builtin") return true;
|
|
2273
|
-
return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
|
|
2274
|
-
});
|
|
2275
|
-
return {
|
|
2276
|
-
...chart,
|
|
2277
|
-
metrics,
|
|
2278
|
-
tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
|
|
2279
|
-
};
|
|
2280
|
-
}
|
|
2281
|
-
/**
|
|
2282
|
-
* Validate and sanitize an authored `charts` config against the eval's
|
|
2283
|
-
* declared columns. Drops metrics/extras that reference unknown columns or
|
|
2284
|
-
* misuse `passThresholdRate`, and drops entire charts whose metrics are all
|
|
2285
|
-
* invalid. Returns `charts: undefined` when nothing valid remains so the UI
|
|
2286
|
-
* falls back to rendering no chart (matching the opt-in default).
|
|
2287
|
-
*/
|
|
2288
|
-
function validateCharts(params) {
|
|
2289
|
-
const { charts, columnDefs, evalId } = params;
|
|
2290
|
-
if (!charts || charts.length === 0) return {
|
|
2291
|
-
charts: void 0,
|
|
2292
|
-
warnings: []
|
|
2293
|
-
};
|
|
2294
|
-
const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
|
|
2295
|
-
const warnings = [];
|
|
2296
|
-
const sanitized = [];
|
|
2297
|
-
for (const chart of charts) {
|
|
2298
|
-
const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
|
|
2299
|
-
if (result) sanitized.push(result);
|
|
2300
|
-
}
|
|
2301
|
-
return {
|
|
2302
|
-
charts: sanitized.length > 0 ? sanitized : void 0,
|
|
2303
|
-
warnings
|
|
2304
|
-
};
|
|
2305
|
-
}
|
|
2306
|
-
//#endregion
|
|
2307
2231
|
//#region ../runner/src/columnBuilder.ts
|
|
2308
2232
|
/**
|
|
2309
2233
|
* Normalize a user-provided score definition (either a function or an
|
|
@@ -2513,98 +2437,6 @@ async function loadConfig() {
|
|
|
2513
2437
|
}
|
|
2514
2438
|
}
|
|
2515
2439
|
//#endregion
|
|
2516
|
-
//#region ../runner/src/discovery.ts
|
|
2517
|
-
const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
|
|
2518
|
-
const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
|
|
2519
|
-
function parseEvalMetas(filePath, content) {
|
|
2520
|
-
const metas = [];
|
|
2521
|
-
let searchIndex = 0;
|
|
2522
|
-
while (searchIndex < content.length) {
|
|
2523
|
-
const defineEvalIndex = content.indexOf("defineEval", searchIndex);
|
|
2524
|
-
if (defineEvalIndex === -1) break;
|
|
2525
|
-
const extracted = extractDefineEvalObject(content, defineEvalIndex);
|
|
2526
|
-
if (!extracted) {
|
|
2527
|
-
searchIndex = defineEvalIndex + 10;
|
|
2528
|
-
continue;
|
|
2529
|
-
}
|
|
2530
|
-
const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
|
|
2531
|
-
if (id !== void 0) {
|
|
2532
|
-
const result = {
|
|
2533
|
-
filePath,
|
|
2534
|
-
id
|
|
2535
|
-
};
|
|
2536
|
-
const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
|
|
2537
|
-
if (title !== void 0) result.title = title;
|
|
2538
|
-
metas.push(result);
|
|
2539
|
-
}
|
|
2540
|
-
searchIndex = extracted.nextIndex;
|
|
2541
|
-
}
|
|
2542
|
-
return metas;
|
|
2543
|
-
}
|
|
2544
|
-
function extractDefineEvalObject(content, defineEvalIndex) {
|
|
2545
|
-
const openParenIndex = content.indexOf("(", defineEvalIndex);
|
|
2546
|
-
if (openParenIndex === -1) return void 0;
|
|
2547
|
-
const objectStartIndex = content.indexOf("{", openParenIndex);
|
|
2548
|
-
if (objectStartIndex === -1) return void 0;
|
|
2549
|
-
let depth = 0;
|
|
2550
|
-
let quote;
|
|
2551
|
-
let inBlockComment = false;
|
|
2552
|
-
let inLineComment = false;
|
|
2553
|
-
let isEscaped = false;
|
|
2554
|
-
for (let index = objectStartIndex; index < content.length; index++) {
|
|
2555
|
-
const currentChar = content[index];
|
|
2556
|
-
const nextChar = content[index + 1];
|
|
2557
|
-
if (inLineComment) {
|
|
2558
|
-
if (currentChar === "\n") inLineComment = false;
|
|
2559
|
-
continue;
|
|
2560
|
-
}
|
|
2561
|
-
if (inBlockComment) {
|
|
2562
|
-
if (currentChar === "*" && nextChar === "/") {
|
|
2563
|
-
inBlockComment = false;
|
|
2564
|
-
index++;
|
|
2565
|
-
}
|
|
2566
|
-
continue;
|
|
2567
|
-
}
|
|
2568
|
-
if (quote) {
|
|
2569
|
-
if (isEscaped) {
|
|
2570
|
-
isEscaped = false;
|
|
2571
|
-
continue;
|
|
2572
|
-
}
|
|
2573
|
-
if (currentChar === "\\") {
|
|
2574
|
-
isEscaped = true;
|
|
2575
|
-
continue;
|
|
2576
|
-
}
|
|
2577
|
-
if (currentChar === quote) quote = void 0;
|
|
2578
|
-
continue;
|
|
2579
|
-
}
|
|
2580
|
-
if (currentChar === "/" && nextChar === "/") {
|
|
2581
|
-
inLineComment = true;
|
|
2582
|
-
index++;
|
|
2583
|
-
continue;
|
|
2584
|
-
}
|
|
2585
|
-
if (currentChar === "/" && nextChar === "*") {
|
|
2586
|
-
inBlockComment = true;
|
|
2587
|
-
index++;
|
|
2588
|
-
continue;
|
|
2589
|
-
}
|
|
2590
|
-
if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
|
|
2591
|
-
quote = currentChar;
|
|
2592
|
-
continue;
|
|
2593
|
-
}
|
|
2594
|
-
if (currentChar === "{") {
|
|
2595
|
-
depth++;
|
|
2596
|
-
continue;
|
|
2597
|
-
}
|
|
2598
|
-
if (currentChar === "}") {
|
|
2599
|
-
depth--;
|
|
2600
|
-
if (depth === 0) return {
|
|
2601
|
-
nextIndex: index + 1,
|
|
2602
|
-
objectText: content.slice(objectStartIndex, index + 1)
|
|
2603
|
-
};
|
|
2604
|
-
}
|
|
2605
|
-
}
|
|
2606
|
-
}
|
|
2607
|
-
//#endregion
|
|
2608
2440
|
//#region ../runner/src/evalModuleLoader.ts
|
|
2609
2441
|
/**
|
|
2610
2442
|
* Import one eval module with a cache key derived from its current source so
|
|
@@ -2692,30 +2524,6 @@ function setLatestRunInfoMap(params) {
|
|
|
2692
2524
|
for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
|
|
2693
2525
|
}
|
|
2694
2526
|
//#endregion
|
|
2695
|
-
//#region ../runner/src/gitState.ts
|
|
2696
|
-
function runGitCommand(workspaceRoot, args) {
|
|
2697
|
-
const result = spawnSync("git", args, {
|
|
2698
|
-
cwd: workspaceRoot,
|
|
2699
|
-
encoding: "utf8",
|
|
2700
|
-
stdio: [
|
|
2701
|
-
"ignore",
|
|
2702
|
-
"pipe",
|
|
2703
|
-
"ignore"
|
|
2704
|
-
]
|
|
2705
|
-
});
|
|
2706
|
-
return {
|
|
2707
|
-
status: result.status,
|
|
2708
|
-
stdout: result.stdout.trim()
|
|
2709
|
-
};
|
|
2710
|
-
}
|
|
2711
|
-
/** Read the current git commit for the workspace, if available. */
|
|
2712
|
-
function readGitWorktreeState(workspaceRoot) {
|
|
2713
|
-
const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
|
|
2714
|
-
if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
|
|
2715
|
-
const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
|
|
2716
|
-
return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
|
|
2717
|
-
}
|
|
2718
|
-
//#endregion
|
|
2719
2527
|
//#region ../runner/src/outputArtifacts.ts
|
|
2720
2528
|
const mimeTypeExtensionMap = {
|
|
2721
2529
|
"application/json": ".json",
|
|
@@ -2867,6 +2675,235 @@ async function recomputeEvalStatusesInRuns(params) {
|
|
|
2867
2675
|
return updatedRuns;
|
|
2868
2676
|
}
|
|
2869
2677
|
//#endregion
|
|
2678
|
+
//#region ../runner/src/runPersistence.ts
|
|
2679
|
+
const SHORT_ID_PATTERN = /^r(\d+)$/;
|
|
2680
|
+
/**
|
|
2681
|
+
* Generate a filesystem-safe, sortable run id combining a UTC timestamp
|
|
2682
|
+
* with a short random suffix.
|
|
2683
|
+
*/
|
|
2684
|
+
function generateRunId() {
|
|
2685
|
+
const now = /* @__PURE__ */ new Date();
|
|
2686
|
+
const pad = (n) => String(n).padStart(2, "0");
|
|
2687
|
+
return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
|
|
2688
|
+
}
|
|
2689
|
+
function parseShortIdNum(shortId) {
|
|
2690
|
+
if (shortId === void 0) return null;
|
|
2691
|
+
const match = SHORT_ID_PATTERN.exec(shortId);
|
|
2692
|
+
if (!match) return null;
|
|
2693
|
+
const num = Number(match[1]);
|
|
2694
|
+
if (!Number.isFinite(num)) return null;
|
|
2695
|
+
return num;
|
|
2696
|
+
}
|
|
2697
|
+
/**
|
|
2698
|
+
* Return the next `shortId` number to assign based on the existing
|
|
2699
|
+
* loaded snapshots. Legacy runs that don't match the `r\d+` format are
|
|
2700
|
+
* ignored.
|
|
2701
|
+
*/
|
|
2702
|
+
function nextShortIdFromSnapshots(snapshots) {
|
|
2703
|
+
let maxNum = -1;
|
|
2704
|
+
for (const snapshot of snapshots) {
|
|
2705
|
+
const num = parseShortIdNum(snapshot.manifest.shortId);
|
|
2706
|
+
if (num !== null && num > maxNum) maxNum = num;
|
|
2707
|
+
}
|
|
2708
|
+
return maxNum + 1;
|
|
2709
|
+
}
|
|
2710
|
+
async function loadPersistedRunSnapshots(localStateDir) {
|
|
2711
|
+
const runsDir = join(localStateDir, "runs");
|
|
2712
|
+
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
2713
|
+
if (entriesResult.error) return [];
|
|
2714
|
+
const snapshots = [];
|
|
2715
|
+
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
2716
|
+
for (const runDir of runDirs) {
|
|
2717
|
+
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
2718
|
+
if (!snapshot) continue;
|
|
2719
|
+
snapshots.push(snapshot);
|
|
2720
|
+
}
|
|
2721
|
+
return snapshots;
|
|
2722
|
+
}
|
|
2723
|
+
async function persistCaseDetail(runDir, caseDetail) {
|
|
2724
|
+
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
2725
|
+
}
|
|
2726
|
+
function getLastRunStatuses(params) {
|
|
2727
|
+
const latestRunInfos = getLatestRunInfos(params);
|
|
2728
|
+
return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
|
|
2729
|
+
}
|
|
2730
|
+
/**
|
|
2731
|
+
* Return the latest scoped run metadata for each eval based on persisted and
|
|
2732
|
+
* in-memory runs.
|
|
2733
|
+
*/
|
|
2734
|
+
function getLatestRunInfos(params) {
|
|
2735
|
+
const { runs, knownEvals } = params;
|
|
2736
|
+
const knownEvalMetas = [...knownEvals];
|
|
2737
|
+
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
2738
|
+
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
2739
|
+
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
2740
|
+
for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
|
|
2741
|
+
status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
|
|
2742
|
+
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
2743
|
+
commitSha: run.manifest.commitSha ?? null,
|
|
2744
|
+
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
|
|
2745
|
+
});
|
|
2746
|
+
return latestRunInfos;
|
|
2747
|
+
}
|
|
2748
|
+
function toLastRunStatus$1(status) {
|
|
2749
|
+
return status === "pending" ? null : status;
|
|
2750
|
+
}
|
|
2751
|
+
async function loadPersistedRunSnapshot(runDir) {
|
|
2752
|
+
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
2753
|
+
if (!manifest) return null;
|
|
2754
|
+
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
2755
|
+
if (!summary) return null;
|
|
2756
|
+
return {
|
|
2757
|
+
runDir,
|
|
2758
|
+
manifest,
|
|
2759
|
+
summary,
|
|
2760
|
+
cases: await readCaseRows(runDir),
|
|
2761
|
+
caseDetails: await readCaseDetails(runDir)
|
|
2762
|
+
};
|
|
2763
|
+
}
|
|
2764
|
+
async function readParsedJsonFile(filePath, schema) {
|
|
2765
|
+
const fileResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
2766
|
+
if (fileResult.error) return null;
|
|
2767
|
+
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
2768
|
+
if (jsonResult.error) return null;
|
|
2769
|
+
const parsed = schema.safeParse(jsonResult.value);
|
|
2770
|
+
if (!parsed.success) return null;
|
|
2771
|
+
return parsed.data;
|
|
2772
|
+
}
|
|
2773
|
+
async function readCaseRows(runDir) {
|
|
2774
|
+
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
2775
|
+
if (fileResult.error) return [];
|
|
2776
|
+
const rows = [];
|
|
2777
|
+
for (const rawLine of fileResult.value.split("\n")) {
|
|
2778
|
+
const line = rawLine.trim();
|
|
2779
|
+
if (line.length === 0) continue;
|
|
2780
|
+
const jsonResult = resultify(() => JSON.parse(line));
|
|
2781
|
+
if (jsonResult.error) continue;
|
|
2782
|
+
const parsed = caseRowSchema.safeParse(jsonResult.value);
|
|
2783
|
+
if (!parsed.success) continue;
|
|
2784
|
+
rows.push(parsed.data);
|
|
2785
|
+
}
|
|
2786
|
+
return rows;
|
|
2787
|
+
}
|
|
2788
|
+
async function readCaseDetails(runDir) {
|
|
2789
|
+
const detailsDir = join(runDir, "case-details");
|
|
2790
|
+
const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
|
|
2791
|
+
if (entriesResult.error) return /* @__PURE__ */ new Map();
|
|
2792
|
+
const caseDetails = /* @__PURE__ */ new Map();
|
|
2793
|
+
for (const entry of entriesResult.value) {
|
|
2794
|
+
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
2795
|
+
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
2796
|
+
if (!detail) continue;
|
|
2797
|
+
caseDetails.set(detail.caseId, detail);
|
|
2798
|
+
}
|
|
2799
|
+
return caseDetails;
|
|
2800
|
+
}
|
|
2801
|
+
function getRunEvalIds(run, knownEvalIds) {
|
|
2802
|
+
const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
|
|
2803
|
+
if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
|
|
2804
|
+
else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
|
|
2805
|
+
return [...evalIds];
|
|
2806
|
+
}
|
|
2807
|
+
function getEvalStatusForRun(run, evalId, manualScoreKeys) {
|
|
2808
|
+
const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
|
|
2809
|
+
if (evalCases.length > 0) {
|
|
2810
|
+
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
2811
|
+
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
2812
|
+
}
|
|
2813
|
+
return toLastRunStatus$1(deriveStatusFromChildStatuses({
|
|
2814
|
+
statuses: [],
|
|
2815
|
+
lifecycleStatus: run.manifest.status
|
|
2816
|
+
}));
|
|
2817
|
+
}
|
|
2818
|
+
function hasPendingManualScores(caseRows, manualScoreKeys) {
|
|
2819
|
+
if (manualScoreKeys.length === 0) return false;
|
|
2820
|
+
return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
|
|
2821
|
+
const value = caseRow.columns[key];
|
|
2822
|
+
return typeof value !== "number" || !Number.isFinite(value);
|
|
2823
|
+
}));
|
|
2824
|
+
}
|
|
2825
|
+
function encodeCaseDetailFileName(caseId) {
|
|
2826
|
+
return encodeURIComponent(caseId);
|
|
2827
|
+
}
|
|
2828
|
+
//#endregion
|
|
2829
|
+
//#region ../runner/src/moduleIsolation.ts
|
|
2830
|
+
const isolationParam = "agent-evals-isolate";
|
|
2831
|
+
const pathSegmentSeparatorPattern = /[\\/]+/;
|
|
2832
|
+
const isolationStorage = new AsyncLocalStorage();
|
|
2833
|
+
const activeIsolationRoots = /* @__PURE__ */ new Map();
|
|
2834
|
+
let hooksRegistered = false;
|
|
2835
|
+
const requireFromRunner = createRequire(import.meta.url);
|
|
2836
|
+
const agentPackageUrlBySpecifier = new Map([
|
|
2837
|
+
"@ls-stack/agent-eval",
|
|
2838
|
+
"@agent-evals/sdk",
|
|
2839
|
+
"@agent-evals/shared",
|
|
2840
|
+
"@agent-evals/runner",
|
|
2841
|
+
"@agent-evals/runner/run-child"
|
|
2842
|
+
].flatMap((specifier) => {
|
|
2843
|
+
try {
|
|
2844
|
+
return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
|
|
2845
|
+
} catch {
|
|
2846
|
+
return [];
|
|
2847
|
+
}
|
|
2848
|
+
}));
|
|
2849
|
+
function isAgentEvalsPackageSpecifier(specifier) {
|
|
2850
|
+
return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
|
|
2851
|
+
}
|
|
2852
|
+
function getIsolationKeyFromParent(parentURL) {
|
|
2853
|
+
if (!parentURL?.startsWith("file:")) return null;
|
|
2854
|
+
const value = new URL(parentURL).searchParams.get(isolationParam);
|
|
2855
|
+
return activeIsolationRoots.has(value ?? "") ? value : null;
|
|
2856
|
+
}
|
|
2857
|
+
function isWorkspaceFile(url, workspaceRoot) {
|
|
2858
|
+
if (url.protocol !== "file:") return false;
|
|
2859
|
+
const relativePath = relative(workspaceRoot, fileURLToPath(url));
|
|
2860
|
+
if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
|
|
2861
|
+
const segments = relativePath.split(pathSegmentSeparatorPattern);
|
|
2862
|
+
return !segments.includes("node_modules") && !segments.includes(".agent-evals");
|
|
2863
|
+
}
|
|
2864
|
+
function addIsolationParam(url, key) {
|
|
2865
|
+
const moduleUrl = new URL(url);
|
|
2866
|
+
if (moduleUrl.searchParams.get(isolationParam) === key) return url;
|
|
2867
|
+
moduleUrl.searchParams.set(isolationParam, key);
|
|
2868
|
+
return moduleUrl.href;
|
|
2869
|
+
}
|
|
2870
|
+
function registerModuleIsolationHooks() {
|
|
2871
|
+
if (hooksRegistered) return;
|
|
2872
|
+
hooksRegistered = true;
|
|
2873
|
+
registerHooks({ resolve(specifier, context, nextResolve) {
|
|
2874
|
+
const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
|
|
2875
|
+
if (agentPackageUrl !== void 0) return {
|
|
2876
|
+
url: agentPackageUrl,
|
|
2877
|
+
shortCircuit: true
|
|
2878
|
+
};
|
|
2879
|
+
const resolved = nextResolve(specifier, context);
|
|
2880
|
+
if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
|
|
2881
|
+
const activeContext = isolationStorage.getStore();
|
|
2882
|
+
const inferredKey = getIsolationKeyFromParent(context.parentURL);
|
|
2883
|
+
const isolationKey = activeContext?.key ?? inferredKey;
|
|
2884
|
+
if (isolationKey === null) return resolved;
|
|
2885
|
+
const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
|
|
2886
|
+
if (workspaceRoot === void 0) return resolved;
|
|
2887
|
+
if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
|
|
2888
|
+
return {
|
|
2889
|
+
...resolved,
|
|
2890
|
+
url: addIsolationParam(resolved.url, isolationKey)
|
|
2891
|
+
};
|
|
2892
|
+
} });
|
|
2893
|
+
}
|
|
2894
|
+
/**
|
|
2895
|
+
* Execute module loading and eval code with fresh workspace module URLs.
|
|
2896
|
+
*
|
|
2897
|
+
* Node does not expose an ESM cache reset API, so the runner appends a
|
|
2898
|
+
* run-scoped query parameter to workspace file imports. Package imports are
|
|
2899
|
+
* left alone so SDK singletons, such as the eval registry, remain shared.
|
|
2900
|
+
*/
|
|
2901
|
+
async function runWithModuleIsolation(context, fn) {
|
|
2902
|
+
registerModuleIsolationHooks();
|
|
2903
|
+
activeIsolationRoots.set(context.key, context.workspaceRoot);
|
|
2904
|
+
return await isolationStorage.run(context, fn);
|
|
2905
|
+
}
|
|
2906
|
+
//#endregion
|
|
2870
2907
|
//#region ../runner/src/traceDisplay.ts
|
|
2871
2908
|
function isRecord$1(value) {
|
|
2872
2909
|
return typeof value === "object" && value !== null;
|
|
@@ -2954,12 +2991,16 @@ async function callWithUnknownResult(fn, args) {
|
|
|
2954
2991
|
return await Reflect.apply(fn, void 0, args);
|
|
2955
2992
|
}
|
|
2956
2993
|
async function runCase(params) {
|
|
2957
|
-
const { evalDef, evalId, evalCase, globalTraceDisplay, trial,
|
|
2994
|
+
const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, artifactDir, runId } = params;
|
|
2958
2995
|
const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
|
|
2959
|
-
|
|
2960
|
-
input: evalCase.input
|
|
2961
|
-
|
|
2962
|
-
|
|
2996
|
+
const execute = async () => {
|
|
2997
|
+
await Reflect.apply(evalDef.execute, evalDef, [{ input: evalCase.input }]);
|
|
2998
|
+
};
|
|
2999
|
+
if (moduleIsolation === void 0) {
|
|
3000
|
+
await execute();
|
|
3001
|
+
return;
|
|
3002
|
+
}
|
|
3003
|
+
await runWithModuleIsolation(moduleIsolation, execute);
|
|
2963
3004
|
}, {
|
|
2964
3005
|
input: evalCase.input,
|
|
2965
3006
|
cacheContext: cacheAdapter ? {
|
|
@@ -2996,11 +3037,15 @@ async function runCase(params) {
|
|
|
2996
3037
|
const scoringTraces = {};
|
|
2997
3038
|
if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
2998
3039
|
const { compute, passThreshold, label } = normalizeScoreDef(def);
|
|
2999
|
-
const scoreRun = await runInEvalScope(evalCase.id, async () =>
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3040
|
+
const scoreRun = await runInEvalScope(evalCase.id, async () => {
|
|
3041
|
+
const computeScore = async () => await callWithUnknownResult(compute, [{
|
|
3042
|
+
input: evalCase.input,
|
|
3043
|
+
outputs: { ...scope.outputs },
|
|
3044
|
+
case: evalCase
|
|
3045
|
+
}]);
|
|
3046
|
+
if (moduleIsolation === void 0) return await computeScore();
|
|
3047
|
+
return await runWithModuleIsolation(moduleIsolation, computeScore);
|
|
3048
|
+
}, {
|
|
3004
3049
|
input: evalCase.input,
|
|
3005
3050
|
cacheContext: cacheAdapter ? {
|
|
3006
3051
|
adapter: cacheAdapter,
|
|
@@ -3119,172 +3164,20 @@ function toAssertionFailure(message, error = void 0) {
|
|
|
3119
3164
|
} : { message };
|
|
3120
3165
|
}
|
|
3121
3166
|
//#endregion
|
|
3122
|
-
//#region ../runner/src/runPersistence.ts
|
|
3123
|
-
const SHORT_ID_PATTERN = /^r(\d+)$/;
|
|
3124
|
-
/**
|
|
3125
|
-
* Generate a filesystem-safe, sortable run id combining a UTC timestamp
|
|
3126
|
-
* with a short random suffix.
|
|
3127
|
-
*/
|
|
3128
|
-
function generateRunId() {
|
|
3129
|
-
const now = /* @__PURE__ */ new Date();
|
|
3130
|
-
const pad = (n) => String(n).padStart(2, "0");
|
|
3131
|
-
return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
|
|
3132
|
-
}
|
|
3133
|
-
function parseShortIdNum(shortId) {
|
|
3134
|
-
if (shortId === void 0) return null;
|
|
3135
|
-
const match = SHORT_ID_PATTERN.exec(shortId);
|
|
3136
|
-
if (!match) return null;
|
|
3137
|
-
const num = Number(match[1]);
|
|
3138
|
-
if (!Number.isFinite(num)) return null;
|
|
3139
|
-
return num;
|
|
3140
|
-
}
|
|
3141
|
-
/**
|
|
3142
|
-
* Return the next `shortId` number to assign based on the existing
|
|
3143
|
-
* loaded snapshots. Legacy runs that don't match the `r\d+` format are
|
|
3144
|
-
* ignored.
|
|
3145
|
-
*/
|
|
3146
|
-
function nextShortIdFromSnapshots(snapshots) {
|
|
3147
|
-
let maxNum = -1;
|
|
3148
|
-
for (const snapshot of snapshots) {
|
|
3149
|
-
const num = parseShortIdNum(snapshot.manifest.shortId);
|
|
3150
|
-
if (num !== null && num > maxNum) maxNum = num;
|
|
3151
|
-
}
|
|
3152
|
-
return maxNum + 1;
|
|
3153
|
-
}
|
|
3154
|
-
async function loadPersistedRunSnapshots(localStateDir) {
|
|
3155
|
-
const runsDir = join(localStateDir, "runs");
|
|
3156
|
-
const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
|
|
3157
|
-
if (entriesResult.error) return [];
|
|
3158
|
-
const snapshots = [];
|
|
3159
|
-
const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
|
|
3160
|
-
for (const runDir of runDirs) {
|
|
3161
|
-
const snapshot = await loadPersistedRunSnapshot(runDir);
|
|
3162
|
-
if (!snapshot) continue;
|
|
3163
|
-
snapshots.push(snapshot);
|
|
3164
|
-
}
|
|
3165
|
-
return snapshots;
|
|
3166
|
-
}
|
|
3167
|
-
async function persistCaseDetail(runDir, caseDetail) {
|
|
3168
|
-
await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
|
|
3169
|
-
}
|
|
3170
|
-
function getLastRunStatuses(params) {
|
|
3171
|
-
const latestRunInfos = getLatestRunInfos(params);
|
|
3172
|
-
return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
|
|
3173
|
-
}
|
|
3174
|
-
/**
|
|
3175
|
-
* Return the latest scoped run metadata for each eval based on persisted and
|
|
3176
|
-
* in-memory runs.
|
|
3177
|
-
*/
|
|
3178
|
-
function getLatestRunInfos(params) {
|
|
3179
|
-
const { runs, knownEvals } = params;
|
|
3180
|
-
const knownEvalMetas = [...knownEvals];
|
|
3181
|
-
const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
|
|
3182
|
-
const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
|
|
3183
|
-
const latestRunInfos = /* @__PURE__ */ new Map();
|
|
3184
|
-
for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
|
|
3185
|
-
status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
|
|
3186
|
-
startedAt: getRunFreshnessTimestamp(run.manifest),
|
|
3187
|
-
commitSha: run.manifest.commitSha ?? null,
|
|
3188
|
-
evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
|
|
3189
|
-
});
|
|
3190
|
-
return latestRunInfos;
|
|
3191
|
-
}
|
|
3192
|
-
function toLastRunStatus$1(status) {
|
|
3193
|
-
return status === "pending" ? null : status;
|
|
3194
|
-
}
|
|
3195
|
-
async function loadPersistedRunSnapshot(runDir) {
|
|
3196
|
-
const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
|
|
3197
|
-
if (!manifest) return null;
|
|
3198
|
-
const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
|
|
3199
|
-
if (!summary) return null;
|
|
3200
|
-
return {
|
|
3201
|
-
runDir,
|
|
3202
|
-
manifest,
|
|
3203
|
-
summary,
|
|
3204
|
-
cases: await readCaseRows(runDir),
|
|
3205
|
-
caseDetails: await readCaseDetails(runDir)
|
|
3206
|
-
};
|
|
3207
|
-
}
|
|
3208
|
-
async function readParsedJsonFile(filePath, schema) {
|
|
3209
|
-
const fileResult = await resultify(() => readFile(filePath, "utf-8"));
|
|
3210
|
-
if (fileResult.error) return null;
|
|
3211
|
-
const jsonResult = resultify(() => JSON.parse(fileResult.value));
|
|
3212
|
-
if (jsonResult.error) return null;
|
|
3213
|
-
const parsed = schema.safeParse(jsonResult.value);
|
|
3214
|
-
if (!parsed.success) return null;
|
|
3215
|
-
return parsed.data;
|
|
3216
|
-
}
|
|
3217
|
-
async function readCaseRows(runDir) {
|
|
3218
|
-
const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
|
|
3219
|
-
if (fileResult.error) return [];
|
|
3220
|
-
const rows = [];
|
|
3221
|
-
for (const rawLine of fileResult.value.split("\n")) {
|
|
3222
|
-
const line = rawLine.trim();
|
|
3223
|
-
if (line.length === 0) continue;
|
|
3224
|
-
const jsonResult = resultify(() => JSON.parse(line));
|
|
3225
|
-
if (jsonResult.error) continue;
|
|
3226
|
-
const parsed = caseRowSchema.safeParse(jsonResult.value);
|
|
3227
|
-
if (!parsed.success) continue;
|
|
3228
|
-
rows.push(parsed.data);
|
|
3229
|
-
}
|
|
3230
|
-
return rows;
|
|
3231
|
-
}
|
|
3232
|
-
async function readCaseDetails(runDir) {
|
|
3233
|
-
const detailsDir = join(runDir, "case-details");
|
|
3234
|
-
const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
|
|
3235
|
-
if (entriesResult.error) return /* @__PURE__ */ new Map();
|
|
3236
|
-
const caseDetails = /* @__PURE__ */ new Map();
|
|
3237
|
-
for (const entry of entriesResult.value) {
|
|
3238
|
-
if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
|
|
3239
|
-
const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
|
|
3240
|
-
if (!detail) continue;
|
|
3241
|
-
caseDetails.set(detail.caseId, detail);
|
|
3242
|
-
}
|
|
3243
|
-
return caseDetails;
|
|
3244
|
-
}
|
|
3245
|
-
function getRunEvalIds(run, knownEvalIds) {
|
|
3246
|
-
const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
|
|
3247
|
-
if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
|
|
3248
|
-
else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
|
|
3249
|
-
return [...evalIds];
|
|
3250
|
-
}
|
|
3251
|
-
function getEvalStatusForRun(run, evalId, manualScoreKeys) {
|
|
3252
|
-
const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
|
|
3253
|
-
if (evalCases.length > 0) {
|
|
3254
|
-
if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
|
|
3255
|
-
return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
|
|
3256
|
-
}
|
|
3257
|
-
return toLastRunStatus$1(deriveStatusFromChildStatuses({
|
|
3258
|
-
statuses: [],
|
|
3259
|
-
lifecycleStatus: run.manifest.status
|
|
3260
|
-
}));
|
|
3261
|
-
}
|
|
3262
|
-
function hasPendingManualScores(caseRows, manualScoreKeys) {
|
|
3263
|
-
if (manualScoreKeys.length === 0) return false;
|
|
3264
|
-
return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
|
|
3265
|
-
const value = caseRow.columns[key];
|
|
3266
|
-
return typeof value !== "number" || !Number.isFinite(value);
|
|
3267
|
-
}));
|
|
3268
|
-
}
|
|
3269
|
-
function encodeCaseDetailFileName(caseId) {
|
|
3270
|
-
return encodeURIComponent(caseId);
|
|
3271
|
-
}
|
|
3272
|
-
//#endregion
|
|
3273
3167
|
//#region ../runner/src/runQueue.ts
|
|
3274
3168
|
async function executeQueuedCases(params) {
|
|
3275
|
-
const {
|
|
3169
|
+
const { queuedCases, concurrency, globalTraceDisplay } = params;
|
|
3276
3170
|
let nextCaseIndex = 0;
|
|
3277
3171
|
let workerError = void 0;
|
|
3278
3172
|
const workerCount = Math.min(concurrency, queuedCases.length);
|
|
3279
3173
|
const workers = Array.from({ length: workerCount }, async () => {
|
|
3280
|
-
while (
|
|
3174
|
+
while (workerError === void 0) {
|
|
3281
3175
|
const queuedCase = queuedCases[nextCaseIndex];
|
|
3282
3176
|
nextCaseIndex += 1;
|
|
3283
3177
|
if (queuedCase === void 0) return;
|
|
3284
3178
|
try {
|
|
3285
3179
|
await executeQueuedCase({
|
|
3286
3180
|
queuedCase,
|
|
3287
|
-
runState,
|
|
3288
3181
|
globalTraceDisplay
|
|
3289
3182
|
});
|
|
3290
3183
|
} catch (error) {
|
|
@@ -3298,11 +3191,10 @@ async function executeQueuedCases(params) {
|
|
|
3298
3191
|
if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
|
|
3299
3192
|
}
|
|
3300
3193
|
async function executeQueuedCase(params) {
|
|
3301
|
-
const { queuedCase,
|
|
3194
|
+
const { queuedCase, globalTraceDisplay } = params;
|
|
3302
3195
|
const startTime = Date.now();
|
|
3303
3196
|
const result = await queuedCase.execute({
|
|
3304
3197
|
globalTraceDisplay,
|
|
3305
|
-
signal: runState.abortController.signal,
|
|
3306
3198
|
startTime
|
|
3307
3199
|
});
|
|
3308
3200
|
await queuedCase.onComplete(result);
|
|
@@ -3353,7 +3245,48 @@ function pickWinningTrial(params) {
|
|
|
3353
3245
|
if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
|
|
3354
3246
|
return medianAttempt;
|
|
3355
3247
|
}
|
|
3356
|
-
async function
|
|
3248
|
+
async function finalizePreparedCase(params) {
|
|
3249
|
+
const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
|
|
3250
|
+
if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
|
|
3251
|
+
preparedCase.finalized = true;
|
|
3252
|
+
const winningTrial = pickWinningTrial({
|
|
3253
|
+
strategy: runState.manifest.trialSelection,
|
|
3254
|
+
attempts: preparedCase.trialResults,
|
|
3255
|
+
scoreKeys: preparedEval.scoreKeys
|
|
3256
|
+
});
|
|
3257
|
+
if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
|
|
3258
|
+
runState.cases.push(winningTrial.caseRow);
|
|
3259
|
+
runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
|
|
3260
|
+
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
3261
|
+
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
3262
|
+
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
3263
|
+
else runState.summary.failedCases++;
|
|
3264
|
+
await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
3265
|
+
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
3266
|
+
onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
|
|
3267
|
+
emitEvent(runState, {
|
|
3268
|
+
type: "case.finished",
|
|
3269
|
+
runId: runState.manifest.id,
|
|
3270
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3271
|
+
payload: winningTrial.caseRow
|
|
3272
|
+
});
|
|
3273
|
+
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
3274
|
+
}
|
|
3275
|
+
function getPreparedCaseOrderKey(caseRow) {
|
|
3276
|
+
return `${caseRow.evalId}\u0000${caseRow.caseId}`;
|
|
3277
|
+
}
|
|
3278
|
+
function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
|
|
3279
|
+
const orderByCase = /* @__PURE__ */ new Map();
|
|
3280
|
+
let order = 0;
|
|
3281
|
+
for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
|
|
3282
|
+
orderByCase.set(`${preparedEval.evalMeta.id}\u0000${preparedCase.caseId}`, order);
|
|
3283
|
+
order++;
|
|
3284
|
+
}
|
|
3285
|
+
caseRows.sort((left, right) => {
|
|
3286
|
+
return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
|
|
3287
|
+
});
|
|
3288
|
+
}
|
|
3289
|
+
async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
|
|
3357
3290
|
try {
|
|
3358
3291
|
const targetEvals = getTargetEvals(request);
|
|
3359
3292
|
emitEvent(runState, {
|
|
@@ -3362,14 +3295,16 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3362
3295
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3363
3296
|
payload: runState.manifest
|
|
3364
3297
|
});
|
|
3365
|
-
const allCaseRows = [];
|
|
3366
3298
|
const evalErrors = [];
|
|
3367
3299
|
const queuedCases = [];
|
|
3368
3300
|
const preparedEvals = [];
|
|
3369
3301
|
const cacheMode = runState.manifest.cacheMode ?? "use";
|
|
3370
3302
|
const cacheEnabled = config.cache?.enabled !== false;
|
|
3303
|
+
const moduleIsolation = {
|
|
3304
|
+
key: runState.manifest.id,
|
|
3305
|
+
workspaceRoot
|
|
3306
|
+
};
|
|
3371
3307
|
for (const evalMeta of targetEvals) {
|
|
3372
|
-
if (runState.abortController.signal.aborted) break;
|
|
3373
3308
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
3374
3309
|
let codeFingerprint = "";
|
|
3375
3310
|
try {
|
|
@@ -3381,7 +3316,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3381
3316
|
else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
|
|
3382
3317
|
try {
|
|
3383
3318
|
const registry = getEvalRegistry();
|
|
3384
|
-
await
|
|
3319
|
+
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
3320
|
+
await loadEvalModule(evalFilePath, codeFingerprint);
|
|
3321
|
+
});
|
|
3385
3322
|
const entry = registry.get(evalMeta.id);
|
|
3386
3323
|
if (!entry) {
|
|
3387
3324
|
evalErrors.push({
|
|
@@ -3390,74 +3327,87 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3390
3327
|
});
|
|
3391
3328
|
continue;
|
|
3392
3329
|
}
|
|
3393
|
-
await
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3400
|
-
|
|
3401
|
-
|
|
3402
|
-
|
|
3403
|
-
|
|
3404
|
-
|
|
3405
|
-
|
|
3406
|
-
|
|
3407
|
-
|
|
3408
|
-
|
|
3409
|
-
|
|
3410
|
-
|
|
3411
|
-
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
const
|
|
3417
|
-
|
|
3418
|
-
|
|
3419
|
-
|
|
3420
|
-
|
|
3421
|
-
|
|
3422
|
-
|
|
3423
|
-
|
|
3424
|
-
|
|
3425
|
-
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
trial,
|
|
3431
|
-
signal,
|
|
3432
|
-
startTime,
|
|
3433
|
-
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
3434
|
-
cacheMode,
|
|
3435
|
-
codeFingerprint,
|
|
3436
|
-
artifactDir: join(runDir, "artifacts"),
|
|
3437
|
-
runId: runState.manifest.id
|
|
3438
|
-
});
|
|
3439
|
-
return {
|
|
3440
|
-
caseDetail,
|
|
3441
|
-
caseRow: {
|
|
3442
|
-
caseId: evalCase.id,
|
|
3330
|
+
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
3331
|
+
await entry.use(async (evalDef) => {
|
|
3332
|
+
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
3333
|
+
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
3334
|
+
evalId: evalMeta.id
|
|
3335
|
+
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
3336
|
+
runState.summary.totalCases += cases.length;
|
|
3337
|
+
const accumulatedColumns = /* @__PURE__ */ new Map();
|
|
3338
|
+
const evalCaseRows = [];
|
|
3339
|
+
const preparedCases = [];
|
|
3340
|
+
const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
|
|
3341
|
+
const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
|
|
3342
|
+
const preparedEval = {
|
|
3343
|
+
evalMeta,
|
|
3344
|
+
accumulatedColumns,
|
|
3345
|
+
evalCaseRows,
|
|
3346
|
+
preparedCases,
|
|
3347
|
+
scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
|
|
3348
|
+
mergeColumns: (columns) => {
|
|
3349
|
+
mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
3350
|
+
}
|
|
3351
|
+
};
|
|
3352
|
+
preparedEvals.push(preparedEval);
|
|
3353
|
+
for (const evalCase of cases) {
|
|
3354
|
+
const trialResults = [];
|
|
3355
|
+
const preparedCase = {
|
|
3356
|
+
caseId: evalCase.id,
|
|
3357
|
+
trialResults,
|
|
3358
|
+
finalized: false
|
|
3359
|
+
};
|
|
3360
|
+
preparedCases.push(preparedCase);
|
|
3361
|
+
for (let trial = 0; trial < request.trials; trial++) {
|
|
3362
|
+
const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
|
|
3363
|
+
queuedCases.push({
|
|
3364
|
+
execute: async ({ startTime, globalTraceDisplay }) => {
|
|
3365
|
+
const { caseDetail, caseRowUpdate } = await runCase({
|
|
3366
|
+
evalDef,
|
|
3443
3367
|
evalId: evalMeta.id,
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
|
|
3454
|
-
|
|
3455
|
-
|
|
3456
|
-
|
|
3457
|
-
|
|
3458
|
-
|
|
3368
|
+
evalCase,
|
|
3369
|
+
globalTraceDisplay,
|
|
3370
|
+
trial,
|
|
3371
|
+
startTime,
|
|
3372
|
+
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
3373
|
+
cacheMode,
|
|
3374
|
+
codeFingerprint,
|
|
3375
|
+
moduleIsolation,
|
|
3376
|
+
artifactDir: join(runDir, "artifacts"),
|
|
3377
|
+
runId: runState.manifest.id
|
|
3378
|
+
});
|
|
3379
|
+
return {
|
|
3380
|
+
caseDetail,
|
|
3381
|
+
caseRow: {
|
|
3382
|
+
caseId: evalCase.id,
|
|
3383
|
+
evalId: evalMeta.id,
|
|
3384
|
+
status: caseRowUpdate.status ?? "pending",
|
|
3385
|
+
latencyMs: caseRowUpdate.latencyMs ?? null,
|
|
3386
|
+
columns: caseRowUpdate.columns ?? {},
|
|
3387
|
+
trial
|
|
3388
|
+
}
|
|
3389
|
+
};
|
|
3390
|
+
},
|
|
3391
|
+
onComplete: async ({ caseDetail, caseRow }) => {
|
|
3392
|
+
trialResults.push({
|
|
3393
|
+
caseDetail,
|
|
3394
|
+
caseRow,
|
|
3395
|
+
bufferedCacheStore
|
|
3396
|
+
});
|
|
3397
|
+
if (trialResults.length !== request.trials) return;
|
|
3398
|
+
await finalizePreparedCase({
|
|
3399
|
+
runState,
|
|
3400
|
+
runDir,
|
|
3401
|
+
preparedEval,
|
|
3402
|
+
preparedCase,
|
|
3403
|
+
onCaseFinished,
|
|
3404
|
+
emitEvent
|
|
3405
|
+
});
|
|
3406
|
+
}
|
|
3407
|
+
});
|
|
3408
|
+
}
|
|
3459
3409
|
}
|
|
3460
|
-
}
|
|
3410
|
+
});
|
|
3461
3411
|
});
|
|
3462
3412
|
} catch (error) {
|
|
3463
3413
|
console.error(`Error running eval ${evalMeta.id}:`, error);
|
|
@@ -3475,37 +3425,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3475
3425
|
}
|
|
3476
3426
|
}
|
|
3477
3427
|
await executeQueuedCases({
|
|
3478
|
-
runState,
|
|
3479
3428
|
queuedCases,
|
|
3480
3429
|
concurrency: getConfiguredConcurrency(),
|
|
3481
3430
|
globalTraceDisplay: config.traceDisplay
|
|
3482
3431
|
});
|
|
3483
3432
|
for (const preparedEval of preparedEvals) {
|
|
3484
|
-
for (const preparedCase of preparedEval.preparedCases) {
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
runState.cases.push(winningTrial.caseRow);
|
|
3493
|
-
runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
|
|
3494
|
-
preparedEval.mergeColumns(winningTrial.caseDetail.columns);
|
|
3495
|
-
if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
|
|
3496
|
-
else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
|
|
3497
|
-
else runState.summary.failedCases++;
|
|
3498
|
-
await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
|
|
3499
|
-
await persistCaseDetail(runDir, winningTrial.caseDetail);
|
|
3500
|
-
emitEvent(runState, {
|
|
3501
|
-
type: "case.finished",
|
|
3502
|
-
runId: runState.manifest.id,
|
|
3503
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3504
|
-
payload: winningTrial.caseRow
|
|
3505
|
-
});
|
|
3506
|
-
preparedEval.evalCaseRows.push(winningTrial.caseRow);
|
|
3507
|
-
allCaseRows.push(winningTrial.caseRow);
|
|
3508
|
-
}
|
|
3433
|
+
for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
|
|
3434
|
+
runState,
|
|
3435
|
+
runDir,
|
|
3436
|
+
preparedEval,
|
|
3437
|
+
preparedCase,
|
|
3438
|
+
onCaseFinished,
|
|
3439
|
+
emitEvent
|
|
3440
|
+
});
|
|
3509
3441
|
preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
|
|
3510
3442
|
lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
|
|
3511
3443
|
const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
|
|
@@ -3516,9 +3448,11 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3516
3448
|
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
|
|
3517
3449
|
});
|
|
3518
3450
|
}
|
|
3451
|
+
sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
|
|
3452
|
+
for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
|
|
3519
3453
|
const endTime = /* @__PURE__ */ new Date();
|
|
3520
3454
|
runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
|
|
3521
|
-
const finalStatus =
|
|
3455
|
+
const finalStatus = evalErrors.length > 0 ? "error" : "completed";
|
|
3522
3456
|
runState.summary.status = finalStatus;
|
|
3523
3457
|
runState.manifest.status = finalStatus;
|
|
3524
3458
|
const completedRunAt = endTime.toISOString();
|
|
@@ -3540,6 +3474,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3540
3474
|
evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
|
|
3541
3475
|
});
|
|
3542
3476
|
}
|
|
3477
|
+
await persistRunState(runState);
|
|
3543
3478
|
emitEvent(runState, {
|
|
3544
3479
|
type: "run.summary",
|
|
3545
3480
|
runId: runState.manifest.id,
|
|
@@ -3558,7 +3493,6 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3558
3493
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3559
3494
|
payload: runState.summary
|
|
3560
3495
|
});
|
|
3561
|
-
await persistRunState(runState);
|
|
3562
3496
|
emitDiscoveryEvent();
|
|
3563
3497
|
} catch (error) {
|
|
3564
3498
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -3566,13 +3500,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
3566
3500
|
runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3567
3501
|
runState.summary.status = "error";
|
|
3568
3502
|
runState.summary.errorMessage = message;
|
|
3503
|
+
await persistRunState(runState);
|
|
3569
3504
|
emitEvent(runState, {
|
|
3570
3505
|
type: "run.error",
|
|
3571
3506
|
runId: runState.manifest.id,
|
|
3572
3507
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3573
3508
|
payload: { message }
|
|
3574
3509
|
});
|
|
3575
|
-
await persistRunState(runState);
|
|
3576
3510
|
emitDiscoveryEvent();
|
|
3577
3511
|
}
|
|
3578
3512
|
}
|
|
@@ -3580,896 +3514,4 @@ function toLastRunStatus(status) {
|
|
|
3580
3514
|
return status === "pending" ? null : status;
|
|
3581
3515
|
}
|
|
3582
3516
|
//#endregion
|
|
3583
|
-
|
|
3584
|
-
const globMagicCharacters = new Set([
|
|
3585
|
-
"*",
|
|
3586
|
-
"?",
|
|
3587
|
-
"[",
|
|
3588
|
-
"]",
|
|
3589
|
-
"{",
|
|
3590
|
-
"}",
|
|
3591
|
-
"(",
|
|
3592
|
-
")",
|
|
3593
|
-
"!",
|
|
3594
|
-
"+",
|
|
3595
|
-
"@"
|
|
3596
|
-
]);
|
|
3597
|
-
function hasGlobMagic(value) {
|
|
3598
|
-
for (const char of value) if (globMagicCharacters.has(char)) return true;
|
|
3599
|
-
return false;
|
|
3600
|
-
}
|
|
3601
|
-
function getWatchRootForIncludePattern(params) {
|
|
3602
|
-
const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
|
|
3603
|
-
const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
|
|
3604
|
-
if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
|
|
3605
|
-
if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
|
|
3606
|
-
return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
|
|
3607
|
-
}
|
|
3608
|
-
function getWatchRootsForIncludePatterns(params) {
|
|
3609
|
-
const roots = /* @__PURE__ */ new Set();
|
|
3610
|
-
for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
|
|
3611
|
-
pattern,
|
|
3612
|
-
workspaceRoot: params.workspaceRoot
|
|
3613
|
-
}));
|
|
3614
|
-
if (roots.size === 0) return [params.workspaceRoot];
|
|
3615
|
-
return [...roots];
|
|
3616
|
-
}
|
|
3617
|
-
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
3618
|
-
function createRunner({ watchForChanges = true } = {}) {
|
|
3619
|
-
let config;
|
|
3620
|
-
let workspaceRoot;
|
|
3621
|
-
let localStateDir;
|
|
3622
|
-
let cacheStore;
|
|
3623
|
-
const evals = /* @__PURE__ */ new Map();
|
|
3624
|
-
const runs = /* @__PURE__ */ new Map();
|
|
3625
|
-
const lastRunStatusMap = /* @__PURE__ */ new Map();
|
|
3626
|
-
const latestRunInfoMap = /* @__PURE__ */ new Map();
|
|
3627
|
-
const discoveryListeners = /* @__PURE__ */ new Set();
|
|
3628
|
-
let nextShortIdNum = 0;
|
|
3629
|
-
let discoveryWatcher;
|
|
3630
|
-
let discoveryRefreshTimer;
|
|
3631
|
-
function toWorkspaceRelativePath(filePath) {
|
|
3632
|
-
return relative(workspaceRoot, filePath).replaceAll("\\", "/");
|
|
3633
|
-
}
|
|
3634
|
-
function getSortedEvalMetas() {
|
|
3635
|
-
return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
|
|
3636
|
-
}
|
|
3637
|
-
function getSourceFingerprint(source) {
|
|
3638
|
-
return createHash("sha256").update(source).digest("hex");
|
|
3639
|
-
}
|
|
3640
|
-
function getConfiguredConcurrency() {
|
|
3641
|
-
const configuredConcurrency = config.concurrency;
|
|
3642
|
-
if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
|
|
3643
|
-
return Math.max(1, Math.floor(configuredConcurrency));
|
|
3644
|
-
}
|
|
3645
|
-
const runner = {
|
|
3646
|
-
async init() {
|
|
3647
|
-
config = await loadConfig();
|
|
3648
|
-
workspaceRoot = config.workspaceRoot ?? process.cwd();
|
|
3649
|
-
localStateDir = resolve(workspaceRoot, ".agent-evals");
|
|
3650
|
-
await mkdir(localStateDir, { recursive: true });
|
|
3651
|
-
await mkdir(join(localStateDir, "runs"), { recursive: true });
|
|
3652
|
-
cacheStore = createFsCacheStore({
|
|
3653
|
-
workspaceRoot,
|
|
3654
|
-
dir: config.cache?.dir,
|
|
3655
|
-
maxEntriesPerEval: config.cache?.maxEntriesPerEval
|
|
3656
|
-
});
|
|
3657
|
-
await loadPersistedRuns();
|
|
3658
|
-
await runner.refreshDiscovery();
|
|
3659
|
-
if (watchForChanges) await setupWatcher();
|
|
3660
|
-
},
|
|
3661
|
-
async listCache() {
|
|
3662
|
-
return cacheStore.list();
|
|
3663
|
-
},
|
|
3664
|
-
async clearCache(filter) {
|
|
3665
|
-
await cacheStore.clear(filter);
|
|
3666
|
-
},
|
|
3667
|
-
async recomputeStatusesForEval(evalId) {
|
|
3668
|
-
const evalMeta = evals.get(evalId);
|
|
3669
|
-
if (!evalMeta) return { updatedRuns: 0 };
|
|
3670
|
-
const registry = getEvalRegistry();
|
|
3671
|
-
await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
|
|
3672
|
-
const entry = registry.get(evalId);
|
|
3673
|
-
if (!entry) return { updatedRuns: 0 };
|
|
3674
|
-
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
3675
|
-
entry.use((evalDef) => {
|
|
3676
|
-
for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
|
|
3677
|
-
const threshold = normalizeScoreDef(def).passThreshold;
|
|
3678
|
-
if (threshold !== void 0) scoreThresholds.set(key, threshold);
|
|
3679
|
-
}
|
|
3680
|
-
for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
|
|
3681
|
-
});
|
|
3682
|
-
const updatedRuns = await recomputeEvalStatusesInRuns({
|
|
3683
|
-
runs: runs.values(),
|
|
3684
|
-
evalId,
|
|
3685
|
-
evalExists: evals.has(evalId),
|
|
3686
|
-
scoreThresholds,
|
|
3687
|
-
persistCaseDetail
|
|
3688
|
-
});
|
|
3689
|
-
emitDiscoveryEvent();
|
|
3690
|
-
return { updatedRuns };
|
|
3691
|
-
},
|
|
3692
|
-
async cleanRunsForEval(evalId) {
|
|
3693
|
-
let deletedRuns = 0;
|
|
3694
|
-
for (const [runId, run] of [...runs]) {
|
|
3695
|
-
if (!runTouchesEval({
|
|
3696
|
-
target: run.manifest.target,
|
|
3697
|
-
caseRows: run.cases,
|
|
3698
|
-
evalId,
|
|
3699
|
-
evalExists: evals.has(evalId)
|
|
3700
|
-
})) continue;
|
|
3701
|
-
if (run.manifest.status === "running") continue;
|
|
3702
|
-
runs.delete(runId);
|
|
3703
|
-
await rm(run.runDir, {
|
|
3704
|
-
recursive: true,
|
|
3705
|
-
force: true
|
|
3706
|
-
});
|
|
3707
|
-
deletedRuns += 1;
|
|
3708
|
-
}
|
|
3709
|
-
emitDiscoveryEvent();
|
|
3710
|
-
return { deletedRuns };
|
|
3711
|
-
},
|
|
3712
|
-
async updateManualScore({ runId, caseId, scoreKey, value }) {
|
|
3713
|
-
const run = runs.get(runId);
|
|
3714
|
-
if (!run) return {
|
|
3715
|
-
updated: false,
|
|
3716
|
-
reason: "Run not found"
|
|
3717
|
-
};
|
|
3718
|
-
if (run.manifest.status === "running") return {
|
|
3719
|
-
updated: false,
|
|
3720
|
-
reason: "Run is still running"
|
|
3721
|
-
};
|
|
3722
|
-
const caseRow = run.cases.find((row) => row.caseId === caseId);
|
|
3723
|
-
if (!caseRow) return {
|
|
3724
|
-
updated: false,
|
|
3725
|
-
reason: "Case not found"
|
|
3726
|
-
};
|
|
3727
|
-
const evalMeta = evals.get(caseRow.evalId);
|
|
3728
|
-
if (!evalMeta) return {
|
|
3729
|
-
updated: false,
|
|
3730
|
-
reason: "Eval not found"
|
|
3731
|
-
};
|
|
3732
|
-
if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
|
|
3733
|
-
updated: false,
|
|
3734
|
-
reason: "Manual score not found"
|
|
3735
|
-
};
|
|
3736
|
-
const caseDetail = run.caseDetails.get(caseId);
|
|
3737
|
-
if (!caseDetail) return {
|
|
3738
|
-
updated: false,
|
|
3739
|
-
reason: "Case detail not found"
|
|
3740
|
-
};
|
|
3741
|
-
caseRow.columns[scoreKey] = value;
|
|
3742
|
-
caseDetail.columns[scoreKey] = value;
|
|
3743
|
-
const scoreThresholds = /* @__PURE__ */ new Map();
|
|
3744
|
-
for (const def of evalMeta.columnDefs) {
|
|
3745
|
-
if (def.isScore !== true || def.passThreshold === void 0) continue;
|
|
3746
|
-
scoreThresholds.set(def.key, def.passThreshold);
|
|
3747
|
-
}
|
|
3748
|
-
const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
|
|
3749
|
-
caseRow.status = nextStatus;
|
|
3750
|
-
caseDetail.status = nextStatus;
|
|
3751
|
-
const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
|
|
3752
|
-
run.summary.totalCases = derivedSummary.totalCases;
|
|
3753
|
-
run.summary.passedCases = derivedSummary.passedCases;
|
|
3754
|
-
run.summary.failedCases = derivedSummary.failedCases;
|
|
3755
|
-
run.summary.errorCases = derivedSummary.errorCases;
|
|
3756
|
-
run.summary.cancelledCases = derivedSummary.cancelledCases;
|
|
3757
|
-
run.summary.totalDurationMs = derivedSummary.totalDurationMs;
|
|
3758
|
-
await persistCaseDetail(run.runDir, caseDetail);
|
|
3759
|
-
await persistRunState(run);
|
|
3760
|
-
emitDiscoveryEvent();
|
|
3761
|
-
return {
|
|
3762
|
-
updated: true,
|
|
3763
|
-
run: {
|
|
3764
|
-
manifest: run.manifest,
|
|
3765
|
-
summary: run.summary,
|
|
3766
|
-
cases: run.cases
|
|
3767
|
-
},
|
|
3768
|
-
caseDetail
|
|
3769
|
-
};
|
|
3770
|
-
},
|
|
3771
|
-
async deleteRun(runId) {
|
|
3772
|
-
const run = runs.get(runId);
|
|
3773
|
-
if (!run) return { deleted: false };
|
|
3774
|
-
if (run.manifest.status === "running") return { deleted: false };
|
|
3775
|
-
runs.delete(runId);
|
|
3776
|
-
await rm(run.runDir, {
|
|
3777
|
-
recursive: true,
|
|
3778
|
-
force: true
|
|
3779
|
-
});
|
|
3780
|
-
emitDiscoveryEvent();
|
|
3781
|
-
return { deleted: true };
|
|
3782
|
-
},
|
|
3783
|
-
getEvals() {
|
|
3784
|
-
const gitState = readGitWorktreeState(workspaceRoot);
|
|
3785
|
-
const result = [];
|
|
3786
|
-
for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
|
|
3787
|
-
meta,
|
|
3788
|
-
config,
|
|
3789
|
-
gitState,
|
|
3790
|
-
latestRun: latestRunInfoMap.get(meta.id),
|
|
3791
|
-
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
3792
|
-
}));
|
|
3793
|
-
return result;
|
|
3794
|
-
},
|
|
3795
|
-
getEval(id) {
|
|
3796
|
-
const meta = evals.get(id);
|
|
3797
|
-
if (!meta) return void 0;
|
|
3798
|
-
return buildEvalSummary({
|
|
3799
|
-
meta,
|
|
3800
|
-
config,
|
|
3801
|
-
gitState: readGitWorktreeState(workspaceRoot),
|
|
3802
|
-
latestRun: latestRunInfoMap.get(meta.id),
|
|
3803
|
-
lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
|
|
3804
|
-
});
|
|
3805
|
-
},
|
|
3806
|
-
async refreshDiscovery() {
|
|
3807
|
-
const patterns = config.include;
|
|
3808
|
-
const discovered = [];
|
|
3809
|
-
for (const pattern of patterns) {
|
|
3810
|
-
const files = await glob(pattern, {
|
|
3811
|
-
cwd: workspaceRoot,
|
|
3812
|
-
absolute: true
|
|
3813
|
-
});
|
|
3814
|
-
discovered.push(...files);
|
|
3815
|
-
}
|
|
3816
|
-
evals.clear();
|
|
3817
|
-
for (const filePath of discovered) try {
|
|
3818
|
-
const content = await readFile(filePath, "utf-8");
|
|
3819
|
-
const discoveredMetas = parseEvalMetas(filePath, content);
|
|
3820
|
-
const sourceFingerprint = getSourceFingerprint(content);
|
|
3821
|
-
const registry = getEvalRegistry();
|
|
3822
|
-
try {
|
|
3823
|
-
await loadEvalModule(filePath, sourceFingerprint);
|
|
3824
|
-
} catch {}
|
|
3825
|
-
for (const meta of discoveredMetas) {
|
|
3826
|
-
const discoveredEntry = registry.get(meta.id);
|
|
3827
|
-
const title = meta.title;
|
|
3828
|
-
let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
|
|
3829
|
-
let stats;
|
|
3830
|
-
let charts;
|
|
3831
|
-
discoveredEntry?.use((evalDef) => {
|
|
3832
|
-
columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
|
|
3833
|
-
stats = evalDef.stats;
|
|
3834
|
-
const validated = validateCharts({
|
|
3835
|
-
charts: evalDef.charts,
|
|
3836
|
-
columnDefs,
|
|
3837
|
-
evalId: meta.id
|
|
3838
|
-
});
|
|
3839
|
-
for (const warning of validated.warnings) console.warn(warning);
|
|
3840
|
-
charts = validated.charts;
|
|
3841
|
-
});
|
|
3842
|
-
evals.set(meta.id, {
|
|
3843
|
-
id: meta.id,
|
|
3844
|
-
title,
|
|
3845
|
-
filePath: toWorkspaceRelativePath(meta.filePath),
|
|
3846
|
-
sourceFilePath: meta.filePath,
|
|
3847
|
-
sourceFingerprint,
|
|
3848
|
-
columnDefs,
|
|
3849
|
-
caseCount: null,
|
|
3850
|
-
stats,
|
|
3851
|
-
charts
|
|
3852
|
-
});
|
|
3853
|
-
}
|
|
3854
|
-
} catch {}
|
|
3855
|
-
emitDiscoveryEvent();
|
|
3856
|
-
},
|
|
3857
|
-
async startRun(request) {
|
|
3858
|
-
const runId = generateRunId();
|
|
3859
|
-
const shortId = `r${String(nextShortIdNum++)}`;
|
|
3860
|
-
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
3861
|
-
const cacheMode = request.cache?.mode ?? "use";
|
|
3862
|
-
const runDir = join(localStateDir, "runs", runId);
|
|
3863
|
-
const manifest = {
|
|
3864
|
-
id: runId,
|
|
3865
|
-
shortId,
|
|
3866
|
-
status: "running",
|
|
3867
|
-
startedAt: now,
|
|
3868
|
-
endedAt: null,
|
|
3869
|
-
commitSha: readGitWorktreeState(workspaceRoot).commitSha,
|
|
3870
|
-
evalSourceFingerprints: {},
|
|
3871
|
-
target: request.target,
|
|
3872
|
-
trials: request.trials,
|
|
3873
|
-
trialSelection: config.trialSelection ?? "lowestScore",
|
|
3874
|
-
cacheMode
|
|
3875
|
-
};
|
|
3876
|
-
const summary = {
|
|
3877
|
-
runId,
|
|
3878
|
-
status: "running",
|
|
3879
|
-
totalCases: 0,
|
|
3880
|
-
passedCases: 0,
|
|
3881
|
-
failedCases: 0,
|
|
3882
|
-
errorCases: 0,
|
|
3883
|
-
cancelledCases: 0,
|
|
3884
|
-
totalDurationMs: null,
|
|
3885
|
-
errorMessage: null
|
|
3886
|
-
};
|
|
3887
|
-
const abortController = new AbortController();
|
|
3888
|
-
const runState = {
|
|
3889
|
-
runDir,
|
|
3890
|
-
manifest,
|
|
3891
|
-
summary,
|
|
3892
|
-
cases: [],
|
|
3893
|
-
caseDetails: /* @__PURE__ */ new Map(),
|
|
3894
|
-
listeners: /* @__PURE__ */ new Set(),
|
|
3895
|
-
abortController
|
|
3896
|
-
};
|
|
3897
|
-
runs.set(runId, runState);
|
|
3898
|
-
setLatestRunInfoMap({
|
|
3899
|
-
latestRunInfoMap,
|
|
3900
|
-
evalIds: getTargetEvalIds({
|
|
3901
|
-
request,
|
|
3902
|
-
sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
|
|
3903
|
-
knownEvalIds: new Set(evals.keys())
|
|
3904
|
-
}),
|
|
3905
|
-
info: {
|
|
3906
|
-
status: "running",
|
|
3907
|
-
startedAt: now,
|
|
3908
|
-
commitSha: manifest.commitSha ?? null,
|
|
3909
|
-
evalSourceFingerprint: null
|
|
3910
|
-
}
|
|
3911
|
-
});
|
|
3912
|
-
await mkdir(runDir, { recursive: true });
|
|
3913
|
-
await mkdir(join(runDir, "traces"), { recursive: true });
|
|
3914
|
-
await mkdir(join(runDir, "artifacts"), { recursive: true });
|
|
3915
|
-
await mkdir(join(runDir, "case-details"), { recursive: true });
|
|
3916
|
-
await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
|
|
3917
|
-
executeRun({
|
|
3918
|
-
runState,
|
|
3919
|
-
request,
|
|
3920
|
-
runDir,
|
|
3921
|
-
config,
|
|
3922
|
-
evals,
|
|
3923
|
-
cacheStore,
|
|
3924
|
-
lastRunStatusMap,
|
|
3925
|
-
latestRunInfoMap,
|
|
3926
|
-
emitEvent,
|
|
3927
|
-
emitDiscoveryEvent,
|
|
3928
|
-
getSourceFingerprint,
|
|
3929
|
-
getConfiguredConcurrency,
|
|
3930
|
-
getSortedEvalMetas,
|
|
3931
|
-
getTargetEvals
|
|
3932
|
-
});
|
|
3933
|
-
return {
|
|
3934
|
-
manifest,
|
|
3935
|
-
summary,
|
|
3936
|
-
cases: []
|
|
3937
|
-
};
|
|
3938
|
-
},
|
|
3939
|
-
getRuns() {
|
|
3940
|
-
return [...runs.values()].map((r) => r.manifest);
|
|
3941
|
-
},
|
|
3942
|
-
getRun(id) {
|
|
3943
|
-
const run = runs.get(id);
|
|
3944
|
-
if (!run) return void 0;
|
|
3945
|
-
return {
|
|
3946
|
-
manifest: run.manifest,
|
|
3947
|
-
summary: run.summary,
|
|
3948
|
-
cases: run.cases
|
|
3949
|
-
};
|
|
3950
|
-
},
|
|
3951
|
-
cancelRun(id) {
|
|
3952
|
-
const run = runs.get(id);
|
|
3953
|
-
if (!run) return;
|
|
3954
|
-
run.abortController.abort();
|
|
3955
|
-
run.manifest.status = "cancelled";
|
|
3956
|
-
run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
3957
|
-
run.summary.status = "cancelled";
|
|
3958
|
-
emitEvent(run, {
|
|
3959
|
-
type: "run.cancelled",
|
|
3960
|
-
runId: id,
|
|
3961
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3962
|
-
payload: run.summary
|
|
3963
|
-
});
|
|
3964
|
-
},
|
|
3965
|
-
getCaseDetail(runId, caseId) {
|
|
3966
|
-
const run = runs.get(runId);
|
|
3967
|
-
if (!run) return void 0;
|
|
3968
|
-
return run.caseDetails.get(caseId);
|
|
3969
|
-
},
|
|
3970
|
-
subscribe(runId, listener) {
|
|
3971
|
-
const run = runs.get(runId);
|
|
3972
|
-
if (!run) return () => {};
|
|
3973
|
-
run.listeners.add(listener);
|
|
3974
|
-
return () => {
|
|
3975
|
-
run.listeners.delete(listener);
|
|
3976
|
-
};
|
|
3977
|
-
},
|
|
3978
|
-
subscribeDiscovery(listener) {
|
|
3979
|
-
discoveryListeners.add(listener);
|
|
3980
|
-
return () => {
|
|
3981
|
-
discoveryListeners.delete(listener);
|
|
3982
|
-
};
|
|
3983
|
-
},
|
|
3984
|
-
async close() {
|
|
3985
|
-
if (discoveryRefreshTimer !== void 0) {
|
|
3986
|
-
clearTimeout(discoveryRefreshTimer);
|
|
3987
|
-
discoveryRefreshTimer = void 0;
|
|
3988
|
-
}
|
|
3989
|
-
const watcher = discoveryWatcher;
|
|
3990
|
-
if (watcher === void 0) return;
|
|
3991
|
-
discoveryWatcher = void 0;
|
|
3992
|
-
await watcher.close();
|
|
3993
|
-
},
|
|
3994
|
-
getWorkspaceRoot() {
|
|
3995
|
-
return workspaceRoot;
|
|
3996
|
-
},
|
|
3997
|
-
getArtifactPath(artifactId_) {
|
|
3998
|
-
return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
|
|
3999
|
-
}
|
|
4000
|
-
};
|
|
4001
|
-
async function setupWatcher() {
|
|
4002
|
-
const watcher = watch(getWatchRootsForIncludePatterns({
|
|
4003
|
-
patterns: config.include,
|
|
4004
|
-
workspaceRoot
|
|
4005
|
-
}), {
|
|
4006
|
-
ignoreInitial: true,
|
|
4007
|
-
persistent: true
|
|
4008
|
-
});
|
|
4009
|
-
discoveryWatcher = watcher;
|
|
4010
|
-
const scheduleRefresh = () => {
|
|
4011
|
-
if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
|
|
4012
|
-
discoveryRefreshTimer = setTimeout(() => {
|
|
4013
|
-
discoveryRefreshTimer = void 0;
|
|
4014
|
-
runner.refreshDiscovery();
|
|
4015
|
-
}, 50);
|
|
4016
|
-
};
|
|
4017
|
-
watcher.on("change", scheduleRefresh);
|
|
4018
|
-
watcher.on("add", scheduleRefresh);
|
|
4019
|
-
watcher.on("unlink", scheduleRefresh);
|
|
4020
|
-
watcher.on("addDir", scheduleRefresh);
|
|
4021
|
-
watcher.on("unlinkDir", scheduleRefresh);
|
|
4022
|
-
await new Promise((ready) => {
|
|
4023
|
-
watcher.once("ready", ready);
|
|
4024
|
-
});
|
|
4025
|
-
}
|
|
4026
|
-
function emitDiscoveryEvent() {
|
|
4027
|
-
const lastRunStatuses = getLastRunStatuses({
|
|
4028
|
-
runs: runs.values(),
|
|
4029
|
-
knownEvals: evals.values()
|
|
4030
|
-
});
|
|
4031
|
-
const latestRunInfos = getLatestRunInfos({
|
|
4032
|
-
runs: runs.values(),
|
|
4033
|
-
knownEvals: evals.values()
|
|
4034
|
-
});
|
|
4035
|
-
lastRunStatusMap.clear();
|
|
4036
|
-
for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
|
|
4037
|
-
latestRunInfoMap.clear();
|
|
4038
|
-
for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
|
|
4039
|
-
const event = {
|
|
4040
|
-
type: "discovery.updated",
|
|
4041
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4042
|
-
payload: runner.getEvals()
|
|
4043
|
-
};
|
|
4044
|
-
for (const listener of discoveryListeners) listener(event);
|
|
4045
|
-
}
|
|
4046
|
-
function getTargetEvals(request) {
|
|
4047
|
-
if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
|
|
4048
|
-
return getSortedEvalMetas();
|
|
4049
|
-
}
|
|
4050
|
-
function emitEvent(runState, event) {
|
|
4051
|
-
for (const listener of runState.listeners) try {
|
|
4052
|
-
listener(event);
|
|
4053
|
-
} catch {}
|
|
4054
|
-
}
|
|
4055
|
-
async function loadPersistedRuns() {
|
|
4056
|
-
runs.clear();
|
|
4057
|
-
const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
|
|
4058
|
-
nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
|
|
4059
|
-
for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
|
|
4060
|
-
...persistedRun,
|
|
4061
|
-
listeners: /* @__PURE__ */ new Set(),
|
|
4062
|
-
abortController: new AbortController()
|
|
4063
|
-
});
|
|
4064
|
-
}
|
|
4065
|
-
return runner;
|
|
4066
|
-
}
|
|
4067
|
-
//#endregion
|
|
4068
|
-
//#region src/cli.ts
|
|
4069
|
-
function parseArgs(argv) {
|
|
4070
|
-
const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
|
|
4071
|
-
const args = {
|
|
4072
|
-
command: "help",
|
|
4073
|
-
subcommand: void 0,
|
|
4074
|
-
showHelp: false,
|
|
4075
|
-
helpTopic: "global",
|
|
4076
|
-
unknownHelpTarget: void 0,
|
|
4077
|
-
evalIds: [],
|
|
4078
|
-
caseIds: [],
|
|
4079
|
-
trials: 1,
|
|
4080
|
-
json: false,
|
|
4081
|
-
port: 4100,
|
|
4082
|
-
cacheMode: "use",
|
|
4083
|
-
clearCache: false,
|
|
4084
|
-
all: false,
|
|
4085
|
-
loadEnv: normalizedArgv.length === argv.length
|
|
4086
|
-
};
|
|
4087
|
-
const command = normalizedArgv[0];
|
|
4088
|
-
if (command === "--help" || command === "-h") {
|
|
4089
|
-
args.showHelp = true;
|
|
4090
|
-
return args;
|
|
4091
|
-
}
|
|
4092
|
-
if (isCliCommand(command)) {
|
|
4093
|
-
args.command = command;
|
|
4094
|
-
args.helpTopic = command === "help" ? "global" : command;
|
|
4095
|
-
} else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
|
|
4096
|
-
let cursor = 1;
|
|
4097
|
-
if (args.command === "cache") {
|
|
4098
|
-
const sub = normalizedArgv[cursor];
|
|
4099
|
-
if (sub === "list" || sub === "clear") {
|
|
4100
|
-
args.subcommand = sub;
|
|
4101
|
-
args.helpTopic = `cache ${sub}`;
|
|
4102
|
-
cursor++;
|
|
4103
|
-
} else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
|
|
4104
|
-
}
|
|
4105
|
-
for (let i = cursor; i < normalizedArgv.length; i++) {
|
|
4106
|
-
const arg = normalizedArgv[i];
|
|
4107
|
-
const next = normalizedArgv[i + 1];
|
|
4108
|
-
if (arg === "--help" || arg === "-h") args.showHelp = true;
|
|
4109
|
-
else if (arg === "--eval" && next) {
|
|
4110
|
-
args.evalIds.push(...next.split(","));
|
|
4111
|
-
i++;
|
|
4112
|
-
} else if (arg === "--case" && next) {
|
|
4113
|
-
args.caseIds.push(...next.split(","));
|
|
4114
|
-
i++;
|
|
4115
|
-
} else if (arg === "--trials" && next) {
|
|
4116
|
-
args.trials = Number(next);
|
|
4117
|
-
i++;
|
|
4118
|
-
} else if (arg === "--json") args.json = true;
|
|
4119
|
-
else if (arg === "--port" && next) {
|
|
4120
|
-
args.port = Number(next);
|
|
4121
|
-
i++;
|
|
4122
|
-
} else if (arg === "--cache" && next) {
|
|
4123
|
-
if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
|
|
4124
|
-
i++;
|
|
4125
|
-
} else if (arg === "--no-cache") args.cacheMode = "bypass";
|
|
4126
|
-
else if (arg === "--refresh-cache") args.cacheMode = "refresh";
|
|
4127
|
-
else if (arg === "--clear-cache") args.clearCache = true;
|
|
4128
|
-
else if (arg === "--all") args.all = true;
|
|
4129
|
-
}
|
|
4130
|
-
return args;
|
|
4131
|
-
}
|
|
4132
|
-
/**
|
|
4133
|
-
* Run the Agent Evals CLI against the current workspace.
|
|
4134
|
-
*
|
|
4135
|
-
* @param argv Raw command-line arguments excluding the executable name.
|
|
4136
|
-
*/
|
|
4137
|
-
async function runCli(argv) {
|
|
4138
|
-
const args = parseArgs(argv);
|
|
4139
|
-
if (args.loadEnv && !loadWorkspaceEnv()) {
|
|
4140
|
-
process.exit(1);
|
|
4141
|
-
return;
|
|
4142
|
-
}
|
|
4143
|
-
if (args.showHelp) {
|
|
4144
|
-
if (args.unknownHelpTarget !== void 0) {
|
|
4145
|
-
console.error(`No help found for "${args.unknownHelpTarget}".`);
|
|
4146
|
-
process.exit(1);
|
|
4147
|
-
return;
|
|
4148
|
-
}
|
|
4149
|
-
printHelp(args.helpTopic);
|
|
4150
|
-
return;
|
|
4151
|
-
}
|
|
4152
|
-
switch (args.command) {
|
|
4153
|
-
case "app":
|
|
4154
|
-
await commandApp(args);
|
|
4155
|
-
break;
|
|
4156
|
-
case "list":
|
|
4157
|
-
await commandList(args);
|
|
4158
|
-
break;
|
|
4159
|
-
case "run":
|
|
4160
|
-
await commandRun(args);
|
|
4161
|
-
break;
|
|
4162
|
-
case "cache":
|
|
4163
|
-
await commandCache(args);
|
|
4164
|
-
break;
|
|
4165
|
-
default:
|
|
4166
|
-
printHelp(args.helpTopic);
|
|
4167
|
-
break;
|
|
4168
|
-
}
|
|
4169
|
-
}
|
|
4170
|
-
function isCliCommand(command) {
|
|
4171
|
-
return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
|
|
4172
|
-
}
|
|
4173
|
-
function loadWorkspaceEnv() {
|
|
4174
|
-
const envPath = resolve(process.cwd(), ".env");
|
|
4175
|
-
if (!existsSync(envPath)) return true;
|
|
4176
|
-
const loadResult = resultify(() => {
|
|
4177
|
-
process.loadEnvFile(envPath);
|
|
4178
|
-
});
|
|
4179
|
-
if (loadResult.error) {
|
|
4180
|
-
console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
|
|
4181
|
-
return false;
|
|
4182
|
-
}
|
|
4183
|
-
return true;
|
|
4184
|
-
}
|
|
4185
|
-
const currentDir = dirname(fileURLToPath(import.meta.url));
|
|
4186
|
-
const repoRoot = resolve(currentDir, "../../..");
|
|
4187
|
-
const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
|
|
4188
|
-
function hasRepoWebWorkspace() {
|
|
4189
|
-
return existsSync(resolve(repoRoot, "apps/web/package.json"));
|
|
4190
|
-
}
|
|
4191
|
-
async function ensureWebUiIsBuilt() {
|
|
4192
|
-
if (!hasRepoWebWorkspace()) return;
|
|
4193
|
-
console.info("Preparing web UI...");
|
|
4194
|
-
await new Promise((resolvePromise, rejectPromise) => {
|
|
4195
|
-
const child = spawn(pnpmCommand, [
|
|
4196
|
-
"--filter",
|
|
4197
|
-
"@agent-evals/web",
|
|
4198
|
-
"build"
|
|
4199
|
-
], {
|
|
4200
|
-
cwd: repoRoot,
|
|
4201
|
-
stdio: "inherit"
|
|
4202
|
-
});
|
|
4203
|
-
child.once("error", (error) => {
|
|
4204
|
-
rejectPromise(error);
|
|
4205
|
-
});
|
|
4206
|
-
child.once("exit", (code, signal) => {
|
|
4207
|
-
if (signal) {
|
|
4208
|
-
rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
|
|
4209
|
-
return;
|
|
4210
|
-
}
|
|
4211
|
-
if (code !== 0) {
|
|
4212
|
-
rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
|
|
4213
|
-
return;
|
|
4214
|
-
}
|
|
4215
|
-
resolvePromise();
|
|
4216
|
-
});
|
|
4217
|
-
});
|
|
4218
|
-
}
|
|
4219
|
-
function isHonoAppModule(mod) {
|
|
4220
|
-
if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
|
|
4221
|
-
const { app } = mod;
|
|
4222
|
-
return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
|
|
4223
|
-
}
|
|
4224
|
-
function isServerRunnerModule(mod) {
|
|
4225
|
-
if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
|
|
4226
|
-
return typeof mod.initRunner === "function";
|
|
4227
|
-
}
|
|
4228
|
-
async function commandApp(args) {
|
|
4229
|
-
await ensureWebUiIsBuilt();
|
|
4230
|
-
const { serve } = await import("@hono/node-server");
|
|
4231
|
-
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
4232
|
-
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
4233
|
-
const appModule = await import("./app-7qDBq_ub.mjs");
|
|
4234
|
-
const runnerModule = await import("./runner-uzzY8kk1.mjs");
|
|
4235
|
-
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
4236
|
-
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
4237
|
-
await runnerModule.initRunner();
|
|
4238
|
-
console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
|
|
4239
|
-
serve({
|
|
4240
|
-
fetch: appModule.app.fetch,
|
|
4241
|
-
port: args.port
|
|
4242
|
-
});
|
|
4243
|
-
}
|
|
4244
|
-
async function commandList(args_) {
|
|
4245
|
-
const runner = createRunner({ watchForChanges: false });
|
|
4246
|
-
await runner.init();
|
|
4247
|
-
const evals = runner.getEvals();
|
|
4248
|
-
if (evals.length === 0) {
|
|
4249
|
-
console.info("No eval files found.");
|
|
4250
|
-
return;
|
|
4251
|
-
}
|
|
4252
|
-
console.info("Discovered evals:\n");
|
|
4253
|
-
for (const ev of evals) {
|
|
4254
|
-
const displayStatus = getEvalDisplayStatus({
|
|
4255
|
-
freshnessStatus: ev.freshnessStatus,
|
|
4256
|
-
stale: ev.stale,
|
|
4257
|
-
outdated: ev.outdated,
|
|
4258
|
-
lastRunStatus: ev.lastRunStatus
|
|
4259
|
-
});
|
|
4260
|
-
const title = getEvalTitle(ev);
|
|
4261
|
-
console.info(` ${title}`);
|
|
4262
|
-
console.info(` id: ${ev.id}`);
|
|
4263
|
-
console.info(` file: ${ev.filePath}`);
|
|
4264
|
-
if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
|
|
4265
|
-
if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
|
|
4266
|
-
console.info("");
|
|
4267
|
-
}
|
|
4268
|
-
}
|
|
4269
|
-
async function commandRun(args) {
|
|
4270
|
-
const runner = createRunner({ watchForChanges: false });
|
|
4271
|
-
await runner.init();
|
|
4272
|
-
if (args.clearCache) {
|
|
4273
|
-
await runner.clearCache();
|
|
4274
|
-
if (!args.json) {
|
|
4275
|
-
console.info("Cleared cache before run.");
|
|
4276
|
-
console.info("");
|
|
4277
|
-
}
|
|
4278
|
-
}
|
|
4279
|
-
const target = args.caseIds.length > 0 ? {
|
|
4280
|
-
mode: "caseIds",
|
|
4281
|
-
caseIds: args.caseIds,
|
|
4282
|
-
evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
|
|
4283
|
-
} : args.evalIds.length > 0 ? {
|
|
4284
|
-
mode: "evalIds",
|
|
4285
|
-
evalIds: args.evalIds
|
|
4286
|
-
} : { mode: "all" };
|
|
4287
|
-
const run = await runner.startRun({
|
|
4288
|
-
target,
|
|
4289
|
-
trials: args.trials,
|
|
4290
|
-
cache: { mode: args.cacheMode }
|
|
4291
|
-
});
|
|
4292
|
-
if (!args.json) {
|
|
4293
|
-
console.info(`Run started: ${run.manifest.id}`);
|
|
4294
|
-
console.info(`Trials: ${String(args.trials)}`);
|
|
4295
|
-
if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
|
|
4296
|
-
console.info("");
|
|
4297
|
-
}
|
|
4298
|
-
await waitForRunCompletion(runner, run.manifest.id);
|
|
4299
|
-
const finalRun = runner.getRun(run.manifest.id);
|
|
4300
|
-
if (!finalRun) {
|
|
4301
|
-
process.exit(1);
|
|
4302
|
-
return;
|
|
4303
|
-
}
|
|
4304
|
-
const { summary } = finalRun;
|
|
4305
|
-
if (args.json) console.info(JSON.stringify(summary, null, 2));
|
|
4306
|
-
else {
|
|
4307
|
-
console.info("--- Run Summary ---");
|
|
4308
|
-
console.info(`Status: ${summary.status}`);
|
|
4309
|
-
console.info(`Total: ${String(summary.totalCases)}`);
|
|
4310
|
-
console.info(`Passed: ${String(summary.passedCases)}`);
|
|
4311
|
-
console.info(`Failed: ${String(summary.failedCases)}`);
|
|
4312
|
-
console.info(`Errors: ${String(summary.errorCases)}`);
|
|
4313
|
-
if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
|
|
4314
|
-
if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
|
|
4315
|
-
}
|
|
4316
|
-
if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
|
|
4317
|
-
}
|
|
4318
|
-
async function commandCache(args) {
|
|
4319
|
-
const runner = createRunner({ watchForChanges: false });
|
|
4320
|
-
await runner.init();
|
|
4321
|
-
if (args.subcommand === "list" || args.subcommand === void 0) {
|
|
4322
|
-
const entries = await runner.listCache();
|
|
4323
|
-
if (args.json) {
|
|
4324
|
-
console.info(JSON.stringify(entries, null, 2));
|
|
4325
|
-
return;
|
|
4326
|
-
}
|
|
4327
|
-
if (entries.length === 0) {
|
|
4328
|
-
console.info("No cache entries.");
|
|
4329
|
-
return;
|
|
4330
|
-
}
|
|
4331
|
-
console.info(`Cache entries (${String(entries.length)}):\n`);
|
|
4332
|
-
for (const entry of entries) {
|
|
4333
|
-
console.info(` ${entry.namespace}`);
|
|
4334
|
-
console.info(` key: ${entry.key}`);
|
|
4335
|
-
const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
|
|
4336
|
-
console.info(` operation: ${operationLabel}`);
|
|
4337
|
-
console.info(` stored: ${entry.storedAt}`);
|
|
4338
|
-
console.info(` size: ${String(entry.sizeBytes)} bytes`);
|
|
4339
|
-
console.info("");
|
|
4340
|
-
}
|
|
4341
|
-
return;
|
|
4342
|
-
}
|
|
4343
|
-
if (args.subcommand === "clear") {
|
|
4344
|
-
if (args.evalIds.length > 0) {
|
|
4345
|
-
for (const evalId of args.evalIds) {
|
|
4346
|
-
const entries = await runner.listCache();
|
|
4347
|
-
const prefix = `${evalId}__`;
|
|
4348
|
-
const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
|
|
4349
|
-
for (const entry of matching) await runner.clearCache({
|
|
4350
|
-
namespace: entry.namespace,
|
|
4351
|
-
key: entry.key
|
|
4352
|
-
});
|
|
4353
|
-
}
|
|
4354
|
-
console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
|
|
4355
|
-
return;
|
|
4356
|
-
}
|
|
4357
|
-
if (args.all) {
|
|
4358
|
-
await runner.clearCache();
|
|
4359
|
-
console.info("Cleared all cache entries.");
|
|
4360
|
-
return;
|
|
4361
|
-
}
|
|
4362
|
-
console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
|
|
4363
|
-
process.exit(1);
|
|
4364
|
-
return;
|
|
4365
|
-
}
|
|
4366
|
-
printHelp(args.helpTopic);
|
|
4367
|
-
}
|
|
4368
|
-
async function waitForRunCompletion(runner, runId) {
|
|
4369
|
-
return new Promise((resolvePromise) => {
|
|
4370
|
-
const check = () => {
|
|
4371
|
-
const run = runner.getRun(runId);
|
|
4372
|
-
if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
|
|
4373
|
-
resolvePromise();
|
|
4374
|
-
return;
|
|
4375
|
-
}
|
|
4376
|
-
setTimeout(check, 200);
|
|
4377
|
-
};
|
|
4378
|
-
check();
|
|
4379
|
-
});
|
|
4380
|
-
}
|
|
4381
|
-
function printHelp(topic = "global") {
|
|
4382
|
-
if (topic === "app") {
|
|
4383
|
-
console.info(`
|
|
4384
|
-
agent-evals app - Start server with UI
|
|
4385
|
-
|
|
4386
|
-
Usage:
|
|
4387
|
-
agent-evals app [flags]
|
|
4388
|
-
|
|
4389
|
-
Flags:
|
|
4390
|
-
--port <n> Server port (default: 4100)
|
|
4391
|
-
--no-env Disable automatic .env loading
|
|
4392
|
-
--help, -h Show this help
|
|
4393
|
-
`);
|
|
4394
|
-
return;
|
|
4395
|
-
}
|
|
4396
|
-
if (topic === "list") {
|
|
4397
|
-
console.info(`
|
|
4398
|
-
agent-evals list - List discovered evals
|
|
4399
|
-
|
|
4400
|
-
Usage:
|
|
4401
|
-
agent-evals list [flags]
|
|
4402
|
-
|
|
4403
|
-
Flags:
|
|
4404
|
-
--no-env Disable automatic .env loading
|
|
4405
|
-
--help, -h Show this help
|
|
4406
|
-
`);
|
|
4407
|
-
return;
|
|
4408
|
-
}
|
|
4409
|
-
if (topic === "run") {
|
|
4410
|
-
console.info(`
|
|
4411
|
-
agent-evals run - Run evals
|
|
4412
|
-
|
|
4413
|
-
Usage:
|
|
4414
|
-
agent-evals run [flags]
|
|
4415
|
-
|
|
4416
|
-
Flags:
|
|
4417
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
4418
|
-
--case <id> Run specific case(s) (comma-separated)
|
|
4419
|
-
--trials <n> Number of trials per case
|
|
4420
|
-
--json Output run summary as JSON
|
|
4421
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
4422
|
-
--no-cache Shortcut for --cache bypass
|
|
4423
|
-
--refresh-cache Shortcut for --cache refresh
|
|
4424
|
-
--clear-cache Clear the cache before starting the run
|
|
4425
|
-
--no-env Disable automatic .env loading
|
|
4426
|
-
--help, -h Show this help
|
|
4427
|
-
`);
|
|
4428
|
-
return;
|
|
4429
|
-
}
|
|
4430
|
-
if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
|
|
4431
|
-
console.info(`
|
|
4432
|
-
agent-evals cache - Manage cached operation entries
|
|
4433
|
-
|
|
4434
|
-
Usage:
|
|
4435
|
-
agent-evals cache list [flags]
|
|
4436
|
-
agent-evals cache clear --eval <id>
|
|
4437
|
-
agent-evals cache clear --all
|
|
4438
|
-
|
|
4439
|
-
Flags:
|
|
4440
|
-
--eval <id> Clear entries for specific eval(s) (comma-separated)
|
|
4441
|
-
--all Confirm clearing every cached entry
|
|
4442
|
-
--json Output cache listing as JSON
|
|
4443
|
-
--no-env Disable automatic .env loading
|
|
4444
|
-
--help, -h Show this help
|
|
4445
|
-
`);
|
|
4446
|
-
return;
|
|
4447
|
-
}
|
|
4448
|
-
console.info(`
|
|
4449
|
-
agent-evals - LLM/Agent eval runner
|
|
4450
|
-
|
|
4451
|
-
Commands:
|
|
4452
|
-
app Start server with UI
|
|
4453
|
-
list List discovered evals
|
|
4454
|
-
run Run evals
|
|
4455
|
-
cache list List cached operation entries
|
|
4456
|
-
cache clear --eval <id> Clear cache entries for one eval
|
|
4457
|
-
cache clear --all Clear every cached entry
|
|
4458
|
-
help Show this help
|
|
4459
|
-
|
|
4460
|
-
Options:
|
|
4461
|
-
--eval <id> Run specific eval(s) (comma-separated)
|
|
4462
|
-
--case <id> Run specific case(s) (comma-separated)
|
|
4463
|
-
--trials <n> Number of trials per case
|
|
4464
|
-
--json Output results as JSON
|
|
4465
|
-
--port <n> Server port (default: 4100)
|
|
4466
|
-
--cache <use|bypass|refresh> Cache mode for this run (default: use)
|
|
4467
|
-
--no-cache Shortcut for --cache bypass
|
|
4468
|
-
--refresh-cache Shortcut for --cache refresh
|
|
4469
|
-
--clear-cache Clear the cache before starting the run
|
|
4470
|
-
--no-env Disable automatic .env loading
|
|
4471
|
-
--help, -h Show help
|
|
4472
|
-
`);
|
|
4473
|
-
}
|
|
4474
|
-
//#endregion
|
|
4475
|
-
export { columnDefSchema as $, evalStatsConfigSchema as A, evalChartTypeSchema as B, spanCacheOptionsSchema as C, setEvalOutput as Ct, evalFreshnessStatusSchema as D, getEvalRegistry as Dt, caseRowSchema as E, defineEval as Et, evalChartBuiltinMetricSchema as F, traceAttributeDisplaySchema as G, traceAttributeDisplayFormatSchema as H, evalChartColorSchema as I, traceSpanErrorSchema as J, traceDisplayConfigSchema as K, evalChartConfigSchema as L, scoreTraceSchema as M, evalChartAggregateSchema as N, evalStatAggregateSchema as O, evalChartAxisSchema as P, cellValueSchema as Q, evalChartMetricSchema as R, serializedCacheSpanSchema as S, runInEvalScope as St, caseDetailSchema as T, repoFile as Tt, traceAttributeDisplayInputSchema as U, evalChartsConfigSchema as V, traceAttributeDisplayPlacementSchema as W, traceSpanSchema as X, traceSpanKindSchema as Y, traceSpanWarningSchema as Z, cacheListItemSchema as _, getCurrentScope as _t, sseEnvelopeSchema as a, repoFileRefSchema as at, cacheRecordingOpSchema as b, isInEvalScope as bt, deriveScopedSummaryFromCases as c, buildTraceTree as ct, runManifestSchema as d, evalTracer as dt, columnFormatSchema as et, runSummarySchema as f, hashCacheKey as ft, cacheFileSchema as g, evalAssert as gt, cacheEntrySchema as h, appendToEvalOutput as ht, updateManualScoreRequestSchema as i, numberDisplayOptionsSchema as it, evalSummarySchema as j, evalStatItemSchema as k, deriveStatusFromCaseRows as l, captureEvalSpanError as lt, trialSelectionModeSchema as m, EvalAssertionError as mt, createRunner as n, fileRefSchema as nt, getEvalTitle as o, runArtifactRefSchema as ot, agentEvalsConfigSchema as p, hashCacheKeySync as pt, traceDisplayInputConfigSchema as q, createRunRequestSchema as r, jsonCellSchema as rt, getEvalDisplayStatus as s, z$1 as st, runCli as t, columnKindSchema as tt, deriveStatusFromChildStatuses as u, evalSpan as ut, cacheModeSchema as v, getEvalCaseInput as vt, assertionFailureSchema as w, setScopeCacheContext as wt, cacheRecordingSchema as x, mergeEvalOutput as xt, cacheOperationTypeSchema as y, incrementEvalOutput as yt, evalChartTooltipExtraSchema as z };
|
|
3517
|
+
export { evalChartAxisSchema as $, runManifestSchema as A, evalTracer as At, cacheRecordingSchema as B, mergeEvalOutput as Bt, updateManualScoreRequestSchema as C, numberDisplayOptionsSchema as Ct, deriveScopedSummaryFromCases as D, buildTraceTree as Dt, getEvalDisplayStatus as E, z$1 as Et, cacheFileSchema as F, evalAssert as Ft, caseRowSchema as G, defineEval as Gt, spanCacheOptionsSchema as H, setEvalOutput as Ht, cacheListItemSchema as I, getCurrentScope as It, evalStatItemSchema as J, evalFreshnessStatusSchema as K, getEvalRegistry as Kt, cacheModeSchema as L, getEvalCaseInput as Lt, agentEvalsConfigSchema as M, hashCacheKeySync as Mt, trialSelectionModeSchema as N, EvalAssertionError as Nt, deriveStatusFromCaseRows as O, captureEvalSpanError as Ot, cacheEntrySchema as P, appendToEvalOutput as Pt, evalChartAggregateSchema as Q, cacheOperationTypeSchema as R, incrementEvalOutput as Rt, createRunRequestSchema as S, jsonCellSchema as St, getEvalTitle as T, runArtifactRefSchema as Tt, assertionFailureSchema as U, setScopeCacheContext as Ut, serializedCacheSpanSchema as V, runInEvalScope as Vt, caseDetailSchema as W, repoFile as Wt, evalSummarySchema as X, evalStatsConfigSchema as Y, scoreTraceSchema as Z, loadEvalModule as _, cellValueSchema as _t, loadPersistedRunSnapshot as a, evalChartTypeSchema as at, normalizeScoreDef as b, columnKindSchema as bt, persistCaseDetail as c, traceAttributeDisplayInputSchema as ct, recomputePersistedCaseStatus as d, traceDisplayConfigSchema as dt, evalChartBuiltinMetricSchema as et, runTouchesEval as f, traceDisplayInputConfigSchema as ft, setLatestRunInfoMap as g, traceSpanWarningSchema as gt, getTargetEvalIds as h, traceSpanSchema as ht, getLatestRunInfos as i, evalChartTooltipExtraSchema as it, runSummarySchema as j, hashCacheKey as jt, deriveStatusFromChildStatuses as k, evalSpan as kt, persistRunState as l, traceAttributeDisplayPlacementSchema as lt, buildEvalSummary as m, traceSpanKindSchema as mt, generateRunId as n, evalChartConfigSchema as nt, loadPersistedRunSnapshots as o, evalChartsConfigSchema as ot, resolveArtifactPath as p, traceSpanErrorSchema as pt, evalStatAggregateSchema as q, getLastRunStatuses as r, evalChartMetricSchema as rt, nextShortIdFromSnapshots as s, traceAttributeDisplayFormatSchema as st, executeRun as t, evalChartColorSchema as tt, recomputeEvalStatusesInRuns as u, traceAttributeDisplaySchema as ut, loadConfig as v, columnDefSchema as vt, sseEnvelopeSchema as w, repoFileRefSchema as wt, createFsCacheStore as x, fileRefSchema as xt, buildDeclaredColumnDefs as y, columnFormatSchema as yt, cacheRecordingOpSchema as z, isInEvalScope as zt };
|