@ls-stack/agent-eval 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,14 @@
1
+ import { createRequire, registerHooks } from "node:module";
1
2
  import { createHash } from "node:crypto";
2
3
  import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
3
- import { dirname, extname, join, relative, resolve } from "node:path";
4
+ import { extname, isAbsolute, join, relative, resolve } from "node:path";
4
5
  import { z, z as z$1 } from "zod/v4";
5
6
  import { AsyncLocalStorage } from "node:async_hooks";
6
7
  import { Buffer as Buffer$1 } from "node:buffer";
7
8
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
8
- import { watch } from "chokidar";
9
- import { glob } from "glob";
10
9
  import { existsSync } from "node:fs";
11
10
  import { resultify } from "t-result";
12
11
  import { fileURLToPath, pathToFileURL } from "node:url";
13
- import { spawn, spawnSync } from "node:child_process";
14
12
  //#region ../sdk/src/defineEval.ts
15
13
  const evalRegistry = /* @__PURE__ */ new Map();
16
14
  /** Return the in-memory registry of evals defined in the current process. */
@@ -2230,80 +2228,6 @@ function safeJsonParse(text) {
2230
2228
  return parsed.value;
2231
2229
  }
2232
2230
  //#endregion
2233
- //#region ../runner/src/chartValidation.ts
2234
- function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
2235
- const columnDef = columnsByKey.get(metric.key);
2236
- if (!columnDef) {
2237
- warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
2238
- return false;
2239
- }
2240
- if (metric.aggregate === "passThresholdRate") {
2241
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
2242
- warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
2243
- return false;
2244
- }
2245
- }
2246
- return true;
2247
- }
2248
- function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
2249
- const columnDef = columnsByKey.get(extra.key);
2250
- if (!columnDef) {
2251
- warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
2252
- return false;
2253
- }
2254
- if (extra.aggregate === "passThresholdRate") {
2255
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
2256
- warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
2257
- return false;
2258
- }
2259
- }
2260
- return true;
2261
- }
2262
- function sanitizeChart(chart, columnsByKey, evalId, warnings) {
2263
- const metrics = chart.metrics.filter((metric) => {
2264
- if (metric.source === "builtin") return true;
2265
- return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
2266
- });
2267
- if (metrics.length === 0) {
2268
- warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
2269
- return null;
2270
- }
2271
- const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
2272
- if (extra.source === "builtin") return true;
2273
- return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
2274
- });
2275
- return {
2276
- ...chart,
2277
- metrics,
2278
- tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
2279
- };
2280
- }
2281
- /**
2282
- * Validate and sanitize an authored `charts` config against the eval's
2283
- * declared columns. Drops metrics/extras that reference unknown columns or
2284
- * misuse `passThresholdRate`, and drops entire charts whose metrics are all
2285
- * invalid. Returns `charts: undefined` when nothing valid remains so the UI
2286
- * falls back to rendering no chart (matching the opt-in default).
2287
- */
2288
- function validateCharts(params) {
2289
- const { charts, columnDefs, evalId } = params;
2290
- if (!charts || charts.length === 0) return {
2291
- charts: void 0,
2292
- warnings: []
2293
- };
2294
- const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
2295
- const warnings = [];
2296
- const sanitized = [];
2297
- for (const chart of charts) {
2298
- const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
2299
- if (result) sanitized.push(result);
2300
- }
2301
- return {
2302
- charts: sanitized.length > 0 ? sanitized : void 0,
2303
- warnings
2304
- };
2305
- }
2306
- //#endregion
2307
2231
  //#region ../runner/src/columnBuilder.ts
2308
2232
  /**
2309
2233
  * Normalize a user-provided score definition (either a function or an
@@ -2513,98 +2437,6 @@ async function loadConfig() {
2513
2437
  }
2514
2438
  }
2515
2439
  //#endregion
2516
- //#region ../runner/src/discovery.ts
2517
- const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
2518
- const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
2519
- function parseEvalMetas(filePath, content) {
2520
- const metas = [];
2521
- let searchIndex = 0;
2522
- while (searchIndex < content.length) {
2523
- const defineEvalIndex = content.indexOf("defineEval", searchIndex);
2524
- if (defineEvalIndex === -1) break;
2525
- const extracted = extractDefineEvalObject(content, defineEvalIndex);
2526
- if (!extracted) {
2527
- searchIndex = defineEvalIndex + 10;
2528
- continue;
2529
- }
2530
- const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
2531
- if (id !== void 0) {
2532
- const result = {
2533
- filePath,
2534
- id
2535
- };
2536
- const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
2537
- if (title !== void 0) result.title = title;
2538
- metas.push(result);
2539
- }
2540
- searchIndex = extracted.nextIndex;
2541
- }
2542
- return metas;
2543
- }
2544
- function extractDefineEvalObject(content, defineEvalIndex) {
2545
- const openParenIndex = content.indexOf("(", defineEvalIndex);
2546
- if (openParenIndex === -1) return void 0;
2547
- const objectStartIndex = content.indexOf("{", openParenIndex);
2548
- if (objectStartIndex === -1) return void 0;
2549
- let depth = 0;
2550
- let quote;
2551
- let inBlockComment = false;
2552
- let inLineComment = false;
2553
- let isEscaped = false;
2554
- for (let index = objectStartIndex; index < content.length; index++) {
2555
- const currentChar = content[index];
2556
- const nextChar = content[index + 1];
2557
- if (inLineComment) {
2558
- if (currentChar === "\n") inLineComment = false;
2559
- continue;
2560
- }
2561
- if (inBlockComment) {
2562
- if (currentChar === "*" && nextChar === "/") {
2563
- inBlockComment = false;
2564
- index++;
2565
- }
2566
- continue;
2567
- }
2568
- if (quote) {
2569
- if (isEscaped) {
2570
- isEscaped = false;
2571
- continue;
2572
- }
2573
- if (currentChar === "\\") {
2574
- isEscaped = true;
2575
- continue;
2576
- }
2577
- if (currentChar === quote) quote = void 0;
2578
- continue;
2579
- }
2580
- if (currentChar === "/" && nextChar === "/") {
2581
- inLineComment = true;
2582
- index++;
2583
- continue;
2584
- }
2585
- if (currentChar === "/" && nextChar === "*") {
2586
- inBlockComment = true;
2587
- index++;
2588
- continue;
2589
- }
2590
- if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
2591
- quote = currentChar;
2592
- continue;
2593
- }
2594
- if (currentChar === "{") {
2595
- depth++;
2596
- continue;
2597
- }
2598
- if (currentChar === "}") {
2599
- depth--;
2600
- if (depth === 0) return {
2601
- nextIndex: index + 1,
2602
- objectText: content.slice(objectStartIndex, index + 1)
2603
- };
2604
- }
2605
- }
2606
- }
2607
- //#endregion
2608
2440
  //#region ../runner/src/evalModuleLoader.ts
2609
2441
  /**
2610
2442
  * Import one eval module with a cache key derived from its current source so
@@ -2692,30 +2524,6 @@ function setLatestRunInfoMap(params) {
2692
2524
  for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
2693
2525
  }
2694
2526
  //#endregion
2695
- //#region ../runner/src/gitState.ts
2696
- function runGitCommand(workspaceRoot, args) {
2697
- const result = spawnSync("git", args, {
2698
- cwd: workspaceRoot,
2699
- encoding: "utf8",
2700
- stdio: [
2701
- "ignore",
2702
- "pipe",
2703
- "ignore"
2704
- ]
2705
- });
2706
- return {
2707
- status: result.status,
2708
- stdout: result.stdout.trim()
2709
- };
2710
- }
2711
- /** Read the current git commit for the workspace, if available. */
2712
- function readGitWorktreeState(workspaceRoot) {
2713
- const insideWorktree = runGitCommand(workspaceRoot, ["rev-parse", "--is-inside-work-tree"]);
2714
- if (insideWorktree.status !== 0 || insideWorktree.stdout !== "true") return { commitSha: null };
2715
- const commitResult = runGitCommand(workspaceRoot, ["rev-parse", "HEAD"]);
2716
- return { commitSha: commitResult.status === 0 ? commitResult.stdout : null };
2717
- }
2718
- //#endregion
2719
2527
  //#region ../runner/src/outputArtifacts.ts
2720
2528
  const mimeTypeExtensionMap = {
2721
2529
  "application/json": ".json",
@@ -2867,6 +2675,235 @@ async function recomputeEvalStatusesInRuns(params) {
2867
2675
  return updatedRuns;
2868
2676
  }
2869
2677
  //#endregion
2678
+ //#region ../runner/src/runPersistence.ts
2679
+ const SHORT_ID_PATTERN = /^r(\d+)$/;
2680
+ /**
2681
+ * Generate a filesystem-safe, sortable run id combining a UTC timestamp
2682
+ * with a short random suffix.
2683
+ */
2684
+ function generateRunId() {
2685
+ const now = /* @__PURE__ */ new Date();
2686
+ const pad = (n) => String(n).padStart(2, "0");
2687
+ return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
2688
+ }
2689
+ function parseShortIdNum(shortId) {
2690
+ if (shortId === void 0) return null;
2691
+ const match = SHORT_ID_PATTERN.exec(shortId);
2692
+ if (!match) return null;
2693
+ const num = Number(match[1]);
2694
+ if (!Number.isFinite(num)) return null;
2695
+ return num;
2696
+ }
2697
+ /**
2698
+ * Return the next `shortId` number to assign based on the existing
2699
+ * loaded snapshots. Legacy runs that don't match the `r\d+` format are
2700
+ * ignored.
2701
+ */
2702
+ function nextShortIdFromSnapshots(snapshots) {
2703
+ let maxNum = -1;
2704
+ for (const snapshot of snapshots) {
2705
+ const num = parseShortIdNum(snapshot.manifest.shortId);
2706
+ if (num !== null && num > maxNum) maxNum = num;
2707
+ }
2708
+ return maxNum + 1;
2709
+ }
2710
+ async function loadPersistedRunSnapshots(localStateDir) {
2711
+ const runsDir = join(localStateDir, "runs");
2712
+ const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
2713
+ if (entriesResult.error) return [];
2714
+ const snapshots = [];
2715
+ const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
2716
+ for (const runDir of runDirs) {
2717
+ const snapshot = await loadPersistedRunSnapshot(runDir);
2718
+ if (!snapshot) continue;
2719
+ snapshots.push(snapshot);
2720
+ }
2721
+ return snapshots;
2722
+ }
2723
+ async function persistCaseDetail(runDir, caseDetail) {
2724
+ await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
2725
+ }
2726
+ function getLastRunStatuses(params) {
2727
+ const latestRunInfos = getLatestRunInfos(params);
2728
+ return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
2729
+ }
2730
+ /**
2731
+ * Return the latest scoped run metadata for each eval based on persisted and
2732
+ * in-memory runs.
2733
+ */
2734
+ function getLatestRunInfos(params) {
2735
+ const { runs, knownEvals } = params;
2736
+ const knownEvalMetas = [...knownEvals];
2737
+ const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
2738
+ const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
2739
+ const latestRunInfos = /* @__PURE__ */ new Map();
2740
+ for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
2741
+ status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
2742
+ startedAt: getRunFreshnessTimestamp(run.manifest),
2743
+ commitSha: run.manifest.commitSha ?? null,
2744
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
2745
+ });
2746
+ return latestRunInfos;
2747
+ }
2748
+ function toLastRunStatus$1(status) {
2749
+ return status === "pending" ? null : status;
2750
+ }
2751
+ async function loadPersistedRunSnapshot(runDir) {
2752
+ const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
2753
+ if (!manifest) return null;
2754
+ const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
2755
+ if (!summary) return null;
2756
+ return {
2757
+ runDir,
2758
+ manifest,
2759
+ summary,
2760
+ cases: await readCaseRows(runDir),
2761
+ caseDetails: await readCaseDetails(runDir)
2762
+ };
2763
+ }
2764
+ async function readParsedJsonFile(filePath, schema) {
2765
+ const fileResult = await resultify(() => readFile(filePath, "utf-8"));
2766
+ if (fileResult.error) return null;
2767
+ const jsonResult = resultify(() => JSON.parse(fileResult.value));
2768
+ if (jsonResult.error) return null;
2769
+ const parsed = schema.safeParse(jsonResult.value);
2770
+ if (!parsed.success) return null;
2771
+ return parsed.data;
2772
+ }
2773
+ async function readCaseRows(runDir) {
2774
+ const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
2775
+ if (fileResult.error) return [];
2776
+ const rows = [];
2777
+ for (const rawLine of fileResult.value.split("\n")) {
2778
+ const line = rawLine.trim();
2779
+ if (line.length === 0) continue;
2780
+ const jsonResult = resultify(() => JSON.parse(line));
2781
+ if (jsonResult.error) continue;
2782
+ const parsed = caseRowSchema.safeParse(jsonResult.value);
2783
+ if (!parsed.success) continue;
2784
+ rows.push(parsed.data);
2785
+ }
2786
+ return rows;
2787
+ }
2788
+ async function readCaseDetails(runDir) {
2789
+ const detailsDir = join(runDir, "case-details");
2790
+ const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
2791
+ if (entriesResult.error) return /* @__PURE__ */ new Map();
2792
+ const caseDetails = /* @__PURE__ */ new Map();
2793
+ for (const entry of entriesResult.value) {
2794
+ if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
2795
+ const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
2796
+ if (!detail) continue;
2797
+ caseDetails.set(detail.caseId, detail);
2798
+ }
2799
+ return caseDetails;
2800
+ }
2801
+ function getRunEvalIds(run, knownEvalIds) {
2802
+ const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
2803
+ if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
2804
+ else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
2805
+ return [...evalIds];
2806
+ }
2807
+ function getEvalStatusForRun(run, evalId, manualScoreKeys) {
2808
+ const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
2809
+ if (evalCases.length > 0) {
2810
+ if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
2811
+ return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
2812
+ }
2813
+ return toLastRunStatus$1(deriveStatusFromChildStatuses({
2814
+ statuses: [],
2815
+ lifecycleStatus: run.manifest.status
2816
+ }));
2817
+ }
2818
+ function hasPendingManualScores(caseRows, manualScoreKeys) {
2819
+ if (manualScoreKeys.length === 0) return false;
2820
+ return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
2821
+ const value = caseRow.columns[key];
2822
+ return typeof value !== "number" || !Number.isFinite(value);
2823
+ }));
2824
+ }
2825
+ function encodeCaseDetailFileName(caseId) {
2826
+ return encodeURIComponent(caseId);
2827
+ }
2828
+ //#endregion
2829
+ //#region ../runner/src/moduleIsolation.ts
2830
+ const isolationParam = "agent-evals-isolate";
2831
+ const pathSegmentSeparatorPattern = /[\\/]+/;
2832
+ const isolationStorage = new AsyncLocalStorage();
2833
+ const activeIsolationRoots = /* @__PURE__ */ new Map();
2834
+ let hooksRegistered = false;
2835
+ const requireFromRunner = createRequire(import.meta.url);
2836
+ const agentPackageUrlBySpecifier = new Map([
2837
+ "@ls-stack/agent-eval",
2838
+ "@agent-evals/sdk",
2839
+ "@agent-evals/shared",
2840
+ "@agent-evals/runner",
2841
+ "@agent-evals/runner/run-child"
2842
+ ].flatMap((specifier) => {
2843
+ try {
2844
+ return [[specifier, pathToFileURL(requireFromRunner.resolve(specifier)).href]];
2845
+ } catch {
2846
+ return [];
2847
+ }
2848
+ }));
2849
+ function isAgentEvalsPackageSpecifier(specifier) {
2850
+ return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
2851
+ }
2852
+ function getIsolationKeyFromParent(parentURL) {
2853
+ if (!parentURL?.startsWith("file:")) return null;
2854
+ const value = new URL(parentURL).searchParams.get(isolationParam);
2855
+ return activeIsolationRoots.has(value ?? "") ? value : null;
2856
+ }
2857
+ function isWorkspaceFile(url, workspaceRoot) {
2858
+ if (url.protocol !== "file:") return false;
2859
+ const relativePath = relative(workspaceRoot, fileURLToPath(url));
2860
+ if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
2861
+ const segments = relativePath.split(pathSegmentSeparatorPattern);
2862
+ return !segments.includes("node_modules") && !segments.includes(".agent-evals");
2863
+ }
2864
+ function addIsolationParam(url, key) {
2865
+ const moduleUrl = new URL(url);
2866
+ if (moduleUrl.searchParams.get(isolationParam) === key) return url;
2867
+ moduleUrl.searchParams.set(isolationParam, key);
2868
+ return moduleUrl.href;
2869
+ }
2870
+ function registerModuleIsolationHooks() {
2871
+ if (hooksRegistered) return;
2872
+ hooksRegistered = true;
2873
+ registerHooks({ resolve(specifier, context, nextResolve) {
2874
+ const agentPackageUrl = agentPackageUrlBySpecifier.get(specifier);
2875
+ if (agentPackageUrl !== void 0) return {
2876
+ url: agentPackageUrl,
2877
+ shortCircuit: true
2878
+ };
2879
+ const resolved = nextResolve(specifier, context);
2880
+ if (isAgentEvalsPackageSpecifier(specifier)) return resolved;
2881
+ const activeContext = isolationStorage.getStore();
2882
+ const inferredKey = getIsolationKeyFromParent(context.parentURL);
2883
+ const isolationKey = activeContext?.key ?? inferredKey;
2884
+ if (isolationKey === null) return resolved;
2885
+ const workspaceRoot = activeContext?.workspaceRoot ?? activeIsolationRoots.get(isolationKey);
2886
+ if (workspaceRoot === void 0) return resolved;
2887
+ if (!isWorkspaceFile(new URL(resolved.url), workspaceRoot)) return resolved;
2888
+ return {
2889
+ ...resolved,
2890
+ url: addIsolationParam(resolved.url, isolationKey)
2891
+ };
2892
+ } });
2893
+ }
2894
+ /**
2895
+ * Execute module loading and eval code with fresh workspace module URLs.
2896
+ *
2897
+ * Node does not expose an ESM cache reset API, so the runner appends a
2898
+ * run-scoped query parameter to workspace file imports. Package imports are
2899
+ * left alone so SDK singletons, such as the eval registry, remain shared.
2900
+ */
2901
+ async function runWithModuleIsolation(context, fn) {
2902
+ registerModuleIsolationHooks();
2903
+ activeIsolationRoots.set(context.key, context.workspaceRoot);
2904
+ return await isolationStorage.run(context, fn);
2905
+ }
2906
+ //#endregion
2870
2907
  //#region ../runner/src/traceDisplay.ts
2871
2908
  function isRecord$1(value) {
2872
2909
  return typeof value === "object" && value !== null;
@@ -2954,12 +2991,16 @@ async function callWithUnknownResult(fn, args) {
2954
2991
  return await Reflect.apply(fn, void 0, args);
2955
2992
  }
2956
2993
  async function runCase(params) {
2957
- const { evalDef, evalId, evalCase, globalTraceDisplay, trial, signal, startTime, cacheAdapter, cacheMode, codeFingerprint, artifactDir, runId } = params;
2994
+ const { evalDef, evalId, evalCase, globalTraceDisplay, trial, startTime, cacheAdapter, cacheMode, codeFingerprint, moduleIsolation, artifactDir, runId } = params;
2958
2995
  const { scope, error: executeError } = await runInEvalScope(evalCase.id, async () => {
2959
- await Reflect.apply(evalDef.execute, evalDef, [{
2960
- input: evalCase.input,
2961
- signal
2962
- }]);
2996
+ const execute = async () => {
2997
+ await Reflect.apply(evalDef.execute, evalDef, [{ input: evalCase.input }]);
2998
+ };
2999
+ if (moduleIsolation === void 0) {
3000
+ await execute();
3001
+ return;
3002
+ }
3003
+ await runWithModuleIsolation(moduleIsolation, execute);
2963
3004
  }, {
2964
3005
  input: evalCase.input,
2965
3006
  cacheContext: cacheAdapter ? {
@@ -2996,11 +3037,15 @@ async function runCase(params) {
2996
3037
  const scoringTraces = {};
2997
3038
  if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
2998
3039
  const { compute, passThreshold, label } = normalizeScoreDef(def);
2999
- const scoreRun = await runInEvalScope(evalCase.id, async () => await callWithUnknownResult(compute, [{
3000
- input: evalCase.input,
3001
- outputs: { ...scope.outputs },
3002
- case: evalCase
3003
- }]), {
3040
+ const scoreRun = await runInEvalScope(evalCase.id, async () => {
3041
+ const computeScore = async () => await callWithUnknownResult(compute, [{
3042
+ input: evalCase.input,
3043
+ outputs: { ...scope.outputs },
3044
+ case: evalCase
3045
+ }]);
3046
+ if (moduleIsolation === void 0) return await computeScore();
3047
+ return await runWithModuleIsolation(moduleIsolation, computeScore);
3048
+ }, {
3004
3049
  input: evalCase.input,
3005
3050
  cacheContext: cacheAdapter ? {
3006
3051
  adapter: cacheAdapter,
@@ -3119,172 +3164,20 @@ function toAssertionFailure(message, error = void 0) {
3119
3164
  } : { message };
3120
3165
  }
3121
3166
  //#endregion
3122
- //#region ../runner/src/runPersistence.ts
3123
- const SHORT_ID_PATTERN = /^r(\d+)$/;
3124
- /**
3125
- * Generate a filesystem-safe, sortable run id combining a UTC timestamp
3126
- * with a short random suffix.
3127
- */
3128
- function generateRunId() {
3129
- const now = /* @__PURE__ */ new Date();
3130
- const pad = (n) => String(n).padStart(2, "0");
3131
- return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
3132
- }
3133
- function parseShortIdNum(shortId) {
3134
- if (shortId === void 0) return null;
3135
- const match = SHORT_ID_PATTERN.exec(shortId);
3136
- if (!match) return null;
3137
- const num = Number(match[1]);
3138
- if (!Number.isFinite(num)) return null;
3139
- return num;
3140
- }
3141
- /**
3142
- * Return the next `shortId` number to assign based on the existing
3143
- * loaded snapshots. Legacy runs that don't match the `r\d+` format are
3144
- * ignored.
3145
- */
3146
- function nextShortIdFromSnapshots(snapshots) {
3147
- let maxNum = -1;
3148
- for (const snapshot of snapshots) {
3149
- const num = parseShortIdNum(snapshot.manifest.shortId);
3150
- if (num !== null && num > maxNum) maxNum = num;
3151
- }
3152
- return maxNum + 1;
3153
- }
3154
- async function loadPersistedRunSnapshots(localStateDir) {
3155
- const runsDir = join(localStateDir, "runs");
3156
- const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
3157
- if (entriesResult.error) return [];
3158
- const snapshots = [];
3159
- const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
3160
- for (const runDir of runDirs) {
3161
- const snapshot = await loadPersistedRunSnapshot(runDir);
3162
- if (!snapshot) continue;
3163
- snapshots.push(snapshot);
3164
- }
3165
- return snapshots;
3166
- }
3167
- async function persistCaseDetail(runDir, caseDetail) {
3168
- await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(caseDetail.caseId)}.json`), JSON.stringify(caseDetail, null, 2));
3169
- }
3170
- function getLastRunStatuses(params) {
3171
- const latestRunInfos = getLatestRunInfos(params);
3172
- return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
3173
- }
3174
- /**
3175
- * Return the latest scoped run metadata for each eval based on persisted and
3176
- * in-memory runs.
3177
- */
3178
- function getLatestRunInfos(params) {
3179
- const { runs, knownEvals } = params;
3180
- const knownEvalMetas = [...knownEvals];
3181
- const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.id, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
3182
- const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
3183
- const latestRunInfos = /* @__PURE__ */ new Map();
3184
- for (const run of orderedRuns) for (const evalId of getRunEvalIds(run, knownEvalMetas.map((evalMeta) => evalMeta.id))) latestRunInfos.set(evalId, {
3185
- status: getEvalStatusForRun(run, evalId, manualScoreKeysByEval.get(evalId) ?? []),
3186
- startedAt: getRunFreshnessTimestamp(run.manifest),
3187
- commitSha: run.manifest.commitSha ?? null,
3188
- evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalId] ?? null
3189
- });
3190
- return latestRunInfos;
3191
- }
3192
- function toLastRunStatus$1(status) {
3193
- return status === "pending" ? null : status;
3194
- }
3195
- async function loadPersistedRunSnapshot(runDir) {
3196
- const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
3197
- if (!manifest) return null;
3198
- const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
3199
- if (!summary) return null;
3200
- return {
3201
- runDir,
3202
- manifest,
3203
- summary,
3204
- cases: await readCaseRows(runDir),
3205
- caseDetails: await readCaseDetails(runDir)
3206
- };
3207
- }
3208
- async function readParsedJsonFile(filePath, schema) {
3209
- const fileResult = await resultify(() => readFile(filePath, "utf-8"));
3210
- if (fileResult.error) return null;
3211
- const jsonResult = resultify(() => JSON.parse(fileResult.value));
3212
- if (jsonResult.error) return null;
3213
- const parsed = schema.safeParse(jsonResult.value);
3214
- if (!parsed.success) return null;
3215
- return parsed.data;
3216
- }
3217
- async function readCaseRows(runDir) {
3218
- const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
3219
- if (fileResult.error) return [];
3220
- const rows = [];
3221
- for (const rawLine of fileResult.value.split("\n")) {
3222
- const line = rawLine.trim();
3223
- if (line.length === 0) continue;
3224
- const jsonResult = resultify(() => JSON.parse(line));
3225
- if (jsonResult.error) continue;
3226
- const parsed = caseRowSchema.safeParse(jsonResult.value);
3227
- if (!parsed.success) continue;
3228
- rows.push(parsed.data);
3229
- }
3230
- return rows;
3231
- }
3232
- async function readCaseDetails(runDir) {
3233
- const detailsDir = join(runDir, "case-details");
3234
- const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
3235
- if (entriesResult.error) return /* @__PURE__ */ new Map();
3236
- const caseDetails = /* @__PURE__ */ new Map();
3237
- for (const entry of entriesResult.value) {
3238
- if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
3239
- const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
3240
- if (!detail) continue;
3241
- caseDetails.set(detail.caseId, detail);
3242
- }
3243
- return caseDetails;
3244
- }
3245
- function getRunEvalIds(run, knownEvalIds) {
3246
- const evalIds = new Set(run.cases.map((caseRow) => caseRow.evalId));
3247
- if (run.manifest.target.mode === "evalIds") for (const evalId of run.manifest.target.evalIds ?? []) evalIds.add(evalId);
3248
- else if (run.manifest.target.mode === "all" && evalIds.size === 0) for (const evalId of knownEvalIds) evalIds.add(evalId);
3249
- return [...evalIds];
3250
- }
3251
- function getEvalStatusForRun(run, evalId, manualScoreKeys) {
3252
- const evalCases = run.cases.filter((caseRow) => caseRow.evalId === evalId);
3253
- if (evalCases.length > 0) {
3254
- if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
3255
- return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
3256
- }
3257
- return toLastRunStatus$1(deriveStatusFromChildStatuses({
3258
- statuses: [],
3259
- lifecycleStatus: run.manifest.status
3260
- }));
3261
- }
3262
- function hasPendingManualScores(caseRows, manualScoreKeys) {
3263
- if (manualScoreKeys.length === 0) return false;
3264
- return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
3265
- const value = caseRow.columns[key];
3266
- return typeof value !== "number" || !Number.isFinite(value);
3267
- }));
3268
- }
3269
- function encodeCaseDetailFileName(caseId) {
3270
- return encodeURIComponent(caseId);
3271
- }
3272
- //#endregion
3273
3167
  //#region ../runner/src/runQueue.ts
3274
3168
  async function executeQueuedCases(params) {
3275
- const { runState, queuedCases, concurrency, globalTraceDisplay } = params;
3169
+ const { queuedCases, concurrency, globalTraceDisplay } = params;
3276
3170
  let nextCaseIndex = 0;
3277
3171
  let workerError = void 0;
3278
3172
  const workerCount = Math.min(concurrency, queuedCases.length);
3279
3173
  const workers = Array.from({ length: workerCount }, async () => {
3280
- while (!runState.abortController.signal.aborted && workerError === void 0) {
3174
+ while (workerError === void 0) {
3281
3175
  const queuedCase = queuedCases[nextCaseIndex];
3282
3176
  nextCaseIndex += 1;
3283
3177
  if (queuedCase === void 0) return;
3284
3178
  try {
3285
3179
  await executeQueuedCase({
3286
3180
  queuedCase,
3287
- runState,
3288
3181
  globalTraceDisplay
3289
3182
  });
3290
3183
  } catch (error) {
@@ -3298,11 +3191,10 @@ async function executeQueuedCases(params) {
3298
3191
  if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
3299
3192
  }
3300
3193
  async function executeQueuedCase(params) {
3301
- const { queuedCase, runState, globalTraceDisplay } = params;
3194
+ const { queuedCase, globalTraceDisplay } = params;
3302
3195
  const startTime = Date.now();
3303
3196
  const result = await queuedCase.execute({
3304
3197
  globalTraceDisplay,
3305
- signal: runState.abortController.signal,
3306
3198
  startTime
3307
3199
  });
3308
3200
  await queuedCase.onComplete(result);
@@ -3353,7 +3245,48 @@ function pickWinningTrial(params) {
3353
3245
  if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
3354
3246
  return medianAttempt;
3355
3247
  }
3356
- async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals }) {
3248
+ async function finalizePreparedCase(params) {
3249
+ const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
3250
+ if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
3251
+ preparedCase.finalized = true;
3252
+ const winningTrial = pickWinningTrial({
3253
+ strategy: runState.manifest.trialSelection,
3254
+ attempts: preparedCase.trialResults,
3255
+ scoreKeys: preparedEval.scoreKeys
3256
+ });
3257
+ if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
3258
+ runState.cases.push(winningTrial.caseRow);
3259
+ runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
3260
+ preparedEval.mergeColumns(winningTrial.caseDetail.columns);
3261
+ if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
3262
+ else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
3263
+ else runState.summary.failedCases++;
3264
+ await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
3265
+ await persistCaseDetail(runDir, winningTrial.caseDetail);
3266
+ onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
3267
+ emitEvent(runState, {
3268
+ type: "case.finished",
3269
+ runId: runState.manifest.id,
3270
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3271
+ payload: winningTrial.caseRow
3272
+ });
3273
+ preparedEval.evalCaseRows.push(winningTrial.caseRow);
3274
+ }
3275
+ function getPreparedCaseOrderKey(caseRow) {
3276
+ return `${caseRow.evalId}\u0000${caseRow.caseId}`;
3277
+ }
3278
+ function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
3279
+ const orderByCase = /* @__PURE__ */ new Map();
3280
+ let order = 0;
3281
+ for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
3282
+ orderByCase.set(`${preparedEval.evalMeta.id}\u0000${preparedCase.caseId}`, order);
3283
+ order++;
3284
+ }
3285
+ caseRows.sort((left, right) => {
3286
+ return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
3287
+ });
3288
+ }
3289
+ async function executeRun({ runState, request, runDir, config, evals, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
3357
3290
  try {
3358
3291
  const targetEvals = getTargetEvals(request);
3359
3292
  emitEvent(runState, {
@@ -3362,14 +3295,16 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3362
3295
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3363
3296
  payload: runState.manifest
3364
3297
  });
3365
- const allCaseRows = [];
3366
3298
  const evalErrors = [];
3367
3299
  const queuedCases = [];
3368
3300
  const preparedEvals = [];
3369
3301
  const cacheMode = runState.manifest.cacheMode ?? "use";
3370
3302
  const cacheEnabled = config.cache?.enabled !== false;
3303
+ const moduleIsolation = {
3304
+ key: runState.manifest.id,
3305
+ workspaceRoot
3306
+ };
3371
3307
  for (const evalMeta of targetEvals) {
3372
- if (runState.abortController.signal.aborted) break;
3373
3308
  const evalFilePath = evalMeta.sourceFilePath;
3374
3309
  let codeFingerprint = "";
3375
3310
  try {
@@ -3381,7 +3316,9 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3381
3316
  else delete runState.manifest.evalSourceFingerprints[evalMeta.id];
3382
3317
  try {
3383
3318
  const registry = getEvalRegistry();
3384
- await loadEvalModule(evalFilePath, codeFingerprint);
3319
+ await runWithModuleIsolation(moduleIsolation, async () => {
3320
+ await loadEvalModule(evalFilePath, codeFingerprint);
3321
+ });
3385
3322
  const entry = registry.get(evalMeta.id);
3386
3323
  if (!entry) {
3387
3324
  evalErrors.push({
@@ -3390,74 +3327,87 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3390
3327
  });
3391
3328
  continue;
3392
3329
  }
3393
- await entry.use(async (evalDef) => {
3394
- const cases = filterEvalCases(resolveRunnableEvalCases({
3395
- cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
3396
- evalId: evalMeta.id
3397
- }), request.target.evalIds, request.target.caseIds, evalMeta.id);
3398
- runState.summary.totalCases += cases.length;
3399
- const accumulatedColumns = /* @__PURE__ */ new Map();
3400
- const evalCaseRows = [];
3401
- const preparedCases = [];
3402
- const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
3403
- const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
3404
- preparedEvals.push({
3405
- evalMeta,
3406
- accumulatedColumns,
3407
- evalCaseRows,
3408
- preparedCases,
3409
- scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
3410
- mergeColumns: (columns) => {
3411
- mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
3412
- }
3413
- });
3414
- for (const evalCase of cases) {
3415
- if (runState.abortController.signal.aborted) break;
3416
- const trialResults = [];
3417
- preparedCases.push({
3418
- caseId: evalCase.id,
3419
- trialResults
3420
- });
3421
- for (let trial = 0; trial < request.trials; trial++) {
3422
- const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
3423
- queuedCases.push({
3424
- execute: async ({ startTime, signal, globalTraceDisplay }) => {
3425
- const { caseDetail, caseRowUpdate } = await runCase({
3426
- evalDef,
3427
- evalId: evalMeta.id,
3428
- evalCase,
3429
- globalTraceDisplay,
3430
- trial,
3431
- signal,
3432
- startTime,
3433
- cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
3434
- cacheMode,
3435
- codeFingerprint,
3436
- artifactDir: join(runDir, "artifacts"),
3437
- runId: runState.manifest.id
3438
- });
3439
- return {
3440
- caseDetail,
3441
- caseRow: {
3442
- caseId: evalCase.id,
3330
+ await runWithModuleIsolation(moduleIsolation, async () => {
3331
+ await entry.use(async (evalDef) => {
3332
+ const cases = filterEvalCases(resolveRunnableEvalCases({
3333
+ cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
3334
+ evalId: evalMeta.id
3335
+ }), request.target.evalIds, request.target.caseIds, evalMeta.id);
3336
+ runState.summary.totalCases += cases.length;
3337
+ const accumulatedColumns = /* @__PURE__ */ new Map();
3338
+ const evalCaseRows = [];
3339
+ const preparedCases = [];
3340
+ const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
3341
+ const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
3342
+ const preparedEval = {
3343
+ evalMeta,
3344
+ accumulatedColumns,
3345
+ evalCaseRows,
3346
+ preparedCases,
3347
+ scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys]),
3348
+ mergeColumns: (columns) => {
3349
+ mergeColumnDefs(accumulatedColumns, columns, evalDef.columns, evalDef.scores, evalDef.manualScores);
3350
+ }
3351
+ };
3352
+ preparedEvals.push(preparedEval);
3353
+ for (const evalCase of cases) {
3354
+ const trialResults = [];
3355
+ const preparedCase = {
3356
+ caseId: evalCase.id,
3357
+ trialResults,
3358
+ finalized: false
3359
+ };
3360
+ preparedCases.push(preparedCase);
3361
+ for (let trial = 0; trial < request.trials; trial++) {
3362
+ const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
3363
+ queuedCases.push({
3364
+ execute: async ({ startTime, globalTraceDisplay }) => {
3365
+ const { caseDetail, caseRowUpdate } = await runCase({
3366
+ evalDef,
3443
3367
  evalId: evalMeta.id,
3444
- status: caseRowUpdate.status ?? "pending",
3445
- latencyMs: caseRowUpdate.latencyMs ?? null,
3446
- columns: caseRowUpdate.columns ?? {},
3447
- trial
3448
- }
3449
- };
3450
- },
3451
- onComplete: ({ caseDetail, caseRow }) => {
3452
- trialResults.push({
3453
- caseDetail,
3454
- caseRow,
3455
- bufferedCacheStore
3456
- });
3457
- }
3458
- });
3368
+ evalCase,
3369
+ globalTraceDisplay,
3370
+ trial,
3371
+ startTime,
3372
+ cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
3373
+ cacheMode,
3374
+ codeFingerprint,
3375
+ moduleIsolation,
3376
+ artifactDir: join(runDir, "artifacts"),
3377
+ runId: runState.manifest.id
3378
+ });
3379
+ return {
3380
+ caseDetail,
3381
+ caseRow: {
3382
+ caseId: evalCase.id,
3383
+ evalId: evalMeta.id,
3384
+ status: caseRowUpdate.status ?? "pending",
3385
+ latencyMs: caseRowUpdate.latencyMs ?? null,
3386
+ columns: caseRowUpdate.columns ?? {},
3387
+ trial
3388
+ }
3389
+ };
3390
+ },
3391
+ onComplete: async ({ caseDetail, caseRow }) => {
3392
+ trialResults.push({
3393
+ caseDetail,
3394
+ caseRow,
3395
+ bufferedCacheStore
3396
+ });
3397
+ if (trialResults.length !== request.trials) return;
3398
+ await finalizePreparedCase({
3399
+ runState,
3400
+ runDir,
3401
+ preparedEval,
3402
+ preparedCase,
3403
+ onCaseFinished,
3404
+ emitEvent
3405
+ });
3406
+ }
3407
+ });
3408
+ }
3459
3409
  }
3460
- }
3410
+ });
3461
3411
  });
3462
3412
  } catch (error) {
3463
3413
  console.error(`Error running eval ${evalMeta.id}:`, error);
@@ -3475,37 +3425,19 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3475
3425
  }
3476
3426
  }
3477
3427
  await executeQueuedCases({
3478
- runState,
3479
3428
  queuedCases,
3480
3429
  concurrency: getConfiguredConcurrency(),
3481
3430
  globalTraceDisplay: config.traceDisplay
3482
3431
  });
3483
3432
  for (const preparedEval of preparedEvals) {
3484
- for (const preparedCase of preparedEval.preparedCases) {
3485
- if (preparedCase.trialResults.length === 0) continue;
3486
- const winningTrial = pickWinningTrial({
3487
- strategy: runState.manifest.trialSelection,
3488
- attempts: preparedCase.trialResults,
3489
- scoreKeys: preparedEval.scoreKeys
3490
- });
3491
- if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
3492
- runState.cases.push(winningTrial.caseRow);
3493
- runState.caseDetails.set(preparedCase.caseId, winningTrial.caseDetail);
3494
- preparedEval.mergeColumns(winningTrial.caseDetail.columns);
3495
- if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
3496
- else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
3497
- else runState.summary.failedCases++;
3498
- await writeFile(join(runDir, "traces", `${preparedCase.caseId}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
3499
- await persistCaseDetail(runDir, winningTrial.caseDetail);
3500
- emitEvent(runState, {
3501
- type: "case.finished",
3502
- runId: runState.manifest.id,
3503
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3504
- payload: winningTrial.caseRow
3505
- });
3506
- preparedEval.evalCaseRows.push(winningTrial.caseRow);
3507
- allCaseRows.push(winningTrial.caseRow);
3508
- }
3433
+ for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
3434
+ runState,
3435
+ runDir,
3436
+ preparedEval,
3437
+ preparedCase,
3438
+ onCaseFinished,
3439
+ emitEvent
3440
+ });
3509
3441
  preparedEval.evalMeta.columnDefs = [...preparedEval.accumulatedColumns.values()];
3510
3442
  lastRunStatusMap.set(preparedEval.evalMeta.id, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
3511
3443
  const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.id) ?? null;
@@ -3516,9 +3448,11 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3516
3448
  evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.id] ?? null
3517
3449
  });
3518
3450
  }
3451
+ sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
3452
+ for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
3519
3453
  const endTime = /* @__PURE__ */ new Date();
3520
3454
  runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
3521
- const finalStatus = runState.abortController.signal.aborted ? "cancelled" : evalErrors.length > 0 ? "error" : "completed";
3455
+ const finalStatus = evalErrors.length > 0 ? "error" : "completed";
3522
3456
  runState.summary.status = finalStatus;
3523
3457
  runState.manifest.status = finalStatus;
3524
3458
  const completedRunAt = endTime.toISOString();
@@ -3540,6 +3474,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3540
3474
  evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalId] ?? null
3541
3475
  });
3542
3476
  }
3477
+ await persistRunState(runState);
3543
3478
  emitEvent(runState, {
3544
3479
  type: "run.summary",
3545
3480
  runId: runState.manifest.id,
@@ -3558,7 +3493,6 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3558
3493
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3559
3494
  payload: runState.summary
3560
3495
  });
3561
- await persistRunState(runState);
3562
3496
  emitDiscoveryEvent();
3563
3497
  } catch (error) {
3564
3498
  const message = error instanceof Error ? error.message : String(error);
@@ -3566,13 +3500,13 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
3566
3500
  runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
3567
3501
  runState.summary.status = "error";
3568
3502
  runState.summary.errorMessage = message;
3503
+ await persistRunState(runState);
3569
3504
  emitEvent(runState, {
3570
3505
  type: "run.error",
3571
3506
  runId: runState.manifest.id,
3572
3507
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3573
3508
  payload: { message }
3574
3509
  });
3575
- await persistRunState(runState);
3576
3510
  emitDiscoveryEvent();
3577
3511
  }
3578
3512
  }
@@ -3580,896 +3514,4 @@ function toLastRunStatus(status) {
3580
3514
  return status === "pending" ? null : status;
3581
3515
  }
3582
3516
  //#endregion
3583
- //#region ../runner/src/runner.ts
3584
- const globMagicCharacters = new Set([
3585
- "*",
3586
- "?",
3587
- "[",
3588
- "]",
3589
- "{",
3590
- "}",
3591
- "(",
3592
- ")",
3593
- "!",
3594
- "+",
3595
- "@"
3596
- ]);
3597
- function hasGlobMagic(value) {
3598
- for (const char of value) if (globMagicCharacters.has(char)) return true;
3599
- return false;
3600
- }
3601
- function getWatchRootForIncludePattern(params) {
3602
- const segments = params.pattern.replaceAll("\\", "/").split("/").filter((part) => part !== "");
3603
- const firstGlobSegmentIndex = segments.findIndex(hasGlobMagic);
3604
- if (firstGlobSegmentIndex === -1) return dirname(resolve(params.workspaceRoot, params.pattern));
3605
- if (firstGlobSegmentIndex === 0) return params.workspaceRoot;
3606
- return resolve(params.workspaceRoot, segments.slice(0, firstGlobSegmentIndex).join("/"));
3607
- }
3608
- function getWatchRootsForIncludePatterns(params) {
3609
- const roots = /* @__PURE__ */ new Set();
3610
- for (const pattern of params.patterns) roots.add(getWatchRootForIncludePattern({
3611
- pattern,
3612
- workspaceRoot: params.workspaceRoot
3613
- }));
3614
- if (roots.size === 0) return [params.workspaceRoot];
3615
- return [...roots];
3616
- }
3617
- /** Create an in-memory eval runner bound to the current workspace config. */
3618
- function createRunner({ watchForChanges = true } = {}) {
3619
- let config;
3620
- let workspaceRoot;
3621
- let localStateDir;
3622
- let cacheStore;
3623
- const evals = /* @__PURE__ */ new Map();
3624
- const runs = /* @__PURE__ */ new Map();
3625
- const lastRunStatusMap = /* @__PURE__ */ new Map();
3626
- const latestRunInfoMap = /* @__PURE__ */ new Map();
3627
- const discoveryListeners = /* @__PURE__ */ new Set();
3628
- let nextShortIdNum = 0;
3629
- let discoveryWatcher;
3630
- let discoveryRefreshTimer;
3631
- function toWorkspaceRelativePath(filePath) {
3632
- return relative(workspaceRoot, filePath).replaceAll("\\", "/");
3633
- }
3634
- function getSortedEvalMetas() {
3635
- return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
3636
- }
3637
- function getSourceFingerprint(source) {
3638
- return createHash("sha256").update(source).digest("hex");
3639
- }
3640
- function getConfiguredConcurrency() {
3641
- const configuredConcurrency = config.concurrency;
3642
- if (typeof configuredConcurrency !== "number" || !Number.isFinite(configuredConcurrency)) return 1;
3643
- return Math.max(1, Math.floor(configuredConcurrency));
3644
- }
3645
- const runner = {
3646
- async init() {
3647
- config = await loadConfig();
3648
- workspaceRoot = config.workspaceRoot ?? process.cwd();
3649
- localStateDir = resolve(workspaceRoot, ".agent-evals");
3650
- await mkdir(localStateDir, { recursive: true });
3651
- await mkdir(join(localStateDir, "runs"), { recursive: true });
3652
- cacheStore = createFsCacheStore({
3653
- workspaceRoot,
3654
- dir: config.cache?.dir,
3655
- maxEntriesPerEval: config.cache?.maxEntriesPerEval
3656
- });
3657
- await loadPersistedRuns();
3658
- await runner.refreshDiscovery();
3659
- if (watchForChanges) await setupWatcher();
3660
- },
3661
- async listCache() {
3662
- return cacheStore.list();
3663
- },
3664
- async clearCache(filter) {
3665
- await cacheStore.clear(filter);
3666
- },
3667
- async recomputeStatusesForEval(evalId) {
3668
- const evalMeta = evals.get(evalId);
3669
- if (!evalMeta) return { updatedRuns: 0 };
3670
- const registry = getEvalRegistry();
3671
- await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
3672
- const entry = registry.get(evalId);
3673
- if (!entry) return { updatedRuns: 0 };
3674
- const scoreThresholds = /* @__PURE__ */ new Map();
3675
- entry.use((evalDef) => {
3676
- for (const [key, def] of Object.entries(evalDef.scores ?? {})) {
3677
- const threshold = normalizeScoreDef(def).passThreshold;
3678
- if (threshold !== void 0) scoreThresholds.set(key, threshold);
3679
- }
3680
- for (const [key, def] of Object.entries(evalDef.manualScores ?? {})) if (def.passThreshold !== void 0) scoreThresholds.set(key, def.passThreshold);
3681
- });
3682
- const updatedRuns = await recomputeEvalStatusesInRuns({
3683
- runs: runs.values(),
3684
- evalId,
3685
- evalExists: evals.has(evalId),
3686
- scoreThresholds,
3687
- persistCaseDetail
3688
- });
3689
- emitDiscoveryEvent();
3690
- return { updatedRuns };
3691
- },
3692
- async cleanRunsForEval(evalId) {
3693
- let deletedRuns = 0;
3694
- for (const [runId, run] of [...runs]) {
3695
- if (!runTouchesEval({
3696
- target: run.manifest.target,
3697
- caseRows: run.cases,
3698
- evalId,
3699
- evalExists: evals.has(evalId)
3700
- })) continue;
3701
- if (run.manifest.status === "running") continue;
3702
- runs.delete(runId);
3703
- await rm(run.runDir, {
3704
- recursive: true,
3705
- force: true
3706
- });
3707
- deletedRuns += 1;
3708
- }
3709
- emitDiscoveryEvent();
3710
- return { deletedRuns };
3711
- },
3712
- async updateManualScore({ runId, caseId, scoreKey, value }) {
3713
- const run = runs.get(runId);
3714
- if (!run) return {
3715
- updated: false,
3716
- reason: "Run not found"
3717
- };
3718
- if (run.manifest.status === "running") return {
3719
- updated: false,
3720
- reason: "Run is still running"
3721
- };
3722
- const caseRow = run.cases.find((row) => row.caseId === caseId);
3723
- if (!caseRow) return {
3724
- updated: false,
3725
- reason: "Case not found"
3726
- };
3727
- const evalMeta = evals.get(caseRow.evalId);
3728
- if (!evalMeta) return {
3729
- updated: false,
3730
- reason: "Eval not found"
3731
- };
3732
- if (evalMeta.columnDefs.find((def) => def.key === scoreKey)?.isManualScore !== true) return {
3733
- updated: false,
3734
- reason: "Manual score not found"
3735
- };
3736
- const caseDetail = run.caseDetails.get(caseId);
3737
- if (!caseDetail) return {
3738
- updated: false,
3739
- reason: "Case detail not found"
3740
- };
3741
- caseRow.columns[scoreKey] = value;
3742
- caseDetail.columns[scoreKey] = value;
3743
- const scoreThresholds = /* @__PURE__ */ new Map();
3744
- for (const def of evalMeta.columnDefs) {
3745
- if (def.isScore !== true || def.passThreshold === void 0) continue;
3746
- scoreThresholds.set(def.key, def.passThreshold);
3747
- }
3748
- const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds);
3749
- caseRow.status = nextStatus;
3750
- caseDetail.status = nextStatus;
3751
- const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
3752
- run.summary.totalCases = derivedSummary.totalCases;
3753
- run.summary.passedCases = derivedSummary.passedCases;
3754
- run.summary.failedCases = derivedSummary.failedCases;
3755
- run.summary.errorCases = derivedSummary.errorCases;
3756
- run.summary.cancelledCases = derivedSummary.cancelledCases;
3757
- run.summary.totalDurationMs = derivedSummary.totalDurationMs;
3758
- await persistCaseDetail(run.runDir, caseDetail);
3759
- await persistRunState(run);
3760
- emitDiscoveryEvent();
3761
- return {
3762
- updated: true,
3763
- run: {
3764
- manifest: run.manifest,
3765
- summary: run.summary,
3766
- cases: run.cases
3767
- },
3768
- caseDetail
3769
- };
3770
- },
3771
- async deleteRun(runId) {
3772
- const run = runs.get(runId);
3773
- if (!run) return { deleted: false };
3774
- if (run.manifest.status === "running") return { deleted: false };
3775
- runs.delete(runId);
3776
- await rm(run.runDir, {
3777
- recursive: true,
3778
- force: true
3779
- });
3780
- emitDiscoveryEvent();
3781
- return { deleted: true };
3782
- },
3783
- getEvals() {
3784
- const gitState = readGitWorktreeState(workspaceRoot);
3785
- const result = [];
3786
- for (const meta of getSortedEvalMetas()) result.push(buildEvalSummary({
3787
- meta,
3788
- config,
3789
- gitState,
3790
- latestRun: latestRunInfoMap.get(meta.id),
3791
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
3792
- }));
3793
- return result;
3794
- },
3795
- getEval(id) {
3796
- const meta = evals.get(id);
3797
- if (!meta) return void 0;
3798
- return buildEvalSummary({
3799
- meta,
3800
- config,
3801
- gitState: readGitWorktreeState(workspaceRoot),
3802
- latestRun: latestRunInfoMap.get(meta.id),
3803
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
3804
- });
3805
- },
3806
- async refreshDiscovery() {
3807
- const patterns = config.include;
3808
- const discovered = [];
3809
- for (const pattern of patterns) {
3810
- const files = await glob(pattern, {
3811
- cwd: workspaceRoot,
3812
- absolute: true
3813
- });
3814
- discovered.push(...files);
3815
- }
3816
- evals.clear();
3817
- for (const filePath of discovered) try {
3818
- const content = await readFile(filePath, "utf-8");
3819
- const discoveredMetas = parseEvalMetas(filePath, content);
3820
- const sourceFingerprint = getSourceFingerprint(content);
3821
- const registry = getEvalRegistry();
3822
- try {
3823
- await loadEvalModule(filePath, sourceFingerprint);
3824
- } catch {}
3825
- for (const meta of discoveredMetas) {
3826
- const discoveredEntry = registry.get(meta.id);
3827
- const title = meta.title;
3828
- let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
3829
- let stats;
3830
- let charts;
3831
- discoveredEntry?.use((evalDef) => {
3832
- columnDefs = buildDeclaredColumnDefs(evalDef.columns, evalDef.scores, evalDef.manualScores);
3833
- stats = evalDef.stats;
3834
- const validated = validateCharts({
3835
- charts: evalDef.charts,
3836
- columnDefs,
3837
- evalId: meta.id
3838
- });
3839
- for (const warning of validated.warnings) console.warn(warning);
3840
- charts = validated.charts;
3841
- });
3842
- evals.set(meta.id, {
3843
- id: meta.id,
3844
- title,
3845
- filePath: toWorkspaceRelativePath(meta.filePath),
3846
- sourceFilePath: meta.filePath,
3847
- sourceFingerprint,
3848
- columnDefs,
3849
- caseCount: null,
3850
- stats,
3851
- charts
3852
- });
3853
- }
3854
- } catch {}
3855
- emitDiscoveryEvent();
3856
- },
3857
- async startRun(request) {
3858
- const runId = generateRunId();
3859
- const shortId = `r${String(nextShortIdNum++)}`;
3860
- const now = (/* @__PURE__ */ new Date()).toISOString();
3861
- const cacheMode = request.cache?.mode ?? "use";
3862
- const runDir = join(localStateDir, "runs", runId);
3863
- const manifest = {
3864
- id: runId,
3865
- shortId,
3866
- status: "running",
3867
- startedAt: now,
3868
- endedAt: null,
3869
- commitSha: readGitWorktreeState(workspaceRoot).commitSha,
3870
- evalSourceFingerprints: {},
3871
- target: request.target,
3872
- trials: request.trials,
3873
- trialSelection: config.trialSelection ?? "lowestScore",
3874
- cacheMode
3875
- };
3876
- const summary = {
3877
- runId,
3878
- status: "running",
3879
- totalCases: 0,
3880
- passedCases: 0,
3881
- failedCases: 0,
3882
- errorCases: 0,
3883
- cancelledCases: 0,
3884
- totalDurationMs: null,
3885
- errorMessage: null
3886
- };
3887
- const abortController = new AbortController();
3888
- const runState = {
3889
- runDir,
3890
- manifest,
3891
- summary,
3892
- cases: [],
3893
- caseDetails: /* @__PURE__ */ new Map(),
3894
- listeners: /* @__PURE__ */ new Set(),
3895
- abortController
3896
- };
3897
- runs.set(runId, runState);
3898
- setLatestRunInfoMap({
3899
- latestRunInfoMap,
3900
- evalIds: getTargetEvalIds({
3901
- request,
3902
- sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
3903
- knownEvalIds: new Set(evals.keys())
3904
- }),
3905
- info: {
3906
- status: "running",
3907
- startedAt: now,
3908
- commitSha: manifest.commitSha ?? null,
3909
- evalSourceFingerprint: null
3910
- }
3911
- });
3912
- await mkdir(runDir, { recursive: true });
3913
- await mkdir(join(runDir, "traces"), { recursive: true });
3914
- await mkdir(join(runDir, "artifacts"), { recursive: true });
3915
- await mkdir(join(runDir, "case-details"), { recursive: true });
3916
- await writeFile(join(runDir, "run.json"), JSON.stringify(manifest, null, 2));
3917
- executeRun({
3918
- runState,
3919
- request,
3920
- runDir,
3921
- config,
3922
- evals,
3923
- cacheStore,
3924
- lastRunStatusMap,
3925
- latestRunInfoMap,
3926
- emitEvent,
3927
- emitDiscoveryEvent,
3928
- getSourceFingerprint,
3929
- getConfiguredConcurrency,
3930
- getSortedEvalMetas,
3931
- getTargetEvals
3932
- });
3933
- return {
3934
- manifest,
3935
- summary,
3936
- cases: []
3937
- };
3938
- },
3939
- getRuns() {
3940
- return [...runs.values()].map((r) => r.manifest);
3941
- },
3942
- getRun(id) {
3943
- const run = runs.get(id);
3944
- if (!run) return void 0;
3945
- return {
3946
- manifest: run.manifest,
3947
- summary: run.summary,
3948
- cases: run.cases
3949
- };
3950
- },
3951
- cancelRun(id) {
3952
- const run = runs.get(id);
3953
- if (!run) return;
3954
- run.abortController.abort();
3955
- run.manifest.status = "cancelled";
3956
- run.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
3957
- run.summary.status = "cancelled";
3958
- emitEvent(run, {
3959
- type: "run.cancelled",
3960
- runId: id,
3961
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3962
- payload: run.summary
3963
- });
3964
- },
3965
- getCaseDetail(runId, caseId) {
3966
- const run = runs.get(runId);
3967
- if (!run) return void 0;
3968
- return run.caseDetails.get(caseId);
3969
- },
3970
- subscribe(runId, listener) {
3971
- const run = runs.get(runId);
3972
- if (!run) return () => {};
3973
- run.listeners.add(listener);
3974
- return () => {
3975
- run.listeners.delete(listener);
3976
- };
3977
- },
3978
- subscribeDiscovery(listener) {
3979
- discoveryListeners.add(listener);
3980
- return () => {
3981
- discoveryListeners.delete(listener);
3982
- };
3983
- },
3984
- async close() {
3985
- if (discoveryRefreshTimer !== void 0) {
3986
- clearTimeout(discoveryRefreshTimer);
3987
- discoveryRefreshTimer = void 0;
3988
- }
3989
- const watcher = discoveryWatcher;
3990
- if (watcher === void 0) return;
3991
- discoveryWatcher = void 0;
3992
- await watcher.close();
3993
- },
3994
- getWorkspaceRoot() {
3995
- return workspaceRoot;
3996
- },
3997
- getArtifactPath(artifactId_) {
3998
- return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
3999
- }
4000
- };
4001
- async function setupWatcher() {
4002
- const watcher = watch(getWatchRootsForIncludePatterns({
4003
- patterns: config.include,
4004
- workspaceRoot
4005
- }), {
4006
- ignoreInitial: true,
4007
- persistent: true
4008
- });
4009
- discoveryWatcher = watcher;
4010
- const scheduleRefresh = () => {
4011
- if (discoveryRefreshTimer !== void 0) clearTimeout(discoveryRefreshTimer);
4012
- discoveryRefreshTimer = setTimeout(() => {
4013
- discoveryRefreshTimer = void 0;
4014
- runner.refreshDiscovery();
4015
- }, 50);
4016
- };
4017
- watcher.on("change", scheduleRefresh);
4018
- watcher.on("add", scheduleRefresh);
4019
- watcher.on("unlink", scheduleRefresh);
4020
- watcher.on("addDir", scheduleRefresh);
4021
- watcher.on("unlinkDir", scheduleRefresh);
4022
- await new Promise((ready) => {
4023
- watcher.once("ready", ready);
4024
- });
4025
- }
4026
- function emitDiscoveryEvent() {
4027
- const lastRunStatuses = getLastRunStatuses({
4028
- runs: runs.values(),
4029
- knownEvals: evals.values()
4030
- });
4031
- const latestRunInfos = getLatestRunInfos({
4032
- runs: runs.values(),
4033
- knownEvals: evals.values()
4034
- });
4035
- lastRunStatusMap.clear();
4036
- for (const [evalId, status] of lastRunStatuses) lastRunStatusMap.set(evalId, status);
4037
- latestRunInfoMap.clear();
4038
- for (const [evalId, info] of latestRunInfos) latestRunInfoMap.set(evalId, info);
4039
- const event = {
4040
- type: "discovery.updated",
4041
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4042
- payload: runner.getEvals()
4043
- };
4044
- for (const listener of discoveryListeners) listener(event);
4045
- }
4046
- function getTargetEvals(request) {
4047
- if (request.target.evalIds && request.target.evalIds.length > 0) return request.target.evalIds.map((id) => evals.get(id)).filter((e) => e !== void 0);
4048
- return getSortedEvalMetas();
4049
- }
4050
- function emitEvent(runState, event) {
4051
- for (const listener of runState.listeners) try {
4052
- listener(event);
4053
- } catch {}
4054
- }
4055
- async function loadPersistedRuns() {
4056
- runs.clear();
4057
- const persistedRuns = await loadPersistedRunSnapshots(localStateDir);
4058
- nextShortIdNum = nextShortIdFromSnapshots(persistedRuns);
4059
- for (const persistedRun of persistedRuns) runs.set(persistedRun.manifest.id, {
4060
- ...persistedRun,
4061
- listeners: /* @__PURE__ */ new Set(),
4062
- abortController: new AbortController()
4063
- });
4064
- }
4065
- return runner;
4066
- }
4067
- //#endregion
4068
- //#region src/cli.ts
4069
- function parseArgs(argv) {
4070
- const normalizedArgv = argv.filter((arg) => arg !== "--no-env");
4071
- const args = {
4072
- command: "help",
4073
- subcommand: void 0,
4074
- showHelp: false,
4075
- helpTopic: "global",
4076
- unknownHelpTarget: void 0,
4077
- evalIds: [],
4078
- caseIds: [],
4079
- trials: 1,
4080
- json: false,
4081
- port: 4100,
4082
- cacheMode: "use",
4083
- clearCache: false,
4084
- all: false,
4085
- loadEnv: normalizedArgv.length === argv.length
4086
- };
4087
- const command = normalizedArgv[0];
4088
- if (command === "--help" || command === "-h") {
4089
- args.showHelp = true;
4090
- return args;
4091
- }
4092
- if (isCliCommand(command)) {
4093
- args.command = command;
4094
- args.helpTopic = command === "help" ? "global" : command;
4095
- } else if (command !== void 0 && !command.startsWith("-")) args.unknownHelpTarget = command;
4096
- let cursor = 1;
4097
- if (args.command === "cache") {
4098
- const sub = normalizedArgv[cursor];
4099
- if (sub === "list" || sub === "clear") {
4100
- args.subcommand = sub;
4101
- args.helpTopic = `cache ${sub}`;
4102
- cursor++;
4103
- } else if (sub !== void 0 && !sub.startsWith("-")) args.unknownHelpTarget = `cache ${sub}`;
4104
- }
4105
- for (let i = cursor; i < normalizedArgv.length; i++) {
4106
- const arg = normalizedArgv[i];
4107
- const next = normalizedArgv[i + 1];
4108
- if (arg === "--help" || arg === "-h") args.showHelp = true;
4109
- else if (arg === "--eval" && next) {
4110
- args.evalIds.push(...next.split(","));
4111
- i++;
4112
- } else if (arg === "--case" && next) {
4113
- args.caseIds.push(...next.split(","));
4114
- i++;
4115
- } else if (arg === "--trials" && next) {
4116
- args.trials = Number(next);
4117
- i++;
4118
- } else if (arg === "--json") args.json = true;
4119
- else if (arg === "--port" && next) {
4120
- args.port = Number(next);
4121
- i++;
4122
- } else if (arg === "--cache" && next) {
4123
- if (next === "use" || next === "bypass" || next === "refresh") args.cacheMode = next;
4124
- i++;
4125
- } else if (arg === "--no-cache") args.cacheMode = "bypass";
4126
- else if (arg === "--refresh-cache") args.cacheMode = "refresh";
4127
- else if (arg === "--clear-cache") args.clearCache = true;
4128
- else if (arg === "--all") args.all = true;
4129
- }
4130
- return args;
4131
- }
4132
- /**
4133
- * Run the Agent Evals CLI against the current workspace.
4134
- *
4135
- * @param argv Raw command-line arguments excluding the executable name.
4136
- */
4137
- async function runCli(argv) {
4138
- const args = parseArgs(argv);
4139
- if (args.loadEnv && !loadWorkspaceEnv()) {
4140
- process.exit(1);
4141
- return;
4142
- }
4143
- if (args.showHelp) {
4144
- if (args.unknownHelpTarget !== void 0) {
4145
- console.error(`No help found for "${args.unknownHelpTarget}".`);
4146
- process.exit(1);
4147
- return;
4148
- }
4149
- printHelp(args.helpTopic);
4150
- return;
4151
- }
4152
- switch (args.command) {
4153
- case "app":
4154
- await commandApp(args);
4155
- break;
4156
- case "list":
4157
- await commandList(args);
4158
- break;
4159
- case "run":
4160
- await commandRun(args);
4161
- break;
4162
- case "cache":
4163
- await commandCache(args);
4164
- break;
4165
- default:
4166
- printHelp(args.helpTopic);
4167
- break;
4168
- }
4169
- }
4170
- function isCliCommand(command) {
4171
- return command === "app" || command === "list" || command === "run" || command === "cache" || command === "help";
4172
- }
4173
- function loadWorkspaceEnv() {
4174
- const envPath = resolve(process.cwd(), ".env");
4175
- if (!existsSync(envPath)) return true;
4176
- const loadResult = resultify(() => {
4177
- process.loadEnvFile(envPath);
4178
- });
4179
- if (loadResult.error) {
4180
- console.error(`Failed to load .env at ${envPath}: ${loadResult.error.message}`);
4181
- return false;
4182
- }
4183
- return true;
4184
- }
4185
- const currentDir = dirname(fileURLToPath(import.meta.url));
4186
- const repoRoot = resolve(currentDir, "../../..");
4187
- const pnpmCommand = process.platform === "win32" ? "pnpm.cmd" : "pnpm";
4188
- function hasRepoWebWorkspace() {
4189
- return existsSync(resolve(repoRoot, "apps/web/package.json"));
4190
- }
4191
- async function ensureWebUiIsBuilt() {
4192
- if (!hasRepoWebWorkspace()) return;
4193
- console.info("Preparing web UI...");
4194
- await new Promise((resolvePromise, rejectPromise) => {
4195
- const child = spawn(pnpmCommand, [
4196
- "--filter",
4197
- "@agent-evals/web",
4198
- "build"
4199
- ], {
4200
- cwd: repoRoot,
4201
- stdio: "inherit"
4202
- });
4203
- child.once("error", (error) => {
4204
- rejectPromise(error);
4205
- });
4206
- child.once("exit", (code, signal) => {
4207
- if (signal) {
4208
- rejectPromise(/* @__PURE__ */ new Error(`Web UI build stopped with signal ${signal}.`));
4209
- return;
4210
- }
4211
- if (code !== 0) {
4212
- rejectPromise(/* @__PURE__ */ new Error(`Web UI build failed with exit code ${String(code)}.`));
4213
- return;
4214
- }
4215
- resolvePromise();
4216
- });
4217
- });
4218
- }
4219
- function isHonoAppModule(mod) {
4220
- if (typeof mod !== "object" || mod === null || !("app" in mod)) return false;
4221
- const { app } = mod;
4222
- return typeof app === "object" && app !== null && "fetch" in app && typeof app.fetch === "function";
4223
- }
4224
- function isServerRunnerModule(mod) {
4225
- if (typeof mod !== "object" || mod === null || !("initRunner" in mod)) return false;
4226
- return typeof mod.initRunner === "function";
4227
- }
4228
- async function commandApp(args) {
4229
- await ensureWebUiIsBuilt();
4230
- const { serve } = await import("@hono/node-server");
4231
- const bundledWebDist = resolve(currentDir, "apps/web/dist");
4232
- if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
4233
- const appModule = await import("./app-7qDBq_ub.mjs");
4234
- const runnerModule = await import("./runner-uzzY8kk1.mjs");
4235
- if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
4236
- if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
4237
- await runnerModule.initRunner();
4238
- console.info(`Agent Evals app: http://localhost:${String(args.port)}`);
4239
- serve({
4240
- fetch: appModule.app.fetch,
4241
- port: args.port
4242
- });
4243
- }
4244
- async function commandList(args_) {
4245
- const runner = createRunner({ watchForChanges: false });
4246
- await runner.init();
4247
- const evals = runner.getEvals();
4248
- if (evals.length === 0) {
4249
- console.info("No eval files found.");
4250
- return;
4251
- }
4252
- console.info("Discovered evals:\n");
4253
- for (const ev of evals) {
4254
- const displayStatus = getEvalDisplayStatus({
4255
- freshnessStatus: ev.freshnessStatus,
4256
- stale: ev.stale,
4257
- outdated: ev.outdated,
4258
- lastRunStatus: ev.lastRunStatus
4259
- });
4260
- const title = getEvalTitle(ev);
4261
- console.info(` ${title}`);
4262
- console.info(` id: ${ev.id}`);
4263
- console.info(` file: ${ev.filePath}`);
4264
- if (displayStatus !== "pending") console.info(` status: ${displayStatus}`);
4265
- if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
4266
- console.info("");
4267
- }
4268
- }
4269
- async function commandRun(args) {
4270
- const runner = createRunner({ watchForChanges: false });
4271
- await runner.init();
4272
- if (args.clearCache) {
4273
- await runner.clearCache();
4274
- if (!args.json) {
4275
- console.info("Cleared cache before run.");
4276
- console.info("");
4277
- }
4278
- }
4279
- const target = args.caseIds.length > 0 ? {
4280
- mode: "caseIds",
4281
- caseIds: args.caseIds,
4282
- evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
4283
- } : args.evalIds.length > 0 ? {
4284
- mode: "evalIds",
4285
- evalIds: args.evalIds
4286
- } : { mode: "all" };
4287
- const run = await runner.startRun({
4288
- target,
4289
- trials: args.trials,
4290
- cache: { mode: args.cacheMode }
4291
- });
4292
- if (!args.json) {
4293
- console.info(`Run started: ${run.manifest.id}`);
4294
- console.info(`Trials: ${String(args.trials)}`);
4295
- if (args.cacheMode !== "use") console.info(`Cache mode: ${args.cacheMode}`);
4296
- console.info("");
4297
- }
4298
- await waitForRunCompletion(runner, run.manifest.id);
4299
- const finalRun = runner.getRun(run.manifest.id);
4300
- if (!finalRun) {
4301
- process.exit(1);
4302
- return;
4303
- }
4304
- const { summary } = finalRun;
4305
- if (args.json) console.info(JSON.stringify(summary, null, 2));
4306
- else {
4307
- console.info("--- Run Summary ---");
4308
- console.info(`Status: ${summary.status}`);
4309
- console.info(`Total: ${String(summary.totalCases)}`);
4310
- console.info(`Passed: ${String(summary.passedCases)}`);
4311
- console.info(`Failed: ${String(summary.failedCases)}`);
4312
- console.info(`Errors: ${String(summary.errorCases)}`);
4313
- if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
4314
- if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
4315
- }
4316
- if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
4317
- }
4318
- async function commandCache(args) {
4319
- const runner = createRunner({ watchForChanges: false });
4320
- await runner.init();
4321
- if (args.subcommand === "list" || args.subcommand === void 0) {
4322
- const entries = await runner.listCache();
4323
- if (args.json) {
4324
- console.info(JSON.stringify(entries, null, 2));
4325
- return;
4326
- }
4327
- if (entries.length === 0) {
4328
- console.info("No cache entries.");
4329
- return;
4330
- }
4331
- console.info(`Cache entries (${String(entries.length)}):\n`);
4332
- for (const entry of entries) {
4333
- console.info(` ${entry.namespace}`);
4334
- console.info(` key: ${entry.key}`);
4335
- const operationLabel = entry.operationType === "span" ? `${entry.operationName} (span ${entry.spanKind ?? "unknown"})` : `${entry.operationName} (value)`;
4336
- console.info(` operation: ${operationLabel}`);
4337
- console.info(` stored: ${entry.storedAt}`);
4338
- console.info(` size: ${String(entry.sizeBytes)} bytes`);
4339
- console.info("");
4340
- }
4341
- return;
4342
- }
4343
- if (args.subcommand === "clear") {
4344
- if (args.evalIds.length > 0) {
4345
- for (const evalId of args.evalIds) {
4346
- const entries = await runner.listCache();
4347
- const prefix = `${evalId}__`;
4348
- const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
4349
- for (const entry of matching) await runner.clearCache({
4350
- namespace: entry.namespace,
4351
- key: entry.key
4352
- });
4353
- }
4354
- console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
4355
- return;
4356
- }
4357
- if (args.all) {
4358
- await runner.clearCache();
4359
- console.info("Cleared all cache entries.");
4360
- return;
4361
- }
4362
- console.info("Refusing to clear cache without --eval <id> or --all. Use one of these flags to confirm.");
4363
- process.exit(1);
4364
- return;
4365
- }
4366
- printHelp(args.helpTopic);
4367
- }
4368
- async function waitForRunCompletion(runner, runId) {
4369
- return new Promise((resolvePromise) => {
4370
- const check = () => {
4371
- const run = runner.getRun(runId);
4372
- if (!run || run.manifest.status === "completed" || run.manifest.status === "cancelled" || run.manifest.status === "error") {
4373
- resolvePromise();
4374
- return;
4375
- }
4376
- setTimeout(check, 200);
4377
- };
4378
- check();
4379
- });
4380
- }
4381
- function printHelp(topic = "global") {
4382
- if (topic === "app") {
4383
- console.info(`
4384
- agent-evals app - Start server with UI
4385
-
4386
- Usage:
4387
- agent-evals app [flags]
4388
-
4389
- Flags:
4390
- --port <n> Server port (default: 4100)
4391
- --no-env Disable automatic .env loading
4392
- --help, -h Show this help
4393
- `);
4394
- return;
4395
- }
4396
- if (topic === "list") {
4397
- console.info(`
4398
- agent-evals list - List discovered evals
4399
-
4400
- Usage:
4401
- agent-evals list [flags]
4402
-
4403
- Flags:
4404
- --no-env Disable automatic .env loading
4405
- --help, -h Show this help
4406
- `);
4407
- return;
4408
- }
4409
- if (topic === "run") {
4410
- console.info(`
4411
- agent-evals run - Run evals
4412
-
4413
- Usage:
4414
- agent-evals run [flags]
4415
-
4416
- Flags:
4417
- --eval <id> Run specific eval(s) (comma-separated)
4418
- --case <id> Run specific case(s) (comma-separated)
4419
- --trials <n> Number of trials per case
4420
- --json Output run summary as JSON
4421
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
4422
- --no-cache Shortcut for --cache bypass
4423
- --refresh-cache Shortcut for --cache refresh
4424
- --clear-cache Clear the cache before starting the run
4425
- --no-env Disable automatic .env loading
4426
- --help, -h Show this help
4427
- `);
4428
- return;
4429
- }
4430
- if (topic === "cache" || topic === "cache list" || topic === "cache clear") {
4431
- console.info(`
4432
- agent-evals cache - Manage cached operation entries
4433
-
4434
- Usage:
4435
- agent-evals cache list [flags]
4436
- agent-evals cache clear --eval <id>
4437
- agent-evals cache clear --all
4438
-
4439
- Flags:
4440
- --eval <id> Clear entries for specific eval(s) (comma-separated)
4441
- --all Confirm clearing every cached entry
4442
- --json Output cache listing as JSON
4443
- --no-env Disable automatic .env loading
4444
- --help, -h Show this help
4445
- `);
4446
- return;
4447
- }
4448
- console.info(`
4449
- agent-evals - LLM/Agent eval runner
4450
-
4451
- Commands:
4452
- app Start server with UI
4453
- list List discovered evals
4454
- run Run evals
4455
- cache list List cached operation entries
4456
- cache clear --eval <id> Clear cache entries for one eval
4457
- cache clear --all Clear every cached entry
4458
- help Show this help
4459
-
4460
- Options:
4461
- --eval <id> Run specific eval(s) (comma-separated)
4462
- --case <id> Run specific case(s) (comma-separated)
4463
- --trials <n> Number of trials per case
4464
- --json Output results as JSON
4465
- --port <n> Server port (default: 4100)
4466
- --cache <use|bypass|refresh> Cache mode for this run (default: use)
4467
- --no-cache Shortcut for --cache bypass
4468
- --refresh-cache Shortcut for --cache refresh
4469
- --clear-cache Clear the cache before starting the run
4470
- --no-env Disable automatic .env loading
4471
- --help, -h Show help
4472
- `);
4473
- }
4474
- //#endregion
4475
- export { columnDefSchema as $, evalStatsConfigSchema as A, evalChartTypeSchema as B, spanCacheOptionsSchema as C, setEvalOutput as Ct, evalFreshnessStatusSchema as D, getEvalRegistry as Dt, caseRowSchema as E, defineEval as Et, evalChartBuiltinMetricSchema as F, traceAttributeDisplaySchema as G, traceAttributeDisplayFormatSchema as H, evalChartColorSchema as I, traceSpanErrorSchema as J, traceDisplayConfigSchema as K, evalChartConfigSchema as L, scoreTraceSchema as M, evalChartAggregateSchema as N, evalStatAggregateSchema as O, evalChartAxisSchema as P, cellValueSchema as Q, evalChartMetricSchema as R, serializedCacheSpanSchema as S, runInEvalScope as St, caseDetailSchema as T, repoFile as Tt, traceAttributeDisplayInputSchema as U, evalChartsConfigSchema as V, traceAttributeDisplayPlacementSchema as W, traceSpanSchema as X, traceSpanKindSchema as Y, traceSpanWarningSchema as Z, cacheListItemSchema as _, getCurrentScope as _t, sseEnvelopeSchema as a, repoFileRefSchema as at, cacheRecordingOpSchema as b, isInEvalScope as bt, deriveScopedSummaryFromCases as c, buildTraceTree as ct, runManifestSchema as d, evalTracer as dt, columnFormatSchema as et, runSummarySchema as f, hashCacheKey as ft, cacheFileSchema as g, evalAssert as gt, cacheEntrySchema as h, appendToEvalOutput as ht, updateManualScoreRequestSchema as i, numberDisplayOptionsSchema as it, evalSummarySchema as j, evalStatItemSchema as k, deriveStatusFromCaseRows as l, captureEvalSpanError as lt, trialSelectionModeSchema as m, EvalAssertionError as mt, createRunner as n, fileRefSchema as nt, getEvalTitle as o, runArtifactRefSchema as ot, agentEvalsConfigSchema as p, hashCacheKeySync as pt, traceDisplayInputConfigSchema as q, createRunRequestSchema as r, jsonCellSchema as rt, getEvalDisplayStatus as s, z$1 as st, runCli as t, columnKindSchema as tt, deriveStatusFromChildStatuses as u, evalSpan as ut, cacheModeSchema as v, getEvalCaseInput as vt, assertionFailureSchema as w, setScopeCacheContext as wt, cacheRecordingSchema as x, mergeEvalOutput as xt, cacheOperationTypeSchema as y, incrementEvalOutput as yt, evalChartTooltipExtraSchema as z };
3517
+ export { evalChartAxisSchema as $, runManifestSchema as A, evalTracer as At, cacheRecordingSchema as B, mergeEvalOutput as Bt, updateManualScoreRequestSchema as C, numberDisplayOptionsSchema as Ct, deriveScopedSummaryFromCases as D, buildTraceTree as Dt, getEvalDisplayStatus as E, z$1 as Et, cacheFileSchema as F, evalAssert as Ft, caseRowSchema as G, defineEval as Gt, spanCacheOptionsSchema as H, setEvalOutput as Ht, cacheListItemSchema as I, getCurrentScope as It, evalStatItemSchema as J, evalFreshnessStatusSchema as K, getEvalRegistry as Kt, cacheModeSchema as L, getEvalCaseInput as Lt, agentEvalsConfigSchema as M, hashCacheKeySync as Mt, trialSelectionModeSchema as N, EvalAssertionError as Nt, deriveStatusFromCaseRows as O, captureEvalSpanError as Ot, cacheEntrySchema as P, appendToEvalOutput as Pt, evalChartAggregateSchema as Q, cacheOperationTypeSchema as R, incrementEvalOutput as Rt, createRunRequestSchema as S, jsonCellSchema as St, getEvalTitle as T, runArtifactRefSchema as Tt, assertionFailureSchema as U, setScopeCacheContext as Ut, serializedCacheSpanSchema as V, runInEvalScope as Vt, caseDetailSchema as W, repoFile as Wt, evalSummarySchema as X, evalStatsConfigSchema as Y, scoreTraceSchema as Z, loadEvalModule as _, cellValueSchema as _t, loadPersistedRunSnapshot as a, evalChartTypeSchema as at, normalizeScoreDef as b, columnKindSchema as bt, persistCaseDetail as c, traceAttributeDisplayInputSchema as ct, recomputePersistedCaseStatus as d, traceDisplayConfigSchema as dt, evalChartBuiltinMetricSchema as et, runTouchesEval as f, traceDisplayInputConfigSchema as ft, setLatestRunInfoMap as g, traceSpanWarningSchema as gt, getTargetEvalIds as h, traceSpanSchema as ht, getLatestRunInfos as i, evalChartTooltipExtraSchema as it, runSummarySchema as j, hashCacheKey as jt, deriveStatusFromChildStatuses as k, evalSpan as kt, persistRunState as l, traceAttributeDisplayPlacementSchema as lt, buildEvalSummary as m, traceSpanKindSchema as mt, generateRunId as n, evalChartConfigSchema as nt, loadPersistedRunSnapshots as o, evalChartsConfigSchema as ot, resolveArtifactPath as p, traceSpanErrorSchema as pt, evalStatAggregateSchema as q, getLastRunStatuses as r, evalChartMetricSchema as rt, nextShortIdFromSnapshots as s, traceAttributeDisplayFormatSchema as st, executeRun as t, evalChartColorSchema as tt, recomputeEvalStatusesInRuns as u, traceAttributeDisplaySchema as ut, loadConfig as v, columnDefSchema as vt, sseEnvelopeSchema as w, repoFileRefSchema as wt, createFsCacheStore as x, fileRefSchema as xt, buildDeclaredColumnDefs as y, columnFormatSchema as yt, cacheRecordingOpSchema as z, isInEvalScope as zt };