@ls-stack/agent-eval 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-KbbX3NYr.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
28
+ <script type="module" crossorigin src="/assets/index-DqR1YaMG.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-DNsZjOms.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-Be0x8CS3.mjs";
2
+ import { t as runCli } from "./cli-ETfZ15RB.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, Rn as getEvalRegistry, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, nt as resolveLlmCallsConfig, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveApiCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-D697g6Qe.mjs";
1
+ import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -8,6 +8,34 @@ import { existsSync } from "node:fs";
8
8
  import { resultify } from "t-result";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { spawn, spawnSync } from "node:child_process";
11
+ //#region ../runner/src/evalSummaries.ts
12
+ /** Build the API/UI summary payload for one discovered eval. */
13
+ function buildEvalSummary(params) {
14
+ const { meta, config, gitState, latestRun, lastRunStatus } = params;
15
+ const { sourceFingerprint, ...summaryMeta } = meta;
16
+ const freshness = deriveEvalFreshness({
17
+ latestRun,
18
+ gitState,
19
+ currentEvalSourceFingerprint: sourceFingerprint,
20
+ staleAfterDays: config.staleAfterDays ?? 14
21
+ });
22
+ return {
23
+ ...summaryMeta,
24
+ stale: freshness.stale,
25
+ outdated: freshness.outdated,
26
+ freshnessStatus: freshness.freshnessStatus,
27
+ latestRunAt: latestRun?.startedAt ?? null,
28
+ latestRunCommitSha: latestRun?.commitSha ?? null,
29
+ currentCommitSha: gitState.commitSha,
30
+ lastRunStatus
31
+ };
32
+ }
33
+ /** Write one latest-run snapshot to each targeted eval id. */
34
+ function setLatestRunInfoMap(params) {
35
+ const { latestRunInfoMap, evalIds, info } = params;
36
+ for (const evalId of evalIds) latestRunInfoMap.set(evalId, info);
37
+ }
38
+ //#endregion
11
39
  //#region ../runner/src/gitState.ts
12
40
  function runGitCommand(workspaceRoot, args) {
13
41
  const result = spawnSync("git", args, {
@@ -142,16 +170,16 @@ function handleRunChildMessage(params) {
142
170
  handleRunChildEvent(runState, message.event, managerContext);
143
171
  }
144
172
  function upsertFinishedCase(runState, caseDetail, caseRow) {
145
- const existingIndex = runState.cases.findIndex((row) => row.evalId === caseRow.evalId && row.caseId === caseRow.caseId && row.trial === caseRow.trial);
173
+ const existingIndex = runState.cases.findIndex((row) => getCaseRowCaseKey(row) === getCaseRowCaseKey(caseRow) && row.trial === caseRow.trial);
146
174
  if (existingIndex === -1) runState.cases.push(caseRow);
147
175
  else runState.cases[existingIndex] = caseRow;
148
- runState.caseDetails.set(caseDetail.caseId, caseDetail);
176
+ runState.caseDetails.set(caseDetail.caseKey ?? caseDetail.caseId, caseDetail);
149
177
  }
150
178
  function applyChildEvalMetas(evals, childMetas) {
151
179
  for (const childMeta of childMetas) {
152
- const evalMeta = evals.get(childMeta.id);
180
+ const evalMeta = evals.get(childMeta.key);
153
181
  if (evalMeta === void 0) {
154
- evals.set(childMeta.id, childMeta);
182
+ evals.set(childMeta.key, childMeta);
155
183
  continue;
156
184
  }
157
185
  evalMeta.columnDefs = childMeta.columnDefs;
@@ -266,6 +294,7 @@ function createRunner({ watchForChanges = true } = {}) {
266
294
  let llmCallsConfig = resolveLlmCallsConfig(void 0);
267
295
  let apiCallsConfig = resolveApiCallsConfig(void 0);
268
296
  const evals = /* @__PURE__ */ new Map();
297
+ let discoveryIssues = [];
269
298
  const runs = /* @__PURE__ */ new Map();
270
299
  const lastRunStatusMap = /* @__PURE__ */ new Map();
271
300
  const latestRunInfoMap = /* @__PURE__ */ new Map();
@@ -279,7 +308,13 @@ function createRunner({ watchForChanges = true } = {}) {
279
308
  return relative(workspaceRoot, filePath).replaceAll("\\", "/");
280
309
  }
281
310
  function getSortedEvalMetas() {
282
- return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath));
311
+ return [...evals.values()].toSorted((a, b) => a.filePath.localeCompare(b.filePath) || a.id.localeCompare(b.id));
312
+ }
313
+ function resolveEvalMeta(evalRef) {
314
+ const exactMatch = evals.get(evalRef);
315
+ if (exactMatch !== void 0) return exactMatch;
316
+ const matches = getSortedEvalMetas().filter((ev) => ev.id === evalRef);
317
+ return matches.length === 1 ? matches[0] : void 0;
283
318
  }
284
319
  function getSourceFingerprint(source) {
285
320
  return createHash("sha256").update(source).digest("hex");
@@ -312,12 +347,12 @@ function createRunner({ watchForChanges = true } = {}) {
312
347
  async clearCache(filter) {
313
348
  await cacheStore.clear(filter);
314
349
  },
315
- async recomputeStatusesForEval(evalId) {
316
- const evalMeta = evals.get(evalId);
350
+ async recomputeStatusesForEval(evalKey) {
351
+ const evalMeta = resolveEvalMeta(evalKey);
317
352
  if (!evalMeta) return { updatedRuns: 0 };
318
353
  const registry = getEvalRegistry();
319
354
  await loadEvalModule(evalMeta.sourceFilePath, evalMeta.sourceFingerprint ?? void 0);
320
- const entry = registry.get(evalId);
355
+ const entry = registry.get(evalMeta.id);
321
356
  if (!entry) return { updatedRuns: 0 };
322
357
  const scoreThresholds = /* @__PURE__ */ new Map();
323
358
  entry.use((evalDef) => {
@@ -329,22 +364,25 @@ function createRunner({ watchForChanges = true } = {}) {
329
364
  });
330
365
  const updatedRuns = await recomputeEvalStatusesInRuns({
331
366
  runs: runs.values(),
332
- evalId,
333
- evalExists: evals.has(evalId),
367
+ evalKey: evalMeta.key,
368
+ evalId: evalMeta.id,
369
+ evalExists: evals.has(evalMeta.key),
334
370
  scoreThresholds,
335
371
  persistCaseDetail
336
372
  });
337
373
  emitDiscoveryEvent();
338
374
  return { updatedRuns };
339
375
  },
340
- async cleanRunsForEval(evalId) {
376
+ async cleanRunsForEval(evalKey) {
377
+ const evalMeta = resolveEvalMeta(evalKey);
341
378
  let deletedRuns = 0;
342
379
  for (const [runId, run] of [...runs]) {
343
380
  if (!runTouchesEval({
344
381
  target: run.manifest.target,
345
382
  caseRows: run.cases,
346
- evalId,
347
- evalExists: evals.has(evalId)
383
+ evalKey: evalMeta?.key ?? evalKey,
384
+ evalId: evalMeta?.id,
385
+ evalExists: evalMeta !== void 0
348
386
  })) continue;
349
387
  if (run.manifest.status === "running") continue;
350
388
  runs.delete(runId);
@@ -367,12 +405,12 @@ function createRunner({ watchForChanges = true } = {}) {
367
405
  updated: false,
368
406
  reason: "Run is still running"
369
407
  };
370
- const caseRow = run.cases.find((row) => row.caseId === caseId);
408
+ const caseRow = run.cases.find((row) => getCaseRowCaseKey(row) === caseId || row.caseId === caseId);
371
409
  if (!caseRow) return {
372
410
  updated: false,
373
411
  reason: "Case not found"
374
412
  };
375
- const evalMeta = evals.get(caseRow.evalId);
413
+ const evalMeta = evals.get(getCaseRowEvalKey(caseRow));
376
414
  if (!evalMeta) return {
377
415
  updated: false,
378
416
  reason: "Eval not found"
@@ -381,7 +419,7 @@ function createRunner({ watchForChanges = true } = {}) {
381
419
  updated: false,
382
420
  reason: "Manual score not found"
383
421
  };
384
- const caseDetail = run.caseDetails.get(caseId);
422
+ const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
385
423
  if (!caseDetail) return {
386
424
  updated: false,
387
425
  reason: "Case detail not found"
@@ -435,22 +473,25 @@ function createRunner({ watchForChanges = true } = {}) {
435
473
  meta,
436
474
  config,
437
475
  gitState,
438
- latestRun: latestRunInfoMap.get(meta.id),
439
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
476
+ latestRun: latestRunInfoMap.get(meta.key),
477
+ lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
440
478
  }));
441
479
  return result;
442
480
  },
443
481
  getEval(id) {
444
- const meta = evals.get(id);
482
+ const meta = resolveEvalMeta(id);
445
483
  if (!meta) return void 0;
446
484
  return buildEvalSummary({
447
485
  meta,
448
486
  config,
449
487
  gitState: readGitWorktreeState(workspaceRoot),
450
- latestRun: latestRunInfoMap.get(meta.id),
451
- lastRunStatus: lastRunStatusMap.get(meta.id) ?? null
488
+ latestRun: latestRunInfoMap.get(meta.key),
489
+ lastRunStatus: lastRunStatusMap.get(meta.key) ?? null
452
490
  });
453
491
  },
492
+ getDiscoveryIssues() {
493
+ return discoveryIssues;
494
+ },
454
495
  async refreshDiscovery() {
455
496
  const patterns = config.include;
456
497
  const discovered = [];
@@ -462,16 +503,25 @@ function createRunner({ watchForChanges = true } = {}) {
462
503
  discovered.push(...files);
463
504
  }
464
505
  evals.clear();
506
+ discoveryIssues = [];
465
507
  for (const filePath of discovered) try {
466
508
  const content = await readFile(filePath, "utf-8");
467
- const discoveredMetas = parseEvalMetas(filePath, content);
509
+ const discovery = parseEvalDiscovery(filePath, content);
510
+ const discoveredMetas = discovery.metas;
511
+ discoveryIssues.push(...discovery.issues.map((issue) => ({
512
+ ...issue,
513
+ filePath: toWorkspaceRelativePath(issue.filePath),
514
+ message: `Duplicate eval id "${issue.evalId}" in ${toWorkspaceRelativePath(issue.filePath)}. Eval ids must be unique within one file.`
515
+ })));
468
516
  const sourceFingerprint = getSourceFingerprint(content);
469
517
  const registry = getEvalRegistry();
518
+ let moduleLoaded = false;
470
519
  try {
471
520
  await loadEvalModule(filePath, sourceFingerprint);
521
+ moduleLoaded = true;
472
522
  } catch {}
473
523
  for (const meta of discoveredMetas) {
474
- const discoveredEntry = registry.get(meta.id);
524
+ const discoveredEntry = moduleLoaded ? registry.get(meta.id) : void 0;
475
525
  const title = meta.title;
476
526
  let columnDefs = buildDeclaredColumnDefs(void 0, void 0, void 0);
477
527
  let stats;
@@ -491,10 +541,16 @@ function createRunner({ watchForChanges = true } = {}) {
491
541
  for (const warning of validated.warnings) console.warn(warning);
492
542
  charts = validated.charts;
493
543
  });
494
- evals.set(meta.id, {
544
+ const relativeFilePath = toWorkspaceRelativePath(meta.filePath);
545
+ const key = buildEvalKey({
546
+ filePath: relativeFilePath,
547
+ evalId: meta.id
548
+ });
549
+ evals.set(key, {
550
+ key,
495
551
  id: meta.id,
496
552
  title,
497
- filePath: toWorkspaceRelativePath(meta.filePath),
553
+ filePath: relativeFilePath,
498
554
  sourceFilePath: meta.filePath,
499
555
  sourceFingerprint,
500
556
  columnDefs,
@@ -549,10 +605,9 @@ function createRunner({ watchForChanges = true } = {}) {
549
605
  runs.set(runId, runState);
550
606
  setLatestRunInfoMap({
551
607
  latestRunInfoMap,
552
- evalIds: getTargetEvalIds({
608
+ evalIds: getTargetEvalKeys({
553
609
  request,
554
- sortedEvalIds: getSortedEvalMetas().map((meta) => meta.id),
555
- knownEvalIds: new Set(evals.keys())
610
+ sortedEvals: getSortedEvalMetas()
556
611
  }),
557
612
  info: {
558
613
  status: "running",
@@ -633,7 +688,7 @@ function createRunner({ watchForChanges = true } = {}) {
633
688
  getCaseDetail(runId, caseId) {
634
689
  const run = runs.get(runId);
635
690
  if (!run) return void 0;
636
- return run.caseDetails.get(caseId);
691
+ return run.caseDetails.get(caseId) ?? run.caseDetails.get(getCaseRowCaseKey(run.cases.find((caseRow) => getCaseRowCaseKey(caseRow) === caseId || caseRow.caseId === caseId) ?? { caseId }));
637
692
  },
638
693
  subscribe(runId, listener) {
639
694
  const run = runs.get(runId);
@@ -799,6 +854,7 @@ function parseArgs(argv) {
799
854
  helpTopic: "global",
800
855
  unknownHelpTarget: void 0,
801
856
  evalIds: [],
857
+ files: [],
802
858
  caseIds: [],
803
859
  trials: 1,
804
860
  json: false,
@@ -834,6 +890,9 @@ function parseArgs(argv) {
834
890
  else if (arg === "--eval" && next) {
835
891
  args.evalIds.push(...next.split(","));
836
892
  i++;
893
+ } else if (arg === "--file" && next) {
894
+ args.files.push(...next.split(","));
895
+ i++;
837
896
  } else if (arg === "--case" && next) {
838
897
  args.caseIds.push(...next.split(","));
839
898
  i++;
@@ -899,6 +958,28 @@ async function runCli(argv) {
899
958
  function isCliCommand(command) {
900
959
  return command === "app" || command === "list" || command === "run" || command === "show-runs" || command === "cache" || command === "help";
901
960
  }
961
+ function escapeRegex(value) {
962
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
963
+ }
964
+ function globToRegex(pattern) {
965
+ const normalized = pattern.replaceAll("\\", "/");
966
+ let regex = "^";
967
+ for (let i = 0; i < normalized.length; i++) {
968
+ const char = normalized[i];
969
+ const next = normalized[i + 1];
970
+ if (char === "*" && next === "*") {
971
+ regex += ".*";
972
+ i++;
973
+ } else if (char === "*") regex += "[^/]*";
974
+ else if (char === "?") regex += "[^/]";
975
+ else regex += escapeRegex(char ?? "");
976
+ }
977
+ return new RegExp(`${regex}$`);
978
+ }
979
+ function fileMatches(pattern, filePath) {
980
+ const normalized = pattern.replaceAll("\\", "/");
981
+ return normalized === filePath || globToRegex(normalized).test(filePath);
982
+ }
902
983
  function loadWorkspaceEnv() {
903
984
  const envPath = resolve(process.cwd(), ".env");
904
985
  if (!existsSync(envPath)) return true;
@@ -959,8 +1040,8 @@ async function commandApp(args) {
959
1040
  const { serve } = await import("@hono/node-server");
960
1041
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
961
1042
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
962
- const appModule = await import("./app-DYRmucgj.mjs");
963
- const runnerModule = await import("./runner-jSujaSKt.mjs");
1043
+ const appModule = await import("./app-DS3j_AyX.mjs");
1044
+ const runnerModule = await import("./runner-B2f2TEjp.mjs");
964
1045
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
965
1046
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
966
1047
  await runnerModule.initRunner();
@@ -973,9 +1054,16 @@ async function commandApp(args) {
973
1054
  async function commandList(args_) {
974
1055
  const runner = createRunner({ watchForChanges: false });
975
1056
  await runner.init();
1057
+ const discoveryIssues = runner.getDiscoveryIssues();
1058
+ if (discoveryIssues.length > 0) {
1059
+ console.error("Discovery errors:\n");
1060
+ for (const issue of discoveryIssues) console.error(` ${issue.message}`);
1061
+ console.error("");
1062
+ }
976
1063
  const evals = runner.getEvals();
977
1064
  if (evals.length === 0) {
978
1065
  console.info("No eval files found.");
1066
+ if (discoveryIssues.length > 0) process.exit(1);
979
1067
  return;
980
1068
  }
981
1069
  console.info("Discovered evals:\n");
@@ -994,12 +1082,13 @@ async function commandList(args_) {
994
1082
  if (ev.caseCount !== null) console.info(` cases: ${String(ev.caseCount)}`);
995
1083
  console.info("");
996
1084
  }
1085
+ if (discoveryIssues.length > 0) process.exit(1);
997
1086
  }
998
1087
  async function commandRun(args) {
999
1088
  const runner = createRunner({ watchForChanges: false });
1000
1089
  await runner.init();
1001
- if (args.evalIds.length === 0 && args.caseIds.length === 0 && !runner.getAllowCliRunAll()) {
1002
- console.error("This workspace disables running all evals from the CLI. Pass --eval <id> or --case <id> to run a targeted subset.");
1090
+ if (args.evalIds.length === 0 && args.caseIds.length === 0 && args.files.length === 0 && !runner.getAllowCliRunAll()) {
1091
+ console.error("This workspace disables running all evals from the CLI. Pass --eval <id>, --file <path|glob>, or --case <id> to run a targeted subset.");
1003
1092
  process.exit(1);
1004
1093
  return;
1005
1094
  }
@@ -1013,10 +1102,15 @@ async function commandRun(args) {
1013
1102
  const target = args.caseIds.length > 0 ? {
1014
1103
  mode: "caseIds",
1015
1104
  caseIds: args.caseIds,
1016
- evalIds: args.evalIds.length > 0 ? args.evalIds : void 0
1105
+ evalIds: args.evalIds.length > 0 ? args.evalIds : void 0,
1106
+ files: args.files.length > 0 ? args.files : void 0
1017
1107
  } : args.evalIds.length > 0 ? {
1018
1108
  mode: "evalIds",
1019
- evalIds: args.evalIds
1109
+ evalIds: args.evalIds,
1110
+ files: args.files.length > 0 ? args.files : void 0
1111
+ } : args.files.length > 0 ? {
1112
+ mode: "evalIds",
1113
+ files: args.files
1020
1114
  } : { mode: "all" };
1021
1115
  const run = await runner.startRun({
1022
1116
  target,
@@ -1046,8 +1140,12 @@ async function commandRun(args) {
1046
1140
  console.info(`Errors: ${String(summary.errorCases)}`);
1047
1141
  if (summary.totalCases > 0) console.info(`Pass Rate: ${String(summary.passedCases)}/${String(summary.totalCases)}`);
1048
1142
  if (summary.totalDurationMs !== null) console.info(`Duration: ${(summary.totalDurationMs / 1e3).toFixed(1)}s`);
1143
+ if (summary.errorMessage !== null) {
1144
+ console.info("");
1145
+ console.info(summary.errorMessage);
1146
+ }
1049
1147
  }
1050
- if (summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
1148
+ if (summary.status === "error" || summary.failedCases > 0 || summary.errorCases > 0) process.exit(1);
1051
1149
  }
1052
1150
  async function commandShowRuns(args) {
1053
1151
  const runner = createRunner({ watchForChanges: false });
@@ -1101,8 +1199,9 @@ async function commandCache(args) {
1101
1199
  return;
1102
1200
  }
1103
1201
  if (args.subcommand === "clear") {
1104
- if (args.evalIds.length > 0) {
1105
- for (const evalId of args.evalIds) {
1202
+ if (args.evalIds.length > 0 || args.files.length > 0) {
1203
+ const evalIds = runner.getEvals().filter((ev) => (args.evalIds.length === 0 || args.evalIds.includes(ev.id)) && (args.files.length === 0 || args.files.some((file) => fileMatches(file, ev.filePath)))).map((ev) => ev.id);
1204
+ for (const evalId of evalIds) {
1106
1205
  const entries = await runner.listCache();
1107
1206
  const prefix = `${evalId}__`;
1108
1207
  const matching = entries.filter((entry) => entry.namespace.startsWith(prefix));
@@ -1111,7 +1210,7 @@ async function commandCache(args) {
1111
1210
  key: entry.key
1112
1211
  });
1113
1212
  }
1114
- console.info(`Cleared cache entries for: ${args.evalIds.join(", ")}`);
1213
+ console.info(`Cleared cache entries for: ${evalIds.join(", ")}`);
1115
1214
  return;
1116
1215
  }
1117
1216
  if (args.all) {
@@ -1130,6 +1229,9 @@ function getSortedRunSnapshots(runner) {
1130
1229
  }
1131
1230
  function buildRunFileIndex(workspaceRoot, run) {
1132
1231
  const runDir = join(workspaceRoot, ".agent-evals", "runs", run.manifest.id);
1232
+ const caseIdCounts = /* @__PURE__ */ new Map();
1233
+ for (const caseRow of run.cases) caseIdCounts.set(caseRow.caseId, (caseIdCounts.get(caseRow.caseId) ?? 0) + 1);
1234
+ const seenCaseIds = /* @__PURE__ */ new Set();
1133
1235
  return {
1134
1236
  id: run.manifest.id,
1135
1237
  shortId: run.manifest.shortId,
@@ -1147,10 +1249,16 @@ function buildRunFileIndex(workspaceRoot, run) {
1147
1249
  tracesDir: join(runDir, "traces")
1148
1250
  },
1149
1251
  cases: run.cases.map((caseRow) => {
1150
- const fileName = `${encodeURIComponent(caseRow.caseId)}.json`;
1252
+ const duplicateCaseIdCount = caseIdCounts.get(caseRow.caseId) ?? 0;
1253
+ const hasPreviousCaseWithId = seenCaseIds.has(caseRow.caseId);
1254
+ const fileId = duplicateCaseIdCount > 1 && hasPreviousCaseWithId ? caseRow.caseKey ?? caseRow.caseId : caseRow.caseId;
1255
+ seenCaseIds.add(caseRow.caseId);
1256
+ const fileName = `${encodeURIComponent(fileId)}.json`;
1151
1257
  return {
1152
1258
  caseId: caseRow.caseId,
1259
+ caseKey: caseRow.caseKey,
1153
1260
  evalId: caseRow.evalId,
1261
+ evalKey: caseRow.evalKey,
1154
1262
  status: caseRow.status,
1155
1263
  files: {
1156
1264
  caseDetail: join(runDir, "case-details", fileName),
@@ -1262,7 +1370,8 @@ Usage:
1262
1370
 
1263
1371
  Flags:
1264
1372
  --eval <id> Run specific eval(s) (comma-separated)
1265
- --case <id> Run specific case(s) (comma-separated)
1373
+ --file <path|glob> Run eval files matching path/glob (comma-separated)
1374
+ --case <id> Run case(s); combine with --file/--eval if ambiguous
1266
1375
  --trials <n> Number of trials per case
1267
1376
  --inspect[=host:port] Run with the Node.js inspector enabled
1268
1377
  --inspect-brk[=host:port] Enable inspector and pause before startup
package/dist/index.d.mts CHANGED
@@ -457,6 +457,7 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
457
457
  type EvalStatsConfig = z$1.infer<typeof evalStatsConfigSchema>;
458
458
  /** Schema summarizing a discovered eval for list and overview screens. */
459
459
  declare const evalSummarySchema: z$1.ZodObject<{
460
+ key: z$1.ZodDefault<z$1.ZodString>;
460
461
  id: z$1.ZodString;
461
462
  title: z$1.ZodOptional<z$1.ZodString>;
462
463
  filePath: z$1.ZodString;
@@ -635,6 +636,8 @@ declare const evalSummarySchema: z$1.ZodObject<{
635
636
  type EvalSummary = z$1.infer<typeof evalSummarySchema>;
636
637
  /** Schema for one case row in an eval run result table. */
637
638
  declare const caseRowSchema: z$1.ZodObject<{
639
+ evalKey: z$1.ZodOptional<z$1.ZodString>;
640
+ caseKey: z$1.ZodOptional<z$1.ZodString>;
638
641
  caseId: z$1.ZodString;
639
642
  evalId: z$1.ZodString;
640
643
  status: z$1.ZodEnum<{
@@ -796,6 +799,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
796
799
  type ScoreTrace = z$1.infer<typeof scoreTraceSchema>;
797
800
  /** Schema for the detailed payload shown when opening a specific case. */
798
801
  declare const caseDetailSchema: z$1.ZodObject<{
802
+ evalKey: z$1.ZodOptional<z$1.ZodString>;
803
+ caseKey: z$1.ZodOptional<z$1.ZodString>;
799
804
  caseId: z$1.ZodString;
800
805
  evalId: z$1.ZodString;
801
806
  status: z$1.ZodEnum<{
@@ -1009,6 +1014,43 @@ declare const caseDetailSchema: z$1.ZodObject<{
1009
1014
  }, z$1.core.$strip>;
1010
1015
  /** Full case payload including inputs, trace, outputs, and failures. */
1011
1016
  type CaseDetail = z$1.infer<typeof caseDetailSchema>;
1017
+ /** Schema for discovery problems that should be shown before running evals. */
1018
+ declare const discoveryIssueSchema: z$1.ZodObject<{
1019
+ type: z$1.ZodEnum<{
1020
+ "duplicate-eval-id": "duplicate-eval-id";
1021
+ }>;
1022
+ severity: z$1.ZodEnum<{
1023
+ error: "error";
1024
+ }>;
1025
+ filePath: z$1.ZodString;
1026
+ evalId: z$1.ZodString;
1027
+ message: z$1.ZodString;
1028
+ }, z$1.core.$strip>;
1029
+ /** Discovery problem found while scanning eval files. */
1030
+ type DiscoveryIssue = z$1.infer<typeof discoveryIssueSchema>;
1031
+ //#endregion
1032
+ //#region ../shared/src/evalIdentity.d.ts
1033
+ /** Build the stable identity for one eval inside a workspace. */
1034
+ declare function buildEvalKey(params: {
1035
+ filePath: string;
1036
+ evalId: string;
1037
+ }): string;
1038
+ /** Build the stable identity for one eval case inside a workspace. */
1039
+ declare function buildCaseKey(params: {
1040
+ filePath: string;
1041
+ evalId: string;
1042
+ caseId: string;
1043
+ }): string;
1044
+ /** Return the collision-safe eval key stored on a row, falling back for legacy data. */
1045
+ declare function getCaseRowEvalKey(row: {
1046
+ evalKey?: string;
1047
+ evalId: string;
1048
+ }): string;
1049
+ /** Return the collision-safe case key stored on a row, falling back for legacy data. */
1050
+ declare function getCaseRowCaseKey(row: {
1051
+ caseKey?: string;
1052
+ caseId: string;
1053
+ }): string;
1012
1054
  //#endregion
1013
1055
  //#region ../shared/src/schemas/chart.d.ts
1014
1056
  /** Chart type rendered for a single eval history chart. */
@@ -1338,6 +1380,8 @@ declare const runManifestSchema: z$1.ZodObject<{
1338
1380
  evalIds: "evalIds";
1339
1381
  caseIds: "caseIds";
1340
1382
  }>;
1383
+ evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1384
+ files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1341
1385
  evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1342
1386
  caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1343
1387
  }, z$1.core.$strip>;
@@ -2772,6 +2816,8 @@ declare const createRunRequestSchema: z$1.ZodObject<{
2772
2816
  evalIds: "evalIds";
2773
2817
  caseIds: "caseIds";
2774
2818
  }>;
2819
+ evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2820
+ files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2775
2821
  evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2776
2822
  caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
2777
2823
  }, z$1.core.$strip>;
@@ -3564,7 +3610,8 @@ type CacheClearFilter = {
3564
3610
  type EvalRunner = {
3565
3611
  /** Load workspace config, discover evals, and start file watching when enabled. */init(): Promise<void>; /** Return the currently discovered eval summaries for the active workspace. */
3566
3612
  getEvals(): EvalSummary[]; /** Look up one discovered eval by id. */
3567
- getEval(id: string): EvalSummary | undefined; /** Re-scan configured eval files and emit a discovery update to listeners. */
3613
+ getEval(id: string): EvalSummary | undefined; /** Return discovery errors that should be shown before running evals. */
3614
+ getDiscoveryIssues(): DiscoveryIssue[]; /** Re-scan configured eval files and emit a discovery update to listeners. */
3568
3615
  refreshDiscovery(): Promise<void>;
3569
3616
  startRun(request: CreateRunRequest): Promise<{
3570
3617
  manifest: RunManifest;
@@ -3617,11 +3664,19 @@ type EvalRunner = {
3617
3664
  * Remove cache entries matching `filter`, or all entries when no filter is
3618
3665
  * supplied.
3619
3666
  */
3620
- clearCache(filter?: CacheClearFilter): Promise<void>; /** Recompute persisted case and run statuses for terminal runs touching one eval. */
3621
- recomputeStatusesForEval(evalId: string): Promise<{
3667
+ clearCache(filter?: CacheClearFilter): Promise<void>;
3668
+ /**
3669
+ * Recompute persisted case and run statuses for terminal runs touching one
3670
+ * eval. Accepts the exact eval key, with a legacy fallback for unique eval ids.
3671
+ */
3672
+ recomputeStatusesForEval(evalKey: string): Promise<{
3622
3673
  updatedRuns: number;
3623
- }>; /** Delete terminal persisted runs that touch one eval from in-memory history and disk. */
3624
- cleanRunsForEval(evalId: string): Promise<{
3674
+ }>;
3675
+ /**
3676
+ * Delete terminal persisted runs that touch one eval from memory and disk.
3677
+ * Accepts the exact eval key, with a legacy fallback for unique eval ids.
3678
+ */
3679
+ cleanRunsForEval(evalKey: string): Promise<{
3625
3680
  deletedRuns: number;
3626
3681
  }>; /** Persist a UI-authored manual score for one case and recompute affected summaries. */
3627
3682
  updateManualScore(params: {
@@ -3667,4 +3722,4 @@ declare function createRunner({
3667
3722
  */
3668
3723
  declare function runCli(argv: string[]): Promise<void>;
3669
3724
  //#endregion
3670
- export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3725
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as llmCallsConfigSchema, $t as columnFormatSchema, A as extractApiCalls, An as runInEvalRuntimeScope, At as cacheEntryWithDebugKeySchema, B as runSummarySchema, Bt as traceCacheRefSchema, Cn as getCurrentScope, Ct as evalChartMetricSchema, D as sseEnvelopeSchema, Dn as isInEvalScope, Dt as cacheDebugKeyEntrySchema, E as updateManualScoreRequestSchema, En as incrementEvalOutput, Et as evalChartsConfigSchema, F as getEvalDisplayStatus, Fn as startEvalBackgroundJob, Ft as cacheRecordingOpSchema, G as apiCallMetricPlacementSchema, Gt as traceDisplayConfigSchema, H as DEFAULT_LLM_CALLS_CONFIG, Ht as traceAttributeDisplayInputSchema, I as deriveScopedSummaryFromCases, In as repoFile, It as cacheRecordingSchema, J as defaultConfigKeySchema, Jt as traceSpanKindSchema, K as apiCallMetricSchema, Kt as traceDisplayInputConfigSchema, L as deriveStatusFromCaseRows, Ln as defineEval, Lt as cacheStatusSchema, M as applyDerivedCallAttributes, Mn as runInExistingEvalScope, Mt as cacheListItemSchema, N as getNestedAttribute, Nn as setEvalOutput, Nt as cacheModeSchema, O as extractCacheEntries, On as mergeEvalOutput, Ot as cacheDebugKeyFileSchema, P as getEvalTitle, Pn as setScopeCacheContext, Pt as cacheOperationTypeSchema, Q as llmCallPricingSchema, Qt as columnDefSchema, R as deriveStatusFromChildStatuses, Rn as getEvalRegistry, Rt as serializedCacheSpanSchema, Sn as evalLog, St as evalChartConfigSchema, T as createRunRequestSchema, Tn as getEvalStartTime, Tt as evalChartTypeSchema, U as agentEvalsConfigSchema, Ut as traceAttributeDisplayPlacementSchema, V as DEFAULT_API_CALLS_CONFIG, Vt as traceAttributeDisplayFormatSchema, W as apiCallMetricFormatSchema, Wt as traceAttributeDisplaySchema, X as llmCallMetricPlacementSchema, Xt as traceSpanWarningSchema, Y as llmCallMetricFormatSchema, Yt as traceSpanSchema, Z as llmCallMetricSchema, Zt as cellValueSchema, _n as EvalAssertionError, _t as scoreTraceSchema, an as runArtifactRefSchema, at as assertionFailureSchema, bt as evalChartBuiltinMetricSchema, cn as captureEvalSpanError, ct as evalFreshnessStatusSchema, dn as hashCacheKey, dt as evalStatsConfigSchema, en as columnKindSchema, et as removeDefaultConfigSchema, fn as hashCacheKeySync, ft as evalSummarySchema, gn as serializeCacheValue, gt as runLogPhaseSchema, hn as serializeCacheRecording, ht as runLogLocationSchema, in as repoFileRefSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as runInEvalScope, jt as cacheFileSchema, k as extractCacheHits, kn as nextEvalId, kt as cacheEntrySchema, ln as evalSpan, lt as evalStatAggregateSchema, mn as deserializeCacheValue, mt as runLogLevelSchema, nn as jsonCellSchema, nt as resolveLlmCallsConfig, on as z, ot as caseDetailSchema, pn as deserializeCacheRecording, pt as runLogEntrySchema, q as apiCallsConfigSchema, qt as traceSpanErrorSchema, rn as numberDisplayOptionsSchema, rt as runLogsConfigSchema, sn as buildTraceTree, st as caseRowSchema, tn as fileRefSchema, tt as resolveApiCallsConfig, un as evalTracer, ut as evalStatItemSchema, vn as advanceEvalTime, vt as evalChartAggregateSchema, wn as getEvalCaseInput, wt as evalChartTooltipExtraSchema, xn as evalAssert, xt as evalChartColorSchema, yn as appendToEvalOutput, yt as evalChartAxisSchema, z as runManifestSchema, zt as spanCacheOptionsSchema } from "./runOrchestration-D697g6Qe.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-Be0x8CS3.mjs";
3
- import "./src-D6cettg0.mjs";
4
- export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-ETfZ15RB.mjs";
3
+ import "./src-CyNb2ycA.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };