@ls-stack/agent-eval 0.54.0 → 0.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@ import { createHash, randomUUID } from "node:crypto";
10
10
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
11
11
  import { existsSync } from "node:fs";
12
12
  import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
13
- import { Result, resultify } from "t-result";
13
+ import { resultify } from "t-result";
14
14
  import { fileURLToPath, pathToFileURL } from "node:url";
15
15
  //#region ../sdk/src/defineEval.ts
16
16
  const evalRegistry = /* @__PURE__ */ new Map();
@@ -1692,10 +1692,10 @@ function evaluateTagExpression(expression, tags) {
1692
1692
  }
1693
1693
  function tagMatchesPattern(tag, pattern) {
1694
1694
  if (!pattern.includes("*")) return tag === pattern;
1695
- const source = pattern.split("*").map(escapeRegex$1).join(".*");
1695
+ const source = pattern.split("*").map(escapeRegex).join(".*");
1696
1696
  return new RegExp(`^${source}$`).test(tag);
1697
1697
  }
1698
- function escapeRegex$1(value) {
1698
+ function escapeRegex(value) {
1699
1699
  return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
1700
1700
  }
1701
1701
  //#endregion
@@ -4928,6 +4928,9 @@ const externalJsonCacheSerializationMarker = "v1:ExternalJson";
4928
4928
  const externalJsonBlobExtension = ".json.br";
4929
4929
  const cacheEntryExtension = ".json.br";
4930
4930
  const debugEntryExtension = ".json";
4931
+ async function commitPendingCacheWrites(params) {
4932
+ for (const pendingWrite of params.pendingWrites) await params.backingStore.write(pendingWrite.entry, pendingWrite.debugKey);
4933
+ }
4931
4934
  /**
4932
4935
  * Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
4933
4936
  *
@@ -5068,10 +5071,16 @@ function createBufferedCacheStore(backingStore) {
5068
5071
  return Promise.resolve();
5069
5072
  },
5070
5073
  async commit() {
5071
- for (const pending of pendingEntries.values()) await backingStore.write(pending.entry, pending.debugKey);
5074
+ await commitPendingCacheWrites({
5075
+ backingStore,
5076
+ pendingWrites: [...pendingEntries.values()].map((pending) => ({ ...pending }))
5077
+ });
5072
5078
  },
5073
5079
  getPendingEntries() {
5074
5080
  return [...pendingEntries.values()].map((pending) => pending.entry);
5081
+ },
5082
+ getPendingWrites() {
5083
+ return [...pendingEntries.values()].map((pending) => ({ ...pending }));
5075
5084
  }
5076
5085
  };
5077
5086
  }
@@ -5412,80 +5421,6 @@ function isRecordLike(value) {
5412
5421
  return typeof value === "object" && value !== null && !Array.isArray(value);
5413
5422
  }
5414
5423
  //#endregion
5415
- //#region ../runner/src/chartValidation.ts
5416
- function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
5417
- const columnDef = columnsByKey.get(metric.key);
5418
- if (!columnDef) {
5419
- warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
5420
- return false;
5421
- }
5422
- if (metric.aggregate === "passThresholdRate") {
5423
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
5424
- warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
5425
- return false;
5426
- }
5427
- }
5428
- return true;
5429
- }
5430
- function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
5431
- const columnDef = columnsByKey.get(extra.key);
5432
- if (!columnDef) {
5433
- warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
5434
- return false;
5435
- }
5436
- if (extra.aggregate === "passThresholdRate") {
5437
- if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
5438
- warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
5439
- return false;
5440
- }
5441
- }
5442
- return true;
5443
- }
5444
- function sanitizeChart(chart, columnsByKey, evalId, warnings) {
5445
- const metrics = chart.metrics.filter((metric) => {
5446
- if (metric.source === "builtin") return true;
5447
- return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
5448
- });
5449
- if (metrics.length === 0) {
5450
- warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
5451
- return null;
5452
- }
5453
- const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
5454
- if (extra.source === "builtin") return true;
5455
- return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
5456
- });
5457
- return {
5458
- ...chart,
5459
- metrics,
5460
- tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
5461
- };
5462
- }
5463
- /**
5464
- * Validate and sanitize an authored `charts` config against the eval's
5465
- * declared columns. Drops metrics/extras that reference unknown columns or
5466
- * misuse `passThresholdRate`, and drops entire charts whose metrics are all
5467
- * invalid. Returns `charts: undefined` when nothing valid remains so the UI
5468
- * falls back to rendering no chart (matching the opt-in default).
5469
- */
5470
- function validateCharts(params) {
5471
- const { charts, columnDefs, evalId } = params;
5472
- if (!charts || charts.length === 0) return {
5473
- charts: void 0,
5474
- warnings: []
5475
- };
5476
- const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
5477
- const warnings = [];
5478
- const sanitized = [];
5479
- for (const chart of charts) {
5480
- const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
5481
- if (result) sanitized.push(result);
5482
- }
5483
- return {
5484
- charts: sanitized.length > 0 ? sanitized : void 0,
5485
- warnings
5486
- };
5487
- }
5488
- //#endregion
5489
5424
  //#region ../runner/src/columnBuilder.ts
5490
5425
  /**
5491
5426
  * Normalize a user-provided score definition (either a function or an
@@ -5991,112 +5926,6 @@ function addDefaultOutputs(params) {
5991
5926
  });
5992
5927
  }
5993
5928
  //#endregion
5994
- //#region ../runner/src/discovery.ts
5995
- const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
5996
- const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
5997
- /** Parse static eval metadata and discovery issues from one eval file. */
5998
- function parseEvalDiscovery(filePath, content) {
5999
- const metas = [];
6000
- let searchIndex = 0;
6001
- while (searchIndex < content.length) {
6002
- const defineEvalIndex = content.indexOf("defineEval", searchIndex);
6003
- if (defineEvalIndex === -1) break;
6004
- const extracted = extractDefineEvalObject(content, defineEvalIndex);
6005
- if (!extracted) {
6006
- searchIndex = defineEvalIndex + 10;
6007
- continue;
6008
- }
6009
- const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
6010
- if (id !== void 0) {
6011
- const result = {
6012
- filePath,
6013
- id
6014
- };
6015
- const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
6016
- if (title !== void 0) result.title = title;
6017
- metas.push(result);
6018
- }
6019
- searchIndex = extracted.nextIndex;
6020
- }
6021
- const countsById = /* @__PURE__ */ new Map();
6022
- for (const meta of metas) countsById.set(meta.id, (countsById.get(meta.id) ?? 0) + 1);
6023
- const duplicateIds = new Set([...countsById].filter(([, count]) => count > 1).map(([id]) => id));
6024
- const issues = [...duplicateIds].map((evalId) => ({
6025
- type: "duplicate-eval-id",
6026
- severity: "error",
6027
- filePath,
6028
- evalId,
6029
- message: `Duplicate eval id "${evalId}" in ${filePath}. Eval ids must be unique within one file.`
6030
- }));
6031
- return {
6032
- metas: metas.filter((meta) => !duplicateIds.has(meta.id)),
6033
- issues
6034
- };
6035
- }
6036
- function extractDefineEvalObject(content, defineEvalIndex) {
6037
- const openParenIndex = content.indexOf("(", defineEvalIndex);
6038
- if (openParenIndex === -1) return void 0;
6039
- const objectStartIndex = content.indexOf("{", openParenIndex);
6040
- if (objectStartIndex === -1) return void 0;
6041
- let depth = 0;
6042
- let quote;
6043
- let inBlockComment = false;
6044
- let inLineComment = false;
6045
- let isEscaped = false;
6046
- for (let index = objectStartIndex; index < content.length; index++) {
6047
- const currentChar = content[index];
6048
- const nextChar = content[index + 1];
6049
- if (inLineComment) {
6050
- if (currentChar === "\n") inLineComment = false;
6051
- continue;
6052
- }
6053
- if (inBlockComment) {
6054
- if (currentChar === "*" && nextChar === "/") {
6055
- inBlockComment = false;
6056
- index++;
6057
- }
6058
- continue;
6059
- }
6060
- if (quote) {
6061
- if (isEscaped) {
6062
- isEscaped = false;
6063
- continue;
6064
- }
6065
- if (currentChar === "\\") {
6066
- isEscaped = true;
6067
- continue;
6068
- }
6069
- if (currentChar === quote) quote = void 0;
6070
- continue;
6071
- }
6072
- if (currentChar === "/" && nextChar === "/") {
6073
- inLineComment = true;
6074
- index++;
6075
- continue;
6076
- }
6077
- if (currentChar === "/" && nextChar === "*") {
6078
- inBlockComment = true;
6079
- index++;
6080
- continue;
6081
- }
6082
- if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
6083
- quote = currentChar;
6084
- continue;
6085
- }
6086
- if (currentChar === "{") {
6087
- depth++;
6088
- continue;
6089
- }
6090
- if (currentChar === "}") {
6091
- depth--;
6092
- if (depth === 0) return {
6093
- nextIndex: index + 1,
6094
- objectText: content.slice(objectStartIndex, index + 1)
6095
- };
6096
- }
6097
- }
6098
- }
6099
- //#endregion
6100
5929
  //#region ../runner/src/evalModuleLoader.ts
6101
5930
  /**
6102
5931
  * Import one eval module with a cache key derived from its current source so
@@ -6121,6 +5950,7 @@ const agentPackageUrlBySpecifier = new Map([
6121
5950
  "@agent-evals/sdk",
6122
5951
  "@agent-evals/shared",
6123
5952
  "@agent-evals/runner",
5953
+ "@agent-evals/runner/case-child",
6124
5954
  "@agent-evals/runner/run-child"
6125
5955
  ].flatMap((specifier) => {
6126
5956
  try {
@@ -6129,6 +5959,7 @@ const agentPackageUrlBySpecifier = new Map([
6129
5959
  return [];
6130
5960
  }
6131
5961
  }));
5962
+ const agentPackageDirectoryPaths = [...new Set([...agentPackageUrlBySpecifier.values()].map((packageUrl) => dirname(fileURLToPath(packageUrl))))];
6132
5963
  function isAgentEvalsPackageSpecifier(specifier) {
6133
5964
  return specifier === "@ls-stack/agent-eval" || specifier === "@agent-evals/sdk" || specifier === "@agent-evals/shared" || specifier === "@agent-evals/runner" || specifier.startsWith("@ls-stack/agent-eval/") || specifier.startsWith("@agent-evals/sdk/") || specifier.startsWith("@agent-evals/shared/") || specifier.startsWith("@agent-evals/runner/");
6134
5965
  }
@@ -6142,9 +5973,17 @@ function isIsolatableFile(url, workspaceRoot) {
6142
5973
  return isIsolatableFilePath(fileURLToPath(url), workspaceRoot);
6143
5974
  }
6144
5975
  function isIsolatableFilePath(filePath, workspaceRoot) {
5976
+ if (isAgentEvalsPackageFilePath(filePath)) return false;
6145
5977
  const relativePath = relative(workspaceRoot, filePath);
6146
5978
  if (relativePath === "" || relativePath.startsWith("..") || isAbsolute(relativePath)) return false;
6147
- return !relativePath.split(pathSegmentSeparatorPattern).includes(".agent-evals");
5979
+ const segments = relativePath.split(pathSegmentSeparatorPattern);
5980
+ return !segments.includes(".agent-evals") && !segments.includes("node_modules");
5981
+ }
5982
+ function isAgentEvalsPackageFilePath(filePath) {
5983
+ return agentPackageDirectoryPaths.some((packageDirectoryPath) => {
5984
+ const packageRelativePath = relative(packageDirectoryPath, filePath);
5985
+ return packageRelativePath === "" || !packageRelativePath.startsWith("..") && !isAbsolute(packageRelativePath);
5986
+ });
6148
5987
  }
6149
5988
  function addIsolationParam(url, key) {
6150
5989
  const moduleUrl = new URL(url);
@@ -6176,6 +6015,9 @@ function registerModuleIsolationHooks() {
6176
6015
  };
6177
6016
  } });
6178
6017
  }
6018
+ function registerAgentEvalsPackageResolutionHooks() {
6019
+ registerModuleIsolationHooks();
6020
+ }
6179
6021
  function clearWorkspaceRequireCacheOnce(context) {
6180
6022
  if (clearedRequireCacheKeys.has(context.key)) return;
6181
6023
  clearedRequireCacheKeys.add(context.key);
@@ -6197,432 +6039,6 @@ async function runWithModuleIsolation(context, fn) {
6197
6039
  return await isolationStorage.run(context, fn);
6198
6040
  }
6199
6041
  //#endregion
6200
- //#region ../runner/src/evalRegistryLoader.ts
6201
- async function loadIsolatedEvalRegistry(params) {
6202
- return await runWithEvalRegistry(async (registry) => {
6203
- await runWithModuleIsolation(params.moduleIsolation, async () => {
6204
- await runInEvalRuntimeScope(params.runtimeScope, async () => {
6205
- await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
6206
- });
6207
- });
6208
- return registry;
6209
- });
6210
- }
6211
- async function useIsolatedEvalDefinition(params) {
6212
- const entry = (await loadIsolatedEvalRegistry(params)).get(params.evalId);
6213
- if (entry === void 0) throw new Error(`Eval "${params.evalId}" was not registered after importing ${params.evalFilePath}`);
6214
- return await entry.use(async (evalDef) => {
6215
- return await params.use(evalDef);
6216
- });
6217
- }
6218
- //#endregion
6219
- //#region ../runner/src/freshness.ts
6220
- /**
6221
- * Derive eval freshness from the latest run, current eval-file fingerprint,
6222
- * current git commit, and an age threshold.
6223
- */
6224
- function deriveEvalFreshness(params) {
6225
- const { latestRun, gitState, currentEvalSourceFingerprint, staleAfterDays, now = /* @__PURE__ */ new Date() } = params;
6226
- const stale = latestRun?.evalSourceFingerprint !== void 0 && latestRun.evalSourceFingerprint !== null && currentEvalSourceFingerprint !== null && currentEvalSourceFingerprint !== latestRun.evalSourceFingerprint;
6227
- const latestRunCommitSha = latestRun?.commitSha;
6228
- if (latestRunCommitSha === void 0 || latestRunCommitSha === null) return {
6229
- freshnessStatus: stale ? "stale" : "fresh",
6230
- stale,
6231
- outdated: false
6232
- };
6233
- if (gitState.commitSha === null) return {
6234
- freshnessStatus: stale ? "stale" : "fresh",
6235
- stale,
6236
- outdated: false
6237
- };
6238
- if (latestRunCommitSha === gitState.commitSha) return {
6239
- freshnessStatus: stale ? "stale" : "fresh",
6240
- stale,
6241
- outdated: false
6242
- };
6243
- const latestRunStartedAt = new Date(latestRun?.startedAt ?? "").getTime();
6244
- if (!Number.isFinite(latestRunStartedAt)) return {
6245
- freshnessStatus: stale ? "stale" : "fresh",
6246
- stale,
6247
- outdated: false
6248
- };
6249
- const outdated = now.getTime() - latestRunStartedAt >= staleAfterDays * 24 * 60 * 60 * 1e3;
6250
- return {
6251
- freshnessStatus: stale ? "stale" : outdated ? "outdated" : "fresh",
6252
- stale,
6253
- outdated
6254
- };
6255
- }
6256
- /** Return the timestamp used when ordering and displaying a run recency. */
6257
- function getRunFreshnessTimestamp(manifest) {
6258
- return manifest.endedAt ?? manifest.startedAt;
6259
- }
6260
- //#endregion
6261
- //#region ../runner/src/manualInput/walker.ts
6262
- function isObject(value) {
6263
- return typeof value === "object" && value !== null;
6264
- }
6265
- function getZodDef(schema) {
6266
- if (!isObject(schema)) return null;
6267
- const zodHolder = schema._zod;
6268
- if (!isObject(zodHolder)) return null;
6269
- const def = zodHolder.def;
6270
- if (!isObject(def)) return null;
6271
- if (typeof def.type !== "string") return null;
6272
- return {
6273
- ...def,
6274
- type: def.type
6275
- };
6276
- }
6277
- function getDescription(schema) {
6278
- if (!isObject(schema)) return void 0;
6279
- const description = schema.description;
6280
- return typeof description === "string" ? description : void 0;
6281
- }
6282
- function getInnerSchema(def) {
6283
- return def.innerType;
6284
- }
6285
- function getChecks(def) {
6286
- const checks = def.checks;
6287
- if (!Array.isArray(checks)) return [];
6288
- const out = [];
6289
- for (const check of checks) {
6290
- if (!isObject(check)) continue;
6291
- const zodHolder = check._zod;
6292
- if (!isObject(zodHolder)) continue;
6293
- const checkDef = zodHolder.def;
6294
- if (!isObject(checkDef)) continue;
6295
- if (typeof checkDef.check !== "string") continue;
6296
- out.push({
6297
- ...checkDef,
6298
- check: checkDef.check
6299
- });
6300
- }
6301
- return out;
6302
- }
6303
- function findCheck(checks, name) {
6304
- return checks.find((check) => check.check === name);
6305
- }
6306
- function unwrap(schema) {
6307
- let current = schema;
6308
- let required = true;
6309
- let defaultValue = void 0;
6310
- for (let depth = 0; depth < 8; depth += 1) {
6311
- const def = getZodDef(current);
6312
- if (!def) return null;
6313
- if (def.type === "optional" || def.type === "nullable") {
6314
- required = false;
6315
- current = getInnerSchema(def);
6316
- continue;
6317
- }
6318
- if (def.type === "nullish") {
6319
- required = false;
6320
- current = getInnerSchema(def);
6321
- continue;
6322
- }
6323
- if (def.type === "default" || def.type === "prefault") {
6324
- const raw = def.defaultValue;
6325
- if (typeof raw === "function") defaultValue = Reflect.apply(raw, void 0, []);
6326
- else defaultValue = raw;
6327
- current = getInnerSchema(def);
6328
- continue;
6329
- }
6330
- if (def.type === "readonly" || def.type === "pipe") {
6331
- current = getInnerSchema(def) ?? def.in;
6332
- continue;
6333
- }
6334
- return {
6335
- schema: current,
6336
- def,
6337
- required,
6338
- defaultValue
6339
- };
6340
- }
6341
- return null;
6342
- }
6343
- function humaniseKey(key) {
6344
- const spaced = key.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[_-]+/g, " ").trim();
6345
- if (!spaced) return key;
6346
- const lowered = spaced.toLowerCase();
6347
- return lowered.charAt(0).toUpperCase() + lowered.slice(1);
6348
- }
6349
- function normaliseSelectOptions(raw) {
6350
- if (!raw) return void 0;
6351
- return raw.map((entry) => {
6352
- if (typeof entry === "string") return {
6353
- value: entry,
6354
- label: entry
6355
- };
6356
- return {
6357
- value: entry.value,
6358
- label: entry.label ?? entry.value
6359
- };
6360
- });
6361
- }
6362
- function enumOptionsFromEntries(def) {
6363
- const entries = def.entries;
6364
- if (!isObject(entries)) return null;
6365
- const out = [];
6366
- for (const [label, value] of Object.entries(entries)) if (typeof value === "string") out.push({
6367
- value,
6368
- label
6369
- });
6370
- else if (typeof value === "number") out.push({
6371
- value: String(value),
6372
- label
6373
- });
6374
- else return null;
6375
- return out;
6376
- }
6377
- function literalUnionOptions(def) {
6378
- const options = def.options;
6379
- if (!Array.isArray(options)) return null;
6380
- const out = [];
6381
- for (const option of options) {
6382
- const optDef = getZodDef(option);
6383
- if (optDef?.type !== "literal") return null;
6384
- const values = optDef.values;
6385
- if (!Array.isArray(values) || values.length !== 1) return null;
6386
- const value = values[0];
6387
- if (typeof value === "string") out.push({
6388
- value,
6389
- label: value
6390
- });
6391
- else if (typeof value === "number") {
6392
- const stringValue = String(value);
6393
- out.push({
6394
- value: stringValue,
6395
- label: stringValue
6396
- });
6397
- } else return null;
6398
- }
6399
- return out.length > 0 ? out : null;
6400
- }
6401
- function literalSelectOptions(def) {
6402
- const values = def.values;
6403
- if (!Array.isArray(values)) return null;
6404
- const out = [];
6405
- for (const value of values) if (typeof value === "string") out.push({
6406
- value,
6407
- label: value
6408
- });
6409
- else if (typeof value === "number") {
6410
- const stringValue = String(value);
6411
- out.push({
6412
- value: stringValue,
6413
- label: stringValue
6414
- });
6415
- } else return null;
6416
- return out;
6417
- }
6418
- function readStringChecks(def) {
6419
- const checks = getChecks(def);
6420
- const out = {};
6421
- const min = findCheck(checks, "min_length");
6422
- if (min && typeof min.minimum === "number") out.minLength = min.minimum;
6423
- const max = findCheck(checks, "max_length");
6424
- if (max && typeof max.maximum === "number") out.maxLength = max.maximum;
6425
- return out;
6426
- }
6427
- const integerNumberFormats = new Set([
6428
- "int",
6429
- "safeint",
6430
- "int32",
6431
- "uint32",
6432
- "int64",
6433
- "uint64"
6434
- ]);
6435
- function readNumberChecks(def) {
6436
- const checks = getChecks(def);
6437
- const out = {};
6438
- const gt = findCheck(checks, "greater_than");
6439
- if (gt && typeof gt.value === "number" && gt.inclusive === true) out.min = gt.value;
6440
- const lt = findCheck(checks, "less_than");
6441
- if (lt && typeof lt.value === "number" && lt.inclusive === true) out.max = lt.value;
6442
- const format = findCheck(checks, "number_format");
6443
- if (format && typeof format.format === "string" && integerNumberFormats.has(format.format)) out.integer = true;
6444
- return out;
6445
- }
6446
- function buildField(key, fieldSchema, override) {
6447
- const unwrapped = unwrap(fieldSchema);
6448
- if (!unwrapped) return Result.err(/* @__PURE__ */ new Error(`manualInput: field "${key}" uses an unsupported Zod schema (could not introspect)`));
6449
- const inner = unwrapped.def;
6450
- const description = override?.description ?? getDescription(unwrapped.schema);
6451
- const base = {
6452
- key,
6453
- label: override?.label ?? humaniseKey(key),
6454
- description,
6455
- placeholder: override?.placeholder,
6456
- required: unwrapped.required,
6457
- defaultValue: override?.defaultValue !== void 0 ? override.defaultValue : unwrapped.defaultValue
6458
- };
6459
- if (override?.asJson === true) {
6460
- const rows = override.rows;
6461
- return Result.ok({
6462
- ...base,
6463
- kind: "json",
6464
- rows
6465
- });
6466
- }
6467
- if (override?.asFile === true) return Result.ok({
6468
- ...base,
6469
- kind: "file",
6470
- accept: override.accept,
6471
- maxSizeBytes: override.maxSizeBytes
6472
- });
6473
- const overrideOptions = normaliseSelectOptions(override?.options);
6474
- if (overrideOptions) return Result.ok({
6475
- ...base,
6476
- kind: "select",
6477
- options: overrideOptions
6478
- });
6479
- switch (inner.type) {
6480
- case "string": {
6481
- const checks = readStringChecks(inner);
6482
- if (override?.multiline === true) return Result.ok({
6483
- ...base,
6484
- kind: "multiline",
6485
- rows: override.rows,
6486
- minLength: checks.minLength,
6487
- maxLength: checks.maxLength
6488
- });
6489
- return Result.ok({
6490
- ...base,
6491
- kind: "text",
6492
- minLength: checks.minLength,
6493
- maxLength: checks.maxLength
6494
- });
6495
- }
6496
- case "number":
6497
- case "int":
6498
- case "bigint": {
6499
- const checks = readNumberChecks(inner);
6500
- return Result.ok({
6501
- ...base,
6502
- kind: "number",
6503
- min: checks.min,
6504
- max: checks.max,
6505
- integer: checks.integer
6506
- });
6507
- }
6508
- case "boolean": return Result.ok({
6509
- ...base,
6510
- kind: "boolean"
6511
- });
6512
- case "enum": {
6513
- const options = enumOptionsFromEntries(inner);
6514
- if (options) return Result.ok({
6515
- ...base,
6516
- kind: "select",
6517
- options
6518
- });
6519
- return Result.ok({
6520
- ...base,
6521
- kind: "json",
6522
- rows: override?.rows
6523
- });
6524
- }
6525
- case "literal": {
6526
- const options = literalSelectOptions(inner);
6527
- if (options && options.length > 0) return Result.ok({
6528
- ...base,
6529
- kind: "select",
6530
- options
6531
- });
6532
- return Result.ok({
6533
- ...base,
6534
- kind: "json",
6535
- rows: override?.rows
6536
- });
6537
- }
6538
- case "union": {
6539
- const options = literalUnionOptions(inner);
6540
- if (options) return Result.ok({
6541
- ...base,
6542
- kind: "select",
6543
- options
6544
- });
6545
- return Result.ok({
6546
- ...base,
6547
- kind: "json",
6548
- rows: override?.rows
6549
- });
6550
- }
6551
- default: return Result.ok({
6552
- ...base,
6553
- kind: "json",
6554
- rows: override?.rows
6555
- });
6556
- }
6557
- }
6558
- function getObjectShape(schema) {
6559
- const def = getZodDef(schema);
6560
- if (!def) return null;
6561
- if (def.type !== "object") return null;
6562
- const shape = def.shape;
6563
- if (!isObject(shape)) return null;
6564
- return shape;
6565
- }
6566
- /**
6567
- * Walk an eval's `manualInput` configuration and produce the wire-format
6568
- * descriptor consumed by the web UI. The schema must resolve to a top-level
6569
- * `z.object(...)`; nested objects, arrays, unions, and other unsupported
6570
- * shapes inside fields fall back to the JSON textarea widget.
6571
- *
6572
- * Returns a `Result` so the caller (eval discovery) can surface a discovery
6573
- * issue without throwing when the schema is incompatible.
6574
- */
6575
- function buildManualInputDescriptor(config) {
6576
- const shape = getObjectShape(config.schema);
6577
- if (!shape) return Result.err(/* @__PURE__ */ new Error("manualInput.schema must be a top-level z.object(...). Wrap nested types in an object schema."));
6578
- const overrides = {};
6579
- const rawOverrides = config.fields;
6580
- if (rawOverrides) {
6581
- for (const [key, override] of Object.entries(rawOverrides)) if (override) overrides[key] = override;
6582
- }
6583
- const fields = [];
6584
- for (const [key, fieldSchema] of Object.entries(shape)) {
6585
- const fieldResult = buildField(key, fieldSchema, overrides[key]);
6586
- if (fieldResult.error) return fieldResult.errorResult();
6587
- fields.push(fieldResult.value);
6588
- }
6589
- return Result.ok({
6590
- title: config.title,
6591
- description: config.description,
6592
- submitLabel: config.submitLabel,
6593
- fields
6594
- });
6595
- }
6596
- /**
6597
- * Resolve an eval's `manualInput` Zod schema against a raw user submission.
6598
- * Returns the parsed value typed against the eval's `TInput` generic, or a
6599
- * structured `Error` carrying the Zod issues for the caller to surface.
6600
- */
6601
- function parseManualInputValues(config, raw) {
6602
- const parsed = config.schema.safeParse(raw);
6603
- if (!parsed.success) return Result.err(new ManualInputValidationError(parsed.error.issues.map(formatIssue)));
6604
- return Result.ok(parsed.data);
6605
- }
6606
- /**
6607
- * Error thrown / returned when manual-input values fail validation against
6608
- * the eval's `manualInput.schema`. Carries the structured Zod issues so the
6609
- * CLI and HTTP layers can surface them per-field.
6610
- */
6611
- var ManualInputValidationError = class extends Error {
6612
- issues;
6613
- constructor(issues) {
6614
- super(issues.length === 0 ? "manualInput validation failed" : `manualInput validation failed: ${issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ")}`);
6615
- this.name = "ManualInputValidationError";
6616
- this.issues = issues;
6617
- }
6618
- };
6619
- function formatIssue(issue) {
6620
- return {
6621
- path: issue.path.map((segment) => typeof segment === "string" || typeof segment === "number" ? String(segment) : "").filter((segment) => segment !== "").join("."),
6622
- message: issue.message
6623
- };
6624
- }
6625
- //#endregion
6626
6042
  //#region ../runner/src/outputArtifacts.ts
6627
6043
  const mimeTypeExtensionMap = {
6628
6044
  "application/json": ".json",
@@ -6768,254 +6184,23 @@ function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
6768
6184
  };
6769
6185
  }
6770
6186
  //#endregion
6771
- //#region ../runner/src/runMaintenance.ts
6772
- async function persistRunState(runState) {
6773
- await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
6774
- await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
6775
- const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
6776
- await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
6777
- }
6778
- /**
6779
- * Recompute a persisted case's status after score definitions changed.
6780
- *
6781
- * Pass/fail gates are per-score: a case fails when any score with a declared
6782
- * `passThreshold` reports a numeric value below that threshold. Scores
6783
- * without a threshold are informational and never gate. Cancelled and
6784
- * errored cases retain their terminal status.
6785
- */
6786
- function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
6787
- if (caseRow.status === "cancelled") return "cancelled";
6788
- if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
6789
- if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
6790
- for (const [key, passThreshold] of scoreThresholds) {
6791
- const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
6792
- if (typeof rawValue !== "number") continue;
6793
- if (rawValue < passThreshold) return "fail";
6794
- }
6795
- return caseRow.status === "error" ? "error" : "pass";
6796
- }
6797
- function runTouchesEval(params) {
6798
- if (params.caseRows.some((caseRow) => caseRow.evalKey === params.evalKey)) return true;
6799
- if (params.target.mode === "all") return params.evalExists;
6800
- if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? false;
6801
- return false;
6802
- }
6803
- async function deleteTemporaryRuns(params) {
6804
- let deletedRuns = 0;
6805
- for (const [runId, run] of [...params.runs]) {
6806
- if (run.manifest.temporary !== true) continue;
6807
- if (run.manifest.status === "running") {
6808
- const endedAt = /* @__PURE__ */ new Date();
6809
- run.manifest.status = "cancelled";
6810
- run.manifest.endedAt = endedAt.toISOString();
6811
- run.summary.status = "cancelled";
6812
- run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
6813
- params.cancelRunningRun(run);
6814
- }
6815
- params.runs.delete(runId);
6816
- await rm(run.runDir, {
6817
- recursive: true,
6818
- force: true
6819
- });
6820
- deletedRuns += 1;
6821
- }
6822
- return deletedRuns;
6823
- }
6824
- async function recomputeEvalStatusesInRuns(params) {
6825
- let updatedRuns = 0;
6826
- for (const run of params.runs) {
6827
- if (!runTouchesEval({
6828
- target: run.manifest.target,
6829
- caseRows: run.cases,
6830
- evalKey: params.evalKey,
6831
- evalExists: params.evalExists
6832
- })) continue;
6833
- if (run.manifest.status === "running") continue;
6834
- let changed = false;
6835
- for (const caseRow of run.cases) {
6836
- if (caseRow.evalKey !== params.evalKey) continue;
6837
- const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
6838
- const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
6839
- if (caseRow.status === nextStatus) continue;
6840
- caseRow.status = nextStatus;
6841
- if (caseDetail) {
6842
- caseDetail.status = nextStatus;
6843
- await params.persistCaseDetail(run.runDir, caseDetail);
6844
- }
6845
- changed = true;
6846
- }
6847
- if (!changed) continue;
6848
- const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
6849
- run.summary.totalCases = derivedSummary.totalCases;
6850
- run.summary.passedCases = derivedSummary.passedCases;
6851
- run.summary.failedCases = derivedSummary.failedCases;
6852
- run.summary.errorCases = derivedSummary.errorCases;
6853
- run.summary.cancelledCases = derivedSummary.cancelledCases;
6854
- await persistRunState(run);
6855
- updatedRuns += 1;
6856
- }
6857
- return updatedRuns;
6858
- }
6859
- //#endregion
6860
- //#region ../runner/src/runPersistence.ts
6861
- const SHORT_ID_PATTERN = /^r(\d+)$/;
6862
- /**
6863
- * Generate a filesystem-safe, sortable run id combining a UTC timestamp
6864
- * with a short random suffix.
6865
- */
6866
- function generateRunId() {
6867
- const now = /* @__PURE__ */ new Date();
6868
- const pad = (n) => String(n).padStart(2, "0");
6869
- return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
6870
- }
6871
- function parseShortIdNum(shortId) {
6872
- if (shortId === void 0) return null;
6873
- const match = SHORT_ID_PATTERN.exec(shortId);
6874
- if (!match) return null;
6875
- const num = Number(match[1]);
6876
- if (!Number.isFinite(num)) return null;
6877
- return num;
6878
- }
6879
- /**
6880
- * Return the next `shortId` number to assign based on the existing
6881
- * loaded snapshots. Legacy runs that don't match the `r\d+` format are
6882
- * ignored.
6883
- */
6884
- function nextShortIdFromSnapshots(snapshots) {
6885
- let maxNum = -1;
6886
- for (const snapshot of snapshots) {
6887
- const num = parseShortIdNum(snapshot.manifest.shortId);
6888
- if (num !== null && num > maxNum) maxNum = num;
6889
- }
6890
- return maxNum + 1;
6891
- }
6892
- async function loadPersistedRunSnapshots(localStateDir) {
6893
- const runsDir = join(localStateDir, "runs");
6894
- const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
6895
- if (entriesResult.error) return [];
6896
- const snapshots = [];
6897
- const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
6898
- for (const runDir of runDirs) {
6899
- const snapshot = await loadPersistedRunSnapshot(runDir);
6900
- if (!snapshot) continue;
6901
- snapshots.push(snapshot);
6902
- }
6903
- return snapshots;
6904
- }
6905
- async function persistCaseDetail(runDir, caseDetail, fileId = caseDetail.caseId) {
6906
- await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), JSON.stringify(caseDetail, null, 2));
6907
- }
6908
- function getLastRunStatuses(params) {
6909
- const latestRunInfos = getLatestRunInfos(params);
6910
- return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
6911
- }
6912
- /**
6913
- * Return the latest scoped run metadata for each eval based on persisted and
6914
- * in-memory runs.
6915
- */
6916
- function getLatestRunInfos(params) {
6917
- const { runs, knownEvals } = params;
6918
- const knownEvalMetas = [...knownEvals];
6919
- const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
6920
- const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
6921
- const latestRunInfos = /* @__PURE__ */ new Map();
6922
- for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
6923
- status: getEvalStatusForRun(run, evalKey, manualScoreKeysByEval.get(evalKey) ?? []),
6924
- startedAt: getRunFreshnessTimestamp(run.manifest),
6925
- commitSha: run.manifest.commitSha ?? null,
6926
- evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? null
6927
- });
6928
- return latestRunInfos;
6929
- }
6930
- function toLastRunStatus$1(status) {
6931
- return status === "pending" ? null : status;
6932
- }
6933
- async function loadPersistedRunSnapshot(runDir) {
6934
- const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
6935
- if (!manifest) return null;
6936
- const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
6937
- if (!summary) return null;
6938
- return {
6939
- runDir,
6940
- manifest,
6941
- summary,
6942
- cases: await readCaseRows(runDir),
6943
- caseDetails: await readCaseDetails(runDir)
6944
- };
6945
- }
6946
- async function readParsedJsonFile(filePath, schema) {
6947
- const fileResult = await resultify(() => readFile(filePath, "utf-8"));
6948
- if (fileResult.error) return null;
6949
- const jsonResult = resultify(() => JSON.parse(fileResult.value));
6950
- if (jsonResult.error) return null;
6951
- const parsed = schema.safeParse(jsonResult.value);
6952
- if (!parsed.success) return null;
6953
- return parsed.data;
6954
- }
6955
- async function readCaseRows(runDir) {
6956
- const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
6957
- if (fileResult.error) return [];
6958
- const rows = [];
6959
- for (const rawLine of fileResult.value.split("\n")) {
6960
- const line = rawLine.trim();
6961
- if (line.length === 0) continue;
6962
- const jsonResult = resultify(() => JSON.parse(line));
6963
- if (jsonResult.error) continue;
6964
- const parsed = caseRowSchema.safeParse(jsonResult.value);
6965
- if (!parsed.success) continue;
6966
- rows.push(parsed.data);
6967
- }
6968
- return rows;
6969
- }
6970
- async function readCaseDetails(runDir) {
6971
- const detailsDir = join(runDir, "case-details");
6972
- const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
6973
- if (entriesResult.error) return /* @__PURE__ */ new Map();
6974
- const caseDetails = /* @__PURE__ */ new Map();
6975
- for (const entry of entriesResult.value) {
6976
- if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
6977
- const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
6978
- if (!detail) continue;
6979
- caseDetails.set(detail.caseKey ?? detail.caseId, detail);
6980
- }
6981
- return caseDetails;
6982
- }
6983
- function getRunEvalKeys(run, knownEvals) {
6984
- const knownEvalMetas = [...knownEvals];
6985
- const evalKeys = /* @__PURE__ */ new Set();
6986
- for (const caseRow of run.cases) if (caseRow.evalKey !== void 0) evalKeys.add(caseRow.evalKey);
6987
- if (run.manifest.target.mode === "evalIds") for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
6988
- else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
6989
- return [...evalKeys];
6990
- }
6991
- function getEvalStatusForRun(run, evalKey, manualScoreKeys) {
6992
- const evalCases = run.cases.filter((caseRow) => caseRow.evalKey === evalKey);
6993
- if (evalCases.length > 0) {
6994
- if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
6995
- return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
6996
- }
6997
- return toLastRunStatus$1(deriveStatusFromChildStatuses({
6998
- statuses: [],
6999
- lifecycleStatus: run.manifest.status
7000
- }));
7001
- }
7002
- function hasPendingManualScores(caseRows, manualScoreKeys) {
7003
- if (manualScoreKeys.length === 0) return false;
7004
- return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
7005
- const value = caseRow.columns[key];
7006
- return typeof value !== "number" || !Number.isFinite(value);
7007
- }));
7008
- }
7009
- function encodeCaseDetailFileName(caseId) {
7010
- return encodeURIComponent(caseId);
7011
- }
7012
- //#endregion
7013
6187
  //#region ../runner/src/stackFormatting.ts
7014
6188
  const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
7015
6189
  function stripTerminalControlCodes(value) {
7016
6190
  return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
7017
6191
  }
7018
6192
  //#endregion
6193
+ //#region ../runner/src/caseChildProtocol.ts
6194
+ function isCaseChildParentMessage(value) {
6195
+ return typeof value === "object" && value !== null && "type" in value && value.type === "start" && "context" in value;
6196
+ }
6197
+ function isCaseChildMessage(value) {
6198
+ if (typeof value !== "object" || value === null) return false;
6199
+ if (!("type" in value) || typeof value.type !== "string") return false;
6200
+ if (value.type === "done") return "result" in value;
6201
+ return value.type === "error" && "message" in value;
6202
+ }
6203
+ //#endregion
7019
6204
  //#region ../runner/src/runExecution.ts
7020
6205
  function filterEvalCases(cases, caseIds) {
7021
6206
  if (!caseIds || caseIds.length === 0) return cases;
@@ -7327,645 +6512,4 @@ function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
7327
6512
  };
7328
6513
  }
7329
6514
  //#endregion
7330
- //#region ../runner/src/runQueue.ts
7331
- async function executeQueuedCases(params) {
7332
- const { queuedCases, concurrency, globalTraceDisplay } = params;
7333
- let nextCaseIndex = 0;
7334
- let workerError = void 0;
7335
- const workerCount = Math.min(concurrency, queuedCases.length);
7336
- const workers = Array.from({ length: workerCount }, async () => {
7337
- while (workerError === void 0) {
7338
- const queuedCase = queuedCases[nextCaseIndex];
7339
- nextCaseIndex += 1;
7340
- if (queuedCase === void 0) return;
7341
- try {
7342
- await executeQueuedCase({
7343
- queuedCase,
7344
- globalTraceDisplay
7345
- });
7346
- } catch (error) {
7347
- workerError = error instanceof Error ? error : new Error(String(error));
7348
- return;
7349
- }
7350
- }
7351
- });
7352
- await Promise.all(workers);
7353
- if (workerError instanceof Error) throw workerError;
7354
- if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
7355
- }
7356
- async function executeQueuedCase(params) {
7357
- const { queuedCase, globalTraceDisplay } = params;
7358
- const startTime = Date.now();
7359
- const result = await queuedCase.execute({
7360
- globalTraceDisplay,
7361
- startTime
7362
- });
7363
- await queuedCase.onComplete(result);
7364
- }
7365
- //#endregion
7366
- //#region ../runner/src/tags.ts
7367
- function getInvalidTagMessages(params) {
7368
- return (params.tags ?? []).flatMap((tag) => {
7369
- const validation = validateEvalTagName(tag);
7370
- return validation.ok ? [] : [`${params.source} tag "${tag}" is invalid: ${validation.message}`];
7371
- });
7372
- }
7373
- /** Resolve effective eval-level tags and discovery issues for one eval. */
7374
- function resolveEvalTags(params) {
7375
- const configTags = params.configTags ?? [];
7376
- const removeTags = params.evalDef.removeTags ?? [];
7377
- const messages = [
7378
- ...getInvalidTagMessages({
7379
- tags: configTags,
7380
- source: "config"
7381
- }),
7382
- ...getInvalidTagMessages({
7383
- tags: params.evalDef.tags,
7384
- source: "eval"
7385
- }),
7386
- ...getInvalidTagMessages({
7387
- tags: removeTags,
7388
- source: "removeTags"
7389
- })
7390
- ];
7391
- const globalTagSet = new Set(configTags);
7392
- for (const tag of removeTags) if (!globalTagSet.has(tag)) messages.push(`removeTags tag "${tag}" is not defined in AgentEvalsConfig.tags.`);
7393
- const removeTagSet = new Set(removeTags);
7394
- return {
7395
- tags: dedupeEvalTags([...configTags.filter((tag) => !removeTagSet.has(tag)), ...params.evalDef.tags ?? []]),
7396
- issues: messages.map((message) => ({
7397
- type: "invalid-tags",
7398
- severity: "error",
7399
- filePath: params.filePath,
7400
- evalId: params.evalId,
7401
- message: `Invalid tags for eval "${params.evalId}" in ${params.filePath}: ${message}`
7402
- }))
7403
- };
7404
- }
7405
- /** Return effective case tags or throw when authored case tags are invalid. */
7406
- function resolveCaseTags(params) {
7407
- const messages = getInvalidTagMessages({
7408
- tags: params.evalCase.tags,
7409
- source: `case "${params.evalCase.id}"`
7410
- });
7411
- if (messages.length > 0) throw new Error(`Invalid tags for case "${params.evalCase.id}" in ${params.filePath}#${params.evalId}: ${messages.join("; ")}`);
7412
- return dedupeEvalTags([...params.evalTags, ...params.evalCase.tags ?? []]);
7413
- }
7414
- /** Validate CLI/API tags filters and return the first error message. */
7415
- function validateTagsFilters(filters) {
7416
- for (const filter of filters ?? []) {
7417
- const error = validateTagsFilterExpression(filter);
7418
- if (error !== null) return `Invalid --tags-filter "${filter}": ${error}`;
7419
- }
7420
- return null;
7421
- }
7422
- /** Filter cases by Vitest-style tag expressions. */
7423
- function filterEvalCasesByTags(cases, tagsFilter) {
7424
- if (tagsFilter === void 0 || tagsFilter.length === 0) return [...cases];
7425
- return cases.filter((evalCase) => matchesTagsFilter({
7426
- tags: evalCase.tags,
7427
- filters: tagsFilter
7428
- }));
7429
- }
7430
- /** Return whether eval-level tags alone satisfy the run's tag filters. */
7431
- function evalTagsMatchFilter(params) {
7432
- return matchesTagsFilter({
7433
- tags: params.tags,
7434
- filters: params.tagsFilter
7435
- });
7436
- }
7437
- //#endregion
7438
- //#region ../runner/src/targeting.ts
7439
- function escapeRegex(value) {
7440
- return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
7441
- }
7442
- function globToRegex(pattern) {
7443
- const normalized = pattern.replaceAll("\\", "/");
7444
- let regex = "^";
7445
- for (let i = 0; i < normalized.length; i++) {
7446
- const char = normalized[i];
7447
- const next = normalized[i + 1];
7448
- if (char === "*" && next === "*") {
7449
- regex += ".*";
7450
- i++;
7451
- } else if (char === "*") regex += "[^/]*";
7452
- else if (char === "?") regex += "[^/]";
7453
- else regex += escapeRegex(char ?? "");
7454
- }
7455
- regex += "$";
7456
- return new RegExp(regex);
7457
- }
7458
- function fileMatches(pattern, filePath) {
7459
- const normalizedPattern = pattern.replaceAll("\\", "/");
7460
- if (normalizedPattern === filePath) return true;
7461
- return globToRegex(normalizedPattern).test(filePath);
7462
- }
7463
- function matchesFiles(evalMeta, files) {
7464
- if (files === void 0 || files.length === 0) return true;
7465
- return files.some((file) => fileMatches(file, evalMeta.filePath));
7466
- }
7467
- function matchesEvalIds(evalMeta, evalIds) {
7468
- if (evalIds === void 0 || evalIds.length === 0) return true;
7469
- return evalIds.includes(evalMeta.id);
7470
- }
7471
- function matchesEvalKeys(evalMeta, evalKeys) {
7472
- if (evalKeys === void 0 || evalKeys.length === 0) return true;
7473
- return evalKeys.includes(evalMeta.key);
7474
- }
7475
- /** Return the discovered evals selected by a run target. */
7476
- function getTargetEvals(params) {
7477
- const { target } = params.request;
7478
- return [...params.evals].filter((evalMeta) => matchesEvalKeys(evalMeta, target.evalKeys)).filter((evalMeta) => matchesEvalIds(evalMeta, target.evalIds)).filter((evalMeta) => matchesFiles(evalMeta, target.files)).toSorted((a, b) => a.filePath.localeCompare(b.filePath));
7479
- }
7480
- /** Resolve which exact eval keys a run request can affect. */
7481
- function getTargetEvalKeys(params) {
7482
- return getTargetEvals({
7483
- evals: params.sortedEvals,
7484
- request: params.request
7485
- }).map((evalMeta) => evalMeta.key);
7486
- }
7487
- //#endregion
7488
- //#region ../runner/src/runOrchestration.ts
7489
- function toOptionalSourceFingerprint(sourceFingerprint) {
7490
- return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
7491
- }
7492
- function buildCaseModuleIsolation(params) {
7493
- return {
7494
- key: [
7495
- params.runId,
7496
- params.evalKey,
7497
- params.caseId,
7498
- `trial-${String(params.trial)}`
7499
- ].join(":"),
7500
- workspaceRoot: params.workspaceRoot
7501
- };
7502
- }
7503
- function buildEvalPreparationModuleIsolation(params) {
7504
- return {
7505
- key: [
7506
- params.runId,
7507
- params.evalKey,
7508
- "prepare"
7509
- ].join(":"),
7510
- workspaceRoot: params.workspaceRoot
7511
- };
7512
- }
7513
- /**
7514
- * Ranks case statuses from worst to best. Used to order trial attempts so the
7515
- * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
7516
- * non-terminal status outside `pass`/`fail`/`error` is treated as indistinct
7517
- * from `fail` for comparison purposes.
7518
- */
7519
- function statusRank(status) {
7520
- if (status === "pass") return 2;
7521
- if (status === "error") return 0;
7522
- return 1;
7523
- }
7524
- /**
7525
- * Returns the minimum numeric value across the declared score columns for a
7526
- * trial, or `-Infinity` when no score has a numeric value. Used as a
7527
- * tiebreaker between trials that share the same status.
7528
- */
7529
- function minScoreValue(caseRow, scoreKeys) {
7530
- let min = Number.POSITIVE_INFINITY;
7531
- for (const key of scoreKeys) {
7532
- const v = caseRow.columns[key];
7533
- if (typeof v === "number" && Number.isFinite(v)) {
7534
- if (v < min) min = v;
7535
- }
7536
- }
7537
- return Number.isFinite(min) ? min : Number.NEGATIVE_INFINITY;
7538
- }
7539
- function compareTrialResults(left, right, scoreKeys) {
7540
- const statusDiff = statusRank(left.caseRow.status) - statusRank(right.caseRow.status);
7541
- if (statusDiff !== 0) return statusDiff;
7542
- const scoreDiff = minScoreValue(left.caseRow, scoreKeys) - minScoreValue(right.caseRow, scoreKeys);
7543
- if (scoreDiff !== 0) return scoreDiff;
7544
- return left.caseRow.trial - right.caseRow.trial;
7545
- }
7546
- function pickWinningTrial(params) {
7547
- const orderedAttempts = [...params.attempts].toSorted((left, right) => compareTrialResults(left, right, params.scoreKeys));
7548
- if (params.strategy === "lowestScore") {
7549
- const [lowestAttempt] = orderedAttempts;
7550
- if (lowestAttempt === void 0) throw new Error("Expected at least one trial attempt");
7551
- return lowestAttempt;
7552
- }
7553
- const medianAttempt = orderedAttempts[Math.floor((orderedAttempts.length - 1) / 2)];
7554
- if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
7555
- return medianAttempt;
7556
- }
7557
- function formatUnknownErrorDetails(error) {
7558
- if (error instanceof Error) return error.stack ?? error.message;
7559
- if (typeof error === "string") return error;
7560
- return String(error);
7561
- }
7562
- function findDuplicateCaseIds(cases) {
7563
- const counts = /* @__PURE__ */ new Map();
7564
- for (const evalCase of cases) counts.set(evalCase.id, (counts.get(evalCase.id) ?? 0) + 1);
7565
- return [...counts].filter(([, count]) => count > 1).map(([caseId]) => caseId).toSorted();
7566
- }
7567
- function throwIfDiscoveryIssues(issues) {
7568
- if (issues.length === 0) return;
7569
- throw new Error(issues.map((issue) => issue.message).join("\n"));
7570
- }
7571
- function findAmbiguousTargetCaseIds(preparedEvals) {
7572
- const ownersByCaseId = /* @__PURE__ */ new Map();
7573
- for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
7574
- const owners = ownersByCaseId.get(preparedCase.caseId) ?? /* @__PURE__ */ new Set();
7575
- owners.add(`${preparedEval.evalMeta.filePath}#${preparedEval.evalMeta.id}`);
7576
- ownersByCaseId.set(preparedCase.caseId, owners);
7577
- }
7578
- return [...ownersByCaseId].filter(([, owners]) => owners.size > 1).map(([caseId, owners]) => `${caseId} (${[...owners].join(", ")})`);
7579
- }
7580
- function buildRunErrorMessage(errors) {
7581
- return errors.map((entry) => {
7582
- const [firstLine, ...detailLines] = entry.details.split("\n");
7583
- const messageLine = firstLine?.trim() ?? "Unknown error";
7584
- const details = detailLines.join("\n").trim();
7585
- if (details.length === 0) return `[${entry.evalId}] ${messageLine}`;
7586
- return `[${entry.evalId}] ${messageLine}\n${details}`;
7587
- }).join("\n");
7588
- }
7589
- async function finalizePreparedCase(params) {
7590
- const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
7591
- if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
7592
- preparedCase.finalized = true;
7593
- const winningTrial = pickWinningTrial({
7594
- strategy: runState.manifest.trialSelection,
7595
- attempts: preparedCase.trialResults,
7596
- scoreKeys: preparedEval.scoreKeys
7597
- });
7598
- if (winningTrial.bufferedCacheStore !== null) await winningTrial.bufferedCacheStore.commit();
7599
- const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
7600
- runState.cases.push(winningTrial.caseRow);
7601
- runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
7602
- if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
7603
- else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
7604
- else runState.summary.failedCases++;
7605
- await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
7606
- await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
7607
- onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
7608
- emitEvent(runState, {
7609
- type: "case.finished",
7610
- runId: runState.manifest.id,
7611
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7612
- payload: winningTrial.caseRow
7613
- });
7614
- preparedEval.evalCaseRows.push(winningTrial.caseRow);
7615
- }
7616
- function getPreparedCaseOrderKey(caseRow) {
7617
- return `${caseRow.evalKey ?? caseRow.evalId}\u0000${caseRow.caseId}`;
7618
- }
7619
- function getCaseArtifactFileId(runState, caseRow) {
7620
- const caseKey = getCaseRowCaseKey(caseRow);
7621
- return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
7622
- }
7623
- function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
7624
- const orderByCase = /* @__PURE__ */ new Map();
7625
- let order = 0;
7626
- for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
7627
- orderByCase.set(`${preparedEval.evalMeta.key}\u0000${preparedCase.caseId}`, order);
7628
- order++;
7629
- }
7630
- caseRows.sort((left, right) => {
7631
- return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
7632
- });
7633
- }
7634
- async function executeRun({ runState, request, runDir, config, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
7635
- try {
7636
- const tagsFilterError = validateTagsFilters(request.target.tagsFilter);
7637
- if (tagsFilterError !== null) throw new Error(tagsFilterError);
7638
- const targetEvals = getTargetEvals(request);
7639
- emitEvent(runState, {
7640
- type: "run.started",
7641
- runId: runState.manifest.id,
7642
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7643
- payload: runState.manifest
7644
- });
7645
- const evalErrors = [];
7646
- const queuedCases = [];
7647
- const preparedEvals = [];
7648
- const cacheMode = runState.manifest.cacheMode ?? "use";
7649
- const cacheEnabled = config.cache?.enabled !== false;
7650
- const llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
7651
- const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
7652
- for (const evalMeta of targetEvals) {
7653
- const evalFilePath = evalMeta.sourceFilePath;
7654
- const evalModuleIsolation = buildEvalPreparationModuleIsolation({
7655
- runId: runState.manifest.id,
7656
- evalKey: evalMeta.key,
7657
- workspaceRoot
7658
- });
7659
- let sourceFingerprint = "";
7660
- try {
7661
- sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
7662
- } catch {
7663
- sourceFingerprint = "";
7664
- }
7665
- if (sourceFingerprint.length > 0) {
7666
- runState.manifest.evalSourceFingerprints[evalMeta.key] = sourceFingerprint;
7667
- evalMeta.sourceFingerprint = sourceFingerprint;
7668
- } else {
7669
- delete runState.manifest.evalSourceFingerprints[evalMeta.key];
7670
- evalMeta.sourceFingerprint = null;
7671
- }
7672
- try {
7673
- const entry = (await loadIsolatedEvalRegistry({
7674
- evalFilePath,
7675
- sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7676
- moduleIsolation: evalModuleIsolation,
7677
- runtimeScope: "env"
7678
- })).get(evalMeta.id);
7679
- if (!entry) {
7680
- evalErrors.push({
7681
- evalId: evalMeta.id,
7682
- details: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
7683
- });
7684
- continue;
7685
- }
7686
- await runWithModuleIsolation(evalModuleIsolation, async () => {
7687
- await runInEvalRuntimeScope("cases", async () => {
7688
- await entry.use(async (evalDef) => {
7689
- const evalTagsResult = resolveEvalTags({
7690
- configTags: config.tags,
7691
- evalDef,
7692
- evalId: evalMeta.id,
7693
- filePath: evalMeta.filePath
7694
- });
7695
- throwIfDiscoveryIssues(evalTagsResult.issues);
7696
- evalMeta.tags = evalTagsResult.tags;
7697
- if (evalDef.manualInput && evalDef.cases !== void 0) throw new Error(`Eval "${evalMeta.id}" cannot declare both "cases" and "manualInput". Remove one of them.`);
7698
- let manualInputCase = null;
7699
- if (evalDef.manualInput) {
7700
- const manualTags = evalTagsResult.tags;
7701
- if (!filterEvalCasesByTags([{
7702
- id: `${evalMeta.id}-manual`,
7703
- input: {},
7704
- tags: manualTags
7705
- }], request.target.tagsFilter).length) {
7706
- evalMeta.caseCount = 1;
7707
- evalMeta.caseIds = [`${evalMeta.id}-manual`];
7708
- return;
7709
- }
7710
- const rawValue = request.manualInputs?.[evalMeta.key];
7711
- if (rawValue === void 0) throw new Error(`Eval "${evalMeta.id}" requires manual input. Provide it via the run modal in the web UI or "--input" / "--input-file" on the CLI.`);
7712
- const parsed = parseManualInputValues(evalDef.manualInput, rawValue);
7713
- if (parsed.error) {
7714
- const formatted = parsed.error.issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ");
7715
- throw new Error(`Invalid manual input for eval "${evalMeta.id}": ${formatted}`);
7716
- }
7717
- manualInputCase = {
7718
- id: `${evalMeta.id}-manual`,
7719
- input: parsed.value,
7720
- tags: manualTags
7721
- };
7722
- }
7723
- const evalCases = manualInputCase ? [manualInputCase] : typeof evalDef.cases === "function" && !evalTagsMatchFilter({
7724
- tags: evalTagsResult.tags,
7725
- tagsFilter: request.target.tagsFilter
7726
- }) ? [] : await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime });
7727
- const runnableCases = (manualInputCase ? evalCases : resolveRunnableEvalCases({
7728
- cases: evalCases,
7729
- evalId: evalMeta.id
7730
- })).map((evalCase) => ({
7731
- ...evalCase,
7732
- tags: resolveCaseTags({
7733
- evalTags: evalTagsResult.tags,
7734
- evalCase,
7735
- evalId: evalMeta.id,
7736
- filePath: evalMeta.filePath
7737
- })
7738
- }));
7739
- const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
7740
- if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
7741
- const cases = filterEvalCasesByTags(filterEvalCases(runnableCases, request.target.caseIds), request.target.tagsFilter);
7742
- evalMeta.caseCount = runnableCases.length;
7743
- evalMeta.caseIds = runnableCases.map((evalCase) => evalCase.id);
7744
- runState.summary.totalCases += cases.length;
7745
- const defaultConfig = resolveEvalDefaultConfig({
7746
- evalDef,
7747
- globalColumns: config.columns,
7748
- globalStats: config.stats,
7749
- globalDefaultStatAggregate: config.defaultStatAggregate,
7750
- globalRemove: config.removeDefaultConfig
7751
- });
7752
- const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
7753
- const validatedCharts = validateCharts({
7754
- charts: defaultConfig.charts,
7755
- columnDefs: declaredColumnDefs,
7756
- evalId: evalMeta.id
7757
- });
7758
- for (const warning of validatedCharts.warnings) console.warn(warning);
7759
- evalMeta.columnDefs = declaredColumnDefs;
7760
- evalMeta.stats = defaultConfig.stats;
7761
- evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
7762
- evalMeta.charts = validatedCharts.charts;
7763
- const evalCaseRows = [];
7764
- const preparedCases = [];
7765
- const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
7766
- const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
7767
- const preparedEval = {
7768
- evalMeta,
7769
- evalCaseRows,
7770
- preparedCases,
7771
- scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
7772
- };
7773
- preparedEvals.push(preparedEval);
7774
- for (const evalCase of cases) {
7775
- const trialResults = [];
7776
- const preparedCase = {
7777
- caseId: evalCase.id,
7778
- trialResults,
7779
- finalized: false
7780
- };
7781
- preparedCases.push(preparedCase);
7782
- for (let trial = 0; trial < request.trials; trial++) {
7783
- const bufferedCacheStore = cacheEnabled && cacheMode !== "bypass" ? createBufferedCacheStore(cacheStore) : null;
7784
- const caseModuleIsolation = buildCaseModuleIsolation({
7785
- runId: runState.manifest.id,
7786
- evalKey: evalMeta.key,
7787
- caseId: evalCase.id,
7788
- trial,
7789
- workspaceRoot
7790
- });
7791
- queuedCases.push({
7792
- execute: async ({ startTime, globalTraceDisplay }) => {
7793
- const { caseDetail, caseRowUpdate } = await useIsolatedEvalDefinition({
7794
- evalId: evalMeta.id,
7795
- evalFilePath,
7796
- sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
7797
- moduleIsolation: caseModuleIsolation,
7798
- runtimeScope: "env",
7799
- use: async (isolatedEvalDef) => await runCase({
7800
- evalDef: isolatedEvalDef,
7801
- evalId: evalMeta.id,
7802
- evalKey: evalMeta.key,
7803
- evalCase,
7804
- globalTraceDisplay,
7805
- globalColumns: config.columns,
7806
- globalDeriveFromTracing: config.deriveFromTracing,
7807
- llmCallsConfig,
7808
- apiCallsConfig,
7809
- globalRemoveDefaultConfig: config.removeDefaultConfig,
7810
- trial,
7811
- startTime,
7812
- cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
7813
- cacheMode,
7814
- moduleIsolation: caseModuleIsolation,
7815
- evalFilePath,
7816
- evalFileRelativePath: evalMeta.filePath,
7817
- workspaceRoot,
7818
- artifactDir: join(runDir, "artifacts"),
7819
- runId: runState.manifest.id
7820
- })
7821
- });
7822
- return {
7823
- caseDetail,
7824
- caseRow: {
7825
- caseId: evalCase.id,
7826
- evalId: evalMeta.id,
7827
- evalKey: evalMeta.key,
7828
- caseKey: caseDetail.caseKey,
7829
- tags: caseDetail.tags,
7830
- status: caseRowUpdate.status ?? "pending",
7831
- durationMs: caseRowUpdate.durationMs ?? null,
7832
- cacheHits: caseRowUpdate.cacheHits ?? 0,
7833
- cacheOperations: caseRowUpdate.cacheOperations ?? 0,
7834
- columns: caseRowUpdate.columns ?? {},
7835
- trial
7836
- }
7837
- };
7838
- },
7839
- onComplete: async ({ caseDetail, caseRow }) => {
7840
- trialResults.push({
7841
- caseDetail,
7842
- caseRow,
7843
- bufferedCacheStore
7844
- });
7845
- if (trialResults.length !== request.trials) return;
7846
- await finalizePreparedCase({
7847
- runState,
7848
- runDir,
7849
- preparedEval,
7850
- preparedCase,
7851
- onCaseFinished,
7852
- emitEvent
7853
- });
7854
- }
7855
- });
7856
- }
7857
- }
7858
- });
7859
- });
7860
- });
7861
- } catch (error) {
7862
- console.error(`Error running eval ${evalMeta.id}:`, error);
7863
- evalErrors.push({
7864
- evalId: evalMeta.id,
7865
- details: formatUnknownErrorDetails(error)
7866
- });
7867
- lastRunStatusMap.set(evalMeta.key, "error");
7868
- latestRunInfoMap.set(evalMeta.key, {
7869
- status: "error",
7870
- startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
7871
- commitSha: runState.manifest.commitSha ?? null,
7872
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.key] ?? null
7873
- });
7874
- }
7875
- }
7876
- const ambiguousCaseTargets = request.target.caseIds && request.target.caseIds.length > 0 ? findAmbiguousTargetCaseIds(preparedEvals) : [];
7877
- if (ambiguousCaseTargets.length > 0) {
7878
- queuedCases.length = 0;
7879
- evalErrors.push({
7880
- evalId: "target",
7881
- details: `Ambiguous --case target. Narrow it with --file and/or --eval: ${ambiguousCaseTargets.join("; ")}`
7882
- });
7883
- } else await executeQueuedCases({
7884
- queuedCases,
7885
- concurrency: getConfiguredConcurrency(),
7886
- globalTraceDisplay: config.traceDisplay
7887
- });
7888
- for (const preparedEval of preparedEvals) {
7889
- for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
7890
- runState,
7891
- runDir,
7892
- preparedEval,
7893
- preparedCase,
7894
- onCaseFinished,
7895
- emitEvent
7896
- });
7897
- lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
7898
- const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
7899
- latestRunInfoMap.set(preparedEval.evalMeta.key, {
7900
- status: latestStatus,
7901
- startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
7902
- commitSha: runState.manifest.commitSha ?? null,
7903
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.key] ?? null
7904
- });
7905
- }
7906
- sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
7907
- for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
7908
- const endTime = /* @__PURE__ */ new Date();
7909
- runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
7910
- const finalStatus = evalErrors.length > 0 ? "error" : "completed";
7911
- runState.summary.status = finalStatus;
7912
- runState.manifest.status = finalStatus;
7913
- const completedRunAt = endTime.toISOString();
7914
- runState.manifest.endedAt = completedRunAt;
7915
- runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
7916
- for (const evalKey of getTargetEvalKeys({
7917
- request,
7918
- sortedEvals: getSortedEvalMetas()
7919
- })) {
7920
- const latestStatus = lastRunStatusMap.get(evalKey) ?? toLastRunStatus(deriveStatusFromCaseRows({
7921
- caseRows: [],
7922
- lifecycleStatus: runState.manifest.status
7923
- }));
7924
- latestRunInfoMap.set(evalKey, {
7925
- status: latestStatus,
7926
- startedAt: completedRunAt,
7927
- commitSha: runState.manifest.commitSha ?? null,
7928
- evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalKey] ?? null
7929
- });
7930
- }
7931
- await persistRunState(runState);
7932
- emitEvent(runState, {
7933
- type: "run.summary",
7934
- runId: runState.manifest.id,
7935
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7936
- payload: runState.summary
7937
- });
7938
- if (finalStatus === "error") emitEvent(runState, {
7939
- type: "run.error",
7940
- runId: runState.manifest.id,
7941
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7942
- payload: { message: buildRunErrorMessage(evalErrors) }
7943
- });
7944
- else emitEvent(runState, {
7945
- type: "run.finished",
7946
- runId: runState.manifest.id,
7947
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7948
- payload: runState.summary
7949
- });
7950
- emitDiscoveryEvent();
7951
- } catch (error) {
7952
- const message = formatUnknownErrorDetails(error);
7953
- runState.manifest.status = "error";
7954
- runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
7955
- runState.summary.status = "error";
7956
- runState.summary.errorMessage = message;
7957
- await persistRunState(runState);
7958
- emitEvent(runState, {
7959
- type: "run.error",
7960
- runId: runState.manifest.id,
7961
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7962
- payload: { message }
7963
- });
7964
- emitDiscoveryEvent();
7965
- }
7966
- }
7967
- function toLastRunStatus(status) {
7968
- return status === "pending" ? null : status;
7969
- }
7970
- //#endregion
7971
- export { getCurrentScope as $, validateCharts as A, buildEvalKey as At, deserializeCacheValue as B, deriveEvalFreshness as C, getEvalDisplayStatus as Ct, loadConfig as D, runSummarySchema as Dt, resolveEvalDefaultConfig as E, runManifestSchema as Et, evalSpan as F, evalChartsConfigSchema as Ft, readManualInputFile as G, serializeCacheValue as H, evalTracer as I, columnDefSchema as It, appendToEvalOutput as J, evalExpect as K, hashCacheKey as L, defineEval as Lt, z$1 as M, evalStatAggregateSchema as Mt, buildTraceTree as N, evalStatsConfigSchema as Nt, buildDeclaredColumnDefs as O, resolveApiCallsConfig as Ot, captureEvalSpanError as P, manualInputDescriptorSchema as Pt, evalTime as Q, hashCacheKeySync as R, getEvalRegistry as Rt, parseManualInputValues as S, getEvalTitle as St, parseEvalDiscovery as T, matchesTagsFilter as Tt, repoFile as U, serializeCacheRecording as V, manualInputFileValueSchema as W, evalAssert as X, configureEvalRunLogs as Y, evalLog as Z, recomputePersistedCaseStatus as _, extractLlmCalls as _t, validateTagsFilters as a, nextEvalId as at, resolveArtifactPath as b, applyDerivedCallAttributes as bt, getLastRunStatuses as c, runInExistingEvalScope as ct, loadPersistedRunSnapshots as d, startEvalBackgroundJob as dt, getEvalCaseInput as et, nextShortIdFromSnapshots as f, createRunRequestSchema as ft, recomputeEvalStatusesInRuns as g, extractApiCalls as gt, persistRunState as h, extractCacheHits as ht, resolveEvalTags as i, mergeEvalOutput as it, createFsCacheStore as j, getCaseRowCaseKey as jt, normalizeScoreDef as k, resolveLlmCallsConfig as kt, getLatestRunInfos as l, setEvalOutput as lt, deleteTemporaryRuns as m, extractCacheEntries as mt, getTargetEvalKeys as n, isInEvalScope as nt, stripTerminalControlCodes as o, runInEvalRuntimeScope as ot, persistCaseDetail as p, updateManualScoreRequestSchema as pt, EvalAssertionError as q, getTargetEvals as r, matchesEvalTags as rt, generateRunId as s, runInEvalScope as st, executeRun as t, incrementEvalOutput as tt, loadPersistedRunSnapshot as u, setScopeCacheContext as ut, runTouchesEval as v, simulateLlmCallCost as vt, loadIsolatedEvalRegistry as w, deriveScopedSummaryFromCases as wt, buildManualInputDescriptor as x, getNestedAttribute as xt, resolveTracePresentation as y, simulateTokenAllocation as yt, deserializeCacheRecording as z };
6515
+ export { startEvalBackgroundJob as $, repoFile as A, evalChartsConfigSchema as At, getCurrentScope as B, evalTracer as C, buildEvalKey as Ct, deserializeCacheValue as D, evalStatAggregateSchema as Dt, deserializeCacheRecording as E, caseRowSchema as Et, appendToEvalOutput as F, mergeEvalOutput as G, incrementEvalOutput as H, configureEvalRunLogs as I, runInEvalScope as J, nextEvalId as K, evalAssert as L, readManualInputFile as M, defineEval as Mt, evalExpect as N, getEvalRegistry as Nt, serializeCacheRecording as O, evalStatsConfigSchema as Ot, EvalAssertionError as P, runWithEvalRegistry as Pt, setScopeCacheContext as Q, evalLog as R, evalSpan as S, resolveLlmCallsConfig as St, hashCacheKeySync as T, caseDetailSchema as Tt, isInEvalScope as U, getEvalCaseInput as V, matchesEvalTags as W, runWithEvalClock as X, runInExistingEvalScope as Y, setEvalOutput as Z, createBufferedCacheStore as _, validateEvalTagName as _t, isCaseChildParentMessage as a, extractLlmCalls as at, buildTraceTree as b, runSummarySchema as bt, resolveArtifactPath as c, applyDerivedCallAttributes as ct, loadEvalModule as d, getEvalDisplayStatus as dt, createRunRequestSchema as et, resolveEvalDefaultConfig as f, deriveScopedSummaryFromCases as ft, commitPendingCacheWrites as g, matchesTagsFilter as gt, normalizeScoreDef as h, dedupeEvalTags as ht, isCaseChildMessage as i, extractApiCalls as it, manualInputFileValueSchema as j, columnDefSchema as jt, serializeCacheValue as k, manualInputDescriptorSchema as kt, registerAgentEvalsPackageResolutionHooks as l, getNestedAttribute as lt, buildDeclaredColumnDefs as m, deriveStatusFromChildStatuses as mt, resolveRunnableEvalCases as n, extractCacheEntries as nt, stripTerminalControlCodes as o, simulateLlmCallCost as ot, loadConfig as p, deriveStatusFromCaseRows as pt, runInEvalRuntimeScope as q, runCase as r, extractCacheHits as rt, resolveTracePresentation as s, simulateTokenAllocation as st, filterEvalCases as t, updateManualScoreRequestSchema as tt, runWithModuleIsolation as u, getEvalTitle as ut, createFsCacheStore as v, validateTagsFilterExpression as vt, hashCacheKey as w, getCaseRowCaseKey as wt, captureEvalSpanError as x, resolveApiCallsConfig as xt, z$1 as y, runManifestSchema as yt, evalTime as z };