@ls-stack/agent-eval 0.54.0 → 0.55.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1596 @@
1
+ import { Et as caseRowSchema, Pt as runWithEvalRegistry, Tt as caseDetailSchema, X as runWithEvalClock, _t as validateEvalTagName, bt as runSummarySchema, d as loadEvalModule, f as resolveEvalDefaultConfig, ft as deriveScopedSummaryFromCases, g as commitPendingCacheWrites, gt as matchesTagsFilter, ht as dedupeEvalTags, i as isCaseChildMessage, m as buildDeclaredColumnDefs, mt as deriveStatusFromChildStatuses, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, pt as deriveStatusFromCaseRows, q as runInEvalRuntimeScope, t as filterEvalCases, u as runWithModuleIsolation, vt as validateTagsFilterExpression, wt as getCaseRowCaseKey, yt as runManifestSchema } from "./runExecution-Sw38bCaq.mjs";
2
+ import { readFile, readdir, rm, writeFile } from "node:fs/promises";
3
+ import { dirname, join } from "node:path";
4
+ import { existsSync } from "node:fs";
5
+ import { Result, resultify } from "t-result";
6
+ import { fileURLToPath } from "node:url";
7
+ import { spawn } from "node:child_process";
8
+ //#region ../runner/src/chartValidation.ts
9
+ function isValidColumnMetric(metric, columnsByKey, evalId, warnings) {
10
+ const columnDef = columnsByKey.get(metric.key);
11
+ if (!columnDef) {
12
+ warnings.push(`[${evalId}] chart metric references unknown column "${metric.key}" — dropped`);
13
+ return false;
14
+ }
15
+ if (metric.aggregate === "passThresholdRate") {
16
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
17
+ warnings.push(`[${evalId}] chart metric "${metric.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
18
+ return false;
19
+ }
20
+ }
21
+ return true;
22
+ }
23
+ function isValidTooltipExtra(extra, columnsByKey, evalId, warnings) {
24
+ const columnDef = columnsByKey.get(extra.key);
25
+ if (!columnDef) {
26
+ warnings.push(`[${evalId}] chart tooltip extra references unknown column "${extra.key}" — dropped`);
27
+ return false;
28
+ }
29
+ if (extra.aggregate === "passThresholdRate") {
30
+ if (columnDef.isScore !== true || typeof columnDef.passThreshold !== "number") {
31
+ warnings.push(`[${evalId}] chart tooltip extra "${extra.key}" uses "passThresholdRate" but the column is not a score with passThreshold — dropped`);
32
+ return false;
33
+ }
34
+ }
35
+ return true;
36
+ }
37
+ function sanitizeChart(chart, columnsByKey, evalId, warnings) {
38
+ const metrics = chart.metrics.filter((metric) => {
39
+ if (metric.source === "builtin") return true;
40
+ return isValidColumnMetric(metric, columnsByKey, evalId, warnings);
41
+ });
42
+ if (metrics.length === 0) {
43
+ warnings.push(`[${evalId}] chart had no valid metrics after validation — chart dropped`);
44
+ return null;
45
+ }
46
+ const tooltipExtras = chart.tooltipExtras?.filter((extra) => {
47
+ if (extra.source === "builtin") return true;
48
+ return isValidTooltipExtra(extra, columnsByKey, evalId, warnings);
49
+ });
50
+ return {
51
+ ...chart,
52
+ metrics,
53
+ tooltipExtras: tooltipExtras?.length ? tooltipExtras : void 0
54
+ };
55
+ }
56
+ /**
57
+ * Validate and sanitize an authored `charts` config against the eval's
58
+ * declared columns. Drops metrics/extras that reference unknown columns or
59
+ * misuse `passThresholdRate`, and drops entire charts whose metrics are all
60
+ * invalid. Returns `charts: undefined` when nothing valid remains so the UI
61
+ * falls back to rendering no chart (matching the opt-in default).
62
+ */
63
+ function validateCharts(params) {
64
+ const { charts, columnDefs, evalId } = params;
65
+ if (!charts || charts.length === 0) return {
66
+ charts: void 0,
67
+ warnings: []
68
+ };
69
+ const columnsByKey = new Map(columnDefs.map((def) => [def.key, def]));
70
+ const warnings = [];
71
+ const sanitized = [];
72
+ for (const chart of charts) {
73
+ const result = sanitizeChart(chart, columnsByKey, evalId, warnings);
74
+ if (result) sanitized.push(result);
75
+ }
76
+ return {
77
+ charts: sanitized.length > 0 ? sanitized : void 0,
78
+ warnings
79
+ };
80
+ }
81
+ //#endregion
82
+ //#region ../runner/src/discovery.ts
83
+ const evalIdMatchRegex = /\bid\s*:\s*['"]([^'"]+)['"]/;
84
+ const evalTitleMatchRegex = /\btitle\s*:\s*['"]([^'"]+)['"]/;
85
+ /** Parse static eval metadata and discovery issues from one eval file. */
86
+ function parseEvalDiscovery(filePath, content) {
87
+ const metas = [];
88
+ let searchIndex = 0;
89
+ while (searchIndex < content.length) {
90
+ const defineEvalIndex = content.indexOf("defineEval", searchIndex);
91
+ if (defineEvalIndex === -1) break;
92
+ const extracted = extractDefineEvalObject(content, defineEvalIndex);
93
+ if (!extracted) {
94
+ searchIndex = defineEvalIndex + 10;
95
+ continue;
96
+ }
97
+ const id = evalIdMatchRegex.exec(extracted.objectText)?.[1];
98
+ if (id !== void 0) {
99
+ const result = {
100
+ filePath,
101
+ id
102
+ };
103
+ const title = evalTitleMatchRegex.exec(extracted.objectText)?.[1];
104
+ if (title !== void 0) result.title = title;
105
+ metas.push(result);
106
+ }
107
+ searchIndex = extracted.nextIndex;
108
+ }
109
+ const countsById = /* @__PURE__ */ new Map();
110
+ for (const meta of metas) countsById.set(meta.id, (countsById.get(meta.id) ?? 0) + 1);
111
+ const duplicateIds = new Set([...countsById].filter(([, count]) => count > 1).map(([id]) => id));
112
+ const issues = [...duplicateIds].map((evalId) => ({
113
+ type: "duplicate-eval-id",
114
+ severity: "error",
115
+ filePath,
116
+ evalId,
117
+ message: `Duplicate eval id "${evalId}" in ${filePath}. Eval ids must be unique within one file.`
118
+ }));
119
+ return {
120
+ metas: metas.filter((meta) => !duplicateIds.has(meta.id)),
121
+ issues
122
+ };
123
+ }
124
+ function extractDefineEvalObject(content, defineEvalIndex) {
125
+ const openParenIndex = content.indexOf("(", defineEvalIndex);
126
+ if (openParenIndex === -1) return void 0;
127
+ const objectStartIndex = content.indexOf("{", openParenIndex);
128
+ if (objectStartIndex === -1) return void 0;
129
+ let depth = 0;
130
+ let quote;
131
+ let inBlockComment = false;
132
+ let inLineComment = false;
133
+ let isEscaped = false;
134
+ for (let index = objectStartIndex; index < content.length; index++) {
135
+ const currentChar = content[index];
136
+ const nextChar = content[index + 1];
137
+ if (inLineComment) {
138
+ if (currentChar === "\n") inLineComment = false;
139
+ continue;
140
+ }
141
+ if (inBlockComment) {
142
+ if (currentChar === "*" && nextChar === "/") {
143
+ inBlockComment = false;
144
+ index++;
145
+ }
146
+ continue;
147
+ }
148
+ if (quote) {
149
+ if (isEscaped) {
150
+ isEscaped = false;
151
+ continue;
152
+ }
153
+ if (currentChar === "\\") {
154
+ isEscaped = true;
155
+ continue;
156
+ }
157
+ if (currentChar === quote) quote = void 0;
158
+ continue;
159
+ }
160
+ if (currentChar === "/" && nextChar === "/") {
161
+ inLineComment = true;
162
+ index++;
163
+ continue;
164
+ }
165
+ if (currentChar === "/" && nextChar === "*") {
166
+ inBlockComment = true;
167
+ index++;
168
+ continue;
169
+ }
170
+ if (currentChar === "\"" || currentChar === "'" || currentChar === "`") {
171
+ quote = currentChar;
172
+ continue;
173
+ }
174
+ if (currentChar === "{") {
175
+ depth++;
176
+ continue;
177
+ }
178
+ if (currentChar === "}") {
179
+ depth--;
180
+ if (depth === 0) return {
181
+ nextIndex: index + 1,
182
+ objectText: content.slice(objectStartIndex, index + 1)
183
+ };
184
+ }
185
+ }
186
+ }
187
+ //#endregion
188
+ //#region ../runner/src/evalRegistryLoader.ts
189
+ async function loadIsolatedEvalRegistry(params) {
190
+ return await runWithEvalRegistry(async (registry) => {
191
+ await runWithModuleIsolation(params.moduleIsolation, async () => {
192
+ await runInEvalRuntimeScope(params.runtimeScope, async () => {
193
+ await loadEvalModule(params.evalFilePath, params.sourceFingerprint);
194
+ });
195
+ });
196
+ return registry;
197
+ });
198
+ }
199
+ //#endregion
200
+ //#region ../runner/src/freshness.ts
201
+ /**
202
+ * Derive eval freshness from the latest run, current eval-file fingerprint,
203
+ * current git commit, and an age threshold.
204
+ */
205
+ function deriveEvalFreshness(params) {
206
+ const { latestRun, gitState, currentEvalSourceFingerprint, staleAfterDays, now = /* @__PURE__ */ new Date() } = params;
207
+ const stale = latestRun?.evalSourceFingerprint !== void 0 && latestRun.evalSourceFingerprint !== null && currentEvalSourceFingerprint !== null && currentEvalSourceFingerprint !== latestRun.evalSourceFingerprint;
208
+ const latestRunCommitSha = latestRun?.commitSha;
209
+ if (latestRunCommitSha === void 0 || latestRunCommitSha === null) return {
210
+ freshnessStatus: stale ? "stale" : "fresh",
211
+ stale,
212
+ outdated: false
213
+ };
214
+ if (gitState.commitSha === null) return {
215
+ freshnessStatus: stale ? "stale" : "fresh",
216
+ stale,
217
+ outdated: false
218
+ };
219
+ if (latestRunCommitSha === gitState.commitSha) return {
220
+ freshnessStatus: stale ? "stale" : "fresh",
221
+ stale,
222
+ outdated: false
223
+ };
224
+ const latestRunStartedAt = new Date(latestRun?.startedAt ?? "").getTime();
225
+ if (!Number.isFinite(latestRunStartedAt)) return {
226
+ freshnessStatus: stale ? "stale" : "fresh",
227
+ stale,
228
+ outdated: false
229
+ };
230
+ const outdated = now.getTime() - latestRunStartedAt >= staleAfterDays * 24 * 60 * 60 * 1e3;
231
+ return {
232
+ freshnessStatus: stale ? "stale" : outdated ? "outdated" : "fresh",
233
+ stale,
234
+ outdated
235
+ };
236
+ }
237
+ /** Return the timestamp used when ordering and displaying a run recency. */
238
+ function getRunFreshnessTimestamp(manifest) {
239
+ return manifest.endedAt ?? manifest.startedAt;
240
+ }
241
+ //#endregion
242
+ //#region ../runner/src/manualInput/walker.ts
243
+ function isObject(value) {
244
+ return typeof value === "object" && value !== null;
245
+ }
246
+ function getZodDef(schema) {
247
+ if (!isObject(schema)) return null;
248
+ const zodHolder = schema._zod;
249
+ if (!isObject(zodHolder)) return null;
250
+ const def = zodHolder.def;
251
+ if (!isObject(def)) return null;
252
+ if (typeof def.type !== "string") return null;
253
+ return {
254
+ ...def,
255
+ type: def.type
256
+ };
257
+ }
258
+ function getDescription(schema) {
259
+ if (!isObject(schema)) return void 0;
260
+ const description = schema.description;
261
+ return typeof description === "string" ? description : void 0;
262
+ }
263
+ function getInnerSchema(def) {
264
+ return def.innerType;
265
+ }
266
+ function getChecks(def) {
267
+ const checks = def.checks;
268
+ if (!Array.isArray(checks)) return [];
269
+ const out = [];
270
+ for (const check of checks) {
271
+ if (!isObject(check)) continue;
272
+ const zodHolder = check._zod;
273
+ if (!isObject(zodHolder)) continue;
274
+ const checkDef = zodHolder.def;
275
+ if (!isObject(checkDef)) continue;
276
+ if (typeof checkDef.check !== "string") continue;
277
+ out.push({
278
+ ...checkDef,
279
+ check: checkDef.check
280
+ });
281
+ }
282
+ return out;
283
+ }
284
+ function findCheck(checks, name) {
285
+ return checks.find((check) => check.check === name);
286
+ }
287
+ function unwrap(schema) {
288
+ let current = schema;
289
+ let required = true;
290
+ let defaultValue = void 0;
291
+ for (let depth = 0; depth < 8; depth += 1) {
292
+ const def = getZodDef(current);
293
+ if (!def) return null;
294
+ if (def.type === "optional" || def.type === "nullable") {
295
+ required = false;
296
+ current = getInnerSchema(def);
297
+ continue;
298
+ }
299
+ if (def.type === "nullish") {
300
+ required = false;
301
+ current = getInnerSchema(def);
302
+ continue;
303
+ }
304
+ if (def.type === "default" || def.type === "prefault") {
305
+ const raw = def.defaultValue;
306
+ if (typeof raw === "function") defaultValue = Reflect.apply(raw, void 0, []);
307
+ else defaultValue = raw;
308
+ current = getInnerSchema(def);
309
+ continue;
310
+ }
311
+ if (def.type === "readonly" || def.type === "pipe") {
312
+ current = getInnerSchema(def) ?? def.in;
313
+ continue;
314
+ }
315
+ return {
316
+ schema: current,
317
+ def,
318
+ required,
319
+ defaultValue
320
+ };
321
+ }
322
+ return null;
323
+ }
324
+ function humaniseKey(key) {
325
+ const spaced = key.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[_-]+/g, " ").trim();
326
+ if (!spaced) return key;
327
+ const lowered = spaced.toLowerCase();
328
+ return lowered.charAt(0).toUpperCase() + lowered.slice(1);
329
+ }
330
+ function normaliseSelectOptions(raw) {
331
+ if (!raw) return void 0;
332
+ return raw.map((entry) => {
333
+ if (typeof entry === "string") return {
334
+ value: entry,
335
+ label: entry
336
+ };
337
+ return {
338
+ value: entry.value,
339
+ label: entry.label ?? entry.value
340
+ };
341
+ });
342
+ }
343
+ function enumOptionsFromEntries(def) {
344
+ const entries = def.entries;
345
+ if (!isObject(entries)) return null;
346
+ const out = [];
347
+ for (const [label, value] of Object.entries(entries)) if (typeof value === "string") out.push({
348
+ value,
349
+ label
350
+ });
351
+ else if (typeof value === "number") out.push({
352
+ value: String(value),
353
+ label
354
+ });
355
+ else return null;
356
+ return out;
357
+ }
358
+ function literalUnionOptions(def) {
359
+ const options = def.options;
360
+ if (!Array.isArray(options)) return null;
361
+ const out = [];
362
+ for (const option of options) {
363
+ const optDef = getZodDef(option);
364
+ if (optDef?.type !== "literal") return null;
365
+ const values = optDef.values;
366
+ if (!Array.isArray(values) || values.length !== 1) return null;
367
+ const value = values[0];
368
+ if (typeof value === "string") out.push({
369
+ value,
370
+ label: value
371
+ });
372
+ else if (typeof value === "number") {
373
+ const stringValue = String(value);
374
+ out.push({
375
+ value: stringValue,
376
+ label: stringValue
377
+ });
378
+ } else return null;
379
+ }
380
+ return out.length > 0 ? out : null;
381
+ }
382
+ function literalSelectOptions(def) {
383
+ const values = def.values;
384
+ if (!Array.isArray(values)) return null;
385
+ const out = [];
386
+ for (const value of values) if (typeof value === "string") out.push({
387
+ value,
388
+ label: value
389
+ });
390
+ else if (typeof value === "number") {
391
+ const stringValue = String(value);
392
+ out.push({
393
+ value: stringValue,
394
+ label: stringValue
395
+ });
396
+ } else return null;
397
+ return out;
398
+ }
399
+ function readStringChecks(def) {
400
+ const checks = getChecks(def);
401
+ const out = {};
402
+ const min = findCheck(checks, "min_length");
403
+ if (min && typeof min.minimum === "number") out.minLength = min.minimum;
404
+ const max = findCheck(checks, "max_length");
405
+ if (max && typeof max.maximum === "number") out.maxLength = max.maximum;
406
+ return out;
407
+ }
408
+ const integerNumberFormats = new Set([
409
+ "int",
410
+ "safeint",
411
+ "int32",
412
+ "uint32",
413
+ "int64",
414
+ "uint64"
415
+ ]);
416
+ function readNumberChecks(def) {
417
+ const checks = getChecks(def);
418
+ const out = {};
419
+ const gt = findCheck(checks, "greater_than");
420
+ if (gt && typeof gt.value === "number" && gt.inclusive === true) out.min = gt.value;
421
+ const lt = findCheck(checks, "less_than");
422
+ if (lt && typeof lt.value === "number" && lt.inclusive === true) out.max = lt.value;
423
+ const format = findCheck(checks, "number_format");
424
+ if (format && typeof format.format === "string" && integerNumberFormats.has(format.format)) out.integer = true;
425
+ return out;
426
+ }
427
+ function buildField(key, fieldSchema, override) {
428
+ const unwrapped = unwrap(fieldSchema);
429
+ if (!unwrapped) return Result.err(/* @__PURE__ */ new Error(`manualInput: field "${key}" uses an unsupported Zod schema (could not introspect)`));
430
+ const inner = unwrapped.def;
431
+ const description = override?.description ?? getDescription(unwrapped.schema);
432
+ const base = {
433
+ key,
434
+ label: override?.label ?? humaniseKey(key),
435
+ description,
436
+ placeholder: override?.placeholder,
437
+ required: unwrapped.required,
438
+ defaultValue: override?.defaultValue !== void 0 ? override.defaultValue : unwrapped.defaultValue
439
+ };
440
+ if (override?.asJson === true) {
441
+ const rows = override.rows;
442
+ return Result.ok({
443
+ ...base,
444
+ kind: "json",
445
+ rows
446
+ });
447
+ }
448
+ if (override?.asFile === true) return Result.ok({
449
+ ...base,
450
+ kind: "file",
451
+ accept: override.accept,
452
+ maxSizeBytes: override.maxSizeBytes
453
+ });
454
+ const overrideOptions = normaliseSelectOptions(override?.options);
455
+ if (overrideOptions) return Result.ok({
456
+ ...base,
457
+ kind: "select",
458
+ options: overrideOptions
459
+ });
460
+ switch (inner.type) {
461
+ case "string": {
462
+ const checks = readStringChecks(inner);
463
+ if (override?.multiline === true) return Result.ok({
464
+ ...base,
465
+ kind: "multiline",
466
+ rows: override.rows,
467
+ minLength: checks.minLength,
468
+ maxLength: checks.maxLength
469
+ });
470
+ return Result.ok({
471
+ ...base,
472
+ kind: "text",
473
+ minLength: checks.minLength,
474
+ maxLength: checks.maxLength
475
+ });
476
+ }
477
+ case "number":
478
+ case "int":
479
+ case "bigint": {
480
+ const checks = readNumberChecks(inner);
481
+ return Result.ok({
482
+ ...base,
483
+ kind: "number",
484
+ min: checks.min,
485
+ max: checks.max,
486
+ integer: checks.integer
487
+ });
488
+ }
489
+ case "boolean": return Result.ok({
490
+ ...base,
491
+ kind: "boolean"
492
+ });
493
+ case "enum": {
494
+ const options = enumOptionsFromEntries(inner);
495
+ if (options) return Result.ok({
496
+ ...base,
497
+ kind: "select",
498
+ options
499
+ });
500
+ return Result.ok({
501
+ ...base,
502
+ kind: "json",
503
+ rows: override?.rows
504
+ });
505
+ }
506
+ case "literal": {
507
+ const options = literalSelectOptions(inner);
508
+ if (options && options.length > 0) return Result.ok({
509
+ ...base,
510
+ kind: "select",
511
+ options
512
+ });
513
+ return Result.ok({
514
+ ...base,
515
+ kind: "json",
516
+ rows: override?.rows
517
+ });
518
+ }
519
+ case "union": {
520
+ const options = literalUnionOptions(inner);
521
+ if (options) return Result.ok({
522
+ ...base,
523
+ kind: "select",
524
+ options
525
+ });
526
+ return Result.ok({
527
+ ...base,
528
+ kind: "json",
529
+ rows: override?.rows
530
+ });
531
+ }
532
+ default: return Result.ok({
533
+ ...base,
534
+ kind: "json",
535
+ rows: override?.rows
536
+ });
537
+ }
538
+ }
539
+ function getObjectShape(schema) {
540
+ const def = getZodDef(schema);
541
+ if (!def) return null;
542
+ if (def.type !== "object") return null;
543
+ const shape = def.shape;
544
+ if (!isObject(shape)) return null;
545
+ return shape;
546
+ }
547
+ /**
548
+ * Walk an eval's `manualInput` configuration and produce the wire-format
549
+ * descriptor consumed by the web UI. The schema must resolve to a top-level
550
+ * `z.object(...)`; nested objects, arrays, unions, and other unsupported
551
+ * shapes inside fields fall back to the JSON textarea widget.
552
+ *
553
+ * Returns a `Result` so the caller (eval discovery) can surface a discovery
554
+ * issue without throwing when the schema is incompatible.
555
+ */
556
+ function buildManualInputDescriptor(config) {
557
+ const shape = getObjectShape(config.schema);
558
+ if (!shape) return Result.err(/* @__PURE__ */ new Error("manualInput.schema must be a top-level z.object(...). Wrap nested types in an object schema."));
559
+ const overrides = {};
560
+ const rawOverrides = config.fields;
561
+ if (rawOverrides) {
562
+ for (const [key, override] of Object.entries(rawOverrides)) if (override) overrides[key] = override;
563
+ }
564
+ const fields = [];
565
+ for (const [key, fieldSchema] of Object.entries(shape)) {
566
+ const fieldResult = buildField(key, fieldSchema, overrides[key]);
567
+ if (fieldResult.error) return fieldResult.errorResult();
568
+ fields.push(fieldResult.value);
569
+ }
570
+ return Result.ok({
571
+ title: config.title,
572
+ description: config.description,
573
+ submitLabel: config.submitLabel,
574
+ fields
575
+ });
576
+ }
577
+ /**
578
+ * Resolve an eval's `manualInput` Zod schema against a raw user submission.
579
+ * Returns the parsed value typed against the eval's `TInput` generic, or a
580
+ * structured `Error` carrying the Zod issues for the caller to surface.
581
+ */
582
+ function parseManualInputValues(config, raw) {
583
+ const parsed = config.schema.safeParse(raw);
584
+ if (!parsed.success) return Result.err(new ManualInputValidationError(parsed.error.issues.map(formatIssue)));
585
+ return Result.ok(parsed.data);
586
+ }
587
+ /**
588
+ * Error thrown / returned when manual-input values fail validation against
589
+ * the eval's `manualInput.schema`. Carries the structured Zod issues so the
590
+ * CLI and HTTP layers can surface them per-field.
591
+ */
592
+ var ManualInputValidationError = class extends Error {
593
+ issues;
594
+ constructor(issues) {
595
+ super(issues.length === 0 ? "manualInput validation failed" : `manualInput validation failed: ${issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ")}`);
596
+ this.name = "ManualInputValidationError";
597
+ this.issues = issues;
598
+ }
599
+ };
600
+ function formatIssue(issue) {
601
+ return {
602
+ path: issue.path.map((segment) => typeof segment === "string" || typeof segment === "number" ? String(segment) : "").filter((segment) => segment !== "").join("."),
603
+ message: issue.message
604
+ };
605
+ }
606
+ //#endregion
607
+ //#region ../runner/src/runMaintenance.ts
608
+ async function persistRunState(runState) {
609
+ await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
610
+ await writeFile(join(runState.runDir, "run.json"), JSON.stringify(runState.manifest, null, 2));
611
+ const casesJsonl = runState.cases.map((c) => JSON.stringify(c)).join("\n");
612
+ await writeFile(join(runState.runDir, "cases.jsonl"), casesJsonl);
613
+ }
614
+ /**
615
+ * Recompute a persisted case's status after score definitions changed.
616
+ *
617
+ * Pass/fail gates are per-score: a case fails when any score with a declared
618
+ * `passThreshold` reports a numeric value below that threshold. Scores
619
+ * without a threshold are informational and never gate. Cancelled and
620
+ * errored cases retain their terminal status.
621
+ */
622
+ function recomputePersistedCaseStatus(caseRow, caseDetail, scoreThresholds) {
623
+ if (caseRow.status === "cancelled") return "cancelled";
624
+ if (caseDetail?.error !== null && caseDetail?.error !== void 0) return "error";
625
+ if ((caseDetail?.assertionFailures.length ?? 0) > 0) return "fail";
626
+ for (const [key, passThreshold] of scoreThresholds) {
627
+ const rawValue = caseRow.columns[key] ?? caseDetail?.columns[key];
628
+ if (typeof rawValue !== "number") continue;
629
+ if (rawValue < passThreshold) return "fail";
630
+ }
631
+ return caseRow.status === "error" ? "error" : "pass";
632
+ }
633
+ function runTouchesEval(params) {
634
+ if (params.caseRows.some((caseRow) => caseRow.evalKey === params.evalKey)) return true;
635
+ if (params.target.mode === "all") return params.evalExists;
636
+ if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? false;
637
+ return false;
638
+ }
639
+ async function deleteTemporaryRuns(params) {
640
+ let deletedRuns = 0;
641
+ for (const [runId, run] of [...params.runs]) {
642
+ if (run.manifest.temporary !== true) continue;
643
+ if (run.manifest.status === "running") {
644
+ const endedAt = /* @__PURE__ */ new Date();
645
+ run.manifest.status = "cancelled";
646
+ run.manifest.endedAt = endedAt.toISOString();
647
+ run.summary.status = "cancelled";
648
+ run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
649
+ params.cancelRunningRun(run);
650
+ }
651
+ params.runs.delete(runId);
652
+ await rm(run.runDir, {
653
+ recursive: true,
654
+ force: true
655
+ });
656
+ deletedRuns += 1;
657
+ }
658
+ return deletedRuns;
659
+ }
660
+ async function recomputeEvalStatusesInRuns(params) {
661
+ let updatedRuns = 0;
662
+ for (const run of params.runs) {
663
+ if (!runTouchesEval({
664
+ target: run.manifest.target,
665
+ caseRows: run.cases,
666
+ evalKey: params.evalKey,
667
+ evalExists: params.evalExists
668
+ })) continue;
669
+ if (run.manifest.status === "running") continue;
670
+ let changed = false;
671
+ for (const caseRow of run.cases) {
672
+ if (caseRow.evalKey !== params.evalKey) continue;
673
+ const caseDetail = run.caseDetails.get(getCaseRowCaseKey(caseRow));
674
+ const nextStatus = recomputePersistedCaseStatus(caseRow, caseDetail, params.scoreThresholds);
675
+ if (caseRow.status === nextStatus) continue;
676
+ caseRow.status = nextStatus;
677
+ if (caseDetail) {
678
+ caseDetail.status = nextStatus;
679
+ await params.persistCaseDetail(run.runDir, caseDetail);
680
+ }
681
+ changed = true;
682
+ }
683
+ if (!changed) continue;
684
+ const derivedSummary = deriveScopedSummaryFromCases({ caseRows: run.cases });
685
+ run.summary.totalCases = derivedSummary.totalCases;
686
+ run.summary.passedCases = derivedSummary.passedCases;
687
+ run.summary.failedCases = derivedSummary.failedCases;
688
+ run.summary.errorCases = derivedSummary.errorCases;
689
+ run.summary.cancelledCases = derivedSummary.cancelledCases;
690
+ await persistRunState(run);
691
+ updatedRuns += 1;
692
+ }
693
+ return updatedRuns;
694
+ }
695
+ //#endregion
696
+ //#region ../runner/src/runPersistence.ts
697
+ const SHORT_ID_PATTERN = /^r(\d+)$/;
698
+ /**
699
+ * Generate a filesystem-safe, sortable run id combining a UTC timestamp
700
+ * with a short random suffix.
701
+ */
702
+ function generateRunId() {
703
+ const now = /* @__PURE__ */ new Date();
704
+ const pad = (n) => String(n).padStart(2, "0");
705
+ return `${`${String(now.getUTCFullYear())}-${pad(now.getUTCMonth() + 1)}-${pad(now.getUTCDate())}T${pad(now.getUTCHours())}-${pad(now.getUTCMinutes())}-${pad(now.getUTCSeconds())}Z`}_${Math.random().toString(36).slice(2, 8)}`;
706
+ }
707
+ function parseShortIdNum(shortId) {
708
+ if (shortId === void 0) return null;
709
+ const match = SHORT_ID_PATTERN.exec(shortId);
710
+ if (!match) return null;
711
+ const num = Number(match[1]);
712
+ if (!Number.isFinite(num)) return null;
713
+ return num;
714
+ }
715
+ /**
716
+ * Return the next `shortId` number to assign based on the existing
717
+ * loaded snapshots. Legacy runs that don't match the `r\d+` format are
718
+ * ignored.
719
+ */
720
+ function nextShortIdFromSnapshots(snapshots) {
721
+ let maxNum = -1;
722
+ for (const snapshot of snapshots) {
723
+ const num = parseShortIdNum(snapshot.manifest.shortId);
724
+ if (num !== null && num > maxNum) maxNum = num;
725
+ }
726
+ return maxNum + 1;
727
+ }
728
+ async function loadPersistedRunSnapshots(localStateDir) {
729
+ const runsDir = join(localStateDir, "runs");
730
+ const entriesResult = await resultify(() => readdir(runsDir, { withFileTypes: true }));
731
+ if (entriesResult.error) return [];
732
+ const snapshots = [];
733
+ const runDirs = entriesResult.value.filter((entry) => entry.isDirectory()).map((entry) => join(runsDir, entry.name)).toSorted();
734
+ for (const runDir of runDirs) {
735
+ const snapshot = await loadPersistedRunSnapshot(runDir);
736
+ if (!snapshot) continue;
737
+ snapshots.push(snapshot);
738
+ }
739
+ return snapshots;
740
+ }
741
+ async function persistCaseDetail(runDir, caseDetail, fileId = caseDetail.caseId) {
742
+ await writeFile(join(runDir, "case-details", `${encodeCaseDetailFileName(fileId)}.json`), JSON.stringify(caseDetail, null, 2));
743
+ }
744
+ function getLastRunStatuses(params) {
745
+ const latestRunInfos = getLatestRunInfos(params);
746
+ return new Map([...latestRunInfos].map(([evalId, info]) => [evalId, info.status]));
747
+ }
748
+ /**
749
+ * Return the latest scoped run metadata for each eval based on persisted and
750
+ * in-memory runs.
751
+ */
752
+ function getLatestRunInfos(params) {
753
+ const { runs, knownEvals } = params;
754
+ const knownEvalMetas = [...knownEvals];
755
+ const manualScoreKeysByEval = new Map(knownEvalMetas.map((evalMeta) => [evalMeta.key, evalMeta.columnDefs.filter((columnDef) => columnDef.isManualScore === true).map((columnDef) => columnDef.key)]));
756
+ const orderedRuns = [...runs].toSorted((a, b) => new Date(getRunFreshnessTimestamp(a.manifest)).getTime() - new Date(getRunFreshnessTimestamp(b.manifest)).getTime());
757
+ const latestRunInfos = /* @__PURE__ */ new Map();
758
+ for (const run of orderedRuns) for (const evalKey of getRunEvalKeys(run, knownEvalMetas)) latestRunInfos.set(evalKey, {
759
+ status: getEvalStatusForRun(run, evalKey, manualScoreKeysByEval.get(evalKey) ?? []),
760
+ startedAt: getRunFreshnessTimestamp(run.manifest),
761
+ commitSha: run.manifest.commitSha ?? null,
762
+ evalSourceFingerprint: run.manifest.evalSourceFingerprints[evalKey] ?? null
763
+ });
764
+ return latestRunInfos;
765
+ }
766
+ function toLastRunStatus$1(status) {
767
+ return status === "pending" ? null : status;
768
+ }
769
+ async function loadPersistedRunSnapshot(runDir) {
770
+ const manifest = await readParsedJsonFile(join(runDir, "run.json"), { safeParse: runManifestSchema.safeParse.bind(runManifestSchema) });
771
+ if (!manifest) return null;
772
+ const summary = await readParsedJsonFile(join(runDir, "summary.json"), { safeParse: runSummarySchema.safeParse.bind(runSummarySchema) });
773
+ if (!summary) return null;
774
+ return {
775
+ runDir,
776
+ manifest,
777
+ summary,
778
+ cases: await readCaseRows(runDir),
779
+ caseDetails: await readCaseDetails(runDir)
780
+ };
781
+ }
782
+ async function readParsedJsonFile(filePath, schema) {
783
+ const fileResult = await resultify(() => readFile(filePath, "utf-8"));
784
+ if (fileResult.error) return null;
785
+ const jsonResult = resultify(() => JSON.parse(fileResult.value));
786
+ if (jsonResult.error) return null;
787
+ const parsed = schema.safeParse(jsonResult.value);
788
+ if (!parsed.success) return null;
789
+ return parsed.data;
790
+ }
791
+ async function readCaseRows(runDir) {
792
+ const fileResult = await resultify(() => readFile(join(runDir, "cases.jsonl"), "utf-8"));
793
+ if (fileResult.error) return [];
794
+ const rows = [];
795
+ for (const rawLine of fileResult.value.split("\n")) {
796
+ const line = rawLine.trim();
797
+ if (line.length === 0) continue;
798
+ const jsonResult = resultify(() => JSON.parse(line));
799
+ if (jsonResult.error) continue;
800
+ const parsed = caseRowSchema.safeParse(jsonResult.value);
801
+ if (!parsed.success) continue;
802
+ rows.push(parsed.data);
803
+ }
804
+ return rows;
805
+ }
806
+ async function readCaseDetails(runDir) {
807
+ const detailsDir = join(runDir, "case-details");
808
+ const entriesResult = await resultify(() => readdir(detailsDir, { withFileTypes: true }));
809
+ if (entriesResult.error) return /* @__PURE__ */ new Map();
810
+ const caseDetails = /* @__PURE__ */ new Map();
811
+ for (const entry of entriesResult.value) {
812
+ if (!entry.isFile() || !entry.name.endsWith(".json")) continue;
813
+ const detail = await readParsedJsonFile(join(detailsDir, entry.name), { safeParse: caseDetailSchema.safeParse.bind(caseDetailSchema) });
814
+ if (!detail) continue;
815
+ caseDetails.set(detail.caseKey ?? detail.caseId, detail);
816
+ }
817
+ return caseDetails;
818
+ }
819
+ function getRunEvalKeys(run, knownEvals) {
820
+ const knownEvalMetas = [...knownEvals];
821
+ const evalKeys = /* @__PURE__ */ new Set();
822
+ for (const caseRow of run.cases) if (caseRow.evalKey !== void 0) evalKeys.add(caseRow.evalKey);
823
+ if (run.manifest.target.mode === "evalIds") for (const evalKey of run.manifest.target.evalKeys ?? []) evalKeys.add(evalKey);
824
+ else if (run.manifest.target.mode === "all" && evalKeys.size === 0) for (const evalMeta of knownEvalMetas) evalKeys.add(evalMeta.key);
825
+ return [...evalKeys];
826
+ }
827
+ function getEvalStatusForRun(run, evalKey, manualScoreKeys) {
828
+ const evalCases = run.cases.filter((caseRow) => caseRow.evalKey === evalKey);
829
+ if (evalCases.length > 0) {
830
+ if (hasPendingManualScores(evalCases, manualScoreKeys)) return "unscored";
831
+ return toLastRunStatus$1(deriveStatusFromCaseRows({ caseRows: evalCases }));
832
+ }
833
+ return toLastRunStatus$1(deriveStatusFromChildStatuses({
834
+ statuses: [],
835
+ lifecycleStatus: run.manifest.status
836
+ }));
837
+ }
838
+ function hasPendingManualScores(caseRows, manualScoreKeys) {
839
+ if (manualScoreKeys.length === 0) return false;
840
+ return caseRows.some((caseRow) => manualScoreKeys.some((key) => {
841
+ const value = caseRow.columns[key];
842
+ return typeof value !== "number" || !Number.isFinite(value);
843
+ }));
844
+ }
845
+ function encodeCaseDetailFileName(caseId) {
846
+ return encodeURIComponent(caseId);
847
+ }
848
+ //#endregion
849
+ //#region ../runner/src/caseChildManager.ts
850
+ const moduleMocksFlag = "--experimental-test-module-mocks";
851
+ const inspectFlagPrefix = "--inspect";
852
+ const inspectBrkFlagPrefix = "--inspect-brk";
853
+ const childOutputTailMaxLength = 12e3;
854
+ const outputHeadlineMaxLength = 240;
855
+ async function executeCaseChild(context) {
856
+ const child = spawn(process.execPath, [...getCaseChildExecArgv(), resolveCaseChildEntrypoint()], {
857
+ cwd: context.workspaceRoot,
858
+ env: process.env,
859
+ serialization: "advanced",
860
+ stdio: [
861
+ "ignore",
862
+ "pipe",
863
+ "pipe",
864
+ "ipc"
865
+ ]
866
+ });
867
+ const outputTail = createCaseChildOutputTail(child);
868
+ return await new Promise((resolvePromise, rejectPromise) => {
869
+ let result;
870
+ let childError;
871
+ let settled = false;
872
+ function settleWithError(error) {
873
+ if (settled) return;
874
+ settled = true;
875
+ rejectPromise(error);
876
+ }
877
+ child.once("error", (error) => {
878
+ childError = /* @__PURE__ */ new Error(`Failed to start case child: ${error.message}`);
879
+ });
880
+ child.on("message", (message) => {
881
+ if (!isCaseChildMessage(message)) return;
882
+ if (message.type === "error") {
883
+ childError = new Error(message.message);
884
+ return;
885
+ }
886
+ result = message.result;
887
+ });
888
+ child.once("close", (code, signal) => {
889
+ if (childError !== void 0) {
890
+ settleWithError(childError);
891
+ return;
892
+ }
893
+ if (result !== void 0 && code === 0 && signal === null) {
894
+ if (settled) return;
895
+ settled = true;
896
+ resolvePromise(result);
897
+ return;
898
+ }
899
+ const reason = formatChildExitReason(code, signal);
900
+ settleWithError(new Error(formatUnexpectedCaseChildExit(reason, outputTail)));
901
+ });
902
+ child.send({
903
+ type: "start",
904
+ context
905
+ });
906
+ });
907
+ }
908
+ function createCaseChildOutputTail(child) {
909
+ const tail = {
910
+ stdout: "",
911
+ stderr: "",
912
+ stdoutTruncated: false,
913
+ stderrTruncated: false
914
+ };
915
+ child.stdout?.on("data", (chunk) => {
916
+ process.stdout.write(chunk);
917
+ const nextTail = appendOutputTail(tail.stdout, chunkToText(chunk));
918
+ tail.stdout = nextTail.text;
919
+ tail.stdoutTruncated = tail.stdoutTruncated || nextTail.truncated;
920
+ });
921
+ child.stderr?.on("data", (chunk) => {
922
+ process.stderr.write(chunk);
923
+ const nextTail = appendOutputTail(tail.stderr, chunkToText(chunk));
924
+ tail.stderr = nextTail.text;
925
+ tail.stderrTruncated = tail.stderrTruncated || nextTail.truncated;
926
+ });
927
+ return tail;
928
+ }
929
+ function chunkToText(chunk) {
930
+ return typeof chunk === "string" ? chunk : chunk.toString("utf-8");
931
+ }
932
+ function appendOutputTail(current, next) {
933
+ const combined = current + next;
934
+ if (combined.length <= childOutputTailMaxLength) return {
935
+ text: combined,
936
+ truncated: false
937
+ };
938
+ return {
939
+ text: combined.slice(combined.length - childOutputTailMaxLength),
940
+ truncated: true
941
+ };
942
+ }
943
+ function formatUnexpectedCaseChildExit(reason, outputTail) {
944
+ const stderr = stripTerminalControlCodes(outputTail.stderr).trim();
945
+ const stdout = stripTerminalControlCodes(outputTail.stdout).trim();
946
+ const headline = getChildStderrHeadline(stderr);
947
+ const sections = [headline === null ? `${reason} before sending a structured case result.` : `Case child exited before sending a structured case result: ${headline}`, reason];
948
+ if (stderr.length > 0) sections.push(formatOutputSection("stderr", stderr, outputTail.stderrTruncated));
949
+ if (stdout.length > 0) sections.push(formatOutputSection("stdout", stdout, outputTail.stdoutTruncated));
950
+ return sections.join("\n\n");
951
+ }
952
+ function formatChildExitReason(code, signal) {
953
+ if (signal !== null) return `Case child exited with signal ${signal}`;
954
+ return `Case child exited with code ${String(code)}`;
955
+ }
956
+ function getChildStderrHeadline(stderr) {
957
+ const line = stderr.split("\n").map((entry) => entry.trim()).find((entry) => entry.length > 0);
958
+ if (line === void 0) return null;
959
+ if (line.length <= outputHeadlineMaxLength) return line;
960
+ return `${line.slice(0, outputHeadlineMaxLength)}...`;
961
+ }
962
+ function formatOutputSection(streamName, output, truncated) {
963
+ return `${truncated ? `Case child ${streamName} (last ${String(output.length)} chars)` : `Case child ${streamName}`}:\n${output}`;
964
+ }
965
+ function getCaseChildExecArgv() {
966
+ const execArgv = [moduleMocksFlag];
967
+ let skipNext = false;
968
+ for (const arg of process.execArgv) {
969
+ if (skipNext) {
970
+ skipNext = false;
971
+ continue;
972
+ }
973
+ if (arg === "--eval" || arg === "-e" || arg === "--print" || arg === "-p") {
974
+ skipNext = true;
975
+ continue;
976
+ }
977
+ if (arg.startsWith("--eval=") || arg.startsWith("--print=")) continue;
978
+ if (arg === "--input-type" || arg.startsWith("--input-type=")) {
979
+ if (arg === "--input-type") skipNext = true;
980
+ continue;
981
+ }
982
+ if (arg === moduleMocksFlag) continue;
983
+ if (isInspectArg(arg)) continue;
984
+ execArgv.push(arg);
985
+ }
986
+ return execArgv;
987
+ }
988
+ function isInspectArg(arg) {
989
+ return arg === inspectFlagPrefix || arg.startsWith(`${inspectFlagPrefix}=`) || arg === inspectBrkFlagPrefix || arg.startsWith(`${inspectBrkFlagPrefix}=`);
990
+ }
991
+ function resolveCaseChildEntrypoint() {
992
+ const currentDir = dirname(fileURLToPath(import.meta.url));
993
+ for (const fileName of [
994
+ "caseChild.ts",
995
+ "caseChild.mjs",
996
+ "caseChild.js"
997
+ ]) {
998
+ const candidate = join(currentDir, fileName);
999
+ if (existsSync(candidate)) return candidate;
1000
+ }
1001
+ throw new Error("Unable to locate the Agent Evals case child entrypoint.");
1002
+ }
1003
+ //#endregion
1004
+ //#region ../runner/src/runQueue.ts
1005
+ async function executeQueuedCases(params) {
1006
+ const { queuedCases, concurrency, globalTraceDisplay } = params;
1007
+ let nextCaseIndex = 0;
1008
+ let workerError = void 0;
1009
+ const workerCount = Math.min(concurrency, queuedCases.length);
1010
+ const workers = Array.from({ length: workerCount }, async () => {
1011
+ while (workerError === void 0) {
1012
+ const queuedCase = queuedCases[nextCaseIndex];
1013
+ nextCaseIndex += 1;
1014
+ if (queuedCase === void 0) return;
1015
+ try {
1016
+ await executeQueuedCase({
1017
+ queuedCase,
1018
+ globalTraceDisplay
1019
+ });
1020
+ } catch (error) {
1021
+ workerError = error instanceof Error ? error : new Error(String(error));
1022
+ return;
1023
+ }
1024
+ }
1025
+ });
1026
+ await Promise.all(workers);
1027
+ if (workerError instanceof Error) throw workerError;
1028
+ if (workerError !== void 0) throw new Error(typeof workerError === "string" ? workerError : typeof workerError === "number" || typeof workerError === "boolean" || typeof workerError === "bigint" ? String(workerError) : workerError === null ? "null" : "Unknown queue worker error");
1029
+ }
1030
+ async function executeQueuedCase(params) {
1031
+ const { queuedCase, globalTraceDisplay } = params;
1032
+ const startTime = Date.now();
1033
+ const result = await queuedCase.execute({
1034
+ globalTraceDisplay,
1035
+ startTime
1036
+ });
1037
+ await queuedCase.onComplete(result);
1038
+ }
1039
+ //#endregion
1040
+ //#region ../runner/src/tags.ts
1041
+ function getInvalidTagMessages(params) {
1042
+ return (params.tags ?? []).flatMap((tag) => {
1043
+ const validation = validateEvalTagName(tag);
1044
+ return validation.ok ? [] : [`${params.source} tag "${tag}" is invalid: ${validation.message}`];
1045
+ });
1046
+ }
1047
+ /** Resolve effective eval-level tags and discovery issues for one eval. */
1048
+ function resolveEvalTags(params) {
1049
+ const configTags = params.configTags ?? [];
1050
+ const removeTags = params.evalDef.removeTags ?? [];
1051
+ const messages = [
1052
+ ...getInvalidTagMessages({
1053
+ tags: configTags,
1054
+ source: "config"
1055
+ }),
1056
+ ...getInvalidTagMessages({
1057
+ tags: params.evalDef.tags,
1058
+ source: "eval"
1059
+ }),
1060
+ ...getInvalidTagMessages({
1061
+ tags: removeTags,
1062
+ source: "removeTags"
1063
+ })
1064
+ ];
1065
+ const globalTagSet = new Set(configTags);
1066
+ for (const tag of removeTags) if (!globalTagSet.has(tag)) messages.push(`removeTags tag "${tag}" is not defined in AgentEvalsConfig.tags.`);
1067
+ const removeTagSet = new Set(removeTags);
1068
+ return {
1069
+ tags: dedupeEvalTags([...configTags.filter((tag) => !removeTagSet.has(tag)), ...params.evalDef.tags ?? []]),
1070
+ issues: messages.map((message) => ({
1071
+ type: "invalid-tags",
1072
+ severity: "error",
1073
+ filePath: params.filePath,
1074
+ evalId: params.evalId,
1075
+ message: `Invalid tags for eval "${params.evalId}" in ${params.filePath}: ${message}`
1076
+ }))
1077
+ };
1078
+ }
1079
+ /** Return effective case tags or throw when authored case tags are invalid. */
1080
+ function resolveCaseTags(params) {
1081
+ const messages = getInvalidTagMessages({
1082
+ tags: params.evalCase.tags,
1083
+ source: `case "${params.evalCase.id}"`
1084
+ });
1085
+ if (messages.length > 0) throw new Error(`Invalid tags for case "${params.evalCase.id}" in ${params.filePath}#${params.evalId}: ${messages.join("; ")}`);
1086
+ return dedupeEvalTags([...params.evalTags, ...params.evalCase.tags ?? []]);
1087
+ }
1088
+ /** Validate CLI/API tags filters and return the first error message. */
1089
+ function validateTagsFilters(filters) {
1090
+ for (const filter of filters ?? []) {
1091
+ const error = validateTagsFilterExpression(filter);
1092
+ if (error !== null) return `Invalid --tags-filter "${filter}": ${error}`;
1093
+ }
1094
+ return null;
1095
+ }
1096
+ /** Filter cases by Vitest-style tag expressions. */
1097
+ function filterEvalCasesByTags(cases, tagsFilter) {
1098
+ if (tagsFilter === void 0 || tagsFilter.length === 0) return [...cases];
1099
+ return cases.filter((evalCase) => matchesTagsFilter({
1100
+ tags: evalCase.tags,
1101
+ filters: tagsFilter
1102
+ }));
1103
+ }
1104
+ /** Return whether eval-level tags alone satisfy the run's tag filters. */
1105
+ function evalTagsMatchFilter(params) {
1106
+ return matchesTagsFilter({
1107
+ tags: params.tags,
1108
+ filters: params.tagsFilter
1109
+ });
1110
+ }
1111
+ //#endregion
1112
+ //#region ../runner/src/targeting.ts
1113
+ function escapeRegex(value) {
1114
+ return value.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
1115
+ }
1116
+ function globToRegex(pattern) {
1117
+ const normalized = pattern.replaceAll("\\", "/");
1118
+ let regex = "^";
1119
+ for (let i = 0; i < normalized.length; i++) {
1120
+ const char = normalized[i];
1121
+ const next = normalized[i + 1];
1122
+ if (char === "*" && next === "*") {
1123
+ regex += ".*";
1124
+ i++;
1125
+ } else if (char === "*") regex += "[^/]*";
1126
+ else if (char === "?") regex += "[^/]";
1127
+ else regex += escapeRegex(char ?? "");
1128
+ }
1129
+ regex += "$";
1130
+ return new RegExp(regex);
1131
+ }
1132
+ function fileMatches(pattern, filePath) {
1133
+ const normalizedPattern = pattern.replaceAll("\\", "/");
1134
+ if (normalizedPattern === filePath) return true;
1135
+ return globToRegex(normalizedPattern).test(filePath);
1136
+ }
1137
+ function matchesFiles(evalMeta, files) {
1138
+ if (files === void 0 || files.length === 0) return true;
1139
+ return files.some((file) => fileMatches(file, evalMeta.filePath));
1140
+ }
1141
+ function matchesEvalIds(evalMeta, evalIds) {
1142
+ if (evalIds === void 0 || evalIds.length === 0) return true;
1143
+ return evalIds.includes(evalMeta.id);
1144
+ }
1145
+ function matchesEvalKeys(evalMeta, evalKeys) {
1146
+ if (evalKeys === void 0 || evalKeys.length === 0) return true;
1147
+ return evalKeys.includes(evalMeta.key);
1148
+ }
1149
+ /** Return the discovered evals selected by a run target. */
1150
+ function getTargetEvals(params) {
1151
+ const { target } = params.request;
1152
+ return [...params.evals].filter((evalMeta) => matchesEvalKeys(evalMeta, target.evalKeys)).filter((evalMeta) => matchesEvalIds(evalMeta, target.evalIds)).filter((evalMeta) => matchesFiles(evalMeta, target.files)).toSorted((a, b) => a.filePath.localeCompare(b.filePath));
1153
+ }
1154
+ /** Resolve which exact eval keys a run request can affect. */
1155
+ function getTargetEvalKeys(params) {
1156
+ return getTargetEvals({
1157
+ evals: params.sortedEvals,
1158
+ request: params.request
1159
+ }).map((evalMeta) => evalMeta.key);
1160
+ }
1161
+ //#endregion
1162
+ //#region ../runner/src/runOrchestration.ts
1163
+ function toOptionalSourceFingerprint(sourceFingerprint) {
1164
+ return sourceFingerprint.length > 0 ? sourceFingerprint : void 0;
1165
+ }
1166
+ function buildEvalPreparationModuleIsolation(params) {
1167
+ return {
1168
+ key: [
1169
+ params.runId,
1170
+ params.evalKey,
1171
+ "prepare"
1172
+ ].join(":"),
1173
+ workspaceRoot: params.workspaceRoot
1174
+ };
1175
+ }
1176
+ /**
1177
+ * Ranks case statuses from worst to best. Used to order trial attempts so the
1178
+ * pessimistic (`lowestScore`) strategy can pick the worst attempt. Any
1179
+ * non-terminal status outside `pass`/`fail`/`error` is treated as indistinct
1180
+ * from `fail` for comparison purposes.
1181
+ */
1182
+ function statusRank(status) {
1183
+ if (status === "pass") return 2;
1184
+ if (status === "error") return 0;
1185
+ return 1;
1186
+ }
1187
+ /**
1188
+ * Returns the minimum numeric value across the declared score columns for a
1189
+ * trial, or `-Infinity` when no score has a numeric value. Used as a
1190
+ * tiebreaker between trials that share the same status.
1191
+ */
1192
+ function minScoreValue(caseRow, scoreKeys) {
1193
+ let min = Number.POSITIVE_INFINITY;
1194
+ for (const key of scoreKeys) {
1195
+ const v = caseRow.columns[key];
1196
+ if (typeof v === "number" && Number.isFinite(v)) {
1197
+ if (v < min) min = v;
1198
+ }
1199
+ }
1200
+ return Number.isFinite(min) ? min : Number.NEGATIVE_INFINITY;
1201
+ }
1202
+ function compareTrialResults(left, right, scoreKeys) {
1203
+ const statusDiff = statusRank(left.caseRow.status) - statusRank(right.caseRow.status);
1204
+ if (statusDiff !== 0) return statusDiff;
1205
+ const scoreDiff = minScoreValue(left.caseRow, scoreKeys) - minScoreValue(right.caseRow, scoreKeys);
1206
+ if (scoreDiff !== 0) return scoreDiff;
1207
+ return left.caseRow.trial - right.caseRow.trial;
1208
+ }
1209
+ function pickWinningTrial(params) {
1210
+ const orderedAttempts = [...params.attempts].toSorted((left, right) => compareTrialResults(left, right, params.scoreKeys));
1211
+ if (params.strategy === "lowestScore") {
1212
+ const [lowestAttempt] = orderedAttempts;
1213
+ if (lowestAttempt === void 0) throw new Error("Expected at least one trial attempt");
1214
+ return lowestAttempt;
1215
+ }
1216
+ const medianAttempt = orderedAttempts[Math.floor((orderedAttempts.length - 1) / 2)];
1217
+ if (medianAttempt === void 0) throw new Error("Expected at least one trial attempt");
1218
+ return medianAttempt;
1219
+ }
1220
+ function formatUnknownErrorDetails(error) {
1221
+ if (error instanceof Error) return error.stack ?? error.message;
1222
+ if (typeof error === "string") return error;
1223
+ return String(error);
1224
+ }
1225
+ function findDuplicateCaseIds(cases) {
1226
+ const counts = /* @__PURE__ */ new Map();
1227
+ for (const evalCase of cases) counts.set(evalCase.id, (counts.get(evalCase.id) ?? 0) + 1);
1228
+ return [...counts].filter(([, count]) => count > 1).map(([caseId]) => caseId).toSorted();
1229
+ }
1230
+ function throwIfDiscoveryIssues(issues) {
1231
+ if (issues.length === 0) return;
1232
+ throw new Error(issues.map((issue) => issue.message).join("\n"));
1233
+ }
1234
+ function findAmbiguousTargetCaseIds(preparedEvals) {
1235
+ const ownersByCaseId = /* @__PURE__ */ new Map();
1236
+ for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
1237
+ const owners = ownersByCaseId.get(preparedCase.caseId) ?? /* @__PURE__ */ new Set();
1238
+ owners.add(`${preparedEval.evalMeta.filePath}#${preparedEval.evalMeta.id}`);
1239
+ ownersByCaseId.set(preparedCase.caseId, owners);
1240
+ }
1241
+ return [...ownersByCaseId].filter(([, owners]) => owners.size > 1).map(([caseId, owners]) => `${caseId} (${[...owners].join(", ")})`);
1242
+ }
1243
+ function buildRunErrorMessage(errors) {
1244
+ return errors.map((entry) => {
1245
+ const [firstLine, ...detailLines] = entry.details.split("\n");
1246
+ const messageLine = firstLine?.trim() ?? "Unknown error";
1247
+ const details = detailLines.join("\n").trim();
1248
+ if (details.length === 0) return `[${entry.evalId}] ${messageLine}`;
1249
+ return `[${entry.evalId}] ${messageLine}\n${details}`;
1250
+ }).join("\n");
1251
+ }
1252
+ async function finalizePreparedCase(params) {
1253
+ const { runState, runDir, preparedEval, preparedCase, onCaseFinished, emitEvent } = params;
1254
+ if (preparedCase.finalized || preparedCase.trialResults.length === 0) return;
1255
+ preparedCase.finalized = true;
1256
+ const winningTrial = pickWinningTrial({
1257
+ strategy: runState.manifest.trialSelection,
1258
+ attempts: preparedCase.trialResults,
1259
+ scoreKeys: preparedEval.scoreKeys
1260
+ });
1261
+ if (winningTrial.pendingCacheWrites.length > 0) await commitPendingCacheWrites({
1262
+ backingStore: params.cacheStore,
1263
+ pendingWrites: winningTrial.pendingCacheWrites
1264
+ });
1265
+ const artifactFileId = getCaseArtifactFileId(runState, winningTrial.caseRow);
1266
+ runState.cases.push(winningTrial.caseRow);
1267
+ runState.caseDetails.set(getCaseRowCaseKey(winningTrial.caseRow), winningTrial.caseDetail);
1268
+ if (winningTrial.caseRow.status === "pass") runState.summary.passedCases++;
1269
+ else if (winningTrial.caseRow.status === "error") runState.summary.errorCases++;
1270
+ else runState.summary.failedCases++;
1271
+ await writeFile(join(runDir, "traces", `${encodeURIComponent(artifactFileId)}.json`), JSON.stringify(winningTrial.caseDetail.trace, null, 2));
1272
+ await persistCaseDetail(runDir, winningTrial.caseDetail, artifactFileId);
1273
+ onCaseFinished?.(winningTrial.caseDetail, winningTrial.caseRow);
1274
+ emitEvent(runState, {
1275
+ type: "case.finished",
1276
+ runId: runState.manifest.id,
1277
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1278
+ payload: winningTrial.caseRow
1279
+ });
1280
+ preparedEval.evalCaseRows.push(winningTrial.caseRow);
1281
+ }
1282
+ function getPreparedCaseOrderKey(caseRow) {
1283
+ return `${caseRow.evalKey ?? caseRow.evalId}\u0000${caseRow.caseId}`;
1284
+ }
1285
+ function getCaseArtifactFileId(runState, caseRow) {
1286
+ const caseKey = getCaseRowCaseKey(caseRow);
1287
+ return runState.cases.some((existing) => existing.caseId === caseRow.caseId && getCaseRowCaseKey(existing) !== caseKey) ? caseKey : caseRow.caseId;
1288
+ }
1289
+ function sortCaseRowsByPreparedOrder(caseRows, preparedEvals) {
1290
+ const orderByCase = /* @__PURE__ */ new Map();
1291
+ let order = 0;
1292
+ for (const preparedEval of preparedEvals) for (const preparedCase of preparedEval.preparedCases) {
1293
+ orderByCase.set(`${preparedEval.evalMeta.key}\u0000${preparedCase.caseId}`, order);
1294
+ order++;
1295
+ }
1296
+ caseRows.sort((left, right) => {
1297
+ return (orderByCase.get(getPreparedCaseOrderKey(left)) ?? Number.MAX_SAFE_INTEGER) - (orderByCase.get(getPreparedCaseOrderKey(right)) ?? Number.MAX_SAFE_INTEGER);
1298
+ });
1299
+ }
1300
+ async function executeRun({ runState, request, runDir, config, cacheStore, lastRunStatusMap, latestRunInfoMap, emitEvent, emitDiscoveryEvent, workspaceRoot, getSourceFingerprint, getConfiguredConcurrency, getSortedEvalMetas, getTargetEvals, onCaseFinished }) {
1301
+ try {
1302
+ const tagsFilterError = validateTagsFilters(request.target.tagsFilter);
1303
+ if (tagsFilterError !== null) throw new Error(tagsFilterError);
1304
+ const targetEvals = getTargetEvals(request);
1305
+ emitEvent(runState, {
1306
+ type: "run.started",
1307
+ runId: runState.manifest.id,
1308
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1309
+ payload: runState.manifest
1310
+ });
1311
+ const evalErrors = [];
1312
+ const queuedCases = [];
1313
+ const preparedEvals = [];
1314
+ const cacheMode = runState.manifest.cacheMode ?? "use";
1315
+ const cacheEnabled = config.cache?.enabled !== false;
1316
+ for (const evalMeta of targetEvals) {
1317
+ const evalFilePath = evalMeta.sourceFilePath;
1318
+ const evalModuleIsolation = buildEvalPreparationModuleIsolation({
1319
+ runId: runState.manifest.id,
1320
+ evalKey: evalMeta.key,
1321
+ workspaceRoot
1322
+ });
1323
+ let sourceFingerprint = "";
1324
+ try {
1325
+ sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
1326
+ } catch {
1327
+ sourceFingerprint = "";
1328
+ }
1329
+ if (sourceFingerprint.length > 0) {
1330
+ runState.manifest.evalSourceFingerprints[evalMeta.key] = sourceFingerprint;
1331
+ evalMeta.sourceFingerprint = sourceFingerprint;
1332
+ } else {
1333
+ delete runState.manifest.evalSourceFingerprints[evalMeta.key];
1334
+ evalMeta.sourceFingerprint = null;
1335
+ }
1336
+ try {
1337
+ const entry = (await loadIsolatedEvalRegistry({
1338
+ evalFilePath,
1339
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
1340
+ moduleIsolation: evalModuleIsolation,
1341
+ runtimeScope: "env"
1342
+ })).get(evalMeta.id);
1343
+ if (!entry) {
1344
+ evalErrors.push({
1345
+ evalId: evalMeta.id,
1346
+ details: `Eval "${evalMeta.id}" was not registered after importing ${evalFilePath}`
1347
+ });
1348
+ continue;
1349
+ }
1350
+ await runWithModuleIsolation(evalModuleIsolation, async () => {
1351
+ await runInEvalRuntimeScope("cases", async () => {
1352
+ await entry.use(async (evalDef) => {
1353
+ const evalTagsResult = resolveEvalTags({
1354
+ configTags: config.tags,
1355
+ evalDef,
1356
+ evalId: evalMeta.id,
1357
+ filePath: evalMeta.filePath
1358
+ });
1359
+ throwIfDiscoveryIssues(evalTagsResult.issues);
1360
+ evalMeta.tags = evalTagsResult.tags;
1361
+ if (evalDef.manualInput && evalDef.cases !== void 0) throw new Error(`Eval "${evalMeta.id}" cannot declare both "cases" and "manualInput". Remove one of them.`);
1362
+ let manualInputCase = null;
1363
+ if (evalDef.manualInput) {
1364
+ const manualTags = evalTagsResult.tags;
1365
+ if (!filterEvalCasesByTags([{
1366
+ id: `${evalMeta.id}-manual`,
1367
+ input: {},
1368
+ tags: manualTags
1369
+ }], request.target.tagsFilter).length) {
1370
+ evalMeta.caseCount = 1;
1371
+ evalMeta.caseIds = [`${evalMeta.id}-manual`];
1372
+ return;
1373
+ }
1374
+ const rawValue = request.manualInputs?.[evalMeta.key];
1375
+ if (rawValue === void 0) throw new Error(`Eval "${evalMeta.id}" requires manual input. Provide it via the run modal in the web UI or "--input" / "--input-file" on the CLI.`);
1376
+ const parsed = parseManualInputValues(evalDef.manualInput, rawValue);
1377
+ if (parsed.error) {
1378
+ const formatted = parsed.error.issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ");
1379
+ throw new Error(`Invalid manual input for eval "${evalMeta.id}": ${formatted}`);
1380
+ }
1381
+ manualInputCase = {
1382
+ id: `${evalMeta.id}-manual`,
1383
+ input: parsed.value,
1384
+ tags: manualTags
1385
+ };
1386
+ }
1387
+ const evalCases = manualInputCase ? [manualInputCase] : typeof evalDef.cases === "function" && !evalTagsMatchFilter({
1388
+ tags: evalTagsResult.tags,
1389
+ tagsFilter: request.target.tagsFilter
1390
+ }) ? [] : await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime });
1391
+ const runnableCases = (manualInputCase ? evalCases : resolveRunnableEvalCases({
1392
+ cases: evalCases,
1393
+ evalId: evalMeta.id
1394
+ })).map((evalCase) => ({
1395
+ ...evalCase,
1396
+ tags: resolveCaseTags({
1397
+ evalTags: evalTagsResult.tags,
1398
+ evalCase,
1399
+ evalId: evalMeta.id,
1400
+ filePath: evalMeta.filePath
1401
+ })
1402
+ }));
1403
+ const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
1404
+ if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
1405
+ const cases = filterEvalCasesByTags(filterEvalCases(runnableCases, request.target.caseIds), request.target.tagsFilter);
1406
+ evalMeta.caseCount = runnableCases.length;
1407
+ evalMeta.caseIds = runnableCases.map((evalCase) => evalCase.id);
1408
+ runState.summary.totalCases += cases.length;
1409
+ const defaultConfig = resolveEvalDefaultConfig({
1410
+ evalDef,
1411
+ globalColumns: config.columns,
1412
+ globalStats: config.stats,
1413
+ globalDefaultStatAggregate: config.defaultStatAggregate,
1414
+ globalRemove: config.removeDefaultConfig
1415
+ });
1416
+ const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
1417
+ const validatedCharts = validateCharts({
1418
+ charts: defaultConfig.charts,
1419
+ columnDefs: declaredColumnDefs,
1420
+ evalId: evalMeta.id
1421
+ });
1422
+ for (const warning of validatedCharts.warnings) console.warn(warning);
1423
+ evalMeta.columnDefs = declaredColumnDefs;
1424
+ evalMeta.stats = defaultConfig.stats;
1425
+ evalMeta.defaultStatAggregate = defaultConfig.defaultStatAggregate;
1426
+ evalMeta.charts = validatedCharts.charts;
1427
+ const evalCaseRows = [];
1428
+ const preparedCases = [];
1429
+ const scoreKeys = Object.freeze(Object.keys(evalDef.scores ?? {}));
1430
+ const manualScoreKeys = Object.freeze(Object.keys(evalDef.manualScores ?? {}));
1431
+ const preparedEval = {
1432
+ evalMeta,
1433
+ evalCaseRows,
1434
+ preparedCases,
1435
+ scoreKeys: Object.freeze([...scoreKeys, ...manualScoreKeys])
1436
+ };
1437
+ preparedEvals.push(preparedEval);
1438
+ for (const evalCase of cases) {
1439
+ const trialResults = [];
1440
+ const preparedCase = {
1441
+ caseId: evalCase.id,
1442
+ trialResults,
1443
+ finalized: false
1444
+ };
1445
+ preparedCases.push(preparedCase);
1446
+ for (let trial = 0; trial < request.trials; trial++) queuedCases.push({
1447
+ execute: async ({ startTime, globalTraceDisplay }) => await executeCaseChild({
1448
+ evalId: evalMeta.id,
1449
+ evalKey: evalMeta.key,
1450
+ evalFilePath,
1451
+ evalFileRelativePath: evalMeta.filePath,
1452
+ sourceFingerprint: toOptionalSourceFingerprint(sourceFingerprint),
1453
+ evalCase,
1454
+ trial,
1455
+ startTime,
1456
+ cacheMode,
1457
+ cacheEnabled,
1458
+ globalTraceDisplay,
1459
+ workspaceRoot,
1460
+ artifactDir: join(runDir, "artifacts"),
1461
+ runId: runState.manifest.id
1462
+ }),
1463
+ onComplete: async ({ caseDetail, caseRow, pendingCacheWrites }) => {
1464
+ trialResults.push({
1465
+ caseDetail,
1466
+ caseRow,
1467
+ pendingCacheWrites
1468
+ });
1469
+ if (trialResults.length !== request.trials) return;
1470
+ await finalizePreparedCase({
1471
+ runState,
1472
+ runDir,
1473
+ cacheStore,
1474
+ preparedEval,
1475
+ preparedCase,
1476
+ onCaseFinished,
1477
+ emitEvent
1478
+ });
1479
+ }
1480
+ });
1481
+ }
1482
+ });
1483
+ });
1484
+ });
1485
+ } catch (error) {
1486
+ console.error(`Error running eval ${evalMeta.id}:`, error);
1487
+ evalErrors.push({
1488
+ evalId: evalMeta.id,
1489
+ details: formatUnknownErrorDetails(error)
1490
+ });
1491
+ lastRunStatusMap.set(evalMeta.key, "error");
1492
+ latestRunInfoMap.set(evalMeta.key, {
1493
+ status: "error",
1494
+ startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
1495
+ commitSha: runState.manifest.commitSha ?? null,
1496
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalMeta.key] ?? null
1497
+ });
1498
+ }
1499
+ }
1500
+ const ambiguousCaseTargets = request.target.caseIds && request.target.caseIds.length > 0 ? findAmbiguousTargetCaseIds(preparedEvals) : [];
1501
+ if (ambiguousCaseTargets.length > 0) {
1502
+ queuedCases.length = 0;
1503
+ evalErrors.push({
1504
+ evalId: "target",
1505
+ details: `Ambiguous --case target. Narrow it with --file and/or --eval: ${ambiguousCaseTargets.join("; ")}`
1506
+ });
1507
+ } else await executeQueuedCases({
1508
+ queuedCases,
1509
+ concurrency: getConfiguredConcurrency(),
1510
+ globalTraceDisplay: config.traceDisplay
1511
+ });
1512
+ for (const preparedEval of preparedEvals) {
1513
+ for (const preparedCase of preparedEval.preparedCases) await finalizePreparedCase({
1514
+ runState,
1515
+ runDir,
1516
+ cacheStore,
1517
+ preparedEval,
1518
+ preparedCase,
1519
+ onCaseFinished,
1520
+ emitEvent
1521
+ });
1522
+ lastRunStatusMap.set(preparedEval.evalMeta.key, toLastRunStatus(deriveStatusFromCaseRows({ caseRows: preparedEval.evalCaseRows })));
1523
+ const latestStatus = lastRunStatusMap.get(preparedEval.evalMeta.key) ?? null;
1524
+ latestRunInfoMap.set(preparedEval.evalMeta.key, {
1525
+ status: latestStatus,
1526
+ startedAt: runState.manifest.endedAt ?? runState.manifest.startedAt,
1527
+ commitSha: runState.manifest.commitSha ?? null,
1528
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[preparedEval.evalMeta.key] ?? null
1529
+ });
1530
+ }
1531
+ sortCaseRowsByPreparedOrder(runState.cases, preparedEvals);
1532
+ for (const preparedEval of preparedEvals) sortCaseRowsByPreparedOrder(preparedEval.evalCaseRows, preparedEvals);
1533
+ const endTime = /* @__PURE__ */ new Date();
1534
+ runState.summary.totalDurationMs = endTime.getTime() - new Date(runState.manifest.startedAt).getTime();
1535
+ const finalStatus = evalErrors.length > 0 ? "error" : "completed";
1536
+ runState.summary.status = finalStatus;
1537
+ runState.manifest.status = finalStatus;
1538
+ const completedRunAt = endTime.toISOString();
1539
+ runState.manifest.endedAt = completedRunAt;
1540
+ runState.summary.errorMessage = evalErrors.length > 0 ? buildRunErrorMessage(evalErrors) : null;
1541
+ for (const evalKey of getTargetEvalKeys({
1542
+ request,
1543
+ sortedEvals: getSortedEvalMetas()
1544
+ })) {
1545
+ const latestStatus = lastRunStatusMap.get(evalKey) ?? toLastRunStatus(deriveStatusFromCaseRows({
1546
+ caseRows: [],
1547
+ lifecycleStatus: runState.manifest.status
1548
+ }));
1549
+ latestRunInfoMap.set(evalKey, {
1550
+ status: latestStatus,
1551
+ startedAt: completedRunAt,
1552
+ commitSha: runState.manifest.commitSha ?? null,
1553
+ evalSourceFingerprint: runState.manifest.evalSourceFingerprints[evalKey] ?? null
1554
+ });
1555
+ }
1556
+ await persistRunState(runState);
1557
+ emitEvent(runState, {
1558
+ type: "run.summary",
1559
+ runId: runState.manifest.id,
1560
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1561
+ payload: runState.summary
1562
+ });
1563
+ if (finalStatus === "error") emitEvent(runState, {
1564
+ type: "run.error",
1565
+ runId: runState.manifest.id,
1566
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1567
+ payload: { message: buildRunErrorMessage(evalErrors) }
1568
+ });
1569
+ else emitEvent(runState, {
1570
+ type: "run.finished",
1571
+ runId: runState.manifest.id,
1572
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1573
+ payload: runState.summary
1574
+ });
1575
+ emitDiscoveryEvent();
1576
+ } catch (error) {
1577
+ const message = formatUnknownErrorDetails(error);
1578
+ runState.manifest.status = "error";
1579
+ runState.manifest.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1580
+ runState.summary.status = "error";
1581
+ runState.summary.errorMessage = message;
1582
+ await persistRunState(runState);
1583
+ emitEvent(runState, {
1584
+ type: "run.error",
1585
+ runId: runState.manifest.id,
1586
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1587
+ payload: { message }
1588
+ });
1589
+ emitDiscoveryEvent();
1590
+ }
1591
+ }
1592
+ function toLastRunStatus(status) {
1593
+ return status === "pending" ? null : status;
1594
+ }
1595
+ //#endregion
1596
+ export { validateCharts as C, parseEvalDiscovery as S, runTouchesEval as _, validateTagsFilters as a, deriveEvalFreshness as b, getLatestRunInfos as c, nextShortIdFromSnapshots as d, persistCaseDetail as f, recomputePersistedCaseStatus as g, recomputeEvalStatusesInRuns as h, resolveEvalTags as i, loadPersistedRunSnapshot as l, persistRunState as m, getTargetEvalKeys as n, generateRunId as o, deleteTemporaryRuns as p, getTargetEvals as r, getLastRunStatuses as s, executeRun as t, loadPersistedRunSnapshots as u, buildManualInputDescriptor as v, loadIsolatedEvalRegistry as x, parseManualInputValues as y };