agentv 2.6.0 → 2.7.1-next.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +128 -33
  2. package/dist/chunk-3L2L5GIL.js +51 -0
  3. package/dist/{chunk-BKMQNEUD.js.map → chunk-3L2L5GIL.js.map} +1 -1
  4. package/dist/{chunk-BKMQNEUD.js → chunk-5H446C7X.js} +2 -45
  5. package/dist/{chunk-LJVS3JAK.js → chunk-BL4PVUAT.js} +6 -4
  6. package/dist/{chunk-LJVS3JAK.js.map → chunk-BL4PVUAT.js.map} +1 -1
  7. package/dist/{chunk-MGK6HHRR.js → chunk-BWLYFF5N.js} +9813 -11237
  8. package/dist/chunk-BWLYFF5N.js.map +1 -0
  9. package/dist/chunk-C5GOHBQM.js +84 -0
  10. package/dist/chunk-C5GOHBQM.js.map +1 -0
  11. package/dist/chunk-EJEG3DU2.js +5476 -0
  12. package/dist/chunk-EJEG3DU2.js.map +1 -0
  13. package/dist/chunk-FV32QHPB.js +565 -0
  14. package/dist/chunk-FV32QHPB.js.map +1 -0
  15. package/dist/chunk-H5FFZCKI.js +2957 -0
  16. package/dist/chunk-H5FFZCKI.js.map +1 -0
  17. package/dist/chunk-JK6V4KVD.js +114 -0
  18. package/dist/chunk-JK6V4KVD.js.map +1 -0
  19. package/dist/chunk-LRULMAAA.js +1711 -0
  20. package/dist/chunk-LRULMAAA.js.map +1 -0
  21. package/dist/chunk-SR4I5KET.js +1238 -0
  22. package/dist/chunk-SR4I5KET.js.map +1 -0
  23. package/dist/chunk-VQ2ZO7XJ.js +2098 -0
  24. package/dist/chunk-VQ2ZO7XJ.js.map +1 -0
  25. package/dist/chunk-XALGXSKB.js +21 -0
  26. package/dist/chunk-XALGXSKB.js.map +1 -0
  27. package/dist/cli.js +8 -2
  28. package/dist/cli.js.map +1 -1
  29. package/dist/dist-R3OCWGXH.js +257 -0
  30. package/dist/dist-R3OCWGXH.js.map +1 -0
  31. package/dist/esm-5Q4BZALM-5REQWAUV.js +924 -0
  32. package/dist/esm-5Q4BZALM-5REQWAUV.js.map +1 -0
  33. package/dist/esm-DX3WQKEN.js +32 -0
  34. package/dist/esm-DX3WQKEN.js.map +1 -0
  35. package/dist/esm-QNEMCJPL.js +933 -0
  36. package/dist/esm-QNEMCJPL.js.map +1 -0
  37. package/dist/esm-R77SNOF5.js +65 -0
  38. package/dist/esm-R77SNOF5.js.map +1 -0
  39. package/dist/esm-RVQPUGWH.js +1207 -0
  40. package/dist/esm-RVQPUGWH.js.map +1 -0
  41. package/dist/getMachineId-bsd-HSK5LZMG.js +41 -0
  42. package/dist/getMachineId-bsd-HSK5LZMG.js.map +1 -0
  43. package/dist/getMachineId-darwin-4DP6CCJV.js +41 -0
  44. package/dist/getMachineId-darwin-4DP6CCJV.js.map +1 -0
  45. package/dist/getMachineId-linux-44LJ5UJB.js +33 -0
  46. package/dist/getMachineId-linux-44LJ5UJB.js.map +1 -0
  47. package/dist/getMachineId-unsupported-NVK6IATM.js +24 -0
  48. package/dist/getMachineId-unsupported-NVK6IATM.js.map +1 -0
  49. package/dist/getMachineId-win-YZ36S7VA.js +43 -0
  50. package/dist/getMachineId-win-YZ36S7VA.js.map +1 -0
  51. package/dist/index.js +10 -2
  52. package/dist/interactive-33TCZXLF.js +333 -0
  53. package/dist/interactive-33TCZXLF.js.map +1 -0
  54. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +9 -0
  55. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +1 -0
  56. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +9 -0
  57. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +1 -0
  58. package/dist/src-2N5EJ2N6.js +1733 -0
  59. package/dist/src-2N5EJ2N6.js.map +1 -0
  60. package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
  61. package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
  62. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
  63. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
  64. package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +433 -0
  65. package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
  66. package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
  67. package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +251 -0
  68. package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
  69. package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
  70. package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
  71. package/dist/templates/.agentv/.env.example +23 -23
  72. package/dist/templates/.agentv/config.yaml +15 -15
  73. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +38 -13
  74. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +9 -6
  75. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +4 -4
  76. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +7 -9
  77. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
  78. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
  79. package/dist/{token-D3IYDJQZ.js → token-POXF46NU.js} +6 -4
  80. package/dist/{token-D3IYDJQZ.js.map → token-POXF46NU.js.map} +1 -1
  81. package/dist/{token-util-FWFPR2BV.js → token-util-6GWYZWGE.js} +4 -3
  82. package/dist/token-util-6GWYZWGE.js.map +1 -0
  83. package/package.json +7 -3
  84. package/dist/chunk-MGK6HHRR.js.map +0 -1
  85. /package/dist/{token-util-FWFPR2BV.js.map → chunk-5H446C7X.js.map} +0 -0
@@ -0,0 +1,2957 @@
1
+ import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
+ import {
3
+ CLI_PLACEHOLDERS,
4
+ KNOWN_PROVIDERS,
5
+ PROVIDER_ALIASES,
6
+ ResponseCache,
7
+ buildDirectoryChain,
8
+ buildSearchRoots,
9
+ ensureVSCodeSubagents,
10
+ findGitRoot,
11
+ isEvaluatorKind,
12
+ listTargetNames,
13
+ loadTestSuite,
14
+ loadTsConfig,
15
+ normalizeLineEndings,
16
+ readTargetDefinitions,
17
+ readTestSuiteMetadata,
18
+ resolveFileReference,
19
+ resolveTargetDefinition,
20
+ runEvaluation,
21
+ shouldEnableCache,
22
+ shouldSkipCacheForTemperature,
23
+ subscribeToCodexLogEntries,
24
+ subscribeToCopilotSdkLogEntries,
25
+ subscribeToPiLogEntries
26
+ } from "./chunk-BWLYFF5N.js";
27
+
28
+ // src/commands/eval/shared.ts
29
+ import { constants } from "node:fs";
30
+ import { access, stat } from "node:fs/promises";
31
+ import path from "node:path";
32
+ import fg from "fast-glob";
33
+ async function resolveEvalPaths(evalPaths, cwd) {
34
+ const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
35
+ if (normalizedInputs.length === 0) {
36
+ throw new Error("No eval paths provided.");
37
+ }
38
+ const unmatched = [];
39
+ const results = /* @__PURE__ */ new Set();
40
+ for (const pattern of normalizedInputs) {
41
+ const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
42
+ try {
43
+ const stats = await stat(candidatePath);
44
+ if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
45
+ results.add(candidatePath);
46
+ continue;
47
+ }
48
+ } catch {
49
+ }
50
+ const globPattern = pattern.includes("\\") ? pattern.replace(/\\/g, "/") : pattern;
51
+ const matches = await fg(globPattern, {
52
+ cwd,
53
+ absolute: true,
54
+ onlyFiles: true,
55
+ unique: true,
56
+ dot: true,
57
+ followSymbolicLinks: true
58
+ });
59
+ const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
60
+ if (yamlMatches.length === 0) {
61
+ unmatched.push(pattern);
62
+ continue;
63
+ }
64
+ for (const filePath of yamlMatches) {
65
+ results.add(path.normalize(filePath));
66
+ }
67
+ }
68
+ if (unmatched.length > 0) {
69
+ throw new Error(
70
+ `No eval files matched: ${unmatched.join(
71
+ ", "
72
+ )}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`
73
+ );
74
+ }
75
+ const sorted = Array.from(results);
76
+ sorted.sort();
77
+ return sorted;
78
+ }
79
+ async function findRepoRoot(start) {
80
+ const fallback = path.resolve(start);
81
+ let current = fallback;
82
+ while (current !== void 0) {
83
+ const candidate = path.join(current, ".git");
84
+ try {
85
+ await access(candidate, constants.F_OK);
86
+ return current;
87
+ } catch {
88
+ const parent = path.dirname(current);
89
+ if (parent === current) {
90
+ break;
91
+ }
92
+ current = parent;
93
+ }
94
+ }
95
+ return fallback;
96
+ }
97
+
98
+ // src/utils/targets.ts
99
+ import { constants as constants2 } from "node:fs";
100
+ import { access as access2 } from "node:fs/promises";
101
+ import path2 from "node:path";
102
+ var TARGET_FILE_CANDIDATES = [
103
+ "targets.yaml",
104
+ "targets.yml",
105
+ path2.join(".agentv", "targets.yaml"),
106
+ path2.join(".agentv", "targets.yml")
107
+ ];
108
+ async function fileExists(filePath) {
109
+ try {
110
+ await access2(filePath, constants2.F_OK);
111
+ return true;
112
+ } catch {
113
+ return false;
114
+ }
115
+ }
116
+ async function discoverTargetsFile(options) {
117
+ const { explicitPath, testFilePath, repoRoot, cwd } = options;
118
+ if (explicitPath) {
119
+ const resolvedExplicit = path2.resolve(explicitPath);
120
+ if (await fileExists(resolvedExplicit)) {
121
+ return resolvedExplicit;
122
+ }
123
+ for (const candidate of TARGET_FILE_CANDIDATES) {
124
+ const nested = path2.join(resolvedExplicit, candidate);
125
+ if (await fileExists(nested)) {
126
+ return nested;
127
+ }
128
+ }
129
+ throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
130
+ }
131
+ const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
132
+ const resolvedCwd = path2.resolve(cwd);
133
+ if (!directories.includes(resolvedCwd)) {
134
+ directories.push(resolvedCwd);
135
+ }
136
+ for (const directory of directories) {
137
+ for (const candidate of TARGET_FILE_CANDIDATES) {
138
+ const fullPath = path2.join(directory, candidate);
139
+ if (await fileExists(fullPath)) {
140
+ return fullPath;
141
+ }
142
+ }
143
+ }
144
+ throw new Error("Unable to locate targets.yaml. Use --targets to specify the file explicitly.");
145
+ }
146
+
147
+ // src/commands/eval/run-eval.ts
148
+ import { constants as constants4 } from "node:fs";
149
+ import { access as access4 } from "node:fs/promises";
150
+ import path10 from "node:path";
151
+ import { pathToFileURL } from "node:url";
152
+
153
+ // src/commands/eval/env.ts
154
+ import { constants as constants3 } from "node:fs";
155
+ import { access as access3 } from "node:fs/promises";
156
+ import path3 from "node:path";
157
+ import { config as loadDotenv } from "dotenv";
158
+ function uniqueDirs(directories) {
159
+ const seen = /* @__PURE__ */ new Set();
160
+ const result = [];
161
+ for (const dir of directories) {
162
+ const absolute = path3.resolve(dir);
163
+ if (seen.has(absolute)) {
164
+ continue;
165
+ }
166
+ seen.add(absolute);
167
+ result.push(absolute);
168
+ }
169
+ return result;
170
+ }
171
+ async function fileExists2(filePath) {
172
+ try {
173
+ await access3(filePath, constants3.F_OK);
174
+ return true;
175
+ } catch {
176
+ return false;
177
+ }
178
+ }
179
+ function collectAncestorDirectories(start, boundary) {
180
+ const directories = [];
181
+ const boundaryDir = path3.resolve(boundary);
182
+ let current = path3.resolve(start);
183
+ while (current !== void 0) {
184
+ directories.push(current);
185
+ if (current === boundaryDir) {
186
+ break;
187
+ }
188
+ const parent = path3.dirname(current);
189
+ if (parent === current) {
190
+ break;
191
+ }
192
+ current = parent;
193
+ }
194
+ return directories;
195
+ }
196
+ async function loadEnvFromHierarchy(options) {
197
+ const { testFilePath, repoRoot, verbose } = options;
198
+ const testDir = path3.dirname(path3.resolve(testFilePath));
199
+ const cwd = process.cwd();
200
+ const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
201
+ const envFiles = [];
202
+ for (const dir of searchDirs) {
203
+ const candidate = path3.join(dir, ".env");
204
+ if (await fileExists2(candidate)) {
205
+ envFiles.push(candidate);
206
+ }
207
+ }
208
+ if (envFiles.length === 0) {
209
+ if (verbose) {
210
+ console.log("No .env file found in hierarchy");
211
+ }
212
+ return void 0;
213
+ }
214
+ for (let i = envFiles.length - 1; i >= 0; i--) {
215
+ const envFile = envFiles[i];
216
+ loadDotenv({ path: envFile, override: false });
217
+ if (verbose) {
218
+ console.log(`Loaded environment from: ${envFile}`);
219
+ }
220
+ }
221
+ return envFiles[0];
222
+ }
223
+
224
+ // src/commands/eval/output-writer.ts
225
+ import path8 from "node:path";
226
+
227
+ // src/commands/eval/json-writer.ts
228
+ import { mkdir, writeFile } from "node:fs/promises";
229
+ import path4 from "node:path";
230
+
231
+ // src/utils/case-conversion.ts
232
+ function toSnakeCase(str) {
233
+ if (/^[A-Z]/.test(str)) {
234
+ return str;
235
+ }
236
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
237
+ }
238
+ function toSnakeCaseDeep(obj) {
239
+ if (obj === null || obj === void 0) {
240
+ return obj;
241
+ }
242
+ if (Array.isArray(obj)) {
243
+ return obj.map((item) => toSnakeCaseDeep(item));
244
+ }
245
+ if (typeof obj === "object") {
246
+ const result = {};
247
+ for (const [key, value] of Object.entries(obj)) {
248
+ const snakeKey = toSnakeCase(key);
249
+ result[snakeKey] = toSnakeCaseDeep(value);
250
+ }
251
+ return result;
252
+ }
253
+ return obj;
254
+ }
255
+
256
+ // src/commands/eval/json-writer.ts
257
+ var JsonWriter = class _JsonWriter {
258
+ filePath;
259
+ results = [];
260
+ closed = false;
261
+ constructor(filePath) {
262
+ this.filePath = filePath;
263
+ }
264
+ static async open(filePath) {
265
+ await mkdir(path4.dirname(filePath), { recursive: true });
266
+ return new _JsonWriter(filePath);
267
+ }
268
+ async append(result) {
269
+ if (this.closed) {
270
+ throw new Error("Cannot write to closed JSON writer");
271
+ }
272
+ this.results.push(result);
273
+ }
274
+ async close() {
275
+ if (this.closed) {
276
+ return;
277
+ }
278
+ this.closed = true;
279
+ const passed = this.results.filter((r) => r.score >= 0.5).length;
280
+ const failed = this.results.length - passed;
281
+ const total = this.results.length;
282
+ const output = {
283
+ stats: {
284
+ total,
285
+ passed,
286
+ failed,
287
+ passRate: total > 0 ? passed / total : 0
288
+ },
289
+ results: this.results
290
+ };
291
+ const snakeCaseOutput = toSnakeCaseDeep(output);
292
+ await writeFile(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
293
+ `, "utf8");
294
+ }
295
+ };
296
+
297
+ // src/commands/eval/jsonl-writer.ts
298
+ import { createWriteStream } from "node:fs";
299
+ import { mkdir as mkdir2 } from "node:fs/promises";
300
+ import path5 from "node:path";
301
+ import { finished } from "node:stream/promises";
302
+
303
+ // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
304
+ var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
305
+ var E_ALREADY_LOCKED = new Error("mutex already locked");
306
+ var E_CANCELED = new Error("request for lock canceled");
307
+ var __awaiter$2 = function(thisArg, _arguments, P, generator) {
308
+ function adopt(value) {
309
+ return value instanceof P ? value : new P(function(resolve) {
310
+ resolve(value);
311
+ });
312
+ }
313
+ return new (P || (P = Promise))(function(resolve, reject) {
314
+ function fulfilled(value) {
315
+ try {
316
+ step(generator.next(value));
317
+ } catch (e) {
318
+ reject(e);
319
+ }
320
+ }
321
+ function rejected(value) {
322
+ try {
323
+ step(generator["throw"](value));
324
+ } catch (e) {
325
+ reject(e);
326
+ }
327
+ }
328
+ function step(result) {
329
+ result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
330
+ }
331
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
332
+ });
333
+ };
334
+ var Semaphore = class {
335
+ constructor(_value, _cancelError = E_CANCELED) {
336
+ this._value = _value;
337
+ this._cancelError = _cancelError;
338
+ this._queue = [];
339
+ this._weightedWaiters = [];
340
+ }
341
+ acquire(weight = 1, priority = 0) {
342
+ if (weight <= 0)
343
+ throw new Error(`invalid weight ${weight}: must be positive`);
344
+ return new Promise((resolve, reject) => {
345
+ const task = { resolve, reject, weight, priority };
346
+ const i = findIndexFromEnd(this._queue, (other) => priority <= other.priority);
347
+ if (i === -1 && weight <= this._value) {
348
+ this._dispatchItem(task);
349
+ } else {
350
+ this._queue.splice(i + 1, 0, task);
351
+ }
352
+ });
353
+ }
354
+ runExclusive(callback_1) {
355
+ return __awaiter$2(this, arguments, void 0, function* (callback, weight = 1, priority = 0) {
356
+ const [value, release] = yield this.acquire(weight, priority);
357
+ try {
358
+ return yield callback(value);
359
+ } finally {
360
+ release();
361
+ }
362
+ });
363
+ }
364
+ waitForUnlock(weight = 1, priority = 0) {
365
+ if (weight <= 0)
366
+ throw new Error(`invalid weight ${weight}: must be positive`);
367
+ if (this._couldLockImmediately(weight, priority)) {
368
+ return Promise.resolve();
369
+ } else {
370
+ return new Promise((resolve) => {
371
+ if (!this._weightedWaiters[weight - 1])
372
+ this._weightedWaiters[weight - 1] = [];
373
+ insertSorted(this._weightedWaiters[weight - 1], { resolve, priority });
374
+ });
375
+ }
376
+ }
377
+ isLocked() {
378
+ return this._value <= 0;
379
+ }
380
+ getValue() {
381
+ return this._value;
382
+ }
383
+ setValue(value) {
384
+ this._value = value;
385
+ this._dispatchQueue();
386
+ }
387
+ release(weight = 1) {
388
+ if (weight <= 0)
389
+ throw new Error(`invalid weight ${weight}: must be positive`);
390
+ this._value += weight;
391
+ this._dispatchQueue();
392
+ }
393
+ cancel() {
394
+ this._queue.forEach((entry) => entry.reject(this._cancelError));
395
+ this._queue = [];
396
+ }
397
+ _dispatchQueue() {
398
+ this._drainUnlockWaiters();
399
+ while (this._queue.length > 0 && this._queue[0].weight <= this._value) {
400
+ this._dispatchItem(this._queue.shift());
401
+ this._drainUnlockWaiters();
402
+ }
403
+ }
404
+ _dispatchItem(item) {
405
+ const previousValue = this._value;
406
+ this._value -= item.weight;
407
+ item.resolve([previousValue, this._newReleaser(item.weight)]);
408
+ }
409
+ _newReleaser(weight) {
410
+ let called = false;
411
+ return () => {
412
+ if (called)
413
+ return;
414
+ called = true;
415
+ this.release(weight);
416
+ };
417
+ }
418
+ _drainUnlockWaiters() {
419
+ if (this._queue.length === 0) {
420
+ for (let weight = this._value; weight > 0; weight--) {
421
+ const waiters = this._weightedWaiters[weight - 1];
422
+ if (!waiters)
423
+ continue;
424
+ waiters.forEach((waiter) => waiter.resolve());
425
+ this._weightedWaiters[weight - 1] = [];
426
+ }
427
+ } else {
428
+ const queuedPriority = this._queue[0].priority;
429
+ for (let weight = this._value; weight > 0; weight--) {
430
+ const waiters = this._weightedWaiters[weight - 1];
431
+ if (!waiters)
432
+ continue;
433
+ const i = waiters.findIndex((waiter) => waiter.priority <= queuedPriority);
434
+ (i === -1 ? waiters : waiters.splice(0, i)).forEach((waiter) => waiter.resolve());
435
+ }
436
+ }
437
+ }
438
+ _couldLockImmediately(weight, priority) {
439
+ return (this._queue.length === 0 || this._queue[0].priority < priority) && weight <= this._value;
440
+ }
441
+ };
442
+ function insertSorted(a, v) {
443
+ const i = findIndexFromEnd(a, (other) => v.priority <= other.priority);
444
+ a.splice(i + 1, 0, v);
445
+ }
446
+ function findIndexFromEnd(a, predicate) {
447
+ for (let i = a.length - 1; i >= 0; i--) {
448
+ if (predicate(a[i])) {
449
+ return i;
450
+ }
451
+ }
452
+ return -1;
453
+ }
454
+ var __awaiter$1 = function(thisArg, _arguments, P, generator) {
455
+ function adopt(value) {
456
+ return value instanceof P ? value : new P(function(resolve) {
457
+ resolve(value);
458
+ });
459
+ }
460
+ return new (P || (P = Promise))(function(resolve, reject) {
461
+ function fulfilled(value) {
462
+ try {
463
+ step(generator.next(value));
464
+ } catch (e) {
465
+ reject(e);
466
+ }
467
+ }
468
+ function rejected(value) {
469
+ try {
470
+ step(generator["throw"](value));
471
+ } catch (e) {
472
+ reject(e);
473
+ }
474
+ }
475
+ function step(result) {
476
+ result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
477
+ }
478
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
479
+ });
480
+ };
481
+ var Mutex = class {
482
+ constructor(cancelError) {
483
+ this._semaphore = new Semaphore(1, cancelError);
484
+ }
485
+ acquire() {
486
+ return __awaiter$1(this, arguments, void 0, function* (priority = 0) {
487
+ const [, releaser] = yield this._semaphore.acquire(1, priority);
488
+ return releaser;
489
+ });
490
+ }
491
+ runExclusive(callback, priority = 0) {
492
+ return this._semaphore.runExclusive(() => callback(), 1, priority);
493
+ }
494
+ isLocked() {
495
+ return this._semaphore.isLocked();
496
+ }
497
+ waitForUnlock(priority = 0) {
498
+ return this._semaphore.waitForUnlock(1, priority);
499
+ }
500
+ release() {
501
+ if (this._semaphore.isLocked())
502
+ this._semaphore.release();
503
+ }
504
+ cancel() {
505
+ return this._semaphore.cancel();
506
+ }
507
+ };
508
+
509
+ // src/commands/eval/jsonl-writer.ts
510
+ var JsonlWriter = class _JsonlWriter {
511
+ stream;
512
+ mutex = new Mutex();
513
+ closed = false;
514
+ constructor(stream) {
515
+ this.stream = stream;
516
+ }
517
+ static async open(filePath) {
518
+ await mkdir2(path5.dirname(filePath), { recursive: true });
519
+ const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
520
+ return new _JsonlWriter(stream);
521
+ }
522
+ async append(record) {
523
+ await this.mutex.runExclusive(async () => {
524
+ if (this.closed) {
525
+ throw new Error("Cannot write to closed JSONL writer");
526
+ }
527
+ const snakeCaseRecord = toSnakeCaseDeep(record);
528
+ const line = `${JSON.stringify(snakeCaseRecord)}
529
+ `;
530
+ if (!this.stream.write(line)) {
531
+ await new Promise((resolve, reject) => {
532
+ this.stream.once("drain", resolve);
533
+ this.stream.once("error", reject);
534
+ });
535
+ }
536
+ });
537
+ }
538
+ async close() {
539
+ if (this.closed) {
540
+ return;
541
+ }
542
+ this.closed = true;
543
+ this.stream.end();
544
+ await finished(this.stream);
545
+ }
546
+ };
547
+
548
+ // src/commands/eval/junit-writer.ts
549
+ import { mkdir as mkdir3, writeFile as writeFile2 } from "node:fs/promises";
550
+ import path6 from "node:path";
551
+ function escapeXml(str) {
552
+ return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
553
+ }
554
+ var JunitWriter = class _JunitWriter {
555
+ filePath;
556
+ results = [];
557
+ closed = false;
558
+ constructor(filePath) {
559
+ this.filePath = filePath;
560
+ }
561
+ static async open(filePath) {
562
+ await mkdir3(path6.dirname(filePath), { recursive: true });
563
+ return new _JunitWriter(filePath);
564
+ }
565
+ async append(result) {
566
+ if (this.closed) {
567
+ throw new Error("Cannot write to closed JUnit writer");
568
+ }
569
+ this.results.push(result);
570
+ }
571
+ async close() {
572
+ if (this.closed) {
573
+ return;
574
+ }
575
+ this.closed = true;
576
+ const grouped = /* @__PURE__ */ new Map();
577
+ for (const result of this.results) {
578
+ const suite = result.dataset ?? "default";
579
+ const existing = grouped.get(suite);
580
+ if (existing) {
581
+ existing.push(result);
582
+ } else {
583
+ grouped.set(suite, [result]);
584
+ }
585
+ }
586
+ const suiteXmls = [];
587
+ for (const [suiteName, results] of grouped) {
588
+ const failures = results.filter((r) => r.score < 0.5).length;
589
+ const errors = results.filter((r) => r.error !== void 0).length;
590
+ const testCases = results.map((r) => {
591
+ const time = r.trace?.durationMs ? (r.trace.durationMs / 1e3).toFixed(3) : "0.000";
592
+ let inner = "";
593
+ if (r.error) {
594
+ inner = `
595
+ <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
596
+ `;
597
+ } else if (r.score < 0.5) {
598
+ const message = `score=${r.score.toFixed(3)}`;
599
+ const detail = [
600
+ `Score: ${r.score.toFixed(3)}`,
601
+ r.reasoning ? `Reasoning: ${r.reasoning}` : "",
602
+ r.misses.length > 0 ? `Misses: ${r.misses.join(", ")}` : ""
603
+ ].filter(Boolean).join("\n");
604
+ inner = `
605
+ <failure message="${escapeXml(message)}">${escapeXml(detail)}</failure>
606
+ `;
607
+ }
608
+ return ` <testcase name="${escapeXml(r.testId)}" classname="${escapeXml(suiteName)}" time="${time}">${inner}</testcase>`;
609
+ });
610
+ suiteXmls.push(
611
+ ` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}">
612
+ ${testCases.join("\n")}
613
+ </testsuite>`
614
+ );
615
+ }
616
+ const totalTests = this.results.length;
617
+ const totalFailures = this.results.filter((r) => r.score < 0.5).length;
618
+ const totalErrors = this.results.filter((r) => r.error !== void 0).length;
619
+ const xml = `<?xml version="1.0" encoding="UTF-8"?>
620
+ <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
621
+ ${suiteXmls.join("\n")}
622
+ </testsuites>
623
+ `;
624
+ await writeFile2(this.filePath, xml, "utf8");
625
+ }
626
+ };
627
+
628
+ // src/commands/eval/yaml-writer.ts
629
+ import { createWriteStream as createWriteStream2 } from "node:fs";
630
+ import { mkdir as mkdir4 } from "node:fs/promises";
631
+ import path7 from "node:path";
632
+ import { finished as finished2 } from "node:stream/promises";
633
+ import { stringify as stringifyYaml } from "yaml";
634
+ var YamlWriter = class _YamlWriter {
635
+ stream;
636
+ mutex = new Mutex();
637
+ closed = false;
638
+ isFirst = true;
639
+ constructor(stream) {
640
+ this.stream = stream;
641
+ }
642
+ static async open(filePath) {
643
+ await mkdir4(path7.dirname(filePath), { recursive: true });
644
+ const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
645
+ return new _YamlWriter(stream);
646
+ }
647
+ async append(record) {
648
+ await this.mutex.runExclusive(async () => {
649
+ if (this.closed) {
650
+ throw new Error("Cannot write to closed YAML writer");
651
+ }
652
+ const snakeCaseRecord = toSnakeCaseDeep(record);
653
+ const yamlDoc = stringifyYaml(snakeCaseRecord, {
654
+ indent: 2,
655
+ lineWidth: 0
656
+ // Disable line wrapping
657
+ // Let YAML library choose appropriate string style based on content
658
+ // (will use block literal for multiline strings with actual newlines)
659
+ });
660
+ const normalizedYaml = normalizeLineEndings(yamlDoc);
661
+ const separator = this.isFirst ? "---\n" : "\n---\n";
662
+ this.isFirst = false;
663
+ const content = `${separator}${normalizedYaml}`;
664
+ if (!this.stream.write(content)) {
665
+ await new Promise((resolve, reject) => {
666
+ this.stream.once("drain", resolve);
667
+ this.stream.once("error", reject);
668
+ });
669
+ }
670
+ });
671
+ }
672
+ async close() {
673
+ if (this.closed) {
674
+ return;
675
+ }
676
+ this.closed = true;
677
+ this.stream.end();
678
+ await finished2(this.stream);
679
+ }
680
+ };
681
+
682
+ // src/commands/eval/output-writer.ts
683
+ async function createOutputWriter(filePath, format) {
684
+ switch (format) {
685
+ case "jsonl":
686
+ return JsonlWriter.open(filePath);
687
+ case "yaml":
688
+ return YamlWriter.open(filePath);
689
+ default: {
690
+ const exhaustiveCheck = format;
691
+ throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
692
+ }
693
+ }
694
+ }
695
+ function getDefaultExtension(format) {
696
+ switch (format) {
697
+ case "jsonl":
698
+ return ".jsonl";
699
+ case "yaml":
700
+ return ".yaml";
701
+ default: {
702
+ const exhaustiveCheck = format;
703
+ throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
704
+ }
705
+ }
706
+ }
707
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml"]);
708
+ function createWriterFromPath(filePath) {
709
+ const ext = path8.extname(filePath).toLowerCase();
710
+ switch (ext) {
711
+ case ".jsonl":
712
+ return JsonlWriter.open(filePath);
713
+ case ".json":
714
+ return JsonWriter.open(filePath);
715
+ case ".xml":
716
+ return JunitWriter.open(filePath);
717
+ case ".yaml":
718
+ case ".yml":
719
+ return YamlWriter.open(filePath);
720
+ default:
721
+ throw new Error(
722
+ `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`
723
+ );
724
+ }
725
+ }
726
+ async function createMultiWriter(filePaths) {
727
+ const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
728
+ return {
729
+ async append(result) {
730
+ await Promise.all(writers.map((w) => w.append(result)));
731
+ },
732
+ async close() {
733
+ await Promise.all(writers.map((w) => w.close()));
734
+ }
735
+ };
736
+ }
737
+
738
+ // src/commands/eval/progress-display.ts
739
+ var ProgressDisplay = class {
740
+ workers = /* @__PURE__ */ new Map();
741
+ totalTests = 0;
742
+ completedTests = 0;
743
+ logPaths = [];
744
+ logPathSet = /* @__PURE__ */ new Set();
745
+ hasPrintedLogHeader = false;
746
+ started = false;
747
+ finished = false;
748
+ verbose;
749
+ constructor(_maxWorkers, options) {
750
+ this.verbose = options?.verbose ?? false;
751
+ }
752
+ isInteractiveMode() {
753
+ return false;
754
+ }
755
+ start() {
756
+ this.started = true;
757
+ this.finished = false;
758
+ }
759
+ setTotalTests(count) {
760
+ this.totalTests = count;
761
+ }
762
+ updateWorker(progress) {
763
+ const previous = this.workers.get(progress.workerId);
764
+ this.workers.set(progress.workerId, progress);
765
+ if (progress.status === "completed" || progress.status === "failed") {
766
+ this.completedTests++;
767
+ }
768
+ const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
769
+ const countPrefix = `${this.completedTests}/${this.totalTests}`;
770
+ switch (progress.status) {
771
+ case "pending":
772
+ if (this.verbose && !previous) {
773
+ console.log(`${countPrefix} \u23F3 ${progress.testId}${targetSuffix}`);
774
+ }
775
+ break;
776
+ case "running":
777
+ if (!previous || previous.status === "pending") {
778
+ console.log(`${countPrefix} \u{1F504} ${progress.testId}${targetSuffix}`);
779
+ }
780
+ break;
781
+ case "completed":
782
+ console.log(`${countPrefix} \u2705 ${progress.testId}${targetSuffix}`);
783
+ break;
784
+ case "failed":
785
+ console.log(
786
+ `${countPrefix} \u274C ${progress.testId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
787
+ );
788
+ break;
789
+ }
790
+ }
791
+ addLogPaths(paths, provider) {
792
+ const newPaths = [];
793
+ for (const path11 of paths) {
794
+ if (this.logPathSet.has(path11)) {
795
+ continue;
796
+ }
797
+ this.logPathSet.add(path11);
798
+ newPaths.push(path11);
799
+ }
800
+ if (newPaths.length === 0) {
801
+ return;
802
+ }
803
+ this.logPaths.push(...newPaths);
804
+ if (!this.hasPrintedLogHeader) {
805
+ console.log("");
806
+ const label = provider === "pi" ? "Pi Coding Agent" : provider === "copilot" ? "Copilot CLI" : "Codex CLI";
807
+ console.log(`${label} logs:`);
808
+ this.hasPrintedLogHeader = true;
809
+ }
810
+ const startIndex = this.logPaths.length - newPaths.length;
811
+ newPaths.forEach((path11, offset) => {
812
+ console.log(`${startIndex + offset + 1}. ${path11}`);
813
+ });
814
+ }
815
+ finish() {
816
+ this.finished = true;
817
+ console.log("");
818
+ }
819
+ clear() {
820
+ }
821
+ };
822
+
823
+ // src/commands/eval/statistics.ts
824
+ var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
825
+ function computeMean(values) {
826
+ if (values.length === 0) {
827
+ return 0;
828
+ }
829
+ const sum = values.reduce((acc, value) => acc + value, 0);
830
+ return sum / values.length;
831
+ }
832
+ function computeMedian(values) {
833
+ if (values.length === 0) {
834
+ return 0;
835
+ }
836
+ const sorted = [...values].sort((a, b) => a - b);
837
+ const mid = Math.floor(sorted.length / 2);
838
+ if (sorted.length % 2 === 0) {
839
+ return (sorted[mid - 1] + sorted[mid]) / 2;
840
+ }
841
+ return sorted[mid];
842
+ }
843
+ function computeStandardDeviation(values) {
844
+ if (values.length < 2) {
845
+ return void 0;
846
+ }
847
+ const mean = computeMean(values);
848
+ const variance = values.reduce((acc, value) => acc + (value - mean) ** 2, 0) / (values.length - 1);
849
+ return Math.sqrt(variance);
850
+ }
851
+ function buildHistogram(values) {
852
+ const bins = [];
853
+ for (let index = 0; index < HISTOGRAM_BREAKPOINTS.length - 1; index += 1) {
854
+ bins.push({
855
+ range: [HISTOGRAM_BREAKPOINTS[index], HISTOGRAM_BREAKPOINTS[index + 1]],
856
+ count: 0
857
+ });
858
+ }
859
+ for (const value of values) {
860
+ for (const bin of bins) {
861
+ const [start, end] = bin.range;
862
+ const isLastBin = end === HISTOGRAM_BREAKPOINTS[HISTOGRAM_BREAKPOINTS.length - 1];
863
+ const withinRange = isLastBin ? value >= start && value <= end : value >= start && value < end + 1e-9;
864
+ if (withinRange) {
865
+ bin.count += 1;
866
+ break;
867
+ }
868
+ }
869
+ }
870
+ return bins;
871
+ }
872
+ function calculateEvaluationSummary(results) {
873
+ const scores = results.map((result) => result.score);
874
+ const total = results.length;
875
+ const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
876
+ const errorCount = errors.length;
877
+ if (total === 0) {
878
+ return {
879
+ total: 0,
880
+ mean: 0,
881
+ median: 0,
882
+ min: 0,
883
+ max: 0,
884
+ standardDeviation: void 0,
885
+ histogram: buildHistogram([]),
886
+ topResults: [],
887
+ bottomResults: [],
888
+ errorCount: 0,
889
+ errors: []
890
+ };
891
+ }
892
+ const mean = computeMean(scores);
893
+ const median = computeMedian(scores);
894
+ const min = Math.min(...scores);
895
+ const max = Math.max(...scores);
896
+ const standardDeviation = computeStandardDeviation(scores);
897
+ const histogram = buildHistogram(scores);
898
+ const sortedResults = [...results].sort((a, b) => b.score - a.score);
899
+ const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
900
+ const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
901
+ return {
902
+ total,
903
+ mean,
904
+ median,
905
+ min,
906
+ max,
907
+ standardDeviation,
908
+ histogram,
909
+ topResults,
910
+ bottomResults,
911
+ errorCount,
912
+ errors
913
+ };
914
+ }
915
+ function formatScore(value) {
916
+ return value.toFixed(3);
917
+ }
918
+ function formatEvaluationSummary(summary) {
919
+ if (summary.total === 0) {
920
+ return "\nNo results to summarize";
921
+ }
922
+ const lines = [];
923
+ if (summary.errorCount > 0) {
924
+ lines.push("\n==================================================");
925
+ lines.push("ERRORS");
926
+ lines.push("==================================================");
927
+ for (const error of summary.errors) {
928
+ lines.push(`
929
+ \u274C ${error.testId}`);
930
+ lines.push(` ${error.error}`);
931
+ }
932
+ lines.push("");
933
+ }
934
+ lines.push("\n==================================================");
935
+ lines.push("EVALUATION SUMMARY");
936
+ lines.push("==================================================");
937
+ lines.push(`Total tests: ${summary.total}`);
938
+ if (summary.errorCount > 0) {
939
+ lines.push(`Failed: ${summary.errorCount}`);
940
+ lines.push(`Passed: ${summary.total - summary.errorCount}`);
941
+ }
942
+ lines.push(`Mean score: ${formatScore(summary.mean)}`);
943
+ lines.push(`Median score: ${formatScore(summary.median)}`);
944
+ lines.push(`Min score: ${formatScore(summary.min)}`);
945
+ lines.push(`Max score: ${formatScore(summary.max)}`);
946
+ if (typeof summary.standardDeviation === "number") {
947
+ lines.push(`Std deviation: ${formatScore(summary.standardDeviation)}`);
948
+ }
949
+ lines.push("\nScore distribution:");
950
+ for (const bin of summary.histogram) {
951
+ const [start, end] = bin.range;
952
+ lines.push(` ${start.toFixed(1)}-${end.toFixed(1)}: ${bin.count}`);
953
+ }
954
+ lines.push("\nTop performing tests:");
955
+ summary.topResults.forEach((result, index) => {
956
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
957
+ });
958
+ lines.push("\nLowest performing tests:");
959
+ summary.bottomResults.forEach((result, index) => {
960
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
961
+ });
962
+ return lines.join("\n");
963
+ }
964
+ function formatMatrixSummary(results) {
965
+ const targetSet = /* @__PURE__ */ new Set();
966
+ const testIdSet = /* @__PURE__ */ new Set();
967
+ for (const result of results) {
968
+ targetSet.add(result.target);
969
+ testIdSet.add(result.testId);
970
+ }
971
+ const targets = [...targetSet].sort();
972
+ const testIds = [...testIdSet].sort();
973
+ if (targets.length < 2) {
974
+ return "";
975
+ }
976
+ const scoreMap = /* @__PURE__ */ new Map();
977
+ for (const result of results) {
978
+ if (!scoreMap.has(result.testId)) {
979
+ scoreMap.set(result.testId, /* @__PURE__ */ new Map());
980
+ }
981
+ scoreMap.get(result.testId)?.set(result.target, result.score);
982
+ }
983
+ const lines = [];
984
+ lines.push("\n==================================================");
985
+ lines.push("MATRIX RESULTS (tests \xD7 targets)");
986
+ lines.push("==================================================");
987
+ const testIdColWidth = Math.max(7, ...testIds.map((id) => id.length));
988
+ const targetColWidth = Math.max(7, ...targets.map((t) => t.length));
989
+ const header = `${"Test".padEnd(testIdColWidth)} ${targets.map((t) => t.padEnd(targetColWidth)).join(" ")}`;
990
+ lines.push(header);
991
+ lines.push("-".repeat(header.length));
992
+ for (const testId of testIds) {
993
+ const cells = targets.map((target) => {
994
+ const score = scoreMap.get(testId)?.get(target);
995
+ return score !== void 0 ? formatScore(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
996
+ });
997
+ lines.push(`${testId.padEnd(testIdColWidth)} ${cells.join(" ")}`);
998
+ }
999
+ lines.push("-".repeat(header.length));
1000
+ const avgCells = targets.map((target) => {
1001
+ const scores = results.filter((r) => r.target === target).map((r) => r.score);
1002
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
1003
+ return formatScore(avg).padEnd(targetColWidth);
1004
+ });
1005
+ lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
1006
+ return lines.join("\n");
1007
+ }
1008
+
1009
+ // ../../packages/core/dist/evaluation/validation/index.js
1010
+ import { readFile } from "node:fs/promises";
1011
+ import path9 from "node:path";
1012
+ import { parse } from "yaml";
1013
+ import { readFile as readFile2 } from "node:fs/promises";
1014
+ import path22 from "node:path";
1015
+ import { parse as parse2 } from "yaml";
1016
+ import { readFile as readFile3 } from "node:fs/promises";
1017
+ import path32 from "node:path";
1018
+ import { parse as parse3 } from "yaml";
1019
+ import { readFile as readFile4 } from "node:fs/promises";
1020
+ import { parse as parse4 } from "yaml";
1021
+ import { readFile as readFile5 } from "node:fs/promises";
1022
+ import path42 from "node:path";
1023
+ import { parse as parse5 } from "yaml";
1024
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
1025
+ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
1026
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
1027
+ async function detectFileType(filePath) {
1028
+ try {
1029
+ const content = await readFile(filePath, "utf8");
1030
+ const parsed = parse(content);
1031
+ if (typeof parsed !== "object" || parsed === null) {
1032
+ return inferFileTypeFromPath(filePath);
1033
+ }
1034
+ const record = parsed;
1035
+ const schema = record.$schema;
1036
+ if (typeof schema !== "string") {
1037
+ return inferFileTypeFromPath(filePath);
1038
+ }
1039
+ switch (schema) {
1040
+ case SCHEMA_EVAL_V2:
1041
+ return "eval";
1042
+ case SCHEMA_TARGETS_V2:
1043
+ return "targets";
1044
+ case SCHEMA_CONFIG_V2:
1045
+ return "config";
1046
+ default:
1047
+ return inferFileTypeFromPath(filePath);
1048
+ }
1049
+ } catch {
1050
+ return inferFileTypeFromPath(filePath);
1051
+ }
1052
+ }
1053
+ function inferFileTypeFromPath(filePath) {
1054
+ const normalized = path9.normalize(filePath).replace(/\\/g, "/");
1055
+ const basename = path9.basename(filePath);
1056
+ if (normalized.includes("/.agentv/")) {
1057
+ if (basename === "config.yaml" || basename === "config.yml") {
1058
+ return "config";
1059
+ }
1060
+ if (basename === "targets.yaml" || basename === "targets.yml") {
1061
+ return "targets";
1062
+ }
1063
+ }
1064
+ return "eval";
1065
+ }
1066
+ var ASSERTION_TYPES_WITH_VALUE = /* @__PURE__ */ new Set(["contains", "equals", "regex"]);
1067
+ var VALID_TEST_FILE_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".jsonl"]);
1068
+ var NAME_PATTERN = /^[a-z0-9-]+$/;
1069
+ function isObject(value) {
1070
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1071
+ }
1072
+ async function validateEvalFile(filePath) {
1073
+ const errors = [];
1074
+ const absolutePath = path22.resolve(filePath);
1075
+ let parsed;
1076
+ try {
1077
+ const content = await readFile2(absolutePath, "utf8");
1078
+ parsed = parse2(content);
1079
+ } catch (error) {
1080
+ errors.push({
1081
+ severity: "error",
1082
+ filePath: absolutePath,
1083
+ message: `Failed to parse YAML: ${error.message}`
1084
+ });
1085
+ return {
1086
+ valid: false,
1087
+ filePath: absolutePath,
1088
+ fileType: "eval",
1089
+ errors
1090
+ };
1091
+ }
1092
+ if (!isObject(parsed)) {
1093
+ errors.push({
1094
+ severity: "error",
1095
+ filePath: absolutePath,
1096
+ message: "File must contain a YAML object"
1097
+ });
1098
+ return {
1099
+ valid: false,
1100
+ filePath: absolutePath,
1101
+ fileType: "eval",
1102
+ errors
1103
+ };
1104
+ }
1105
+ validateMetadata(parsed, absolutePath, errors);
1106
+ let cases = parsed.tests;
1107
+ if (cases === void 0 && "eval_cases" in parsed) {
1108
+ cases = parsed.eval_cases;
1109
+ errors.push({
1110
+ severity: "warning",
1111
+ filePath: absolutePath,
1112
+ location: "eval_cases",
1113
+ message: "'eval_cases' is deprecated. Use 'tests' instead."
1114
+ });
1115
+ }
1116
+ if (cases === void 0 && "evalcases" in parsed) {
1117
+ cases = parsed.evalcases;
1118
+ errors.push({
1119
+ severity: "warning",
1120
+ filePath: absolutePath,
1121
+ location: "evalcases",
1122
+ message: "'evalcases' is deprecated. Use 'tests' instead."
1123
+ });
1124
+ }
1125
+ if (typeof cases === "string") {
1126
+ validateTestsStringPath(cases, absolutePath, errors);
1127
+ return {
1128
+ valid: errors.filter((e) => e.severity === "error").length === 0,
1129
+ filePath: absolutePath,
1130
+ fileType: "eval",
1131
+ errors
1132
+ };
1133
+ }
1134
+ if (!Array.isArray(cases)) {
1135
+ errors.push({
1136
+ severity: "error",
1137
+ filePath: absolutePath,
1138
+ location: "tests",
1139
+ message: "Missing or invalid 'tests' field (must be an array or a file path string)"
1140
+ });
1141
+ return {
1142
+ valid: errors.length === 0,
1143
+ filePath: absolutePath,
1144
+ fileType: "eval",
1145
+ errors
1146
+ };
1147
+ }
1148
+ for (let i = 0; i < cases.length; i++) {
1149
+ const evalCase = cases[i];
1150
+ const location = `tests[${i}]`;
1151
+ if (!isObject(evalCase)) {
1152
+ errors.push({
1153
+ severity: "error",
1154
+ filePath: absolutePath,
1155
+ location,
1156
+ message: "Eval case must be an object"
1157
+ });
1158
+ continue;
1159
+ }
1160
+ const id = evalCase.id;
1161
+ if (typeof id !== "string" || id.trim().length === 0) {
1162
+ errors.push({
1163
+ severity: "error",
1164
+ filePath: absolutePath,
1165
+ location: `${location}.id`,
1166
+ message: "Missing or invalid 'id' field (must be a non-empty string)"
1167
+ });
1168
+ }
1169
+ let criteria = evalCase.criteria;
1170
+ if (criteria === void 0 && "expected_outcome" in evalCase) {
1171
+ criteria = evalCase.expected_outcome;
1172
+ errors.push({
1173
+ severity: "warning",
1174
+ filePath: absolutePath,
1175
+ location: `${location}.expected_outcome`,
1176
+ message: "'expected_outcome' is deprecated. Use 'criteria' instead."
1177
+ });
1178
+ }
1179
+ if (criteria !== void 0 && (typeof criteria !== "string" || criteria.trim().length === 0)) {
1180
+ errors.push({
1181
+ severity: "error",
1182
+ filePath: absolutePath,
1183
+ location: `${location}.criteria`,
1184
+ message: "Invalid 'criteria' field (must be a non-empty string if provided)"
1185
+ });
1186
+ }
1187
+ const inputField = evalCase.input;
1188
+ if (inputField !== void 0) {
1189
+ if (typeof inputField === "string") {
1190
+ } else if (Array.isArray(inputField)) {
1191
+ validateMessages(inputField, `${location}.input`, absolutePath, errors);
1192
+ } else {
1193
+ errors.push({
1194
+ severity: "error",
1195
+ filePath: absolutePath,
1196
+ location: `${location}.input`,
1197
+ message: "Invalid 'input' field (must be a string or array of messages)"
1198
+ });
1199
+ }
1200
+ } else {
1201
+ errors.push({
1202
+ severity: "error",
1203
+ filePath: absolutePath,
1204
+ location: `${location}.input`,
1205
+ message: "Missing 'input' field (must be a string or array of messages)"
1206
+ });
1207
+ }
1208
+ const expectedOutputField = evalCase.expected_output;
1209
+ if (expectedOutputField !== void 0) {
1210
+ if (typeof expectedOutputField === "string") {
1211
+ } else if (Array.isArray(expectedOutputField)) {
1212
+ if (expectedOutputField.length > 0 && isObject(expectedOutputField[0]) && "role" in expectedOutputField[0]) {
1213
+ validateMessages(
1214
+ expectedOutputField,
1215
+ `${location}.expected_output`,
1216
+ absolutePath,
1217
+ errors
1218
+ );
1219
+ }
1220
+ } else if (isObject(expectedOutputField)) {
1221
+ } else {
1222
+ errors.push({
1223
+ severity: "error",
1224
+ filePath: absolutePath,
1225
+ location: `${location}.expected_output`,
1226
+ message: "Invalid 'expected_output' field (must be a string, object, or array)"
1227
+ });
1228
+ }
1229
+ }
1230
+ const assertField = evalCase.assert;
1231
+ if (assertField !== void 0) {
1232
+ validateAssertArray(assertField, location, absolutePath, errors);
1233
+ }
1234
+ }
1235
+ return {
1236
+ valid: errors.filter((e) => e.severity === "error").length === 0,
1237
+ filePath: absolutePath,
1238
+ fileType: "eval",
1239
+ errors
1240
+ };
1241
+ }
1242
+ function validateMessages(messages, location, filePath, errors) {
1243
+ for (let i = 0; i < messages.length; i++) {
1244
+ const message = messages[i];
1245
+ const msgLocation = `${location}[${i}]`;
1246
+ if (!isObject(message)) {
1247
+ errors.push({
1248
+ severity: "error",
1249
+ filePath,
1250
+ location: msgLocation,
1251
+ message: "Message must be an object"
1252
+ });
1253
+ continue;
1254
+ }
1255
+ const role = message.role;
1256
+ const validRoles = ["system", "user", "assistant"];
1257
+ if (!validRoles.includes(role)) {
1258
+ errors.push({
1259
+ severity: "error",
1260
+ filePath,
1261
+ location: `${msgLocation}.role`,
1262
+ message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
1263
+ });
1264
+ }
1265
+ const content = message.content;
1266
+ if (typeof content === "string") {
1267
+ validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
1268
+ } else if (Array.isArray(content)) {
1269
+ for (let j = 0; j < content.length; j++) {
1270
+ const contentItem = content[j];
1271
+ const contentLocation = `${msgLocation}.content[${j}]`;
1272
+ if (typeof contentItem === "string") {
1273
+ validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
1274
+ } else if (isObject(contentItem)) {
1275
+ const type = contentItem.type;
1276
+ if (typeof type !== "string") {
1277
+ errors.push({
1278
+ severity: "error",
1279
+ filePath,
1280
+ location: `${contentLocation}.type`,
1281
+ message: "Content object must have a 'type' field"
1282
+ });
1283
+ }
1284
+ if (type === "text") {
1285
+ const value = contentItem.value;
1286
+ if (typeof value !== "string") {
1287
+ errors.push({
1288
+ severity: "error",
1289
+ filePath,
1290
+ location: `${contentLocation}.value`,
1291
+ message: "Content with type 'text' must have a 'value' field"
1292
+ });
1293
+ } else {
1294
+ validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
1295
+ }
1296
+ }
1297
+ } else {
1298
+ errors.push({
1299
+ severity: "error",
1300
+ filePath,
1301
+ location: contentLocation,
1302
+ message: "Content array items must be strings or objects"
1303
+ });
1304
+ }
1305
+ }
1306
+ } else {
1307
+ errors.push({
1308
+ severity: "error",
1309
+ filePath,
1310
+ location: `${msgLocation}.content`,
1311
+ message: "Missing or invalid 'content' field (must be a string or array)"
1312
+ });
1313
+ }
1314
+ }
1315
+ }
1316
+ function validateMetadata(parsed, filePath, errors) {
1317
+ const name = parsed.name;
1318
+ if (name !== void 0) {
1319
+ if (typeof name === "string") {
1320
+ if (!NAME_PATTERN.test(name)) {
1321
+ errors.push({
1322
+ severity: "warning",
1323
+ filePath,
1324
+ location: "name",
1325
+ message: `Invalid 'name' format '${name}'. Must match pattern /^[a-z0-9-]+$/ (lowercase alphanumeric with hyphens).`
1326
+ });
1327
+ }
1328
+ }
1329
+ if (!("description" in parsed) || parsed.description === void 0) {
1330
+ errors.push({
1331
+ severity: "warning",
1332
+ filePath,
1333
+ location: "name",
1334
+ message: "When 'name' is present, 'description' should also be provided."
1335
+ });
1336
+ }
1337
+ }
1338
+ }
1339
+ function validateTestsStringPath(testsPath, filePath, errors) {
1340
+ const ext = path22.extname(testsPath);
1341
+ if (!VALID_TEST_FILE_EXTENSIONS.has(ext)) {
1342
+ errors.push({
1343
+ severity: "warning",
1344
+ filePath,
1345
+ location: "tests",
1346
+ message: `Unsupported file extension '${ext}' for tests path '${testsPath}'. Supported extensions: ${[...VALID_TEST_FILE_EXTENSIONS].join(", ")}`
1347
+ });
1348
+ }
1349
+ }
1350
+ function validateAssertArray(assertField, parentLocation, filePath, errors) {
1351
+ if (!Array.isArray(assertField)) {
1352
+ errors.push({
1353
+ severity: "warning",
1354
+ filePath,
1355
+ location: `${parentLocation}.assert`,
1356
+ message: "'assert' must be an array of assertion objects."
1357
+ });
1358
+ return;
1359
+ }
1360
+ for (let i = 0; i < assertField.length; i++) {
1361
+ const item = assertField[i];
1362
+ const location = `${parentLocation}.assert[${i}]`;
1363
+ if (!isObject(item)) {
1364
+ errors.push({
1365
+ severity: "warning",
1366
+ filePath,
1367
+ location,
1368
+ message: "Assertion item must be an object with a type field."
1369
+ });
1370
+ continue;
1371
+ }
1372
+ const typeValue = item.type;
1373
+ if (typeValue === void 0 || typeof typeValue !== "string") {
1374
+ errors.push({
1375
+ severity: "warning",
1376
+ filePath,
1377
+ location: `${location}.type`,
1378
+ message: "Assertion item is missing a 'type' field."
1379
+ });
1380
+ continue;
1381
+ }
1382
+ if (!isEvaluatorKind(typeValue)) {
1383
+ errors.push({
1384
+ severity: "warning",
1385
+ filePath,
1386
+ location: `${location}.type`,
1387
+ message: `Unknown assertion type '${typeValue}'.`
1388
+ });
1389
+ continue;
1390
+ }
1391
+ if (ASSERTION_TYPES_WITH_VALUE.has(typeValue)) {
1392
+ const value = item.value;
1393
+ if (value === void 0 || typeof value !== "string") {
1394
+ errors.push({
1395
+ severity: "warning",
1396
+ filePath,
1397
+ location: `${location}.value`,
1398
+ message: `Assertion type '${typeValue}' requires a 'value' field (string).`
1399
+ });
1400
+ continue;
1401
+ }
1402
+ if (typeValue === "regex") {
1403
+ try {
1404
+ new RegExp(value);
1405
+ } catch {
1406
+ errors.push({
1407
+ severity: "warning",
1408
+ filePath,
1409
+ location: `${location}.value`,
1410
+ message: `Invalid regex pattern '${value}': not a valid regular expression.`
1411
+ });
1412
+ }
1413
+ }
1414
+ }
1415
+ const required = item.required;
1416
+ if (required !== void 0) {
1417
+ validateRequiredField(required, location, filePath, errors);
1418
+ }
1419
+ }
1420
+ }
1421
+ function validateRequiredField(required, parentLocation, filePath, errors) {
1422
+ if (typeof required === "boolean") {
1423
+ return;
1424
+ }
1425
+ if (typeof required === "number") {
1426
+ if (required <= 0 || required > 1) {
1427
+ errors.push({
1428
+ severity: "warning",
1429
+ filePath,
1430
+ location: `${parentLocation}.required`,
1431
+ message: `Invalid 'required' value ${required}. When a number, it must be between 0 (exclusive) and 1 (inclusive).`
1432
+ });
1433
+ }
1434
+ return;
1435
+ }
1436
+ errors.push({
1437
+ severity: "warning",
1438
+ filePath,
1439
+ location: `${parentLocation}.required`,
1440
+ message: `Invalid 'required' value. Must be a boolean or a number between 0 (exclusive) and 1 (inclusive).`
1441
+ });
1442
+ }
1443
+ function validateContentForRoleMarkers(content, location, filePath, errors) {
1444
+ const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
1445
+ for (const marker of markers) {
1446
+ if (content.toLowerCase().includes(marker.toLowerCase())) {
1447
+ errors.push({
1448
+ severity: "warning",
1449
+ filePath,
1450
+ location,
1451
+ message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
1452
+ });
1453
+ }
1454
+ }
1455
+ }
1456
+ function isObject2(value) {
1457
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1458
+ }
1459
+ var COMMON_SETTINGS = /* @__PURE__ */ new Set(["provider_batching", "providerBatching"]);
1460
+ var RETRY_SETTINGS = /* @__PURE__ */ new Set([
1461
+ "max_retries",
1462
+ "maxRetries",
1463
+ "retry_initial_delay_ms",
1464
+ "retryInitialDelayMs",
1465
+ "retry_max_delay_ms",
1466
+ "retryMaxDelayMs",
1467
+ "retry_backoff_factor",
1468
+ "retryBackoffFactor",
1469
+ "retry_status_codes",
1470
+ "retryStatusCodes"
1471
+ ]);
1472
+ var AZURE_SETTINGS = /* @__PURE__ */ new Set([
1473
+ ...COMMON_SETTINGS,
1474
+ ...RETRY_SETTINGS,
1475
+ "endpoint",
1476
+ "resource",
1477
+ "resourceName",
1478
+ "api_key",
1479
+ "apiKey",
1480
+ "deployment",
1481
+ "deploymentName",
1482
+ "model",
1483
+ "version",
1484
+ "api_version",
1485
+ "temperature",
1486
+ "max_output_tokens",
1487
+ "maxTokens"
1488
+ ]);
1489
+ var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
1490
+ ...COMMON_SETTINGS,
1491
+ ...RETRY_SETTINGS,
1492
+ "api_key",
1493
+ "apiKey",
1494
+ "model",
1495
+ "deployment",
1496
+ "variant",
1497
+ "temperature",
1498
+ "max_output_tokens",
1499
+ "maxTokens",
1500
+ "thinking_budget",
1501
+ "thinkingBudget"
1502
+ ]);
1503
+ var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
1504
+ ...COMMON_SETTINGS,
1505
+ ...RETRY_SETTINGS,
1506
+ "api_key",
1507
+ "apiKey",
1508
+ "model",
1509
+ "deployment",
1510
+ "variant",
1511
+ "temperature",
1512
+ "max_output_tokens",
1513
+ "maxTokens"
1514
+ ]);
1515
+ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
1516
+ ...COMMON_SETTINGS,
1517
+ "model",
1518
+ "executable",
1519
+ "command",
1520
+ "binary",
1521
+ "args",
1522
+ "arguments",
1523
+ "cwd",
1524
+ "timeout_seconds",
1525
+ "timeoutSeconds",
1526
+ "log_dir",
1527
+ "logDir",
1528
+ "log_directory",
1529
+ "logDirectory",
1530
+ "log_format",
1531
+ "logFormat",
1532
+ "log_output_format",
1533
+ "logOutputFormat",
1534
+ "system_prompt",
1535
+ "systemPrompt",
1536
+ "workspace_template",
1537
+ "workspaceTemplate"
1538
+ ]);
1539
+ var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
1540
+ ...COMMON_SETTINGS,
1541
+ "cli_url",
1542
+ "cliUrl",
1543
+ "cli_path",
1544
+ "cliPath",
1545
+ "github_token",
1546
+ "githubToken",
1547
+ "model",
1548
+ "cwd",
1549
+ "timeout_seconds",
1550
+ "timeoutSeconds",
1551
+ "log_dir",
1552
+ "logDir",
1553
+ "log_format",
1554
+ "logFormat",
1555
+ "system_prompt",
1556
+ "systemPrompt",
1557
+ "workspace_template",
1558
+ "workspaceTemplate"
1559
+ ]);
1560
+ var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
1561
+ ...COMMON_SETTINGS,
1562
+ "executable",
1563
+ "workspace_template",
1564
+ "workspaceTemplate",
1565
+ "wait",
1566
+ "dry_run",
1567
+ "dryRun",
1568
+ "subagent_root",
1569
+ "subagentRoot",
1570
+ "timeout_seconds",
1571
+ "timeoutSeconds"
1572
+ ]);
1573
+ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
1574
+ ...COMMON_SETTINGS,
1575
+ "response",
1576
+ "delayMs",
1577
+ "delayMinMs",
1578
+ "delayMaxMs",
1579
+ "trace"
1580
+ // For testing tool_trajectory evaluator
1581
+ ]);
1582
+ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
1583
+ ...COMMON_SETTINGS,
1584
+ "model",
1585
+ "cwd",
1586
+ "timeout_seconds",
1587
+ "timeoutSeconds",
1588
+ "log_dir",
1589
+ "logDir",
1590
+ "log_directory",
1591
+ "logDirectory",
1592
+ "log_format",
1593
+ "logFormat",
1594
+ "log_output_format",
1595
+ "logOutputFormat",
1596
+ "system_prompt",
1597
+ "systemPrompt",
1598
+ "workspace_template",
1599
+ "workspaceTemplate",
1600
+ "max_turns",
1601
+ "maxTurns",
1602
+ "max_budget_usd",
1603
+ "maxBudgetUsd"
1604
+ ]);
1605
+ function getKnownSettings(provider) {
1606
+ const normalizedProvider = provider.toLowerCase();
1607
+ switch (normalizedProvider) {
1608
+ case "azure":
1609
+ case "azure-openai":
1610
+ return AZURE_SETTINGS;
1611
+ case "anthropic":
1612
+ return ANTHROPIC_SETTINGS;
1613
+ case "gemini":
1614
+ case "google":
1615
+ case "google-gemini":
1616
+ return GEMINI_SETTINGS;
1617
+ case "codex":
1618
+ case "codex-cli":
1619
+ return CODEX_SETTINGS;
1620
+ case "copilot":
1621
+ case "copilot-sdk":
1622
+ case "copilot_sdk":
1623
+ case "copilot-cli":
1624
+ return COPILOT_SDK_SETTINGS;
1625
+ case "claude":
1626
+ case "claude-code":
1627
+ case "claude-sdk":
1628
+ return CLAUDE_SETTINGS;
1629
+ case "vscode":
1630
+ case "vscode-insiders":
1631
+ return VSCODE_SETTINGS;
1632
+ case "mock":
1633
+ return MOCK_SETTINGS;
1634
+ case "cli":
1635
+ return null;
1636
+ default:
1637
+ return null;
1638
+ }
1639
+ }
1640
+ function validateUnknownSettings(target, provider, absolutePath, location, errors) {
1641
+ const knownSettings = getKnownSettings(provider);
1642
+ if (!knownSettings) {
1643
+ return;
1644
+ }
1645
+ const baseFields = /* @__PURE__ */ new Set(["name", "provider", "judge_target", "workers", "$schema", "targets"]);
1646
+ for (const key of Object.keys(target)) {
1647
+ if (!baseFields.has(key) && !knownSettings.has(key)) {
1648
+ errors.push({
1649
+ severity: "warning",
1650
+ filePath: absolutePath,
1651
+ location: `${location}.${key}`,
1652
+ message: `Unknown setting '${key}' for ${provider} provider. This property will be ignored.`
1653
+ });
1654
+ }
1655
+ }
1656
+ }
1657
+ async function validateTargetsFile(filePath) {
1658
+ const errors = [];
1659
+ const absolutePath = path32.resolve(filePath);
1660
+ let parsed;
1661
+ try {
1662
+ const content = await readFile3(absolutePath, "utf8");
1663
+ parsed = parse3(content);
1664
+ } catch (error) {
1665
+ errors.push({
1666
+ severity: "error",
1667
+ filePath: absolutePath,
1668
+ message: `Failed to parse YAML: ${error.message}`
1669
+ });
1670
+ return {
1671
+ valid: false,
1672
+ filePath: absolutePath,
1673
+ fileType: "targets",
1674
+ errors
1675
+ };
1676
+ }
1677
+ function validateCliSettings(target, absolutePath2, location, errors2) {
1678
+ const commandTemplate = target.command_template ?? target.commandTemplate;
1679
+ if (typeof commandTemplate !== "string" || commandTemplate.trim().length === 0) {
1680
+ errors2.push({
1681
+ severity: "error",
1682
+ filePath: absolutePath2,
1683
+ location: `${location}.commandTemplate`,
1684
+ message: "CLI provider requires 'command_template' or 'commandTemplate' as a non-empty string"
1685
+ });
1686
+ } else {
1687
+ recordUnknownPlaceholders(
1688
+ commandTemplate,
1689
+ absolutePath2,
1690
+ `${location}.commandTemplate`,
1691
+ errors2
1692
+ );
1693
+ }
1694
+ const healthcheck = target.healthcheck;
1695
+ if (healthcheck !== void 0) {
1696
+ validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
1697
+ }
1698
+ }
1699
+ function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
1700
+ if (!isObject2(healthcheck)) {
1701
+ errors2.push({
1702
+ severity: "error",
1703
+ filePath: absolutePath2,
1704
+ location,
1705
+ message: "'healthcheck' must be an object when provided"
1706
+ });
1707
+ return;
1708
+ }
1709
+ const type = healthcheck.type;
1710
+ if (type !== "http" && type !== "command") {
1711
+ errors2.push({
1712
+ severity: "error",
1713
+ filePath: absolutePath2,
1714
+ location: `${location}.type`,
1715
+ message: "healthcheck.type must be either 'http' or 'command'"
1716
+ });
1717
+ return;
1718
+ }
1719
+ const timeoutSeconds = healthcheck.timeout_seconds ?? healthcheck.timeoutSeconds;
1720
+ if (timeoutSeconds !== void 0) {
1721
+ const numericTimeout = Number(timeoutSeconds);
1722
+ if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
1723
+ errors2.push({
1724
+ severity: "error",
1725
+ filePath: absolutePath2,
1726
+ location: `${location}.timeoutSeconds`,
1727
+ message: "healthcheck.timeoutSeconds must be a positive number when provided"
1728
+ });
1729
+ }
1730
+ }
1731
+ if (type === "http") {
1732
+ const url = healthcheck.url;
1733
+ if (typeof url !== "string" || url.trim().length === 0) {
1734
+ errors2.push({
1735
+ severity: "error",
1736
+ filePath: absolutePath2,
1737
+ location: `${location}.url`,
1738
+ message: "healthcheck.url must be a non-empty string for http checks"
1739
+ });
1740
+ }
1741
+ return;
1742
+ }
1743
+ const commandTemplate = healthcheck.command_template ?? healthcheck.commandTemplate;
1744
+ if (typeof commandTemplate !== "string" || commandTemplate.trim().length === 0) {
1745
+ errors2.push({
1746
+ severity: "error",
1747
+ filePath: absolutePath2,
1748
+ location: `${location}.commandTemplate`,
1749
+ message: "healthcheck.commandTemplate must be a non-empty string for command checks"
1750
+ });
1751
+ } else {
1752
+ recordUnknownPlaceholders(
1753
+ commandTemplate,
1754
+ absolutePath2,
1755
+ `${location}.commandTemplate`,
1756
+ errors2
1757
+ );
1758
+ }
1759
+ const cwd = healthcheck.cwd;
1760
+ if (cwd !== void 0 && typeof cwd !== "string") {
1761
+ errors2.push({
1762
+ severity: "error",
1763
+ filePath: absolutePath2,
1764
+ location: `${location}.cwd`,
1765
+ message: "healthcheck.cwd must be a string when provided"
1766
+ });
1767
+ }
1768
+ }
1769
+ function recordUnknownPlaceholders(template, absolutePath2, location, errors2) {
1770
+ const placeholders = extractPlaceholders(template);
1771
+ for (const placeholder of placeholders) {
1772
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
1773
+ errors2.push({
1774
+ severity: "error",
1775
+ filePath: absolutePath2,
1776
+ location,
1777
+ message: `Unknown CLI placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
1778
+ });
1779
+ }
1780
+ }
1781
+ }
1782
+ function extractPlaceholders(template) {
1783
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
1784
+ const result = [];
1785
+ for (const match of matches) {
1786
+ const placeholder = match[1];
1787
+ if (placeholder) {
1788
+ result.push(placeholder);
1789
+ }
1790
+ }
1791
+ return result;
1792
+ }
1793
+ if (!isObject2(parsed)) {
1794
+ errors.push({
1795
+ severity: "error",
1796
+ filePath: absolutePath,
1797
+ message: "File must contain a YAML object"
1798
+ });
1799
+ return {
1800
+ valid: false,
1801
+ filePath: absolutePath,
1802
+ fileType: "targets",
1803
+ errors
1804
+ };
1805
+ }
1806
+ const targets = parsed.targets;
1807
+ if (!Array.isArray(targets)) {
1808
+ errors.push({
1809
+ severity: "error",
1810
+ filePath: absolutePath,
1811
+ location: "targets",
1812
+ message: "Missing or invalid 'targets' field (must be an array)"
1813
+ });
1814
+ return {
1815
+ valid: errors.length === 0,
1816
+ filePath: absolutePath,
1817
+ fileType: "targets",
1818
+ errors
1819
+ };
1820
+ }
1821
+ const knownProviders = [...KNOWN_PROVIDERS, ...PROVIDER_ALIASES];
1822
+ for (let i = 0; i < targets.length; i++) {
1823
+ const target = targets[i];
1824
+ const location = `targets[${i}]`;
1825
+ if (!isObject2(target)) {
1826
+ errors.push({
1827
+ severity: "error",
1828
+ filePath: absolutePath,
1829
+ location,
1830
+ message: "Target must be an object"
1831
+ });
1832
+ continue;
1833
+ }
1834
+ const name = target.name;
1835
+ if (typeof name !== "string" || name.trim().length === 0) {
1836
+ errors.push({
1837
+ severity: "error",
1838
+ filePath: absolutePath,
1839
+ location: `${location}.name`,
1840
+ message: "Missing or invalid 'name' field (must be a non-empty string)"
1841
+ });
1842
+ }
1843
+ const provider = target.provider;
1844
+ const providerValue = typeof provider === "string" ? provider.trim().toLowerCase() : void 0;
1845
+ if (typeof provider !== "string" || provider.trim().length === 0) {
1846
+ errors.push({
1847
+ severity: "error",
1848
+ filePath: absolutePath,
1849
+ location: `${location}.provider`,
1850
+ message: "Missing or invalid 'provider' field (must be a non-empty string)"
1851
+ });
1852
+ } else if (!knownProviders.includes(provider)) {
1853
+ errors.push({
1854
+ severity: "warning",
1855
+ filePath: absolutePath,
1856
+ location: `${location}.provider`,
1857
+ message: `Unknown provider '${provider}'. Known providers: ${knownProviders.join(", ")}`
1858
+ });
1859
+ }
1860
+ if (providerValue === "cli") {
1861
+ validateCliSettings(target, absolutePath, location, errors);
1862
+ }
1863
+ if (typeof provider === "string") {
1864
+ validateUnknownSettings(target, provider, absolutePath, location, errors);
1865
+ }
1866
+ const judgeTarget = target.judge_target;
1867
+ if (judgeTarget !== void 0 && typeof judgeTarget !== "string") {
1868
+ errors.push({
1869
+ severity: "error",
1870
+ filePath: absolutePath,
1871
+ location: `${location}.judge_target`,
1872
+ message: "Invalid 'judge_target' field (must be a string)"
1873
+ });
1874
+ }
1875
+ }
1876
+ return {
1877
+ valid: errors.filter((e) => e.severity === "error").length === 0,
1878
+ filePath: absolutePath,
1879
+ fileType: "targets",
1880
+ errors
1881
+ };
1882
+ }
1883
+ async function validateConfigFile(filePath) {
1884
+ const errors = [];
1885
+ try {
1886
+ const content = await readFile4(filePath, "utf8");
1887
+ const parsed = parse4(content);
1888
+ if (typeof parsed !== "object" || parsed === null) {
1889
+ errors.push({
1890
+ severity: "error",
1891
+ filePath,
1892
+ message: "Config file must contain a valid YAML object"
1893
+ });
1894
+ return { valid: false, filePath, fileType: "config", errors };
1895
+ }
1896
+ const config = parsed;
1897
+ const guidelinePatterns = config.guideline_patterns;
1898
+ if (guidelinePatterns !== void 0) {
1899
+ if (!Array.isArray(guidelinePatterns)) {
1900
+ errors.push({
1901
+ severity: "error",
1902
+ filePath,
1903
+ location: "guideline_patterns",
1904
+ message: "Field 'guideline_patterns' must be an array"
1905
+ });
1906
+ } else if (!guidelinePatterns.every((p) => typeof p === "string")) {
1907
+ errors.push({
1908
+ severity: "error",
1909
+ filePath,
1910
+ location: "guideline_patterns",
1911
+ message: "All entries in 'guideline_patterns' must be strings"
1912
+ });
1913
+ } else if (guidelinePatterns.length === 0) {
1914
+ errors.push({
1915
+ severity: "warning",
1916
+ filePath,
1917
+ location: "guideline_patterns",
1918
+ message: "Field 'guideline_patterns' is empty. Consider removing it or adding patterns."
1919
+ });
1920
+ }
1921
+ }
1922
+ const evalPatterns = config.eval_patterns;
1923
+ if (evalPatterns !== void 0) {
1924
+ if (!Array.isArray(evalPatterns)) {
1925
+ errors.push({
1926
+ severity: "error",
1927
+ filePath,
1928
+ location: "eval_patterns",
1929
+ message: "Field 'eval_patterns' must be an array"
1930
+ });
1931
+ } else if (!evalPatterns.every((p) => typeof p === "string")) {
1932
+ errors.push({
1933
+ severity: "error",
1934
+ filePath,
1935
+ location: "eval_patterns",
1936
+ message: "All entries in 'eval_patterns' must be strings"
1937
+ });
1938
+ } else if (evalPatterns.length === 0) {
1939
+ errors.push({
1940
+ severity: "warning",
1941
+ filePath,
1942
+ location: "eval_patterns",
1943
+ message: "Field 'eval_patterns' is empty. Consider removing it or adding patterns."
1944
+ });
1945
+ }
1946
+ }
1947
+ const allowedFields = /* @__PURE__ */ new Set(["$schema", "guideline_patterns", "eval_patterns"]);
1948
+ const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
1949
+ if (unexpectedFields.length > 0) {
1950
+ errors.push({
1951
+ severity: "warning",
1952
+ filePath,
1953
+ message: `Unexpected fields: ${unexpectedFields.join(", ")}`
1954
+ });
1955
+ }
1956
+ return {
1957
+ valid: errors.filter((e) => e.severity === "error").length === 0,
1958
+ filePath,
1959
+ fileType: "config",
1960
+ errors
1961
+ };
1962
+ } catch (error) {
1963
+ errors.push({
1964
+ severity: "error",
1965
+ filePath,
1966
+ message: `Failed to parse config file: ${error.message}`
1967
+ });
1968
+ return { valid: false, filePath, fileType: "config", errors };
1969
+ }
1970
+ }
1971
+ function isObject3(value) {
1972
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1973
+ }
1974
+ async function validateFileReferences(evalFilePath) {
1975
+ const errors = [];
1976
+ const absolutePath = path42.resolve(evalFilePath);
1977
+ const gitRoot = await findGitRoot(absolutePath);
1978
+ if (!gitRoot) {
1979
+ errors.push({
1980
+ severity: "error",
1981
+ filePath: absolutePath,
1982
+ message: "Cannot validate file references: git repository root not found"
1983
+ });
1984
+ return errors;
1985
+ }
1986
+ const searchRoots = buildSearchRoots(absolutePath, gitRoot);
1987
+ let parsed;
1988
+ try {
1989
+ const content = await readFile5(absolutePath, "utf8");
1990
+ parsed = parse5(content);
1991
+ } catch {
1992
+ return errors;
1993
+ }
1994
+ if (!isObject3(parsed)) {
1995
+ return errors;
1996
+ }
1997
+ let cases = parsed.tests;
1998
+ if (cases === void 0 && "eval_cases" in parsed) {
1999
+ cases = parsed.eval_cases;
2000
+ }
2001
+ if (cases === void 0 && "evalcases" in parsed) {
2002
+ cases = parsed.evalcases;
2003
+ }
2004
+ if (!Array.isArray(cases)) {
2005
+ return errors;
2006
+ }
2007
+ for (let i = 0; i < cases.length; i++) {
2008
+ const evalCase = cases[i];
2009
+ if (!isObject3(evalCase)) {
2010
+ continue;
2011
+ }
2012
+ const inputField = evalCase.input;
2013
+ if (Array.isArray(inputField)) {
2014
+ await validateMessagesFileRefs(
2015
+ inputField,
2016
+ `tests[${i}].input`,
2017
+ searchRoots,
2018
+ absolutePath,
2019
+ errors
2020
+ );
2021
+ }
2022
+ const expectedOutputField = evalCase.expected_output;
2023
+ if (Array.isArray(expectedOutputField)) {
2024
+ await validateMessagesFileRefs(
2025
+ expectedOutputField,
2026
+ `tests[${i}].expected_output`,
2027
+ searchRoots,
2028
+ absolutePath,
2029
+ errors
2030
+ );
2031
+ }
2032
+ }
2033
+ return errors;
2034
+ }
2035
+ async function validateMessagesFileRefs(messages, location, searchRoots, filePath, errors) {
2036
+ for (let i = 0; i < messages.length; i++) {
2037
+ const message = messages[i];
2038
+ if (!isObject3(message)) {
2039
+ continue;
2040
+ }
2041
+ const content = message.content;
2042
+ if (typeof content === "string") {
2043
+ continue;
2044
+ }
2045
+ if (!Array.isArray(content)) {
2046
+ continue;
2047
+ }
2048
+ for (let j = 0; j < content.length; j++) {
2049
+ const contentItem = content[j];
2050
+ if (!isObject3(contentItem)) {
2051
+ continue;
2052
+ }
2053
+ const type = contentItem.type;
2054
+ if (type !== "file") {
2055
+ continue;
2056
+ }
2057
+ const value = contentItem.value;
2058
+ if (typeof value !== "string") {
2059
+ errors.push({
2060
+ severity: "error",
2061
+ filePath,
2062
+ location: `${location}[${i}].content[${j}].value`,
2063
+ message: "File reference must have a 'value' field with the file path"
2064
+ });
2065
+ continue;
2066
+ }
2067
+ const { resolvedPath } = await resolveFileReference(value, searchRoots);
2068
+ if (!resolvedPath) {
2069
+ errors.push({
2070
+ severity: "error",
2071
+ filePath,
2072
+ location: `${location}[${i}].content[${j}]`,
2073
+ message: `Referenced file not found: ${value}`
2074
+ });
2075
+ } else {
2076
+ try {
2077
+ const fileContent = await readFile5(resolvedPath, "utf8");
2078
+ if (fileContent.trim().length === 0) {
2079
+ errors.push({
2080
+ severity: "warning",
2081
+ filePath,
2082
+ location: `${location}[${i}].content[${j}]`,
2083
+ message: `Referenced file is empty: ${value}`
2084
+ });
2085
+ }
2086
+ } catch (error) {
2087
+ errors.push({
2088
+ severity: "error",
2089
+ filePath,
2090
+ location: `${location}[${i}].content[${j}]`,
2091
+ message: `Cannot read referenced file: ${value} (${error.message})`
2092
+ });
2093
+ }
2094
+ }
2095
+ }
2096
+ }
2097
+ }
2098
+
2099
+ // src/commands/eval/targets.ts
2100
+ var ANSI_YELLOW = "\x1B[33m";
2101
+ var ANSI_RED = "\x1B[31m";
2102
+ var ANSI_RESET = "\x1B[0m";
2103
+ function isTTY() {
2104
+ return process.stdout.isTTY ?? false;
2105
+ }
2106
+ async function readTestSuiteTarget(testFilePath) {
2107
+ const metadata = await readTestSuiteMetadata(testFilePath);
2108
+ return metadata.target;
2109
+ }
2110
+ function pickTargetName(options) {
2111
+ const cliName = options.cliTargetName?.trim();
2112
+ if (cliName && cliName !== "default") {
2113
+ return { name: cliName, source: "cli" };
2114
+ }
2115
+ const fileName = options.fileTargetName?.trim();
2116
+ if (fileName && fileName.length > 0) {
2117
+ return { name: fileName, source: "test-file" };
2118
+ }
2119
+ return { name: "default", source: "default" };
2120
+ }
2121
+ async function selectTarget(options) {
2122
+ const {
2123
+ testFilePath,
2124
+ repoRoot,
2125
+ cwd,
2126
+ explicitTargetsPath,
2127
+ cliTargetName,
2128
+ dryRun,
2129
+ dryRunDelay,
2130
+ dryRunDelayMin,
2131
+ dryRunDelayMax,
2132
+ env
2133
+ } = options;
2134
+ const targetsFilePath = await discoverTargetsFile({
2135
+ explicitPath: explicitTargetsPath,
2136
+ testFilePath,
2137
+ repoRoot,
2138
+ cwd
2139
+ });
2140
+ const validationResult = await validateTargetsFile(targetsFilePath);
2141
+ const warnings = validationResult.errors.filter((e) => e.severity === "warning");
2142
+ const useColors = isTTY();
2143
+ if (warnings.length > 0) {
2144
+ console.warn(`
2145
+ Warnings in ${targetsFilePath}:`);
2146
+ for (const warning of warnings) {
2147
+ const location = warning.location ? ` [${warning.location}]` : "";
2148
+ const prefix = useColors ? `${ANSI_YELLOW} \u26A0${ANSI_RESET}` : " \u26A0";
2149
+ const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
2150
+ console.warn(`${prefix}${location} ${message}`);
2151
+ }
2152
+ console.warn("");
2153
+ }
2154
+ const errors = validationResult.errors.filter((e) => e.severity === "error");
2155
+ if (errors.length > 0) {
2156
+ console.error(`
2157
+ Errors in ${targetsFilePath}:`);
2158
+ for (const error of errors) {
2159
+ const location = error.location ? ` [${error.location}]` : "";
2160
+ const prefix = useColors ? `${ANSI_RED} \u2717${ANSI_RESET}` : " \u2717";
2161
+ const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
2162
+ console.error(`${prefix}${location} ${message}`);
2163
+ }
2164
+ throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
2165
+ }
2166
+ const definitions = await readTargetDefinitions(targetsFilePath);
2167
+ const fileTargetName = await readTestSuiteTarget(testFilePath);
2168
+ const targetChoice = pickTargetName({ cliTargetName, fileTargetName });
2169
+ const targetDefinition = definitions.find(
2170
+ (definition) => definition.name === targetChoice.name
2171
+ );
2172
+ if (!targetDefinition) {
2173
+ const available = listTargetNames(definitions).join(", ");
2174
+ throw new Error(
2175
+ `Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`
2176
+ );
2177
+ }
2178
+ if (dryRun) {
2179
+ const mockTarget = {
2180
+ kind: "mock",
2181
+ name: `${targetDefinition.name}-dry-run`,
2182
+ judgeTarget: void 0,
2183
+ config: {
2184
+ response: '{"answer":"Mock dry-run response"}',
2185
+ delayMs: dryRunDelay,
2186
+ delayMinMs: dryRunDelayMin,
2187
+ delayMaxMs: dryRunDelayMax
2188
+ }
2189
+ };
2190
+ return {
2191
+ definitions,
2192
+ resolvedTarget: mockTarget,
2193
+ targetName: targetChoice.name,
2194
+ targetSource: targetChoice.source,
2195
+ targetsFilePath
2196
+ };
2197
+ }
2198
+ try {
2199
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
2200
+ return {
2201
+ definitions,
2202
+ resolvedTarget,
2203
+ targetName: targetChoice.name,
2204
+ targetSource: targetChoice.source,
2205
+ targetsFilePath
2206
+ };
2207
+ } catch (error) {
2208
+ const message = error instanceof Error ? error.message : String(error);
2209
+ throw new Error(`Failed to resolve target '${targetChoice.name}': ${message}`);
2210
+ }
2211
+ }
2212
+ async function selectMultipleTargets(options) {
2213
+ const {
2214
+ testFilePath,
2215
+ repoRoot,
2216
+ cwd,
2217
+ explicitTargetsPath,
2218
+ dryRun,
2219
+ dryRunDelay,
2220
+ dryRunDelayMin,
2221
+ dryRunDelayMax,
2222
+ env,
2223
+ targetNames
2224
+ } = options;
2225
+ const targetsFilePath = await discoverTargetsFile({
2226
+ explicitPath: explicitTargetsPath,
2227
+ testFilePath,
2228
+ repoRoot,
2229
+ cwd
2230
+ });
2231
+ const validationResult = await validateTargetsFile(targetsFilePath);
2232
+ const warnings = validationResult.errors.filter((e) => e.severity === "warning");
2233
+ const useColors = isTTY();
2234
+ if (warnings.length > 0) {
2235
+ console.warn(`
2236
+ Warnings in ${targetsFilePath}:`);
2237
+ for (const warning of warnings) {
2238
+ const location = warning.location ? ` [${warning.location}]` : "";
2239
+ const prefix = useColors ? `${ANSI_YELLOW} \u26A0${ANSI_RESET}` : " \u26A0";
2240
+ const message = useColors ? `${ANSI_YELLOW}${warning.message}${ANSI_RESET}` : warning.message;
2241
+ console.warn(`${prefix}${location} ${message}`);
2242
+ }
2243
+ console.warn("");
2244
+ }
2245
+ const errors = validationResult.errors.filter((e) => e.severity === "error");
2246
+ if (errors.length > 0) {
2247
+ console.error(`
2248
+ Errors in ${targetsFilePath}:`);
2249
+ for (const error of errors) {
2250
+ const location = error.location ? ` [${error.location}]` : "";
2251
+ const prefix = useColors ? `${ANSI_RED} \u2717${ANSI_RESET}` : " \u2717";
2252
+ const message = useColors ? `${ANSI_RED}${error.message}${ANSI_RESET}` : error.message;
2253
+ console.error(`${prefix}${location} ${message}`);
2254
+ }
2255
+ throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
2256
+ }
2257
+ const definitions = await readTargetDefinitions(targetsFilePath);
2258
+ const results = [];
2259
+ for (const name of targetNames) {
2260
+ const targetDefinition = definitions.find(
2261
+ (definition) => definition.name === name
2262
+ );
2263
+ if (!targetDefinition) {
2264
+ const available = listTargetNames(definitions).join(", ");
2265
+ throw new Error(
2266
+ `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`
2267
+ );
2268
+ }
2269
+ if (dryRun) {
2270
+ const mockTarget = {
2271
+ kind: "mock",
2272
+ name: `${targetDefinition.name}-dry-run`,
2273
+ judgeTarget: void 0,
2274
+ config: {
2275
+ response: '{"answer":"Mock dry-run response"}',
2276
+ delayMs: dryRunDelay,
2277
+ delayMinMs: dryRunDelayMin,
2278
+ delayMaxMs: dryRunDelayMax
2279
+ }
2280
+ };
2281
+ results.push({
2282
+ definitions,
2283
+ resolvedTarget: mockTarget,
2284
+ targetName: name,
2285
+ targetSource: "cli",
2286
+ targetsFilePath
2287
+ });
2288
+ } else {
2289
+ try {
2290
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
2291
+ results.push({
2292
+ definitions,
2293
+ resolvedTarget,
2294
+ targetName: name,
2295
+ targetSource: "cli",
2296
+ targetsFilePath
2297
+ });
2298
+ } catch (error) {
2299
+ const message = error instanceof Error ? error.message : String(error);
2300
+ throw new Error(`Failed to resolve target '${name}': ${message}`);
2301
+ }
2302
+ }
2303
+ }
2304
+ return results;
2305
+ }
2306
+
2307
+ // src/commands/eval/run-eval.ts
2308
+ var DEFAULT_WORKERS = 3;
2309
+ function normalizeBoolean(value) {
2310
+ return value === true;
2311
+ }
2312
+ function normalizeString(value) {
2313
+ if (typeof value !== "string") {
2314
+ return void 0;
2315
+ }
2316
+ const trimmed = value.trim();
2317
+ return trimmed.length > 0 ? trimmed : void 0;
2318
+ }
2319
+ function normalizeNumber(value, fallback) {
2320
+ if (typeof value === "number" && Number.isFinite(value)) {
2321
+ return value;
2322
+ }
2323
+ if (typeof value === "string") {
2324
+ const parsed = Number.parseInt(value, 10);
2325
+ if (!Number.isNaN(parsed)) {
2326
+ return parsed;
2327
+ }
2328
+ }
2329
+ return fallback;
2330
+ }
2331
+ function normalizeOptionalNumber(value) {
2332
+ if (typeof value === "number" && Number.isFinite(value)) {
2333
+ return value;
2334
+ }
2335
+ if (typeof value === "string") {
2336
+ const parsed = Number.parseInt(value, 10);
2337
+ if (!Number.isNaN(parsed)) {
2338
+ return parsed;
2339
+ }
2340
+ }
2341
+ return void 0;
2342
+ }
2343
+ function normalizeOptions(rawOptions, config) {
2344
+ const cliFormat = normalizeString(rawOptions.outputFormat);
2345
+ const configFormat = config?.output?.format;
2346
+ const formatStr = cliFormat ?? configFormat ?? "jsonl";
2347
+ const format = formatStr === "yaml" ? "yaml" : "jsonl";
2348
+ const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
2349
+ const configWorkers = config?.execution?.workers;
2350
+ const workers = cliWorkers ?? configWorkers ?? 0;
2351
+ const rawOutputPaths = rawOptions.output;
2352
+ const outputPaths = Array.isArray(rawOutputPaths) ? rawOutputPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
2353
+ const rawTarget = rawOptions.target;
2354
+ let cliTargets = [];
2355
+ let singleTarget;
2356
+ if (Array.isArray(rawTarget)) {
2357
+ cliTargets = rawTarget.filter((v) => typeof v === "string" && v.trim().length > 0);
2358
+ singleTarget = cliTargets.length === 1 ? cliTargets[0] : void 0;
2359
+ } else if (typeof rawTarget === "string") {
2360
+ const trimmed = rawTarget.trim();
2361
+ if (trimmed.length > 0 && trimmed !== "default") {
2362
+ cliTargets = [trimmed];
2363
+ singleTarget = trimmed;
2364
+ }
2365
+ }
2366
+ const cliAgentTimeout = normalizeOptionalNumber(rawOptions.agentTimeout);
2367
+ const configAgentTimeoutSeconds = config?.execution?.agentTimeoutMs != null ? config.execution.agentTimeoutMs / 1e3 : void 0;
2368
+ const cliMaxRetries = normalizeOptionalNumber(rawOptions.maxRetries);
2369
+ const configMaxRetries = config?.execution?.maxRetries;
2370
+ const cliCache = normalizeBoolean(rawOptions.cache);
2371
+ const cliNoCache = normalizeBoolean(rawOptions.noCache);
2372
+ const configCacheEnabled = config?.cache?.enabled;
2373
+ const resolvedCache = cliCache || !cliNoCache && configCacheEnabled === true;
2374
+ const resolvedNoCache = cliNoCache;
2375
+ const cliOut = normalizeString(rawOptions.out);
2376
+ const configOut = config?.output?.dir;
2377
+ return {
2378
+ target: singleTarget,
2379
+ cliTargets,
2380
+ targetsPath: normalizeString(rawOptions.targets),
2381
+ filter: normalizeString(rawOptions.filter),
2382
+ workers: workers > 0 ? workers : void 0,
2383
+ outPath: cliOut ?? configOut,
2384
+ outputPaths,
2385
+ format,
2386
+ dryRun: normalizeBoolean(rawOptions.dryRun),
2387
+ dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
2388
+ dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
2389
+ dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0),
2390
+ agentTimeoutSeconds: cliAgentTimeout ?? configAgentTimeoutSeconds ?? 120,
2391
+ maxRetries: cliMaxRetries ?? configMaxRetries ?? 2,
2392
+ cache: resolvedCache,
2393
+ noCache: resolvedNoCache,
2394
+ verbose: normalizeBoolean(rawOptions.verbose),
2395
+ keepWorkspaces: normalizeBoolean(rawOptions.keepWorkspaces),
2396
+ cleanupWorkspaces: normalizeBoolean(rawOptions.cleanupWorkspaces),
2397
+ trace: normalizeBoolean(rawOptions.trace),
2398
+ otelFile: normalizeString(rawOptions.otelFile),
2399
+ traceFile: normalizeString(rawOptions.traceFile),
2400
+ exportOtel: normalizeBoolean(rawOptions.exportOtel),
2401
+ otelBackend: normalizeString(rawOptions.otelBackend),
2402
+ otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
2403
+ otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns)
2404
+ };
2405
+ }
2406
+ async function ensureFileExists(filePath, description) {
2407
+ try {
2408
+ await access4(filePath, constants4.F_OK);
2409
+ } catch {
2410
+ throw new Error(`${description} not found: ${filePath}`);
2411
+ }
2412
+ }
2413
+ function buildDefaultOutputPath(cwd, format) {
2414
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2415
+ const baseName = "eval";
2416
+ const extension = getDefaultExtension(format);
2417
+ return path10.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
2418
+ }
2419
+ function createProgressReporter(maxWorkers, options) {
2420
+ const display = new ProgressDisplay(maxWorkers, options);
2421
+ return {
2422
+ isInteractive: display.isInteractiveMode(),
2423
+ start: () => display.start(),
2424
+ setTotal: (total) => display.setTotalTests(total),
2425
+ update: (workerId, progress) => display.updateWorker({ ...progress, workerId }),
2426
+ finish: () => display.finish(),
2427
+ addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
2428
+ };
2429
+ }
2430
+ function makeEvalKey(testFilePath, evalId) {
2431
+ return `${path10.resolve(testFilePath)}::${evalId}`;
2432
+ }
2433
+ function createDisplayIdTracker() {
2434
+ const map = /* @__PURE__ */ new Map();
2435
+ let nextId = 1;
2436
+ return {
2437
+ getOrAssign(evalKey) {
2438
+ const existing = map.get(evalKey);
2439
+ if (existing !== void 0) {
2440
+ return existing;
2441
+ }
2442
+ const assigned = nextId++;
2443
+ map.set(evalKey, assigned);
2444
+ return assigned;
2445
+ }
2446
+ };
2447
+ }
2448
+ function applyVerboseOverride(selection, cliVerbose) {
2449
+ const { resolvedTarget } = selection;
2450
+ if (resolvedTarget.kind !== "cli") {
2451
+ return selection;
2452
+ }
2453
+ return {
2454
+ ...selection,
2455
+ resolvedTarget: {
2456
+ ...resolvedTarget,
2457
+ config: {
2458
+ ...resolvedTarget.config,
2459
+ verbose: cliVerbose
2460
+ }
2461
+ }
2462
+ };
2463
+ }
2464
+ async function prepareFileMetadata(params) {
2465
+ const { testFilePath, repoRoot, cwd, options } = params;
2466
+ await ensureFileExists(testFilePath, "Test file");
2467
+ await loadEnvFromHierarchy({
2468
+ testFilePath,
2469
+ repoRoot,
2470
+ verbose: options.verbose
2471
+ });
2472
+ const suite = await loadTestSuite(testFilePath, repoRoot, {
2473
+ verbose: options.verbose,
2474
+ filter: options.filter
2475
+ });
2476
+ const filteredIds = suite.tests.map((value) => value.id);
2477
+ const cliTargets = options.cliTargets;
2478
+ const suiteTargets = suite.targets;
2479
+ let targetNames;
2480
+ if (cliTargets.length > 0) {
2481
+ targetNames = cliTargets;
2482
+ } else if (suiteTargets && suiteTargets.length > 0) {
2483
+ targetNames = suiteTargets;
2484
+ } else {
2485
+ targetNames = [];
2486
+ }
2487
+ let selections;
2488
+ if (targetNames.length > 1) {
2489
+ const multiSelections = await selectMultipleTargets({
2490
+ testFilePath,
2491
+ repoRoot,
2492
+ cwd,
2493
+ explicitTargetsPath: options.targetsPath,
2494
+ dryRun: options.dryRun,
2495
+ dryRunDelay: options.dryRunDelay,
2496
+ dryRunDelayMin: options.dryRunDelayMin,
2497
+ dryRunDelayMax: options.dryRunDelayMax,
2498
+ env: process.env,
2499
+ targetNames
2500
+ });
2501
+ selections = multiSelections.map((sel) => {
2502
+ const providerLabel = options.dryRun ? `${sel.resolvedTarget.kind} (dry-run)` : sel.resolvedTarget.kind;
2503
+ return {
2504
+ selection: sel,
2505
+ inlineTargetLabel: `${sel.targetName} [provider=${providerLabel}]`
2506
+ };
2507
+ });
2508
+ } else {
2509
+ const selection = await selectTarget({
2510
+ testFilePath,
2511
+ repoRoot,
2512
+ cwd,
2513
+ explicitTargetsPath: options.targetsPath,
2514
+ cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
2515
+ dryRun: options.dryRun,
2516
+ dryRunDelay: options.dryRunDelay,
2517
+ dryRunDelayMin: options.dryRunDelayMin,
2518
+ dryRunDelayMax: options.dryRunDelayMax,
2519
+ env: process.env
2520
+ });
2521
+ const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
2522
+ selections = [
2523
+ {
2524
+ selection,
2525
+ inlineTargetLabel: `${selection.targetName} [provider=${providerLabel}]`
2526
+ }
2527
+ ];
2528
+ }
2529
+ return {
2530
+ evalIds: filteredIds,
2531
+ evalCases: suite.tests,
2532
+ selections,
2533
+ trialsConfig: suite.trials,
2534
+ suiteTargets,
2535
+ yamlCache: suite.cacheConfig?.enabled,
2536
+ yamlCachePath: suite.cacheConfig?.cachePath
2537
+ };
2538
+ }
2539
+ async function runWithLimit(items, limit, task) {
2540
+ const safeLimit = Math.max(1, limit);
2541
+ let index = 0;
2542
+ const workers = Array.from({ length: safeLimit }, async () => {
2543
+ while (index < items.length) {
2544
+ const current = items[index];
2545
+ index += 1;
2546
+ await task(current);
2547
+ }
2548
+ });
2549
+ await Promise.all(workers);
2550
+ }
2551
+ async function runSingleEvalFile(params) {
2552
+ const {
2553
+ testFilePath,
2554
+ cwd,
2555
+ repoRoot,
2556
+ options,
2557
+ outputWriter,
2558
+ otelExporter,
2559
+ cache,
2560
+ evaluationRunner,
2561
+ workersOverride,
2562
+ progressReporter,
2563
+ seenEvalCases,
2564
+ displayIdTracker,
2565
+ selection,
2566
+ inlineTargetLabel,
2567
+ evalCases,
2568
+ trialsConfig,
2569
+ matrixMode
2570
+ } = params;
2571
+ const targetName = selection.targetName;
2572
+ await ensureFileExists(testFilePath, "Test file");
2573
+ const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
2574
+ const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
2575
+ const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
2576
+ if (!progressReporter.isInteractive || options.verbose) {
2577
+ console.log(targetMessage);
2578
+ }
2579
+ const agentTimeoutMs = Math.max(0, options.agentTimeoutSeconds) * 1e3;
2580
+ const workerPreference = workersOverride ?? options.workers;
2581
+ let resolvedWorkers = workerPreference ?? resolvedTargetSelection.resolvedTarget.workers ?? DEFAULT_WORKERS;
2582
+ if (resolvedWorkers < 1 || resolvedWorkers > 50) {
2583
+ throw new Error(`Workers must be between 1 and 50, got: ${resolvedWorkers}`);
2584
+ }
2585
+ const isVSCodeProvider = ["vscode", "vscode-insiders"].includes(
2586
+ resolvedTargetSelection.resolvedTarget.kind
2587
+ );
2588
+ if (isVSCodeProvider && resolvedWorkers > 1) {
2589
+ console.warn(
2590
+ `Warning: VSCode providers require window focus. Limiting workers from ${resolvedWorkers} to 1 to prevent race conditions.`
2591
+ );
2592
+ resolvedWorkers = 1;
2593
+ }
2594
+ if (isVSCodeProvider && !options.dryRun) {
2595
+ const vsConfig = resolvedTargetSelection.resolvedTarget.config;
2596
+ await ensureVSCodeSubagents({
2597
+ kind: resolvedTargetSelection.resolvedTarget.kind,
2598
+ count: resolvedWorkers,
2599
+ verbose: options.verbose,
2600
+ vscodeCmd: vsConfig.executable
2601
+ });
2602
+ }
2603
+ const streamingObserver = otelExporter?.createStreamingObserver() ?? null;
2604
+ const results = await evaluationRunner({
2605
+ testFilePath,
2606
+ repoRoot,
2607
+ target: resolvedTargetSelection.resolvedTarget,
2608
+ targets: resolvedTargetSelection.definitions,
2609
+ env: process.env,
2610
+ maxRetries: Math.max(0, options.maxRetries),
2611
+ agentTimeoutMs,
2612
+ cache,
2613
+ useCache: (() => {
2614
+ if (!cache) return false;
2615
+ const targetConfig = resolvedTargetSelection.resolvedTarget.config;
2616
+ if (shouldSkipCacheForTemperature(targetConfig)) {
2617
+ if (options.verbose) {
2618
+ console.log("Cache skipped: target temperature > 0");
2619
+ }
2620
+ return false;
2621
+ }
2622
+ return true;
2623
+ })(),
2624
+ evalCases,
2625
+ verbose: options.verbose,
2626
+ maxConcurrency: resolvedWorkers,
2627
+ keepWorkspaces: options.keepWorkspaces,
2628
+ cleanupWorkspaces: options.cleanupWorkspaces,
2629
+ trials: trialsConfig,
2630
+ streamCallbacks: streamingObserver?.getStreamCallbacks(),
2631
+ onResult: async (result) => {
2632
+ streamingObserver?.finalizeEvalCase(result.score, result.error);
2633
+ const { output: _, ...resultWithoutTrace } = result;
2634
+ await outputWriter.append(resultWithoutTrace);
2635
+ if (otelExporter && !streamingObserver) {
2636
+ try {
2637
+ await otelExporter.exportResult(result);
2638
+ } catch (err) {
2639
+ if (options.verbose) {
2640
+ console.warn(
2641
+ `OTel export warning: ${err instanceof Error ? err.message : String(err)}`
2642
+ );
2643
+ }
2644
+ }
2645
+ }
2646
+ },
2647
+ onProgress: async (event) => {
2648
+ const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
2649
+ const evalKey = makeEvalKey(testFilePath, evalKeyId);
2650
+ if (event.status === "pending" && !seenEvalCases.has(evalKey)) {
2651
+ seenEvalCases.add(evalKey);
2652
+ progressReporter.setTotal(seenEvalCases.size);
2653
+ }
2654
+ const displayId = displayIdTracker.getOrAssign(evalKey);
2655
+ if (event.status === "running" && streamingObserver) {
2656
+ streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
2657
+ }
2658
+ progressReporter.update(displayId, {
2659
+ workerId: displayId,
2660
+ testId: matrixMode ? `${event.testId}@${targetName}` : event.testId,
2661
+ status: event.status,
2662
+ startedAt: event.startedAt,
2663
+ completedAt: event.completedAt,
2664
+ error: event.error,
2665
+ targetLabel: inlineTargetLabel
2666
+ });
2667
+ }
2668
+ });
2669
+ return { results: [...results] };
2670
+ }
2671
+ async function runEvalCommand(input) {
2672
+ const cwd = process.cwd();
2673
+ let config = null;
2674
+ try {
2675
+ config = await loadTsConfig(cwd);
2676
+ } catch (err) {
2677
+ console.warn(
2678
+ `Warning: Failed to load agentv config: ${err instanceof Error ? err.message : String(err)}`
2679
+ );
2680
+ }
2681
+ const options = normalizeOptions(input.rawOptions, config);
2682
+ const repoRoot = await findRepoRoot(cwd);
2683
+ if (options.keepWorkspaces && options.cleanupWorkspaces) {
2684
+ console.warn(
2685
+ "Warning: Both --keep-workspaces and --cleanup-workspaces specified. --cleanup-workspaces takes precedence."
2686
+ );
2687
+ }
2688
+ if (options.verbose) {
2689
+ console.log(`Repository root: ${repoRoot}`);
2690
+ }
2691
+ let otelExporter = null;
2692
+ const useFileExport = !!(options.otelFile || options.traceFile);
2693
+ if (options.exportOtel || useFileExport) {
2694
+ try {
2695
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-R3OCWGXH.js");
2696
+ let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
2697
+ let headers = {};
2698
+ if (options.otelBackend) {
2699
+ const preset = OTEL_BACKEND_PRESETS[options.otelBackend];
2700
+ if (preset) {
2701
+ endpoint = preset.endpoint;
2702
+ headers = preset.headers(process.env);
2703
+ } else {
2704
+ console.warn(`Unknown OTel backend preset: ${options.otelBackend}`);
2705
+ }
2706
+ }
2707
+ if (process.env.OTEL_EXPORTER_OTLP_HEADERS) {
2708
+ for (const pair of process.env.OTEL_EXPORTER_OTLP_HEADERS.split(",")) {
2709
+ const [key, ...rest] = pair.split("=");
2710
+ if (key) headers[key.trim()] = rest.join("=").trim();
2711
+ }
2712
+ }
2713
+ const captureContent = options.otelCaptureContent || process.env.AGENTV_OTEL_CAPTURE_CONTENT === "true";
2714
+ otelExporter = new OtelTraceExporter({
2715
+ endpoint,
2716
+ headers,
2717
+ captureContent,
2718
+ groupTurns: options.otelGroupTurns,
2719
+ otlpFilePath: options.otelFile ? path10.resolve(options.otelFile) : void 0,
2720
+ traceFilePath: options.traceFile ? path10.resolve(options.traceFile) : void 0
2721
+ });
2722
+ const initialized = await otelExporter.init();
2723
+ if (!initialized) {
2724
+ console.warn(
2725
+ "OTel export requested but @opentelemetry packages not available. Install them to enable export."
2726
+ );
2727
+ otelExporter = null;
2728
+ }
2729
+ } catch (err) {
2730
+ console.warn(
2731
+ `OTel export initialization failed: ${err instanceof Error ? err.message : String(err)}`
2732
+ );
2733
+ otelExporter = null;
2734
+ }
2735
+ }
2736
+ const outputPath = options.outPath ? path10.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
2737
+ const extraOutputPaths = options.outputPaths.map((p) => path10.resolve(p));
2738
+ const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
2739
+ const uniqueOutputPaths = [...new Set(allOutputPaths)];
2740
+ let outputWriter;
2741
+ if (uniqueOutputPaths.length === 1) {
2742
+ outputWriter = await createOutputWriter(outputPath, options.format);
2743
+ console.log(`Output path: ${outputPath}`);
2744
+ } else {
2745
+ outputWriter = await createMultiWriter(uniqueOutputPaths);
2746
+ console.log("Output paths:");
2747
+ for (const p of uniqueOutputPaths) {
2748
+ console.log(` ${p}`);
2749
+ }
2750
+ }
2751
+ const resolvedTestFiles = input.testFiles.map((file) => path10.resolve(file));
2752
+ if (options.otelFile) {
2753
+ console.log(`OTLP JSON file: ${path10.resolve(options.otelFile)}`);
2754
+ }
2755
+ if (options.traceFile) {
2756
+ console.log(`Trace file: ${path10.resolve(options.traceFile)}`);
2757
+ }
2758
+ const evaluationRunner = await resolveEvaluationRunner();
2759
+ const allResults = [];
2760
+ const seenEvalCases = /* @__PURE__ */ new Set();
2761
+ const displayIdTracker = createDisplayIdTracker();
2762
+ const totalWorkers = options.workers ?? DEFAULT_WORKERS;
2763
+ const fileConcurrency = Math.min(
2764
+ Math.max(1, totalWorkers),
2765
+ Math.max(1, resolvedTestFiles.length)
2766
+ );
2767
+ const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : void 0;
2768
+ const fileMetadata = /* @__PURE__ */ new Map();
2769
+ for (const testFilePath of resolvedTestFiles) {
2770
+ const meta = await prepareFileMetadata({
2771
+ testFilePath,
2772
+ repoRoot,
2773
+ cwd,
2774
+ options
2775
+ });
2776
+ fileMetadata.set(testFilePath, meta);
2777
+ }
2778
+ const firstMeta = fileMetadata.values().next().value;
2779
+ const yamlCacheEnabled = firstMeta?.yamlCache;
2780
+ const yamlCachePath = firstMeta?.yamlCachePath;
2781
+ const cacheEnabled = shouldEnableCache({
2782
+ cliCache: options.cache,
2783
+ cliNoCache: options.noCache,
2784
+ yamlCache: yamlCacheEnabled
2785
+ });
2786
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path10.resolve(yamlCachePath) : void 0) : void 0;
2787
+ const useCache = cacheEnabled;
2788
+ if (cacheEnabled) {
2789
+ console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
2790
+ }
2791
+ const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
2792
+ let totalEvalCount = 0;
2793
+ for (const meta of fileMetadata.values()) {
2794
+ const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
2795
+ for (const test of meta.evalCases) {
2796
+ const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
2797
+ totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
2798
+ }
2799
+ }
2800
+ if (totalEvalCount === 0) {
2801
+ throw new Error("No tests matched the provided filters.");
2802
+ }
2803
+ const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
2804
+ progressReporter.start();
2805
+ progressReporter.setTotal(totalEvalCount);
2806
+ const seenCodexLogPaths = /* @__PURE__ */ new Set();
2807
+ const unsubscribeCodexLogs = subscribeToCodexLogEntries((entry) => {
2808
+ if (!entry.filePath || seenCodexLogPaths.has(entry.filePath)) {
2809
+ return;
2810
+ }
2811
+ seenCodexLogPaths.add(entry.filePath);
2812
+ progressReporter.addLogPaths([entry.filePath], "codex");
2813
+ });
2814
+ const seenPiLogPaths = /* @__PURE__ */ new Set();
2815
+ const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
2816
+ if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) {
2817
+ return;
2818
+ }
2819
+ seenPiLogPaths.add(entry.filePath);
2820
+ progressReporter.addLogPaths([entry.filePath], "pi");
2821
+ });
2822
+ const seenCopilotLogPaths = /* @__PURE__ */ new Set();
2823
+ const unsubscribeCopilotLogs = subscribeToCopilotSdkLogEntries((entry) => {
2824
+ if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
2825
+ return;
2826
+ }
2827
+ seenCopilotLogPaths.add(entry.filePath);
2828
+ progressReporter.addLogPaths([entry.filePath], "copilot");
2829
+ });
2830
+ for (const [testFilePath, meta] of fileMetadata.entries()) {
2831
+ for (const { selection, inlineTargetLabel } of meta.selections) {
2832
+ for (const testId of meta.evalIds) {
2833
+ const evalKey = makeEvalKey(
2834
+ testFilePath,
2835
+ meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
2836
+ );
2837
+ seenEvalCases.add(evalKey);
2838
+ const displayId = displayIdTracker.getOrAssign(evalKey);
2839
+ progressReporter.update(displayId, {
2840
+ workerId: displayId,
2841
+ testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
2842
+ status: "pending",
2843
+ targetLabel: inlineTargetLabel
2844
+ });
2845
+ }
2846
+ }
2847
+ }
2848
+ try {
2849
+ await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
2850
+ const targetPrep = fileMetadata.get(testFilePath);
2851
+ if (!targetPrep) {
2852
+ throw new Error(`Missing metadata for ${testFilePath}`);
2853
+ }
2854
+ for (const { selection, inlineTargetLabel } of targetPrep.selections) {
2855
+ const targetName = selection.targetName;
2856
+ const applicableEvalCases = targetPrep.selections.length > 1 ? targetPrep.evalCases.filter((test) => {
2857
+ if (test.targets && test.targets.length > 0) {
2858
+ return test.targets.includes(targetName);
2859
+ }
2860
+ return true;
2861
+ }) : targetPrep.evalCases;
2862
+ if (applicableEvalCases.length === 0) {
2863
+ continue;
2864
+ }
2865
+ const result = await runSingleEvalFile({
2866
+ testFilePath,
2867
+ cwd,
2868
+ repoRoot,
2869
+ options,
2870
+ outputWriter,
2871
+ otelExporter,
2872
+ cache,
2873
+ evaluationRunner,
2874
+ workersOverride: perFileWorkers,
2875
+ progressReporter,
2876
+ seenEvalCases,
2877
+ displayIdTracker,
2878
+ selection,
2879
+ inlineTargetLabel,
2880
+ evalCases: applicableEvalCases,
2881
+ trialsConfig: targetPrep.trialsConfig,
2882
+ matrixMode: targetPrep.selections.length > 1
2883
+ });
2884
+ allResults.push(...result.results);
2885
+ }
2886
+ });
2887
+ progressReporter.finish();
2888
+ const summary = calculateEvaluationSummary(allResults);
2889
+ console.log(formatEvaluationSummary(summary));
2890
+ if (isMatrixMode && allResults.length > 0) {
2891
+ console.log(formatMatrixSummary(allResults));
2892
+ }
2893
+ const failedWithWorkspaces = allResults.filter(
2894
+ (r) => r.workspacePath && (r.error || r.score < 0.5)
2895
+ );
2896
+ if (failedWithWorkspaces.length > 0) {
2897
+ console.log("\nWorkspaces preserved for debugging:");
2898
+ for (const result of failedWithWorkspaces) {
2899
+ console.log(` ${result.testId}: ${result.workspacePath}`);
2900
+ }
2901
+ }
2902
+ if (allResults.length > 0) {
2903
+ if (uniqueOutputPaths.length === 1) {
2904
+ console.log(`
2905
+ Results written to: ${outputPath}`);
2906
+ } else {
2907
+ console.log("\nResults written to:");
2908
+ for (const p of uniqueOutputPaths) {
2909
+ console.log(` ${p}`);
2910
+ }
2911
+ }
2912
+ }
2913
+ } finally {
2914
+ unsubscribeCodexLogs();
2915
+ unsubscribePiLogs();
2916
+ unsubscribeCopilotLogs();
2917
+ await outputWriter.close().catch(() => void 0);
2918
+ if (otelExporter) {
2919
+ try {
2920
+ await otelExporter.shutdown();
2921
+ } catch {
2922
+ }
2923
+ }
2924
+ }
2925
+ }
2926
+ async function resolveEvaluationRunner() {
2927
+ const overridePath = process.env.AGENTEVO_CLI_EVAL_RUNNER;
2928
+ if (!overridePath) {
2929
+ return runEvaluation;
2930
+ }
2931
+ const resolved = path10.isAbsolute(overridePath) ? overridePath : path10.resolve(process.cwd(), overridePath);
2932
+ const moduleUrl = pathToFileURL(resolved).href;
2933
+ const mod = await import(moduleUrl);
2934
+ const candidate = mod.runEvaluation;
2935
+ if (typeof candidate !== "function") {
2936
+ throw new Error(
2937
+ `Module '${resolved}' must export a 'runEvaluation' function to override the default implementation`
2938
+ );
2939
+ }
2940
+ return candidate;
2941
+ }
2942
+
2943
+ export {
2944
+ toSnakeCaseDeep,
2945
+ resolveEvalPaths,
2946
+ findRepoRoot,
2947
+ detectFileType,
2948
+ validateEvalFile,
2949
+ validateTargetsFile,
2950
+ validateConfigFile,
2951
+ validateFileReferences,
2952
+ TARGET_FILE_CANDIDATES,
2953
+ fileExists,
2954
+ selectTarget,
2955
+ runEvalCommand
2956
+ };
2957
+ //# sourceMappingURL=chunk-H5FFZCKI.js.map