agentv 2.19.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +62 -36
  2. package/dist/templates/.agentv/config.yaml +5 -0
  3. package/dist/templates/.agentv/targets.yaml +7 -7
  4. package/package.json +2 -2
  5. package/dist/chunk-3L2L5GIL.js +0 -51
  6. package/dist/chunk-3L2L5GIL.js.map +0 -1
  7. package/dist/chunk-4MSAOMCC.js +0 -3338
  8. package/dist/chunk-4MSAOMCC.js.map +0 -1
  9. package/dist/chunk-5H446C7X.js +0 -50
  10. package/dist/chunk-5H446C7X.js.map +0 -1
  11. package/dist/chunk-BL4PVUAT.js +0 -261
  12. package/dist/chunk-BL4PVUAT.js.map +0 -1
  13. package/dist/chunk-C5GOHBQM.js +0 -84
  14. package/dist/chunk-C5GOHBQM.js.map +0 -1
  15. package/dist/chunk-FV32QHPB.js +0 -565
  16. package/dist/chunk-FV32QHPB.js.map +0 -1
  17. package/dist/chunk-GC6T3RD4.js +0 -5954
  18. package/dist/chunk-GC6T3RD4.js.map +0 -1
  19. package/dist/chunk-JK6V4KVD.js +0 -114
  20. package/dist/chunk-JK6V4KVD.js.map +0 -1
  21. package/dist/chunk-LRULMAAA.js +0 -1711
  22. package/dist/chunk-LRULMAAA.js.map +0 -1
  23. package/dist/chunk-SR4I5KET.js +0 -1238
  24. package/dist/chunk-SR4I5KET.js.map +0 -1
  25. package/dist/chunk-VQ2ZO7XJ.js +0 -2098
  26. package/dist/chunk-VQ2ZO7XJ.js.map +0 -1
  27. package/dist/chunk-XALGXSKB.js +0 -21
  28. package/dist/chunk-XALGXSKB.js.map +0 -1
  29. package/dist/chunk-XTYMR4I5.js +0 -49811
  30. package/dist/chunk-XTYMR4I5.js.map +0 -1
  31. package/dist/cli.js +0 -28
  32. package/dist/cli.js.map +0 -1
  33. package/dist/dist-MQBGD6LP.js +0 -289
  34. package/dist/dist-MQBGD6LP.js.map +0 -1
  35. package/dist/esm-5Q4BZALM-5REQWAUV.js +0 -924
  36. package/dist/esm-5Q4BZALM-5REQWAUV.js.map +0 -1
  37. package/dist/esm-DX3WQKEN.js +0 -32
  38. package/dist/esm-DX3WQKEN.js.map +0 -1
  39. package/dist/esm-QNEMCJPL.js +0 -933
  40. package/dist/esm-QNEMCJPL.js.map +0 -1
  41. package/dist/esm-R77SNOF5.js +0 -65
  42. package/dist/esm-R77SNOF5.js.map +0 -1
  43. package/dist/esm-RVQPUGWH.js +0 -1207
  44. package/dist/esm-RVQPUGWH.js.map +0 -1
  45. package/dist/getMachineId-bsd-HSK5LZMG.js +0 -41
  46. package/dist/getMachineId-bsd-HSK5LZMG.js.map +0 -1
  47. package/dist/getMachineId-darwin-4DP6CCJV.js +0 -41
  48. package/dist/getMachineId-darwin-4DP6CCJV.js.map +0 -1
  49. package/dist/getMachineId-linux-44LJ5UJB.js +0 -33
  50. package/dist/getMachineId-linux-44LJ5UJB.js.map +0 -1
  51. package/dist/getMachineId-unsupported-NVK6IATM.js +0 -24
  52. package/dist/getMachineId-unsupported-NVK6IATM.js.map +0 -1
  53. package/dist/getMachineId-win-YZ36S7VA.js +0 -43
  54. package/dist/getMachineId-win-YZ36S7VA.js.map +0 -1
  55. package/dist/index.js +0 -19
  56. package/dist/index.js.map +0 -1
  57. package/dist/interactive-3TDBCSDW.js +0 -333
  58. package/dist/interactive-3TDBCSDW.js.map +0 -1
  59. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +0 -9
  60. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +0 -1
  61. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +0 -9
  62. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +0 -1
  63. package/dist/src-2N5EJ2N6.js +0 -1733
  64. package/dist/src-2N5EJ2N6.js.map +0 -1
  65. package/dist/token-POXF46NU.js +0 -66
  66. package/dist/token-POXF46NU.js.map +0 -1
  67. package/dist/token-util-6GWYZWGE.js +0 -8
  68. package/dist/token-util-6GWYZWGE.js.map +0 -1
@@ -1,3338 +0,0 @@
1
- import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
- import {
3
- CLI_PLACEHOLDERS,
4
- KNOWN_PROVIDERS,
5
- PROVIDER_ALIASES,
6
- ResponseCache,
7
- buildDirectoryChain,
8
- buildSearchRoots,
9
- ensureVSCodeSubagents,
10
- findGitRoot,
11
- isEvaluatorKind,
12
- listTargetNames,
13
- loadConfig,
14
- loadTestSuite,
15
- loadTsConfig,
16
- normalizeLineEndings,
17
- readTargetDefinitions,
18
- readTestSuiteMetadata,
19
- resolveFileReference,
20
- resolveTargetDefinition,
21
- runEvaluation,
22
- shouldEnableCache,
23
- shouldSkipCacheForTemperature,
24
- subscribeToCodexLogEntries,
25
- subscribeToCopilotCliLogEntries,
26
- subscribeToCopilotSdkLogEntries,
27
- subscribeToPiLogEntries
28
- } from "./chunk-XTYMR4I5.js";
29
-
30
- // package.json
31
- var package_default = {
32
- name: "agentv",
33
- version: "2.19.0",
34
- description: "CLI entry point for AgentV",
35
- type: "module",
36
- repository: {
37
- type: "git",
38
- url: "https://github.com/EntityProcess/agentv.git"
39
- },
40
- homepage: "https://github.com/EntityProcess/agentv#readme",
41
- bugs: {
42
- url: "https://github.com/EntityProcess/agentv/issues"
43
- },
44
- bin: {
45
- agentv: "./dist/cli.js"
46
- },
47
- files: ["dist", "README.md"],
48
- scripts: {
49
- dev: "bun src/cli.ts",
50
- build: "tsup && bun run copy-readme",
51
- "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
52
- prepublishOnly: "bun run copy-readme",
53
- typecheck: "tsc --noEmit",
54
- lint: "biome check .",
55
- format: "biome format --write .",
56
- fix: "biome check --write .",
57
- test: "bun test",
58
- "test:watch": "bun test --watch"
59
- },
60
- dependencies: {
61
- "@anthropic-ai/claude-agent-sdk": "^0.2.49",
62
- "@github/copilot-sdk": "^0.1.25",
63
- "@inquirer/prompts": "^8.2.1",
64
- "@mariozechner/pi-agent-core": "^0.54.2",
65
- "@mariozechner/pi-ai": "^0.54.2",
66
- "@openai/codex-sdk": "^0.104.0",
67
- "cmd-ts": "^0.14.3",
68
- dotenv: "^16.4.5",
69
- "fast-glob": "^3.3.3",
70
- json5: "^2.2.3",
71
- micromatch: "^4.0.8",
72
- semver: "^7.7.4",
73
- yaml: "^2.6.1"
74
- },
75
- devDependencies: {
76
- "@agentv/core": "workspace:*",
77
- "@types/semver": "^7.7.1",
78
- execa: "^9.3.0"
79
- }
80
- };
81
-
82
- // src/commands/eval/shared.ts
83
- import { constants } from "node:fs";
84
- import { access, stat } from "node:fs/promises";
85
- import path from "node:path";
86
- import fg from "fast-glob";
87
- async function resolveEvalPaths(evalPaths, cwd) {
88
- const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
89
- if (normalizedInputs.length === 0) {
90
- throw new Error("No eval paths provided.");
91
- }
92
- const unmatched = [];
93
- const results = /* @__PURE__ */ new Set();
94
- for (const pattern of normalizedInputs) {
95
- const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
96
- try {
97
- const stats = await stat(candidatePath);
98
- if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
99
- results.add(candidatePath);
100
- continue;
101
- }
102
- } catch {
103
- }
104
- const globPattern = pattern.includes("\\") ? pattern.replace(/\\/g, "/") : pattern;
105
- const matches = await fg(globPattern, {
106
- cwd,
107
- absolute: true,
108
- onlyFiles: true,
109
- unique: true,
110
- dot: true,
111
- followSymbolicLinks: true
112
- });
113
- const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
114
- if (yamlMatches.length === 0) {
115
- unmatched.push(pattern);
116
- continue;
117
- }
118
- for (const filePath of yamlMatches) {
119
- results.add(path.normalize(filePath));
120
- }
121
- }
122
- if (unmatched.length > 0) {
123
- throw new Error(
124
- `No eval files matched: ${unmatched.join(
125
- ", "
126
- )}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`
127
- );
128
- }
129
- const sorted = Array.from(results);
130
- sorted.sort();
131
- return sorted;
132
- }
133
- async function findRepoRoot(start) {
134
- const fallback = path.resolve(start);
135
- let current = fallback;
136
- while (current !== void 0) {
137
- const candidate = path.join(current, ".git");
138
- try {
139
- await access(candidate, constants.F_OK);
140
- return current;
141
- } catch {
142
- const parent = path.dirname(current);
143
- if (parent === current) {
144
- break;
145
- }
146
- current = parent;
147
- }
148
- }
149
- return fallback;
150
- }
151
-
152
- // src/utils/targets.ts
153
- import { constants as constants2 } from "node:fs";
154
- import { access as access2 } from "node:fs/promises";
155
- import path2 from "node:path";
156
- var TARGET_FILE_CANDIDATES = [
157
- "targets.yaml",
158
- "targets.yml",
159
- path2.join(".agentv", "targets.yaml"),
160
- path2.join(".agentv", "targets.yml")
161
- ];
162
- async function fileExists(filePath) {
163
- try {
164
- await access2(filePath, constants2.F_OK);
165
- return true;
166
- } catch {
167
- return false;
168
- }
169
- }
170
- async function discoverTargetsFile(options) {
171
- const { explicitPath, testFilePath, repoRoot, cwd } = options;
172
- if (explicitPath) {
173
- const resolvedExplicit = path2.resolve(explicitPath);
174
- if (await fileExists(resolvedExplicit)) {
175
- return resolvedExplicit;
176
- }
177
- for (const candidate of TARGET_FILE_CANDIDATES) {
178
- const nested = path2.join(resolvedExplicit, candidate);
179
- if (await fileExists(nested)) {
180
- return nested;
181
- }
182
- }
183
- throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
184
- }
185
- const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
186
- const resolvedCwd = path2.resolve(cwd);
187
- if (!directories.includes(resolvedCwd)) {
188
- directories.push(resolvedCwd);
189
- }
190
- for (const directory of directories) {
191
- for (const candidate of TARGET_FILE_CANDIDATES) {
192
- const fullPath = path2.join(directory, candidate);
193
- if (await fileExists(fullPath)) {
194
- return fullPath;
195
- }
196
- }
197
- }
198
- throw new Error("Unable to locate targets.yaml. Use --targets to specify the file explicitly.");
199
- }
200
-
201
- // src/commands/eval/run-eval.ts
202
- import { constants as constants4 } from "node:fs";
203
- import { access as access4 } from "node:fs/promises";
204
- import path10 from "node:path";
205
- import { pathToFileURL } from "node:url";
206
-
207
- // src/version-check.ts
208
- import { satisfies, validRange } from "semver";
209
- var ANSI_YELLOW = "\x1B[33m";
210
- var ANSI_RED = "\x1B[31m";
211
- var ANSI_RESET = "\x1B[0m";
212
- function checkVersion(requiredVersion) {
213
- const currentVersion = package_default.version;
214
- if (!requiredVersion.trim() || !validRange(requiredVersion)) {
215
- throw new Error(
216
- `Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
217
- );
218
- }
219
- return {
220
- satisfied: satisfies(currentVersion, requiredVersion),
221
- currentVersion,
222
- requiredRange: requiredVersion
223
- };
224
- }
225
- async function enforceRequiredVersion(requiredVersion, options) {
226
- let result;
227
- try {
228
- result = checkVersion(requiredVersion);
229
- } catch (err) {
230
- console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
231
- process.exit(1);
232
- }
233
- if (result.satisfied) {
234
- return;
235
- }
236
- const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
237
- Run \`agentv self update\` to upgrade.`;
238
- if (options?.strict) {
239
- console.error(warning);
240
- console.error(
241
- `${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
242
- );
243
- process.exit(1);
244
- }
245
- if (process.stdin.isTTY && process.stdout.isTTY) {
246
- console.warn(warning);
247
- const shouldContinue = await promptContinue();
248
- if (!shouldContinue) {
249
- process.exit(1);
250
- }
251
- } else {
252
- process.stderr.write(`${warning}
253
- `);
254
- }
255
- }
256
- async function promptContinue() {
257
- const { confirm } = await import("@inquirer/prompts");
258
- return confirm({ message: "Continue anyway?", default: false });
259
- }
260
-
261
- // src/commands/eval/env.ts
262
- import { constants as constants3 } from "node:fs";
263
- import { access as access3 } from "node:fs/promises";
264
- import path3 from "node:path";
265
- import { config as loadDotenv } from "dotenv";
266
- function uniqueDirs(directories) {
267
- const seen = /* @__PURE__ */ new Set();
268
- const result = [];
269
- for (const dir of directories) {
270
- const absolute = path3.resolve(dir);
271
- if (seen.has(absolute)) {
272
- continue;
273
- }
274
- seen.add(absolute);
275
- result.push(absolute);
276
- }
277
- return result;
278
- }
279
- async function fileExists2(filePath) {
280
- try {
281
- await access3(filePath, constants3.F_OK);
282
- return true;
283
- } catch {
284
- return false;
285
- }
286
- }
287
- function collectAncestorDirectories(start, boundary) {
288
- const directories = [];
289
- const boundaryDir = path3.resolve(boundary);
290
- let current = path3.resolve(start);
291
- while (current !== void 0) {
292
- directories.push(current);
293
- if (current === boundaryDir) {
294
- break;
295
- }
296
- const parent = path3.dirname(current);
297
- if (parent === current) {
298
- break;
299
- }
300
- current = parent;
301
- }
302
- return directories;
303
- }
304
- async function loadEnvFromHierarchy(options) {
305
- const { testFilePath, repoRoot, verbose } = options;
306
- const testDir = path3.dirname(path3.resolve(testFilePath));
307
- const cwd = process.cwd();
308
- const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
309
- const envFiles = [];
310
- for (const dir of searchDirs) {
311
- const candidate = path3.join(dir, ".env");
312
- if (await fileExists2(candidate)) {
313
- envFiles.push(candidate);
314
- }
315
- }
316
- if (envFiles.length === 0) {
317
- if (verbose) {
318
- console.log("No .env file found in hierarchy");
319
- }
320
- return void 0;
321
- }
322
- for (let i = envFiles.length - 1; i >= 0; i--) {
323
- const envFile = envFiles[i];
324
- loadDotenv({ path: envFile, override: false });
325
- if (verbose) {
326
- console.log(`Loaded environment from: ${envFile}`);
327
- }
328
- }
329
- return envFiles[0];
330
- }
331
-
332
- // src/commands/eval/output-writer.ts
333
- import path8 from "node:path";
334
-
335
- // src/commands/eval/json-writer.ts
336
- import { mkdir, writeFile } from "node:fs/promises";
337
- import path4 from "node:path";
338
-
339
- // src/utils/case-conversion.ts
340
- function toSnakeCase(str) {
341
- if (/^[A-Z]/.test(str)) {
342
- return str;
343
- }
344
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
345
- }
346
- function toSnakeCaseDeep(obj) {
347
- if (obj === null || obj === void 0) {
348
- return obj;
349
- }
350
- if (Array.isArray(obj)) {
351
- return obj.map((item) => toSnakeCaseDeep(item));
352
- }
353
- if (typeof obj === "object") {
354
- const result = {};
355
- for (const [key, value] of Object.entries(obj)) {
356
- const snakeKey = toSnakeCase(key);
357
- result[snakeKey] = toSnakeCaseDeep(value);
358
- }
359
- return result;
360
- }
361
- return obj;
362
- }
363
-
364
- // src/commands/eval/json-writer.ts
365
- var JsonWriter = class _JsonWriter {
366
- filePath;
367
- results = [];
368
- closed = false;
369
- constructor(filePath) {
370
- this.filePath = filePath;
371
- }
372
- static async open(filePath) {
373
- await mkdir(path4.dirname(filePath), { recursive: true });
374
- return new _JsonWriter(filePath);
375
- }
376
- async append(result) {
377
- if (this.closed) {
378
- throw new Error("Cannot write to closed JSON writer");
379
- }
380
- this.results.push(result);
381
- }
382
- async close() {
383
- if (this.closed) {
384
- return;
385
- }
386
- this.closed = true;
387
- const passed = this.results.filter((r) => r.score >= 0.5).length;
388
- const failed = this.results.length - passed;
389
- const total = this.results.length;
390
- const output = {
391
- stats: {
392
- total,
393
- passed,
394
- failed,
395
- passRate: total > 0 ? passed / total : 0
396
- },
397
- results: this.results
398
- };
399
- const snakeCaseOutput = toSnakeCaseDeep(output);
400
- await writeFile(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
401
- `, "utf8");
402
- }
403
- };
404
-
405
- // src/commands/eval/jsonl-writer.ts
406
- import { createWriteStream } from "node:fs";
407
- import { mkdir as mkdir2 } from "node:fs/promises";
408
- import path5 from "node:path";
409
- import { finished } from "node:stream/promises";
410
-
411
- // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
412
- var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
413
- var E_ALREADY_LOCKED = new Error("mutex already locked");
414
- var E_CANCELED = new Error("request for lock canceled");
415
- var __awaiter$2 = function(thisArg, _arguments, P, generator) {
416
- function adopt(value) {
417
- return value instanceof P ? value : new P(function(resolve) {
418
- resolve(value);
419
- });
420
- }
421
- return new (P || (P = Promise))(function(resolve, reject) {
422
- function fulfilled(value) {
423
- try {
424
- step(generator.next(value));
425
- } catch (e) {
426
- reject(e);
427
- }
428
- }
429
- function rejected(value) {
430
- try {
431
- step(generator["throw"](value));
432
- } catch (e) {
433
- reject(e);
434
- }
435
- }
436
- function step(result) {
437
- result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
438
- }
439
- step((generator = generator.apply(thisArg, _arguments || [])).next());
440
- });
441
- };
442
- var Semaphore = class {
443
- constructor(_value, _cancelError = E_CANCELED) {
444
- this._value = _value;
445
- this._cancelError = _cancelError;
446
- this._queue = [];
447
- this._weightedWaiters = [];
448
- }
449
- acquire(weight = 1, priority = 0) {
450
- if (weight <= 0)
451
- throw new Error(`invalid weight ${weight}: must be positive`);
452
- return new Promise((resolve, reject) => {
453
- const task = { resolve, reject, weight, priority };
454
- const i = findIndexFromEnd(this._queue, (other) => priority <= other.priority);
455
- if (i === -1 && weight <= this._value) {
456
- this._dispatchItem(task);
457
- } else {
458
- this._queue.splice(i + 1, 0, task);
459
- }
460
- });
461
- }
462
- runExclusive(callback_1) {
463
- return __awaiter$2(this, arguments, void 0, function* (callback, weight = 1, priority = 0) {
464
- const [value, release] = yield this.acquire(weight, priority);
465
- try {
466
- return yield callback(value);
467
- } finally {
468
- release();
469
- }
470
- });
471
- }
472
- waitForUnlock(weight = 1, priority = 0) {
473
- if (weight <= 0)
474
- throw new Error(`invalid weight ${weight}: must be positive`);
475
- if (this._couldLockImmediately(weight, priority)) {
476
- return Promise.resolve();
477
- } else {
478
- return new Promise((resolve) => {
479
- if (!this._weightedWaiters[weight - 1])
480
- this._weightedWaiters[weight - 1] = [];
481
- insertSorted(this._weightedWaiters[weight - 1], { resolve, priority });
482
- });
483
- }
484
- }
485
- isLocked() {
486
- return this._value <= 0;
487
- }
488
- getValue() {
489
- return this._value;
490
- }
491
- setValue(value) {
492
- this._value = value;
493
- this._dispatchQueue();
494
- }
495
- release(weight = 1) {
496
- if (weight <= 0)
497
- throw new Error(`invalid weight ${weight}: must be positive`);
498
- this._value += weight;
499
- this._dispatchQueue();
500
- }
501
- cancel() {
502
- this._queue.forEach((entry) => entry.reject(this._cancelError));
503
- this._queue = [];
504
- }
505
- _dispatchQueue() {
506
- this._drainUnlockWaiters();
507
- while (this._queue.length > 0 && this._queue[0].weight <= this._value) {
508
- this._dispatchItem(this._queue.shift());
509
- this._drainUnlockWaiters();
510
- }
511
- }
512
- _dispatchItem(item) {
513
- const previousValue = this._value;
514
- this._value -= item.weight;
515
- item.resolve([previousValue, this._newReleaser(item.weight)]);
516
- }
517
- _newReleaser(weight) {
518
- let called = false;
519
- return () => {
520
- if (called)
521
- return;
522
- called = true;
523
- this.release(weight);
524
- };
525
- }
526
- _drainUnlockWaiters() {
527
- if (this._queue.length === 0) {
528
- for (let weight = this._value; weight > 0; weight--) {
529
- const waiters = this._weightedWaiters[weight - 1];
530
- if (!waiters)
531
- continue;
532
- waiters.forEach((waiter) => waiter.resolve());
533
- this._weightedWaiters[weight - 1] = [];
534
- }
535
- } else {
536
- const queuedPriority = this._queue[0].priority;
537
- for (let weight = this._value; weight > 0; weight--) {
538
- const waiters = this._weightedWaiters[weight - 1];
539
- if (!waiters)
540
- continue;
541
- const i = waiters.findIndex((waiter) => waiter.priority <= queuedPriority);
542
- (i === -1 ? waiters : waiters.splice(0, i)).forEach((waiter) => waiter.resolve());
543
- }
544
- }
545
- }
546
- _couldLockImmediately(weight, priority) {
547
- return (this._queue.length === 0 || this._queue[0].priority < priority) && weight <= this._value;
548
- }
549
- };
550
- function insertSorted(a, v) {
551
- const i = findIndexFromEnd(a, (other) => v.priority <= other.priority);
552
- a.splice(i + 1, 0, v);
553
- }
554
- function findIndexFromEnd(a, predicate) {
555
- for (let i = a.length - 1; i >= 0; i--) {
556
- if (predicate(a[i])) {
557
- return i;
558
- }
559
- }
560
- return -1;
561
- }
562
- var __awaiter$1 = function(thisArg, _arguments, P, generator) {
563
- function adopt(value) {
564
- return value instanceof P ? value : new P(function(resolve) {
565
- resolve(value);
566
- });
567
- }
568
- return new (P || (P = Promise))(function(resolve, reject) {
569
- function fulfilled(value) {
570
- try {
571
- step(generator.next(value));
572
- } catch (e) {
573
- reject(e);
574
- }
575
- }
576
- function rejected(value) {
577
- try {
578
- step(generator["throw"](value));
579
- } catch (e) {
580
- reject(e);
581
- }
582
- }
583
- function step(result) {
584
- result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
585
- }
586
- step((generator = generator.apply(thisArg, _arguments || [])).next());
587
- });
588
- };
589
- var Mutex = class {
590
- constructor(cancelError) {
591
- this._semaphore = new Semaphore(1, cancelError);
592
- }
593
- acquire() {
594
- return __awaiter$1(this, arguments, void 0, function* (priority = 0) {
595
- const [, releaser] = yield this._semaphore.acquire(1, priority);
596
- return releaser;
597
- });
598
- }
599
- runExclusive(callback, priority = 0) {
600
- return this._semaphore.runExclusive(() => callback(), 1, priority);
601
- }
602
- isLocked() {
603
- return this._semaphore.isLocked();
604
- }
605
- waitForUnlock(priority = 0) {
606
- return this._semaphore.waitForUnlock(1, priority);
607
- }
608
- release() {
609
- if (this._semaphore.isLocked())
610
- this._semaphore.release();
611
- }
612
- cancel() {
613
- return this._semaphore.cancel();
614
- }
615
- };
616
-
617
- // src/commands/eval/jsonl-writer.ts
618
- var JsonlWriter = class _JsonlWriter {
619
- stream;
620
- mutex = new Mutex();
621
- closed = false;
622
- constructor(stream) {
623
- this.stream = stream;
624
- }
625
- static async open(filePath) {
626
- await mkdir2(path5.dirname(filePath), { recursive: true });
627
- const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
628
- return new _JsonlWriter(stream);
629
- }
630
- async append(record) {
631
- await this.mutex.runExclusive(async () => {
632
- if (this.closed) {
633
- throw new Error("Cannot write to closed JSONL writer");
634
- }
635
- const snakeCaseRecord = toSnakeCaseDeep(record);
636
- const line = `${JSON.stringify(snakeCaseRecord)}
637
- `;
638
- if (!this.stream.write(line)) {
639
- await new Promise((resolve, reject) => {
640
- this.stream.once("drain", resolve);
641
- this.stream.once("error", reject);
642
- });
643
- }
644
- });
645
- }
646
- async close() {
647
- if (this.closed) {
648
- return;
649
- }
650
- this.closed = true;
651
- this.stream.end();
652
- await finished(this.stream);
653
- }
654
- };
655
-
656
- // src/commands/eval/junit-writer.ts
657
- import { mkdir as mkdir3, writeFile as writeFile2 } from "node:fs/promises";
658
- import path6 from "node:path";
659
- function escapeXml(str) {
660
- return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
661
- }
662
- var JunitWriter = class _JunitWriter {
663
- filePath;
664
- results = [];
665
- closed = false;
666
- constructor(filePath) {
667
- this.filePath = filePath;
668
- }
669
- static async open(filePath) {
670
- await mkdir3(path6.dirname(filePath), { recursive: true });
671
- return new _JunitWriter(filePath);
672
- }
673
- async append(result) {
674
- if (this.closed) {
675
- throw new Error("Cannot write to closed JUnit writer");
676
- }
677
- this.results.push(result);
678
- }
679
- async close() {
680
- if (this.closed) {
681
- return;
682
- }
683
- this.closed = true;
684
- const grouped = /* @__PURE__ */ new Map();
685
- for (const result of this.results) {
686
- const suite = result.dataset ?? "default";
687
- const existing = grouped.get(suite);
688
- if (existing) {
689
- existing.push(result);
690
- } else {
691
- grouped.set(suite, [result]);
692
- }
693
- }
694
- const suiteXmls = [];
695
- for (const [suiteName, results] of grouped) {
696
- const failures = results.filter((r) => r.score < 0.5).length;
697
- const errors = results.filter((r) => r.error !== void 0).length;
698
- const testCases = results.map((r) => {
699
- const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
700
- let inner = "";
701
- if (r.error) {
702
- inner = `
703
- <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
704
- `;
705
- } else if (r.score < 0.5) {
706
- const message = `score=${r.score.toFixed(3)}`;
707
- const detail = [
708
- `Score: ${r.score.toFixed(3)}`,
709
- r.reasoning ? `Reasoning: ${r.reasoning}` : "",
710
- r.misses.length > 0 ? `Misses: ${r.misses.join(", ")}` : ""
711
- ].filter(Boolean).join("\n");
712
- inner = `
713
- <failure message="${escapeXml(message)}">${escapeXml(detail)}</failure>
714
- `;
715
- }
716
- return ` <testcase name="${escapeXml(r.testId)}" classname="${escapeXml(suiteName)}" time="${time}">${inner}</testcase>`;
717
- });
718
- suiteXmls.push(
719
- ` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}">
720
- ${testCases.join("\n")}
721
- </testsuite>`
722
- );
723
- }
724
- const totalTests = this.results.length;
725
- const totalFailures = this.results.filter((r) => r.score < 0.5).length;
726
- const totalErrors = this.results.filter((r) => r.error !== void 0).length;
727
- const xml = `<?xml version="1.0" encoding="UTF-8"?>
728
- <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
729
- ${suiteXmls.join("\n")}
730
- </testsuites>
731
- `;
732
- await writeFile2(this.filePath, xml, "utf8");
733
- }
734
- };
735
-
736
- // src/commands/eval/yaml-writer.ts
737
- import { createWriteStream as createWriteStream2 } from "node:fs";
738
- import { mkdir as mkdir4 } from "node:fs/promises";
739
- import path7 from "node:path";
740
- import { finished as finished2 } from "node:stream/promises";
741
- import { stringify as stringifyYaml } from "yaml";
742
- var YamlWriter = class _YamlWriter {
743
- stream;
744
- mutex = new Mutex();
745
- closed = false;
746
- isFirst = true;
747
- constructor(stream) {
748
- this.stream = stream;
749
- }
750
- static async open(filePath) {
751
- await mkdir4(path7.dirname(filePath), { recursive: true });
752
- const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
753
- return new _YamlWriter(stream);
754
- }
755
- async append(record) {
756
- await this.mutex.runExclusive(async () => {
757
- if (this.closed) {
758
- throw new Error("Cannot write to closed YAML writer");
759
- }
760
- const snakeCaseRecord = toSnakeCaseDeep(record);
761
- const yamlDoc = stringifyYaml(snakeCaseRecord, {
762
- indent: 2,
763
- lineWidth: 0
764
- // Disable line wrapping
765
- // Let YAML library choose appropriate string style based on content
766
- // (will use block literal for multiline strings with actual newlines)
767
- });
768
- const normalizedYaml = normalizeLineEndings(yamlDoc);
769
- const separator = this.isFirst ? "---\n" : "\n---\n";
770
- this.isFirst = false;
771
- const content = `${separator}${normalizedYaml}`;
772
- if (!this.stream.write(content)) {
773
- await new Promise((resolve, reject) => {
774
- this.stream.once("drain", resolve);
775
- this.stream.once("error", reject);
776
- });
777
- }
778
- });
779
- }
780
- async close() {
781
- if (this.closed) {
782
- return;
783
- }
784
- this.closed = true;
785
- this.stream.end();
786
- await finished2(this.stream);
787
- }
788
- };
789
-
790
- // src/commands/eval/output-writer.ts
791
- async function createOutputWriter(filePath, format) {
792
- switch (format) {
793
- case "jsonl":
794
- return JsonlWriter.open(filePath);
795
- case "yaml":
796
- return YamlWriter.open(filePath);
797
- default: {
798
- const exhaustiveCheck = format;
799
- throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
800
- }
801
- }
802
- }
803
- function getDefaultExtension(format) {
804
- switch (format) {
805
- case "jsonl":
806
- return ".jsonl";
807
- case "yaml":
808
- return ".yaml";
809
- default: {
810
- const exhaustiveCheck = format;
811
- throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
812
- }
813
- }
814
- }
815
- var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml"]);
816
- function createWriterFromPath(filePath) {
817
- const ext = path8.extname(filePath).toLowerCase();
818
- switch (ext) {
819
- case ".jsonl":
820
- return JsonlWriter.open(filePath);
821
- case ".json":
822
- return JsonWriter.open(filePath);
823
- case ".xml":
824
- return JunitWriter.open(filePath);
825
- case ".yaml":
826
- case ".yml":
827
- return YamlWriter.open(filePath);
828
- default:
829
- throw new Error(
830
- `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`
831
- );
832
- }
833
- }
834
- async function createMultiWriter(filePaths) {
835
- const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
836
- return {
837
- async append(result) {
838
- await Promise.all(writers.map((w) => w.append(result)));
839
- },
840
- async close() {
841
- await Promise.all(writers.map((w) => w.close()));
842
- }
843
- };
844
- }
845
-
846
- // src/commands/eval/progress-display.ts
847
- var ProgressDisplay = class {
848
- workers = /* @__PURE__ */ new Map();
849
- totalTests = 0;
850
- completedTests = 0;
851
- logPaths = [];
852
- logPathSet = /* @__PURE__ */ new Set();
853
- hasPrintedLogHeader = false;
854
- started = false;
855
- finished = false;
856
- verbose;
857
- constructor(_maxWorkers, options) {
858
- this.verbose = options?.verbose ?? false;
859
- }
860
- isInteractiveMode() {
861
- return false;
862
- }
863
- start() {
864
- this.started = true;
865
- this.finished = false;
866
- }
867
- setTotalTests(count) {
868
- this.totalTests = count;
869
- }
870
- updateWorker(progress) {
871
- const previous = this.workers.get(progress.workerId);
872
- this.workers.set(progress.workerId, progress);
873
- if (progress.status === "completed" || progress.status === "failed") {
874
- this.completedTests++;
875
- }
876
- const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
877
- const countPrefix = `${this.completedTests}/${this.totalTests}`;
878
- switch (progress.status) {
879
- case "pending":
880
- if (this.verbose && !previous) {
881
- console.log(`${countPrefix} \u23F3 ${progress.testId}${targetSuffix}`);
882
- }
883
- break;
884
- case "running":
885
- if (!previous || previous.status === "pending") {
886
- console.log(`${countPrefix} \u{1F504} ${progress.testId}${targetSuffix}`);
887
- }
888
- break;
889
- case "completed":
890
- console.log(`${countPrefix} \u2705 ${progress.testId}${targetSuffix}`);
891
- break;
892
- case "failed":
893
- console.log(
894
- `${countPrefix} \u274C ${progress.testId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
895
- );
896
- break;
897
- }
898
- }
899
- addLogPaths(paths, provider) {
900
- const newPaths = [];
901
- for (const path11 of paths) {
902
- if (this.logPathSet.has(path11)) {
903
- continue;
904
- }
905
- this.logPathSet.add(path11);
906
- newPaths.push(path11);
907
- }
908
- if (newPaths.length === 0) {
909
- return;
910
- }
911
- this.logPaths.push(...newPaths);
912
- if (!this.hasPrintedLogHeader) {
913
- console.log("");
914
- const label = provider === "pi" ? "Pi Coding Agent" : provider === "copilot" ? "Copilot CLI" : "Codex CLI";
915
- console.log(`${label} logs:`);
916
- this.hasPrintedLogHeader = true;
917
- }
918
- const startIndex = this.logPaths.length - newPaths.length;
919
- newPaths.forEach((path11, offset) => {
920
- console.log(`${startIndex + offset + 1}. ${path11}`);
921
- });
922
- }
923
- finish() {
924
- this.finished = true;
925
- console.log("");
926
- }
927
- clear() {
928
- }
929
- };
930
-
931
- // src/commands/eval/retry-errors.ts
932
- import { createReadStream } from "node:fs";
933
- import { createInterface } from "node:readline";
934
- async function loadErrorTestIds(jsonlPath) {
935
- const ids = [];
936
- const rl = createInterface({
937
- input: createReadStream(jsonlPath),
938
- crlfDelay: Number.POSITIVE_INFINITY
939
- });
940
- for await (const line of rl) {
941
- const trimmed = line.trim();
942
- if (!trimmed) continue;
943
- try {
944
- const parsed = JSON.parse(trimmed);
945
- if (parsed.executionStatus === "execution_error" && parsed.testId) {
946
- ids.push(parsed.testId);
947
- }
948
- } catch {
949
- }
950
- }
951
- return [...new Set(ids)];
952
- }
953
- async function loadNonErrorResults(jsonlPath) {
954
- const results = [];
955
- const rl = createInterface({
956
- input: createReadStream(jsonlPath),
957
- crlfDelay: Number.POSITIVE_INFINITY
958
- });
959
- for await (const line of rl) {
960
- const trimmed = line.trim();
961
- if (!trimmed) continue;
962
- try {
963
- const parsed = JSON.parse(trimmed);
964
- if (!parsed.testId || parsed.score === void 0) continue;
965
- if (parsed.executionStatus !== "execution_error") {
966
- results.push(parsed);
967
- }
968
- } catch {
969
- }
970
- }
971
- return results;
972
- }
973
-
974
- // src/commands/eval/statistics.ts
975
- var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
976
- function computeMean(values) {
977
- if (values.length === 0) {
978
- return 0;
979
- }
980
- const sum = values.reduce((acc, value) => acc + value, 0);
981
- return sum / values.length;
982
- }
983
- function computeMedian(values) {
984
- if (values.length === 0) {
985
- return 0;
986
- }
987
- const sorted = [...values].sort((a, b) => a - b);
988
- const mid = Math.floor(sorted.length / 2);
989
- if (sorted.length % 2 === 0) {
990
- return (sorted[mid - 1] + sorted[mid]) / 2;
991
- }
992
- return sorted[mid];
993
- }
994
- function computeStandardDeviation(values) {
995
- if (values.length < 2) {
996
- return void 0;
997
- }
998
- const mean = computeMean(values);
999
- const variance = values.reduce((acc, value) => acc + (value - mean) ** 2, 0) / (values.length - 1);
1000
- return Math.sqrt(variance);
1001
- }
1002
- function buildHistogram(values) {
1003
- const bins = [];
1004
- for (let index = 0; index < HISTOGRAM_BREAKPOINTS.length - 1; index += 1) {
1005
- bins.push({
1006
- range: [HISTOGRAM_BREAKPOINTS[index], HISTOGRAM_BREAKPOINTS[index + 1]],
1007
- count: 0
1008
- });
1009
- }
1010
- for (const value of values) {
1011
- for (const bin of bins) {
1012
- const [start, end] = bin.range;
1013
- const isLastBin = end === HISTOGRAM_BREAKPOINTS[HISTOGRAM_BREAKPOINTS.length - 1];
1014
- const withinRange = isLastBin ? value >= start && value <= end : value >= start && value < end + 1e-9;
1015
- if (withinRange) {
1016
- bin.count += 1;
1017
- break;
1018
- }
1019
- }
1020
- }
1021
- return bins;
1022
- }
1023
- function calculateEvaluationSummary(results) {
1024
- const total = results.length;
1025
- const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
1026
- const errorCount = errors.length;
1027
- if (total === 0) {
1028
- return {
1029
- total: 0,
1030
- mean: 0,
1031
- median: 0,
1032
- min: 0,
1033
- max: 0,
1034
- standardDeviation: void 0,
1035
- histogram: buildHistogram([]),
1036
- topResults: [],
1037
- bottomResults: [],
1038
- errorCount: 0,
1039
- errors: [],
1040
- executionErrorCount: 0,
1041
- qualityFailureCount: 0,
1042
- passedCount: 0,
1043
- byFailureStage: {},
1044
- byFailureReason: {}
1045
- };
1046
- }
1047
- const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
1048
- const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
1049
- const qualityScores = qualityResults.map((r) => r.score);
1050
- const mean = computeMean(qualityScores);
1051
- const median = computeMedian(qualityScores);
1052
- const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
1053
- const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
1054
- const standardDeviation = computeStandardDeviation(qualityScores);
1055
- const histogram = buildHistogram(qualityScores);
1056
- const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
1057
- const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
1058
- const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
1059
- const executionErrorCount = executionErrors.length;
1060
- const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
1061
- const passedCount = results.filter((r) => r.executionStatus === "ok").length;
1062
- const byFailureStage = {};
1063
- const byFailureReason = {};
1064
- for (const result of executionErrors) {
1065
- if (result.failureStage) {
1066
- byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
1067
- }
1068
- if (result.failureReasonCode) {
1069
- byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
1070
- }
1071
- }
1072
- return {
1073
- total,
1074
- mean,
1075
- median,
1076
- min,
1077
- max,
1078
- standardDeviation,
1079
- histogram,
1080
- topResults,
1081
- bottomResults,
1082
- errorCount,
1083
- errors,
1084
- executionErrorCount,
1085
- qualityFailureCount,
1086
- passedCount,
1087
- byFailureStage,
1088
- byFailureReason
1089
- };
1090
- }
1091
- function formatScore(value) {
1092
- return value.toFixed(3);
1093
- }
1094
- function formatEvaluationSummary(summary) {
1095
- if (summary.total === 0) {
1096
- return "\nNo results to summarize";
1097
- }
1098
- const lines = [];
1099
- if (summary.errorCount > 0) {
1100
- lines.push("\n==================================================");
1101
- lines.push("EXECUTION ERRORS");
1102
- lines.push("==================================================");
1103
- for (const error of summary.errors) {
1104
- lines.push(`
1105
- \u274C ${error.testId}`);
1106
- lines.push(` ${error.error}`);
1107
- }
1108
- lines.push("");
1109
- }
1110
- lines.push("\n==================================================");
1111
- lines.push("EVALUATION SUMMARY");
1112
- lines.push("==================================================");
1113
- lines.push(`Total tests: ${summary.total}`);
1114
- lines.push(`Passed: ${summary.passedCount}`);
1115
- if (summary.qualityFailureCount > 0) {
1116
- lines.push(`Quality failures: ${summary.qualityFailureCount}`);
1117
- }
1118
- if (summary.executionErrorCount > 0) {
1119
- lines.push(`Execution errors: ${summary.executionErrorCount}`);
1120
- }
1121
- if (summary.executionErrorCount > 0) {
1122
- const qualityCount = summary.total - summary.executionErrorCount;
1123
- lines.push(
1124
- `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
1125
- );
1126
- } else {
1127
- lines.push(`Mean score: ${formatScore(summary.mean)}`);
1128
- }
1129
- lines.push(`Median score: ${formatScore(summary.median)}`);
1130
- lines.push(`Min score: ${formatScore(summary.min)}`);
1131
- lines.push(`Max score: ${formatScore(summary.max)}`);
1132
- if (typeof summary.standardDeviation === "number") {
1133
- lines.push(`Std deviation: ${formatScore(summary.standardDeviation)}`);
1134
- }
1135
- lines.push("\nScore distribution:");
1136
- for (const bin of summary.histogram) {
1137
- const [start, end] = bin.range;
1138
- lines.push(` ${start.toFixed(1)}-${end.toFixed(1)}: ${bin.count}`);
1139
- }
1140
- lines.push("\nTop performing tests:");
1141
- summary.topResults.forEach((result, index) => {
1142
- lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
1143
- });
1144
- lines.push("\nLowest performing tests:");
1145
- summary.bottomResults.forEach((result, index) => {
1146
- lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
1147
- });
1148
- const failureStageEntries = Object.entries(summary.byFailureStage);
1149
- if (failureStageEntries.length > 0) {
1150
- lines.push("\nExecution errors by stage:");
1151
- for (const [stage, count] of failureStageEntries) {
1152
- lines.push(` ${stage}: ${count}`);
1153
- }
1154
- }
1155
- const failureReasonEntries = Object.entries(summary.byFailureReason);
1156
- if (failureReasonEntries.length > 0) {
1157
- lines.push("\nExecution errors by reason:");
1158
- for (const [reason, count] of failureReasonEntries) {
1159
- lines.push(` ${reason}: ${count}`);
1160
- }
1161
- }
1162
- return lines.join("\n");
1163
- }
1164
- function formatMatrixSummary(results) {
1165
- const targetSet = /* @__PURE__ */ new Set();
1166
- const testIdSet = /* @__PURE__ */ new Set();
1167
- for (const result of results) {
1168
- targetSet.add(result.target);
1169
- testIdSet.add(result.testId);
1170
- }
1171
- const targets = [...targetSet].sort();
1172
- const testIds = [...testIdSet].sort();
1173
- if (targets.length < 2) {
1174
- return "";
1175
- }
1176
- const scoreMap = /* @__PURE__ */ new Map();
1177
- for (const result of results) {
1178
- if (!scoreMap.has(result.testId)) {
1179
- scoreMap.set(result.testId, /* @__PURE__ */ new Map());
1180
- }
1181
- scoreMap.get(result.testId)?.set(result.target, result.score);
1182
- }
1183
- const lines = [];
1184
- lines.push("\n==================================================");
1185
- lines.push("MATRIX RESULTS (tests \xD7 targets)");
1186
- lines.push("==================================================");
1187
- const testIdColWidth = Math.max(7, ...testIds.map((id) => id.length));
1188
- const targetColWidth = Math.max(7, ...targets.map((t) => t.length));
1189
- const header = `${"Test".padEnd(testIdColWidth)} ${targets.map((t) => t.padEnd(targetColWidth)).join(" ")}`;
1190
- lines.push(header);
1191
- lines.push("-".repeat(header.length));
1192
- for (const testId of testIds) {
1193
- const cells = targets.map((target) => {
1194
- const score = scoreMap.get(testId)?.get(target);
1195
- return score !== void 0 ? formatScore(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
1196
- });
1197
- lines.push(`${testId.padEnd(testIdColWidth)} ${cells.join(" ")}`);
1198
- }
1199
- lines.push("-".repeat(header.length));
1200
- const avgCells = targets.map((target) => {
1201
- const scores = results.filter((r) => r.target === target).map((r) => r.score);
1202
- const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
1203
- return formatScore(avg).padEnd(targetColWidth);
1204
- });
1205
- lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
1206
- return lines.join("\n");
1207
- }
1208
-
1209
- // ../../packages/core/dist/evaluation/validation/index.js
1210
- import { readFile } from "node:fs/promises";
1211
- import path9 from "node:path";
1212
- import { parse } from "yaml";
1213
- import { readFile as readFile2 } from "node:fs/promises";
1214
- import path22 from "node:path";
1215
- import { parse as parse2 } from "yaml";
1216
- import { readFile as readFile3 } from "node:fs/promises";
1217
- import path32 from "node:path";
1218
- import { parse as parse3 } from "yaml";
1219
- import { readFile as readFile4 } from "node:fs/promises";
1220
- import { parse as parse4 } from "yaml";
1221
- import { readFile as readFile5 } from "node:fs/promises";
1222
- import path42 from "node:path";
1223
- import { parse as parse5 } from "yaml";
1224
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
1225
- var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
1226
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
1227
- async function detectFileType(filePath) {
1228
- try {
1229
- const content = await readFile(filePath, "utf8");
1230
- const parsed = parse(content);
1231
- if (typeof parsed !== "object" || parsed === null) {
1232
- return inferFileTypeFromPath(filePath);
1233
- }
1234
- const record = parsed;
1235
- const schema = record.$schema;
1236
- if (typeof schema !== "string") {
1237
- return inferFileTypeFromPath(filePath);
1238
- }
1239
- switch (schema) {
1240
- case SCHEMA_EVAL_V2:
1241
- return "eval";
1242
- case SCHEMA_TARGETS_V2:
1243
- return "targets";
1244
- case SCHEMA_CONFIG_V2:
1245
- return "config";
1246
- default:
1247
- return inferFileTypeFromPath(filePath);
1248
- }
1249
- } catch {
1250
- return inferFileTypeFromPath(filePath);
1251
- }
1252
- }
1253
- function inferFileTypeFromPath(filePath) {
1254
- const normalized = path9.normalize(filePath).replace(/\\/g, "/");
1255
- const basename = path9.basename(filePath);
1256
- if (normalized.includes("/.agentv/")) {
1257
- if (basename === "config.yaml" || basename === "config.yml") {
1258
- return "config";
1259
- }
1260
- if (basename === "targets.yaml" || basename === "targets.yml") {
1261
- return "targets";
1262
- }
1263
- }
1264
- return "eval";
1265
- }
1266
- var ASSERTION_TYPES_WITH_STRING_VALUE = /* @__PURE__ */ new Set([
1267
- "contains",
1268
- "icontains",
1269
- "starts-with",
1270
- "ends-with",
1271
- "equals",
1272
- "regex"
1273
- ]);
1274
- var ASSERTION_TYPES_WITH_ARRAY_VALUE = /* @__PURE__ */ new Set([
1275
- "contains-any",
1276
- "contains-all",
1277
- "icontains-any",
1278
- "icontains-all"
1279
- ]);
1280
- var VALID_TEST_FILE_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".jsonl"]);
1281
- var NAME_PATTERN = /^[a-z0-9-]+$/;
1282
- function isObject(value) {
1283
- return typeof value === "object" && value !== null && !Array.isArray(value);
1284
- }
1285
- async function validateEvalFile(filePath) {
1286
- const errors = [];
1287
- const absolutePath = path22.resolve(filePath);
1288
- let parsed;
1289
- try {
1290
- const content = await readFile2(absolutePath, "utf8");
1291
- parsed = parse2(content);
1292
- } catch (error) {
1293
- errors.push({
1294
- severity: "error",
1295
- filePath: absolutePath,
1296
- message: `Failed to parse YAML: ${error.message}`
1297
- });
1298
- return {
1299
- valid: false,
1300
- filePath: absolutePath,
1301
- fileType: "eval",
1302
- errors
1303
- };
1304
- }
1305
- if (!isObject(parsed)) {
1306
- errors.push({
1307
- severity: "error",
1308
- filePath: absolutePath,
1309
- message: "File must contain a YAML object"
1310
- });
1311
- return {
1312
- valid: false,
1313
- filePath: absolutePath,
1314
- fileType: "eval",
1315
- errors
1316
- };
1317
- }
1318
- validateMetadata(parsed, absolutePath, errors);
1319
- const suiteInput = parsed.input;
1320
- if (suiteInput !== void 0) {
1321
- if (typeof suiteInput === "string") {
1322
- } else if (Array.isArray(suiteInput)) {
1323
- validateMessages(suiteInput, "input", absolutePath, errors);
1324
- } else {
1325
- errors.push({
1326
- severity: "error",
1327
- filePath: absolutePath,
1328
- location: "input",
1329
- message: "Invalid suite-level 'input' field (must be a string or array of messages)"
1330
- });
1331
- }
1332
- }
1333
- let cases = parsed.tests;
1334
- if (cases === void 0 && "eval_cases" in parsed) {
1335
- cases = parsed.eval_cases;
1336
- errors.push({
1337
- severity: "warning",
1338
- filePath: absolutePath,
1339
- location: "eval_cases",
1340
- message: "'eval_cases' is deprecated. Use 'tests' instead."
1341
- });
1342
- }
1343
- if (cases === void 0 && "evalcases" in parsed) {
1344
- cases = parsed.evalcases;
1345
- errors.push({
1346
- severity: "warning",
1347
- filePath: absolutePath,
1348
- location: "evalcases",
1349
- message: "'evalcases' is deprecated. Use 'tests' instead."
1350
- });
1351
- }
1352
- if (typeof cases === "string") {
1353
- validateTestsStringPath(cases, absolutePath, errors);
1354
- return {
1355
- valid: errors.filter((e) => e.severity === "error").length === 0,
1356
- filePath: absolutePath,
1357
- fileType: "eval",
1358
- errors
1359
- };
1360
- }
1361
- if (!Array.isArray(cases)) {
1362
- errors.push({
1363
- severity: "error",
1364
- filePath: absolutePath,
1365
- location: "tests",
1366
- message: "Missing or invalid 'tests' field (must be an array or a file path string)"
1367
- });
1368
- return {
1369
- valid: errors.length === 0,
1370
- filePath: absolutePath,
1371
- fileType: "eval",
1372
- errors
1373
- };
1374
- }
1375
- for (let i = 0; i < cases.length; i++) {
1376
- const evalCase = cases[i];
1377
- const location = `tests[${i}]`;
1378
- if (!isObject(evalCase)) {
1379
- errors.push({
1380
- severity: "error",
1381
- filePath: absolutePath,
1382
- location,
1383
- message: "Eval case must be an object"
1384
- });
1385
- continue;
1386
- }
1387
- const id = evalCase.id;
1388
- if (typeof id !== "string" || id.trim().length === 0) {
1389
- errors.push({
1390
- severity: "error",
1391
- filePath: absolutePath,
1392
- location: `${location}.id`,
1393
- message: "Missing or invalid 'id' field (must be a non-empty string)"
1394
- });
1395
- }
1396
- let criteria = evalCase.criteria;
1397
- if (criteria === void 0 && "expected_outcome" in evalCase) {
1398
- criteria = evalCase.expected_outcome;
1399
- errors.push({
1400
- severity: "warning",
1401
- filePath: absolutePath,
1402
- location: `${location}.expected_outcome`,
1403
- message: "'expected_outcome' is deprecated. Use 'criteria' instead."
1404
- });
1405
- }
1406
- if (criteria !== void 0 && (typeof criteria !== "string" || criteria.trim().length === 0)) {
1407
- errors.push({
1408
- severity: "error",
1409
- filePath: absolutePath,
1410
- location: `${location}.criteria`,
1411
- message: "Invalid 'criteria' field (must be a non-empty string if provided)"
1412
- });
1413
- }
1414
- const inputField = evalCase.input;
1415
- if (inputField !== void 0) {
1416
- if (typeof inputField === "string") {
1417
- } else if (Array.isArray(inputField)) {
1418
- validateMessages(inputField, `${location}.input`, absolutePath, errors);
1419
- } else {
1420
- errors.push({
1421
- severity: "error",
1422
- filePath: absolutePath,
1423
- location: `${location}.input`,
1424
- message: "Invalid 'input' field (must be a string or array of messages)"
1425
- });
1426
- }
1427
- } else {
1428
- errors.push({
1429
- severity: "error",
1430
- filePath: absolutePath,
1431
- location: `${location}.input`,
1432
- message: "Missing 'input' field (must be a string or array of messages)"
1433
- });
1434
- }
1435
- const expectedOutputField = evalCase.expected_output;
1436
- if (expectedOutputField !== void 0) {
1437
- if (typeof expectedOutputField === "string") {
1438
- } else if (Array.isArray(expectedOutputField)) {
1439
- if (expectedOutputField.length > 0 && isObject(expectedOutputField[0]) && "role" in expectedOutputField[0]) {
1440
- validateMessages(
1441
- expectedOutputField,
1442
- `${location}.expected_output`,
1443
- absolutePath,
1444
- errors
1445
- );
1446
- }
1447
- } else if (isObject(expectedOutputField)) {
1448
- } else {
1449
- errors.push({
1450
- severity: "error",
1451
- filePath: absolutePath,
1452
- location: `${location}.expected_output`,
1453
- message: "Invalid 'expected_output' field (must be a string, object, or array)"
1454
- });
1455
- }
1456
- }
1457
- const assertField = evalCase.assert;
1458
- if (assertField !== void 0) {
1459
- validateAssertArray(assertField, location, absolutePath, errors);
1460
- }
1461
- }
1462
- if (isObject(parsed.workspace)) {
1463
- validateWorkspaceRepoConfig(parsed.workspace, absolutePath, errors);
1464
- }
1465
- return {
1466
- valid: errors.filter((e) => e.severity === "error").length === 0,
1467
- filePath: absolutePath,
1468
- fileType: "eval",
1469
- errors
1470
- };
1471
- }
1472
- function validateWorkspaceRepoConfig(workspace, filePath, errors) {
1473
- const repos = workspace.repos;
1474
- const hooks = workspace.hooks;
1475
- const afterEachHook = isObject(hooks) ? hooks.after_each : void 0;
1476
- const isolation = workspace.isolation;
1477
- if (Array.isArray(repos)) {
1478
- for (const repo of repos) {
1479
- if (!isObject(repo)) continue;
1480
- const checkout = repo.checkout;
1481
- const clone = repo.clone;
1482
- if (isObject(checkout) && isObject(clone)) {
1483
- const ancestor = checkout.ancestor;
1484
- const depth = clone.depth;
1485
- if (typeof ancestor === "number" && typeof depth === "number" && depth < ancestor + 1) {
1486
- errors.push({
1487
- severity: "warning",
1488
- filePath,
1489
- location: `workspace.repos[path=${repo.path}]`,
1490
- message: `clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). Recommend depth >= ${ancestor + 1}.`
1491
- });
1492
- }
1493
- }
1494
- }
1495
- }
1496
- if (isObject(afterEachHook) && afterEachHook.reset && afterEachHook.reset !== "none") {
1497
- if (!Array.isArray(repos) || repos.length === 0) {
1498
- errors.push({
1499
- severity: "warning",
1500
- filePath,
1501
- location: "workspace.hooks.after_each",
1502
- message: `hooks.after_each.reset '${afterEachHook.reset}' has no effect without repos.`
1503
- });
1504
- }
1505
- }
1506
- if (isObject(afterEachHook) && afterEachHook.reset && isolation === "per_test") {
1507
- errors.push({
1508
- severity: "warning",
1509
- filePath,
1510
- location: "workspace.hooks.after_each",
1511
- message: "hooks.after_each.reset is redundant with isolation: per_test (each test gets a fresh workspace)."
1512
- });
1513
- }
1514
- }
1515
- function validateMessages(messages, location, filePath, errors) {
1516
- for (let i = 0; i < messages.length; i++) {
1517
- const message = messages[i];
1518
- const msgLocation = `${location}[${i}]`;
1519
- if (!isObject(message)) {
1520
- errors.push({
1521
- severity: "error",
1522
- filePath,
1523
- location: msgLocation,
1524
- message: "Message must be an object"
1525
- });
1526
- continue;
1527
- }
1528
- const role = message.role;
1529
- const validRoles = ["system", "user", "assistant"];
1530
- if (!validRoles.includes(role)) {
1531
- errors.push({
1532
- severity: "error",
1533
- filePath,
1534
- location: `${msgLocation}.role`,
1535
- message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
1536
- });
1537
- }
1538
- const content = message.content;
1539
- if (typeof content === "string") {
1540
- validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
1541
- } else if (Array.isArray(content)) {
1542
- for (let j = 0; j < content.length; j++) {
1543
- const contentItem = content[j];
1544
- const contentLocation = `${msgLocation}.content[${j}]`;
1545
- if (typeof contentItem === "string") {
1546
- validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
1547
- } else if (isObject(contentItem)) {
1548
- const type = contentItem.type;
1549
- if (typeof type !== "string") {
1550
- errors.push({
1551
- severity: "error",
1552
- filePath,
1553
- location: `${contentLocation}.type`,
1554
- message: "Content object must have a 'type' field"
1555
- });
1556
- }
1557
- if (type === "text") {
1558
- const value = contentItem.value;
1559
- if (typeof value !== "string") {
1560
- errors.push({
1561
- severity: "error",
1562
- filePath,
1563
- location: `${contentLocation}.value`,
1564
- message: "Content with type 'text' must have a 'value' field"
1565
- });
1566
- } else {
1567
- validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
1568
- }
1569
- }
1570
- } else {
1571
- errors.push({
1572
- severity: "error",
1573
- filePath,
1574
- location: contentLocation,
1575
- message: "Content array items must be strings or objects"
1576
- });
1577
- }
1578
- }
1579
- } else {
1580
- errors.push({
1581
- severity: "error",
1582
- filePath,
1583
- location: `${msgLocation}.content`,
1584
- message: "Missing or invalid 'content' field (must be a string or array)"
1585
- });
1586
- }
1587
- }
1588
- }
1589
- function validateMetadata(parsed, filePath, errors) {
1590
- const name = parsed.name;
1591
- if (name !== void 0) {
1592
- if (typeof name === "string") {
1593
- if (!NAME_PATTERN.test(name)) {
1594
- errors.push({
1595
- severity: "warning",
1596
- filePath,
1597
- location: "name",
1598
- message: `Invalid 'name' format '${name}'. Must match pattern /^[a-z0-9-]+$/ (lowercase alphanumeric with hyphens).`
1599
- });
1600
- }
1601
- }
1602
- if (!("description" in parsed) || parsed.description === void 0) {
1603
- errors.push({
1604
- severity: "warning",
1605
- filePath,
1606
- location: "name",
1607
- message: "When 'name' is present, 'description' should also be provided."
1608
- });
1609
- }
1610
- }
1611
- }
1612
- function validateTestsStringPath(testsPath, filePath, errors) {
1613
- const ext = path22.extname(testsPath);
1614
- if (!VALID_TEST_FILE_EXTENSIONS.has(ext)) {
1615
- errors.push({
1616
- severity: "warning",
1617
- filePath,
1618
- location: "tests",
1619
- message: `Unsupported file extension '${ext}' for tests path '${testsPath}'. Supported extensions: ${[...VALID_TEST_FILE_EXTENSIONS].join(", ")}`
1620
- });
1621
- }
1622
- }
1623
- function validateAssertArray(assertField, parentLocation, filePath, errors) {
1624
- if (!Array.isArray(assertField)) {
1625
- errors.push({
1626
- severity: "warning",
1627
- filePath,
1628
- location: `${parentLocation}.assert`,
1629
- message: "'assert' must be an array of assertion objects."
1630
- });
1631
- return;
1632
- }
1633
- for (let i = 0; i < assertField.length; i++) {
1634
- const item = assertField[i];
1635
- const location = `${parentLocation}.assert[${i}]`;
1636
- if (!isObject(item)) {
1637
- errors.push({
1638
- severity: "warning",
1639
- filePath,
1640
- location,
1641
- message: "Assertion item must be an object with a type field."
1642
- });
1643
- continue;
1644
- }
1645
- const rawTypeValue = item.type;
1646
- if (rawTypeValue === void 0 || typeof rawTypeValue !== "string") {
1647
- errors.push({
1648
- severity: "warning",
1649
- filePath,
1650
- location: `${location}.type`,
1651
- message: "Assertion item is missing a 'type' field."
1652
- });
1653
- continue;
1654
- }
1655
- const typeValue = rawTypeValue.replace(/_/g, "-");
1656
- if (!isEvaluatorKind(typeValue)) {
1657
- errors.push({
1658
- severity: "warning",
1659
- filePath,
1660
- location: `${location}.type`,
1661
- message: `Unknown assertion type '${rawTypeValue}'.`
1662
- });
1663
- continue;
1664
- }
1665
- if (ASSERTION_TYPES_WITH_STRING_VALUE.has(typeValue)) {
1666
- const value = item.value;
1667
- if (value === void 0 || typeof value !== "string") {
1668
- errors.push({
1669
- severity: "warning",
1670
- filePath,
1671
- location: `${location}.value`,
1672
- message: `Assertion type '${typeValue}' requires a 'value' field (string).`
1673
- });
1674
- continue;
1675
- }
1676
- if (typeValue === "regex") {
1677
- try {
1678
- new RegExp(value);
1679
- } catch {
1680
- errors.push({
1681
- severity: "warning",
1682
- filePath,
1683
- location: `${location}.value`,
1684
- message: `Invalid regex pattern '${value}': not a valid regular expression.`
1685
- });
1686
- }
1687
- }
1688
- }
1689
- if (ASSERTION_TYPES_WITH_ARRAY_VALUE.has(typeValue)) {
1690
- const value = item.value;
1691
- if (!Array.isArray(value) || value.length === 0) {
1692
- errors.push({
1693
- severity: "warning",
1694
- filePath,
1695
- location: `${location}.value`,
1696
- message: `Assertion type '${typeValue}' requires a 'value' field (non-empty string array).`
1697
- });
1698
- continue;
1699
- }
1700
- }
1701
- const required = item.required;
1702
- if (required !== void 0) {
1703
- validateRequiredField(required, location, filePath, errors);
1704
- }
1705
- }
1706
- }
1707
- function validateRequiredField(required, parentLocation, filePath, errors) {
1708
- if (typeof required === "boolean") {
1709
- return;
1710
- }
1711
- if (typeof required === "number") {
1712
- if (required <= 0 || required > 1) {
1713
- errors.push({
1714
- severity: "warning",
1715
- filePath,
1716
- location: `${parentLocation}.required`,
1717
- message: `Invalid 'required' value ${required}. When a number, it must be between 0 (exclusive) and 1 (inclusive).`
1718
- });
1719
- }
1720
- return;
1721
- }
1722
- errors.push({
1723
- severity: "warning",
1724
- filePath,
1725
- location: `${parentLocation}.required`,
1726
- message: `Invalid 'required' value. Must be a boolean or a number between 0 (exclusive) and 1 (inclusive).`
1727
- });
1728
- }
1729
- function validateContentForRoleMarkers(content, location, filePath, errors) {
1730
- const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
1731
- for (const marker of markers) {
1732
- if (content.toLowerCase().includes(marker.toLowerCase())) {
1733
- errors.push({
1734
- severity: "warning",
1735
- filePath,
1736
- location,
1737
- message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
1738
- });
1739
- }
1740
- }
1741
- }
1742
- function isObject2(value) {
1743
- return typeof value === "object" && value !== null && !Array.isArray(value);
1744
- }
1745
- var COMMON_SETTINGS = /* @__PURE__ */ new Set(["provider_batching", "providerBatching"]);
1746
- var RETRY_SETTINGS = /* @__PURE__ */ new Set([
1747
- "max_retries",
1748
- "maxRetries",
1749
- "retry_initial_delay_ms",
1750
- "retryInitialDelayMs",
1751
- "retry_max_delay_ms",
1752
- "retryMaxDelayMs",
1753
- "retry_backoff_factor",
1754
- "retryBackoffFactor",
1755
- "retry_status_codes",
1756
- "retryStatusCodes"
1757
- ]);
1758
- var AZURE_SETTINGS = /* @__PURE__ */ new Set([
1759
- ...COMMON_SETTINGS,
1760
- ...RETRY_SETTINGS,
1761
- "endpoint",
1762
- "resource",
1763
- "resourceName",
1764
- "api_key",
1765
- "apiKey",
1766
- "deployment",
1767
- "deploymentName",
1768
- "model",
1769
- "version",
1770
- "api_version",
1771
- "temperature",
1772
- "max_output_tokens",
1773
- "maxTokens"
1774
- ]);
1775
- var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
1776
- ...COMMON_SETTINGS,
1777
- ...RETRY_SETTINGS,
1778
- "api_key",
1779
- "apiKey",
1780
- "model",
1781
- "deployment",
1782
- "variant",
1783
- "temperature",
1784
- "max_output_tokens",
1785
- "maxTokens",
1786
- "thinking_budget",
1787
- "thinkingBudget"
1788
- ]);
1789
- var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
1790
- ...COMMON_SETTINGS,
1791
- ...RETRY_SETTINGS,
1792
- "api_key",
1793
- "apiKey",
1794
- "model",
1795
- "deployment",
1796
- "variant",
1797
- "temperature",
1798
- "max_output_tokens",
1799
- "maxTokens"
1800
- ]);
1801
- var CODEX_SETTINGS = /* @__PURE__ */ new Set([
1802
- ...COMMON_SETTINGS,
1803
- "model",
1804
- "executable",
1805
- "command",
1806
- "binary",
1807
- "args",
1808
- "arguments",
1809
- "cwd",
1810
- "timeout_seconds",
1811
- "timeoutSeconds",
1812
- "log_dir",
1813
- "logDir",
1814
- "log_directory",
1815
- "logDirectory",
1816
- "log_format",
1817
- "logFormat",
1818
- "log_output_format",
1819
- "logOutputFormat",
1820
- "system_prompt",
1821
- "systemPrompt",
1822
- "workspace_template",
1823
- "workspaceTemplate"
1824
- ]);
1825
- var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
1826
- ...COMMON_SETTINGS,
1827
- "cli_url",
1828
- "cliUrl",
1829
- "cli_path",
1830
- "cliPath",
1831
- "github_token",
1832
- "githubToken",
1833
- "model",
1834
- "cwd",
1835
- "timeout_seconds",
1836
- "timeoutSeconds",
1837
- "log_dir",
1838
- "logDir",
1839
- "log_format",
1840
- "logFormat",
1841
- "system_prompt",
1842
- "systemPrompt",
1843
- "workspace_template",
1844
- "workspaceTemplate"
1845
- ]);
1846
- var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
1847
- ...COMMON_SETTINGS,
1848
- "executable",
1849
- "command",
1850
- "binary",
1851
- "args",
1852
- "arguments",
1853
- "model",
1854
- "cwd",
1855
- "timeout_seconds",
1856
- "timeoutSeconds",
1857
- "log_dir",
1858
- "logDir",
1859
- "log_format",
1860
- "logFormat",
1861
- "system_prompt",
1862
- "systemPrompt",
1863
- "workspace_template",
1864
- "workspaceTemplate"
1865
- ]);
1866
- var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
1867
- ...COMMON_SETTINGS,
1868
- "executable",
1869
- "workspace_template",
1870
- "workspaceTemplate",
1871
- "wait",
1872
- "dry_run",
1873
- "dryRun",
1874
- "subagent_root",
1875
- "subagentRoot",
1876
- "timeout_seconds",
1877
- "timeoutSeconds"
1878
- ]);
1879
- var MOCK_SETTINGS = /* @__PURE__ */ new Set([
1880
- ...COMMON_SETTINGS,
1881
- "response",
1882
- "delayMs",
1883
- "delayMinMs",
1884
- "delayMaxMs",
1885
- "trace"
1886
- // For testing tool-trajectory evaluator
1887
- ]);
1888
- var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
1889
- ...COMMON_SETTINGS,
1890
- "model",
1891
- "cwd",
1892
- "timeout_seconds",
1893
- "timeoutSeconds",
1894
- "log_dir",
1895
- "logDir",
1896
- "log_directory",
1897
- "logDirectory",
1898
- "log_format",
1899
- "logFormat",
1900
- "log_output_format",
1901
- "logOutputFormat",
1902
- "system_prompt",
1903
- "systemPrompt",
1904
- "workspace_template",
1905
- "workspaceTemplate",
1906
- "max_turns",
1907
- "maxTurns",
1908
- "max_budget_usd",
1909
- "maxBudgetUsd"
1910
- ]);
1911
- function getKnownSettings(provider) {
1912
- const normalizedProvider = provider.toLowerCase();
1913
- switch (normalizedProvider) {
1914
- case "azure":
1915
- case "azure-openai":
1916
- return AZURE_SETTINGS;
1917
- case "anthropic":
1918
- return ANTHROPIC_SETTINGS;
1919
- case "gemini":
1920
- case "google":
1921
- case "google-gemini":
1922
- return GEMINI_SETTINGS;
1923
- case "codex":
1924
- case "codex-cli":
1925
- return CODEX_SETTINGS;
1926
- case "copilot-sdk":
1927
- case "copilot_sdk":
1928
- return COPILOT_SDK_SETTINGS;
1929
- case "copilot":
1930
- case "copilot-cli":
1931
- return COPILOT_CLI_SETTINGS;
1932
- case "claude":
1933
- case "claude-code":
1934
- case "claude-sdk":
1935
- return CLAUDE_SETTINGS;
1936
- case "vscode":
1937
- case "vscode-insiders":
1938
- return VSCODE_SETTINGS;
1939
- case "mock":
1940
- return MOCK_SETTINGS;
1941
- case "cli":
1942
- return null;
1943
- default:
1944
- return null;
1945
- }
1946
- }
1947
- function validateUnknownSettings(target, provider, absolutePath, location, errors) {
1948
- const removedTargetFields = /* @__PURE__ */ new Set(["workspace_template", "workspaceTemplate"]);
1949
- const knownSettings = getKnownSettings(provider);
1950
- if (!knownSettings) {
1951
- return;
1952
- }
1953
- const baseFields = /* @__PURE__ */ new Set(["name", "provider", "judge_target", "workers", "$schema", "targets"]);
1954
- for (const key of Object.keys(target)) {
1955
- if (removedTargetFields.has(key)) {
1956
- errors.push({
1957
- severity: "error",
1958
- filePath: absolutePath,
1959
- location: `${location}.${key}`,
1960
- message: "target-level workspace_template has been removed. Use eval-level workspace.template."
1961
- });
1962
- continue;
1963
- }
1964
- if (!baseFields.has(key) && !knownSettings.has(key)) {
1965
- errors.push({
1966
- severity: "warning",
1967
- filePath: absolutePath,
1968
- location: `${location}.${key}`,
1969
- message: `Unknown setting '${key}' for ${provider} provider. This property will be ignored.`
1970
- });
1971
- }
1972
- }
1973
- }
1974
- async function validateTargetsFile(filePath) {
1975
- const errors = [];
1976
- const absolutePath = path32.resolve(filePath);
1977
- let parsed;
1978
- try {
1979
- const content = await readFile3(absolutePath, "utf8");
1980
- parsed = parse3(content);
1981
- } catch (error) {
1982
- errors.push({
1983
- severity: "error",
1984
- filePath: absolutePath,
1985
- message: `Failed to parse YAML: ${error.message}`
1986
- });
1987
- return {
1988
- valid: false,
1989
- filePath: absolutePath,
1990
- fileType: "targets",
1991
- errors
1992
- };
1993
- }
1994
- function validateCliSettings(target, absolutePath2, location, errors2) {
1995
- const command = target.command;
1996
- if (typeof command !== "string" || command.trim().length === 0) {
1997
- errors2.push({
1998
- severity: "error",
1999
- filePath: absolutePath2,
2000
- location: `${location}.command`,
2001
- message: "CLI provider requires 'command' as a non-empty string"
2002
- });
2003
- } else {
2004
- recordUnknownPlaceholders(command, absolutePath2, `${location}.command`, errors2);
2005
- }
2006
- const healthcheck = target.healthcheck;
2007
- if (healthcheck !== void 0) {
2008
- validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
2009
- }
2010
- }
2011
- function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
2012
- if (!isObject2(healthcheck)) {
2013
- errors2.push({
2014
- severity: "error",
2015
- filePath: absolutePath2,
2016
- location,
2017
- message: "'healthcheck' must be an object when provided"
2018
- });
2019
- return;
2020
- }
2021
- const timeoutSeconds = healthcheck.timeout_seconds ?? healthcheck.timeoutSeconds;
2022
- if (timeoutSeconds !== void 0) {
2023
- const numericTimeout = Number(timeoutSeconds);
2024
- if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
2025
- errors2.push({
2026
- severity: "error",
2027
- filePath: absolutePath2,
2028
- location: `${location}.timeoutSeconds`,
2029
- message: "healthcheck.timeoutSeconds must be a positive number when provided"
2030
- });
2031
- }
2032
- }
2033
- const hasUrl = typeof healthcheck.url === "string" && healthcheck.url.trim().length > 0;
2034
- const hasCommand = typeof healthcheck.command === "string" && healthcheck.command.trim().length > 0;
2035
- if (!hasUrl && !hasCommand) {
2036
- errors2.push({
2037
- severity: "error",
2038
- filePath: absolutePath2,
2039
- location,
2040
- message: "healthcheck must have either 'url' (HTTP) or 'command' (command)"
2041
- });
2042
- return;
2043
- }
2044
- if (hasUrl) {
2045
- return;
2046
- }
2047
- recordUnknownPlaceholders(
2048
- healthcheck.command,
2049
- absolutePath2,
2050
- `${location}.command`,
2051
- errors2
2052
- );
2053
- const cwd = healthcheck.cwd;
2054
- if (cwd !== void 0 && typeof cwd !== "string") {
2055
- errors2.push({
2056
- severity: "error",
2057
- filePath: absolutePath2,
2058
- location: `${location}.cwd`,
2059
- message: "healthcheck.cwd must be a string when provided"
2060
- });
2061
- }
2062
- }
2063
- function recordUnknownPlaceholders(template, absolutePath2, location, errors2) {
2064
- const placeholders = extractPlaceholders(template);
2065
- for (const placeholder of placeholders) {
2066
- if (!CLI_PLACEHOLDERS.has(placeholder)) {
2067
- errors2.push({
2068
- severity: "error",
2069
- filePath: absolutePath2,
2070
- location,
2071
- message: `Unknown CLI placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
2072
- });
2073
- }
2074
- }
2075
- }
2076
- function extractPlaceholders(template) {
2077
- const matches = template.matchAll(/\{([A-Z_]+)\}/g);
2078
- const result = [];
2079
- for (const match of matches) {
2080
- const placeholder = match[1];
2081
- if (placeholder) {
2082
- result.push(placeholder);
2083
- }
2084
- }
2085
- return result;
2086
- }
2087
- if (!isObject2(parsed)) {
2088
- errors.push({
2089
- severity: "error",
2090
- filePath: absolutePath,
2091
- message: "File must contain a YAML object"
2092
- });
2093
- return {
2094
- valid: false,
2095
- filePath: absolutePath,
2096
- fileType: "targets",
2097
- errors
2098
- };
2099
- }
2100
- const targets = parsed.targets;
2101
- if (!Array.isArray(targets)) {
2102
- errors.push({
2103
- severity: "error",
2104
- filePath: absolutePath,
2105
- location: "targets",
2106
- message: "Missing or invalid 'targets' field (must be an array)"
2107
- });
2108
- return {
2109
- valid: errors.length === 0,
2110
- filePath: absolutePath,
2111
- fileType: "targets",
2112
- errors
2113
- };
2114
- }
2115
- const knownProviders = [...KNOWN_PROVIDERS, ...PROVIDER_ALIASES];
2116
- for (let i = 0; i < targets.length; i++) {
2117
- const target = targets[i];
2118
- const location = `targets[${i}]`;
2119
- if (!isObject2(target)) {
2120
- errors.push({
2121
- severity: "error",
2122
- filePath: absolutePath,
2123
- location,
2124
- message: "Target must be an object"
2125
- });
2126
- continue;
2127
- }
2128
- const name = target.name;
2129
- if (typeof name !== "string" || name.trim().length === 0) {
2130
- errors.push({
2131
- severity: "error",
2132
- filePath: absolutePath,
2133
- location: `${location}.name`,
2134
- message: "Missing or invalid 'name' field (must be a non-empty string)"
2135
- });
2136
- }
2137
- const provider = target.provider;
2138
- const providerValue = typeof provider === "string" ? provider.trim().toLowerCase() : void 0;
2139
- if (typeof provider !== "string" || provider.trim().length === 0) {
2140
- errors.push({
2141
- severity: "error",
2142
- filePath: absolutePath,
2143
- location: `${location}.provider`,
2144
- message: "Missing or invalid 'provider' field (must be a non-empty string)"
2145
- });
2146
- } else if (!knownProviders.includes(provider)) {
2147
- errors.push({
2148
- severity: "warning",
2149
- filePath: absolutePath,
2150
- location: `${location}.provider`,
2151
- message: `Unknown provider '${provider}'. Known providers: ${knownProviders.join(", ")}`
2152
- });
2153
- }
2154
- if (providerValue === "cli") {
2155
- validateCliSettings(target, absolutePath, location, errors);
2156
- }
2157
- if (typeof provider === "string") {
2158
- validateUnknownSettings(target, provider, absolutePath, location, errors);
2159
- }
2160
- const judgeTarget = target.judge_target;
2161
- if (judgeTarget !== void 0 && typeof judgeTarget !== "string") {
2162
- errors.push({
2163
- severity: "error",
2164
- filePath: absolutePath,
2165
- location: `${location}.judge_target`,
2166
- message: "Invalid 'judge_target' field (must be a string)"
2167
- });
2168
- }
2169
- }
2170
- return {
2171
- valid: errors.filter((e) => e.severity === "error").length === 0,
2172
- filePath: absolutePath,
2173
- fileType: "targets",
2174
- errors
2175
- };
2176
- }
2177
- async function validateConfigFile(filePath) {
2178
- const errors = [];
2179
- try {
2180
- const content = await readFile4(filePath, "utf8");
2181
- const parsed = parse4(content);
2182
- if (typeof parsed !== "object" || parsed === null) {
2183
- errors.push({
2184
- severity: "error",
2185
- filePath,
2186
- message: "Config file must contain a valid YAML object"
2187
- });
2188
- return { valid: false, filePath, fileType: "config", errors };
2189
- }
2190
- const config = parsed;
2191
- const guidelinePatterns = config.guideline_patterns;
2192
- if (guidelinePatterns !== void 0) {
2193
- if (!Array.isArray(guidelinePatterns)) {
2194
- errors.push({
2195
- severity: "error",
2196
- filePath,
2197
- location: "guideline_patterns",
2198
- message: "Field 'guideline_patterns' must be an array"
2199
- });
2200
- } else if (!guidelinePatterns.every((p) => typeof p === "string")) {
2201
- errors.push({
2202
- severity: "error",
2203
- filePath,
2204
- location: "guideline_patterns",
2205
- message: "All entries in 'guideline_patterns' must be strings"
2206
- });
2207
- } else if (guidelinePatterns.length === 0) {
2208
- errors.push({
2209
- severity: "warning",
2210
- filePath,
2211
- location: "guideline_patterns",
2212
- message: "Field 'guideline_patterns' is empty. Consider removing it or adding patterns."
2213
- });
2214
- }
2215
- }
2216
- const evalPatterns = config.eval_patterns;
2217
- if (evalPatterns !== void 0) {
2218
- if (!Array.isArray(evalPatterns)) {
2219
- errors.push({
2220
- severity: "error",
2221
- filePath,
2222
- location: "eval_patterns",
2223
- message: "Field 'eval_patterns' must be an array"
2224
- });
2225
- } else if (!evalPatterns.every((p) => typeof p === "string")) {
2226
- errors.push({
2227
- severity: "error",
2228
- filePath,
2229
- location: "eval_patterns",
2230
- message: "All entries in 'eval_patterns' must be strings"
2231
- });
2232
- } else if (evalPatterns.length === 0) {
2233
- errors.push({
2234
- severity: "warning",
2235
- filePath,
2236
- location: "eval_patterns",
2237
- message: "Field 'eval_patterns' is empty. Consider removing it or adding patterns."
2238
- });
2239
- }
2240
- }
2241
- const allowedFields = /* @__PURE__ */ new Set(["$schema", "guideline_patterns", "eval_patterns"]);
2242
- const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
2243
- if (unexpectedFields.length > 0) {
2244
- errors.push({
2245
- severity: "warning",
2246
- filePath,
2247
- message: `Unexpected fields: ${unexpectedFields.join(", ")}`
2248
- });
2249
- }
2250
- return {
2251
- valid: errors.filter((e) => e.severity === "error").length === 0,
2252
- filePath,
2253
- fileType: "config",
2254
- errors
2255
- };
2256
- } catch (error) {
2257
- errors.push({
2258
- severity: "error",
2259
- filePath,
2260
- message: `Failed to parse config file: ${error.message}`
2261
- });
2262
- return { valid: false, filePath, fileType: "config", errors };
2263
- }
2264
- }
2265
- function isObject3(value) {
2266
- return typeof value === "object" && value !== null && !Array.isArray(value);
2267
- }
2268
- async function validateFileReferences(evalFilePath) {
2269
- const errors = [];
2270
- const absolutePath = path42.resolve(evalFilePath);
2271
- const gitRoot = await findGitRoot(absolutePath);
2272
- if (!gitRoot) {
2273
- errors.push({
2274
- severity: "error",
2275
- filePath: absolutePath,
2276
- message: "Cannot validate file references: git repository root not found"
2277
- });
2278
- return errors;
2279
- }
2280
- const searchRoots = buildSearchRoots(absolutePath, gitRoot);
2281
- let parsed;
2282
- try {
2283
- const content = await readFile5(absolutePath, "utf8");
2284
- parsed = parse5(content);
2285
- } catch {
2286
- return errors;
2287
- }
2288
- if (!isObject3(parsed)) {
2289
- return errors;
2290
- }
2291
- let cases = parsed.tests;
2292
- if (cases === void 0 && "eval_cases" in parsed) {
2293
- cases = parsed.eval_cases;
2294
- }
2295
- if (cases === void 0 && "evalcases" in parsed) {
2296
- cases = parsed.evalcases;
2297
- }
2298
- if (!Array.isArray(cases)) {
2299
- return errors;
2300
- }
2301
- for (let i = 0; i < cases.length; i++) {
2302
- const evalCase = cases[i];
2303
- if (!isObject3(evalCase)) {
2304
- continue;
2305
- }
2306
- const inputField = evalCase.input;
2307
- if (Array.isArray(inputField)) {
2308
- await validateMessagesFileRefs(
2309
- inputField,
2310
- `tests[${i}].input`,
2311
- searchRoots,
2312
- absolutePath,
2313
- errors
2314
- );
2315
- }
2316
- const expectedOutputField = evalCase.expected_output;
2317
- if (Array.isArray(expectedOutputField)) {
2318
- await validateMessagesFileRefs(
2319
- expectedOutputField,
2320
- `tests[${i}].expected_output`,
2321
- searchRoots,
2322
- absolutePath,
2323
- errors
2324
- );
2325
- }
2326
- }
2327
- return errors;
2328
- }
2329
- async function validateMessagesFileRefs(messages, location, searchRoots, filePath, errors) {
2330
- for (let i = 0; i < messages.length; i++) {
2331
- const message = messages[i];
2332
- if (!isObject3(message)) {
2333
- continue;
2334
- }
2335
- const content = message.content;
2336
- if (typeof content === "string") {
2337
- continue;
2338
- }
2339
- if (!Array.isArray(content)) {
2340
- continue;
2341
- }
2342
- for (let j = 0; j < content.length; j++) {
2343
- const contentItem = content[j];
2344
- if (!isObject3(contentItem)) {
2345
- continue;
2346
- }
2347
- const type = contentItem.type;
2348
- if (type !== "file") {
2349
- continue;
2350
- }
2351
- const value = contentItem.value;
2352
- if (typeof value !== "string") {
2353
- errors.push({
2354
- severity: "error",
2355
- filePath,
2356
- location: `${location}[${i}].content[${j}].value`,
2357
- message: "File reference must have a 'value' field with the file path"
2358
- });
2359
- continue;
2360
- }
2361
- const { resolvedPath } = await resolveFileReference(value, searchRoots);
2362
- if (!resolvedPath) {
2363
- errors.push({
2364
- severity: "error",
2365
- filePath,
2366
- location: `${location}[${i}].content[${j}]`,
2367
- message: `Referenced file not found: ${value}`
2368
- });
2369
- } else {
2370
- try {
2371
- const fileContent = await readFile5(resolvedPath, "utf8");
2372
- if (fileContent.trim().length === 0) {
2373
- errors.push({
2374
- severity: "warning",
2375
- filePath,
2376
- location: `${location}[${i}].content[${j}]`,
2377
- message: `Referenced file is empty: ${value}`
2378
- });
2379
- }
2380
- } catch (error) {
2381
- errors.push({
2382
- severity: "error",
2383
- filePath,
2384
- location: `${location}[${i}].content[${j}]`,
2385
- message: `Cannot read referenced file: ${value} (${error.message})`
2386
- });
2387
- }
2388
- }
2389
- }
2390
- }
2391
- }
2392
-
2393
- // src/commands/eval/targets.ts
2394
- var ANSI_YELLOW2 = "\x1B[33m";
2395
- var ANSI_RED2 = "\x1B[31m";
2396
- var ANSI_RESET2 = "\x1B[0m";
2397
- function isTTY() {
2398
- return process.stdout.isTTY ?? false;
2399
- }
2400
- async function readTestSuiteTarget(testFilePath) {
2401
- const metadata = await readTestSuiteMetadata(testFilePath);
2402
- return metadata.target;
2403
- }
2404
- function pickTargetName(options) {
2405
- const cliName = options.cliTargetName?.trim();
2406
- if (cliName && cliName !== "default") {
2407
- return { name: cliName, source: "cli" };
2408
- }
2409
- const fileName = options.fileTargetName?.trim();
2410
- if (fileName && fileName.length > 0) {
2411
- return { name: fileName, source: "test-file" };
2412
- }
2413
- return { name: "default", source: "default" };
2414
- }
2415
- async function selectTarget(options) {
2416
- const {
2417
- testFilePath,
2418
- repoRoot,
2419
- cwd,
2420
- explicitTargetsPath,
2421
- cliTargetName,
2422
- dryRun,
2423
- dryRunDelay,
2424
- dryRunDelayMin,
2425
- dryRunDelayMax,
2426
- env
2427
- } = options;
2428
- const targetsFilePath = await discoverTargetsFile({
2429
- explicitPath: explicitTargetsPath,
2430
- testFilePath,
2431
- repoRoot,
2432
- cwd
2433
- });
2434
- const validationResult = await validateTargetsFile(targetsFilePath);
2435
- const warnings = validationResult.errors.filter((e) => e.severity === "warning");
2436
- const useColors = isTTY();
2437
- if (warnings.length > 0) {
2438
- console.warn(`
2439
- Warnings in ${targetsFilePath}:`);
2440
- for (const warning of warnings) {
2441
- const location = warning.location ? ` [${warning.location}]` : "";
2442
- const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
2443
- const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
2444
- console.warn(`${prefix}${location} ${message}`);
2445
- }
2446
- console.warn("");
2447
- }
2448
- const errors = validationResult.errors.filter((e) => e.severity === "error");
2449
- if (errors.length > 0) {
2450
- console.error(`
2451
- Errors in ${targetsFilePath}:`);
2452
- for (const error of errors) {
2453
- const location = error.location ? ` [${error.location}]` : "";
2454
- const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
2455
- const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
2456
- console.error(`${prefix}${location} ${message}`);
2457
- }
2458
- throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
2459
- }
2460
- const definitions = await readTargetDefinitions(targetsFilePath);
2461
- const fileTargetName = await readTestSuiteTarget(testFilePath);
2462
- const targetChoice = pickTargetName({ cliTargetName, fileTargetName });
2463
- const targetDefinition = definitions.find(
2464
- (definition) => definition.name === targetChoice.name
2465
- );
2466
- if (!targetDefinition) {
2467
- const available = listTargetNames(definitions).join(", ");
2468
- throw new Error(
2469
- `Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`
2470
- );
2471
- }
2472
- if (dryRun) {
2473
- const mockTarget = {
2474
- kind: "mock",
2475
- name: `${targetDefinition.name}-dry-run`,
2476
- judgeTarget: void 0,
2477
- config: {
2478
- response: '{"answer":"Mock dry-run response"}',
2479
- delayMs: dryRunDelay,
2480
- delayMinMs: dryRunDelayMin,
2481
- delayMaxMs: dryRunDelayMax
2482
- }
2483
- };
2484
- return {
2485
- definitions,
2486
- resolvedTarget: mockTarget,
2487
- targetName: targetChoice.name,
2488
- targetSource: targetChoice.source,
2489
- targetsFilePath
2490
- };
2491
- }
2492
- try {
2493
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
2494
- return {
2495
- definitions,
2496
- resolvedTarget,
2497
- targetName: targetChoice.name,
2498
- targetSource: targetChoice.source,
2499
- targetsFilePath
2500
- };
2501
- } catch (error) {
2502
- const message = error instanceof Error ? error.message : String(error);
2503
- throw new Error(`Failed to resolve target '${targetChoice.name}': ${message}`);
2504
- }
2505
- }
2506
- async function selectMultipleTargets(options) {
2507
- const {
2508
- testFilePath,
2509
- repoRoot,
2510
- cwd,
2511
- explicitTargetsPath,
2512
- dryRun,
2513
- dryRunDelay,
2514
- dryRunDelayMin,
2515
- dryRunDelayMax,
2516
- env,
2517
- targetNames
2518
- } = options;
2519
- const targetsFilePath = await discoverTargetsFile({
2520
- explicitPath: explicitTargetsPath,
2521
- testFilePath,
2522
- repoRoot,
2523
- cwd
2524
- });
2525
- const validationResult = await validateTargetsFile(targetsFilePath);
2526
- const warnings = validationResult.errors.filter((e) => e.severity === "warning");
2527
- const useColors = isTTY();
2528
- if (warnings.length > 0) {
2529
- console.warn(`
2530
- Warnings in ${targetsFilePath}:`);
2531
- for (const warning of warnings) {
2532
- const location = warning.location ? ` [${warning.location}]` : "";
2533
- const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
2534
- const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
2535
- console.warn(`${prefix}${location} ${message}`);
2536
- }
2537
- console.warn("");
2538
- }
2539
- const errors = validationResult.errors.filter((e) => e.severity === "error");
2540
- if (errors.length > 0) {
2541
- console.error(`
2542
- Errors in ${targetsFilePath}:`);
2543
- for (const error of errors) {
2544
- const location = error.location ? ` [${error.location}]` : "";
2545
- const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
2546
- const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
2547
- console.error(`${prefix}${location} ${message}`);
2548
- }
2549
- throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
2550
- }
2551
- const definitions = await readTargetDefinitions(targetsFilePath);
2552
- const results = [];
2553
- for (const name of targetNames) {
2554
- const targetDefinition = definitions.find(
2555
- (definition) => definition.name === name
2556
- );
2557
- if (!targetDefinition) {
2558
- const available = listTargetNames(definitions).join(", ");
2559
- throw new Error(
2560
- `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`
2561
- );
2562
- }
2563
- if (dryRun) {
2564
- const mockTarget = {
2565
- kind: "mock",
2566
- name: `${targetDefinition.name}-dry-run`,
2567
- judgeTarget: void 0,
2568
- config: {
2569
- response: '{"answer":"Mock dry-run response"}',
2570
- delayMs: dryRunDelay,
2571
- delayMinMs: dryRunDelayMin,
2572
- delayMaxMs: dryRunDelayMax
2573
- }
2574
- };
2575
- results.push({
2576
- definitions,
2577
- resolvedTarget: mockTarget,
2578
- targetName: name,
2579
- targetSource: "cli",
2580
- targetsFilePath
2581
- });
2582
- } else {
2583
- try {
2584
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
2585
- results.push({
2586
- definitions,
2587
- resolvedTarget,
2588
- targetName: name,
2589
- targetSource: "cli",
2590
- targetsFilePath
2591
- });
2592
- } catch (error) {
2593
- const message = error instanceof Error ? error.message : String(error);
2594
- throw new Error(`Failed to resolve target '${name}': ${message}`);
2595
- }
2596
- }
2597
- }
2598
- return results;
2599
- }
2600
-
2601
- // src/commands/eval/run-eval.ts
2602
- var DEFAULT_WORKERS = 3;
2603
- function normalizeBoolean(value) {
2604
- return value === true;
2605
- }
2606
- function normalizeString(value) {
2607
- if (typeof value !== "string") {
2608
- return void 0;
2609
- }
2610
- const trimmed = value.trim();
2611
- return trimmed.length > 0 ? trimmed : void 0;
2612
- }
2613
- function resolveTimestampPlaceholder(value) {
2614
- if (!value.includes("{timestamp}")) {
2615
- return value;
2616
- }
2617
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2618
- return value.replaceAll("{timestamp}", timestamp);
2619
- }
2620
- function normalizeNumber(value, fallback) {
2621
- if (typeof value === "number" && Number.isFinite(value)) {
2622
- return value;
2623
- }
2624
- if (typeof value === "string") {
2625
- const parsed = Number.parseInt(value, 10);
2626
- if (!Number.isNaN(parsed)) {
2627
- return parsed;
2628
- }
2629
- }
2630
- return fallback;
2631
- }
2632
- function normalizeOptionalNumber(value) {
2633
- if (typeof value === "number" && Number.isFinite(value)) {
2634
- return value;
2635
- }
2636
- if (typeof value === "string") {
2637
- const parsed = Number.parseInt(value, 10);
2638
- if (!Number.isNaN(parsed)) {
2639
- return parsed;
2640
- }
2641
- }
2642
- return void 0;
2643
- }
2644
- function normalizeWorkspaceMode(value) {
2645
- return value === "pooled" || value === "temp" || value === "static" ? value : void 0;
2646
- }
2647
- function normalizeOptions(rawOptions, config, yamlExecution) {
2648
- const cliFormat = normalizeString(rawOptions.outputFormat);
2649
- const configFormat = config?.output?.format;
2650
- const formatStr = cliFormat ?? configFormat ?? "jsonl";
2651
- const format = formatStr === "yaml" ? "yaml" : "jsonl";
2652
- const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
2653
- const configWorkers = config?.execution?.workers;
2654
- const workers = cliWorkers ?? configWorkers ?? 0;
2655
- const rawOutputPaths = rawOptions.output;
2656
- const outputPaths = Array.isArray(rawOutputPaths) ? rawOutputPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
2657
- const rawTarget = rawOptions.target;
2658
- let cliTargets = [];
2659
- let singleTarget;
2660
- if (Array.isArray(rawTarget)) {
2661
- cliTargets = rawTarget.filter((v) => typeof v === "string" && v.trim().length > 0);
2662
- singleTarget = cliTargets.length === 1 ? cliTargets[0] : void 0;
2663
- } else if (typeof rawTarget === "string") {
2664
- const trimmed = rawTarget.trim();
2665
- if (trimmed.length > 0 && trimmed !== "default") {
2666
- cliTargets = [trimmed];
2667
- singleTarget = trimmed;
2668
- }
2669
- }
2670
- const cliAgentTimeout = normalizeOptionalNumber(rawOptions.agentTimeout);
2671
- const configAgentTimeoutSeconds = config?.execution?.agentTimeoutMs != null ? config.execution.agentTimeoutMs / 1e3 : void 0;
2672
- const cliMaxRetries = normalizeOptionalNumber(rawOptions.maxRetries);
2673
- const configMaxRetries = config?.execution?.maxRetries;
2674
- const cliCache = normalizeBoolean(rawOptions.cache);
2675
- const cliNoCache = normalizeBoolean(rawOptions.noCache);
2676
- const configCacheEnabled = config?.cache?.enabled;
2677
- const resolvedCache = cliCache || !cliNoCache && configCacheEnabled === true;
2678
- const resolvedNoCache = cliNoCache;
2679
- const cliOut = normalizeString(rawOptions.out);
2680
- const configOut = config?.output?.dir;
2681
- const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
2682
- const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
2683
- const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
2684
- if (cliWorkspacePath && cliWorkspaceModeRaw && cliWorkspaceMode !== "static") {
2685
- throw new Error("--workspace-path requires --workspace-mode=static (or omit --workspace-mode)");
2686
- }
2687
- const yamlExecutionRecord = yamlExecution;
2688
- const yamlWorkspaceMode = normalizeWorkspaceMode(yamlExecutionRecord?.workspace_mode);
2689
- const yamlWorkspacePath = normalizeString(yamlExecutionRecord?.workspace_path);
2690
- const workspacePath = cliWorkspacePath ?? yamlWorkspacePath;
2691
- const workspaceMode = cliWorkspacePath ? "static" : cliWorkspaceMode ?? yamlWorkspaceMode;
2692
- return {
2693
- target: singleTarget,
2694
- cliTargets,
2695
- targetsPath: normalizeString(rawOptions.targets),
2696
- filter: normalizeString(rawOptions.filter),
2697
- workers: workers > 0 ? workers : void 0,
2698
- outPath: cliOut ?? configOut,
2699
- outputPaths,
2700
- format,
2701
- dryRun: normalizeBoolean(rawOptions.dryRun),
2702
- dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
2703
- dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
2704
- dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0),
2705
- agentTimeoutSeconds: cliAgentTimeout ?? configAgentTimeoutSeconds,
2706
- maxRetries: cliMaxRetries ?? configMaxRetries ?? 2,
2707
- cache: resolvedCache,
2708
- noCache: resolvedNoCache,
2709
- // Boolean OR: config `true` cannot be overridden to `false` from CLI.
2710
- // Intentional — there are no --no-verbose / --no-keep-workspaces flags.
2711
- // Precedence: CLI > YAML config > TS config
2712
- verbose: normalizeBoolean(rawOptions.verbose) || yamlExecution?.verbose === true || config?.execution?.verbose === true,
2713
- // Precedence: CLI > YAML config > TS config
2714
- otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
2715
- traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
2716
- exportOtel: normalizeBoolean(rawOptions.exportOtel),
2717
- otelBackend: normalizeString(rawOptions.otelBackend),
2718
- otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
2719
- otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
2720
- retryErrors: normalizeString(rawOptions.retryErrors),
2721
- workspaceMode,
2722
- workspacePath
2723
- };
2724
- }
2725
- async function ensureFileExists(filePath, description) {
2726
- try {
2727
- await access4(filePath, constants4.F_OK);
2728
- } catch {
2729
- throw new Error(`${description} not found: ${filePath}`);
2730
- }
2731
- }
2732
- function buildDefaultOutputPath(cwd, format) {
2733
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2734
- const baseName = "eval";
2735
- const extension = getDefaultExtension(format);
2736
- return path10.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
2737
- }
2738
- function createProgressReporter(maxWorkers, options) {
2739
- const display = new ProgressDisplay(maxWorkers, options);
2740
- return {
2741
- isInteractive: display.isInteractiveMode(),
2742
- start: () => display.start(),
2743
- setTotal: (total) => display.setTotalTests(total),
2744
- update: (workerId, progress) => display.updateWorker({ ...progress, workerId }),
2745
- finish: () => display.finish(),
2746
- addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
2747
- };
2748
- }
2749
- function makeEvalKey(testFilePath, evalId) {
2750
- return `${path10.resolve(testFilePath)}::${evalId}`;
2751
- }
2752
- function createDisplayIdTracker() {
2753
- const map = /* @__PURE__ */ new Map();
2754
- let nextId = 1;
2755
- return {
2756
- getOrAssign(evalKey) {
2757
- const existing = map.get(evalKey);
2758
- if (existing !== void 0) {
2759
- return existing;
2760
- }
2761
- const assigned = nextId++;
2762
- map.set(evalKey, assigned);
2763
- return assigned;
2764
- }
2765
- };
2766
- }
2767
- function applyVerboseOverride(selection, cliVerbose) {
2768
- const { resolvedTarget } = selection;
2769
- if (resolvedTarget.kind !== "cli") {
2770
- return selection;
2771
- }
2772
- return {
2773
- ...selection,
2774
- resolvedTarget: {
2775
- ...resolvedTarget,
2776
- config: {
2777
- ...resolvedTarget.config,
2778
- verbose: cliVerbose
2779
- }
2780
- }
2781
- };
2782
- }
2783
- async function prepareFileMetadata(params) {
2784
- const { testFilePath, repoRoot, cwd, options } = params;
2785
- await ensureFileExists(testFilePath, "Test file");
2786
- await loadEnvFromHierarchy({
2787
- testFilePath,
2788
- repoRoot,
2789
- verbose: options.verbose
2790
- });
2791
- const suite = await loadTestSuite(testFilePath, repoRoot, {
2792
- verbose: options.verbose,
2793
- filter: options.filter
2794
- });
2795
- const filteredIds = suite.tests.map((value) => value.id);
2796
- const cliTargets = options.cliTargets;
2797
- const suiteTargets = suite.targets;
2798
- let targetNames;
2799
- if (cliTargets.length > 0) {
2800
- targetNames = cliTargets;
2801
- } else if (suiteTargets && suiteTargets.length > 0) {
2802
- targetNames = suiteTargets;
2803
- } else {
2804
- targetNames = [];
2805
- }
2806
- let selections;
2807
- if (targetNames.length > 1) {
2808
- const multiSelections = await selectMultipleTargets({
2809
- testFilePath,
2810
- repoRoot,
2811
- cwd,
2812
- explicitTargetsPath: options.targetsPath,
2813
- dryRun: options.dryRun,
2814
- dryRunDelay: options.dryRunDelay,
2815
- dryRunDelayMin: options.dryRunDelayMin,
2816
- dryRunDelayMax: options.dryRunDelayMax,
2817
- env: process.env,
2818
- targetNames
2819
- });
2820
- selections = multiSelections.map((sel) => {
2821
- const providerLabel = options.dryRun ? `${sel.resolvedTarget.kind} (dry-run)` : sel.resolvedTarget.kind;
2822
- return {
2823
- selection: sel,
2824
- inlineTargetLabel: `${sel.targetName} [provider=${providerLabel}]`
2825
- };
2826
- });
2827
- } else {
2828
- const selection = await selectTarget({
2829
- testFilePath,
2830
- repoRoot,
2831
- cwd,
2832
- explicitTargetsPath: options.targetsPath,
2833
- cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
2834
- dryRun: options.dryRun,
2835
- dryRunDelay: options.dryRunDelay,
2836
- dryRunDelayMin: options.dryRunDelayMin,
2837
- dryRunDelayMax: options.dryRunDelayMax,
2838
- env: process.env
2839
- });
2840
- const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
2841
- selections = [
2842
- {
2843
- selection,
2844
- inlineTargetLabel: `${selection.targetName} [provider=${providerLabel}]`
2845
- }
2846
- ];
2847
- }
2848
- return {
2849
- evalIds: filteredIds,
2850
- evalCases: suite.tests,
2851
- selections,
2852
- trialsConfig: suite.trials,
2853
- suiteTargets,
2854
- yamlCache: suite.cacheConfig?.enabled,
2855
- yamlCachePath: suite.cacheConfig?.cachePath,
2856
- totalBudgetUsd: suite.totalBudgetUsd,
2857
- failOnError: suite.failOnError
2858
- };
2859
- }
2860
- async function runWithLimit(items, limit, task) {
2861
- const safeLimit = Math.max(1, limit);
2862
- let index = 0;
2863
- const workers = Array.from({ length: safeLimit }, async () => {
2864
- while (index < items.length) {
2865
- const current = items[index];
2866
- index += 1;
2867
- await task(current);
2868
- }
2869
- });
2870
- await Promise.all(workers);
2871
- }
2872
- async function runSingleEvalFile(params) {
2873
- const {
2874
- testFilePath,
2875
- cwd,
2876
- repoRoot,
2877
- options,
2878
- outputWriter,
2879
- otelExporter,
2880
- cache,
2881
- evaluationRunner,
2882
- workersOverride,
2883
- progressReporter,
2884
- seenEvalCases,
2885
- displayIdTracker,
2886
- selection,
2887
- inlineTargetLabel,
2888
- evalCases,
2889
- trialsConfig,
2890
- matrixMode,
2891
- totalBudgetUsd,
2892
- failOnError
2893
- } = params;
2894
- const targetName = selection.targetName;
2895
- await ensureFileExists(testFilePath, "Test file");
2896
- const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
2897
- const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
2898
- const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
2899
- if (!progressReporter.isInteractive || options.verbose) {
2900
- console.log(targetMessage);
2901
- }
2902
- const agentTimeoutMs = options.agentTimeoutSeconds != null ? Math.max(0, options.agentTimeoutSeconds) * 1e3 : void 0;
2903
- const workerPreference = workersOverride ?? options.workers;
2904
- let resolvedWorkers = workerPreference ?? resolvedTargetSelection.resolvedTarget.workers ?? DEFAULT_WORKERS;
2905
- if (resolvedWorkers < 1 || resolvedWorkers > 50) {
2906
- throw new Error(`Workers must be between 1 and 50, got: ${resolvedWorkers}`);
2907
- }
2908
- const isVSCodeProvider = ["vscode", "vscode-insiders"].includes(
2909
- resolvedTargetSelection.resolvedTarget.kind
2910
- );
2911
- if (isVSCodeProvider && resolvedWorkers > 1) {
2912
- console.warn(
2913
- `Warning: VSCode providers require window focus. Limiting workers from ${resolvedWorkers} to 1 to prevent race conditions.`
2914
- );
2915
- resolvedWorkers = 1;
2916
- }
2917
- if (isVSCodeProvider && !options.dryRun) {
2918
- const vsConfig = resolvedTargetSelection.resolvedTarget.config;
2919
- await ensureVSCodeSubagents({
2920
- kind: resolvedTargetSelection.resolvedTarget.kind,
2921
- count: resolvedWorkers,
2922
- verbose: options.verbose,
2923
- vscodeCmd: vsConfig.executable
2924
- });
2925
- }
2926
- const streamingObserver = otelExporter?.createStreamingObserver() ?? null;
2927
- const results = await evaluationRunner({
2928
- testFilePath,
2929
- repoRoot,
2930
- target: resolvedTargetSelection.resolvedTarget,
2931
- targets: resolvedTargetSelection.definitions,
2932
- env: process.env,
2933
- maxRetries: Math.max(0, options.maxRetries),
2934
- agentTimeoutMs,
2935
- cache,
2936
- useCache: (() => {
2937
- if (!cache) return false;
2938
- const targetConfig = resolvedTargetSelection.resolvedTarget.config;
2939
- if (shouldSkipCacheForTemperature(targetConfig)) {
2940
- if (options.verbose) {
2941
- console.log("Cache skipped: target temperature > 0");
2942
- }
2943
- return false;
2944
- }
2945
- return true;
2946
- })(),
2947
- evalCases,
2948
- verbose: options.verbose,
2949
- maxConcurrency: resolvedWorkers,
2950
- workspaceMode: options.workspaceMode,
2951
- workspacePath: options.workspacePath,
2952
- trials: trialsConfig,
2953
- totalBudgetUsd,
2954
- failOnError,
2955
- streamCallbacks: streamingObserver?.getStreamCallbacks(),
2956
- onResult: async (result) => {
2957
- streamingObserver?.finalizeEvalCase(result.score, result.error);
2958
- const { output: _, ...resultWithoutTrace } = result;
2959
- await outputWriter.append(resultWithoutTrace);
2960
- if (otelExporter && !streamingObserver) {
2961
- try {
2962
- await otelExporter.exportResult(result);
2963
- } catch (err) {
2964
- if (options.verbose) {
2965
- console.warn(
2966
- `OTel export warning: ${err instanceof Error ? err.message : String(err)}`
2967
- );
2968
- }
2969
- }
2970
- }
2971
- },
2972
- onProgress: async (event) => {
2973
- const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
2974
- const evalKey = makeEvalKey(testFilePath, evalKeyId);
2975
- if (event.status === "pending" && !seenEvalCases.has(evalKey)) {
2976
- seenEvalCases.add(evalKey);
2977
- progressReporter.setTotal(seenEvalCases.size);
2978
- }
2979
- const displayId = displayIdTracker.getOrAssign(evalKey);
2980
- if (event.status === "running" && streamingObserver) {
2981
- streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
2982
- }
2983
- progressReporter.update(displayId, {
2984
- workerId: displayId,
2985
- testId: matrixMode ? `${event.testId}@${targetName}` : event.testId,
2986
- status: event.status,
2987
- startedAt: event.startedAt,
2988
- completedAt: event.completedAt,
2989
- error: event.error,
2990
- targetLabel: inlineTargetLabel
2991
- });
2992
- }
2993
- });
2994
- return { results: [...results] };
2995
- }
2996
- async function runEvalCommand(input) {
2997
- const cwd = process.cwd();
2998
- let config = null;
2999
- try {
3000
- config = await loadTsConfig(cwd);
3001
- } catch (err) {
3002
- console.warn(
3003
- `Warning: Failed to load agentv config: ${err instanceof Error ? err.message : String(err)}`
3004
- );
3005
- }
3006
- const repoRoot = await findRepoRoot(cwd);
3007
- const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
3008
- if (yamlConfig?.required_version) {
3009
- await enforceRequiredVersion(yamlConfig.required_version, {
3010
- strict: normalizeBoolean(input.rawOptions.strict)
3011
- });
3012
- }
3013
- let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
3014
- let retryNonErrorResults;
3015
- if (options.retryErrors) {
3016
- const retryPath = path10.resolve(options.retryErrors);
3017
- await ensureFileExists(retryPath, "Retry-errors JSONL file");
3018
- const errorIds = await loadErrorTestIds(retryPath);
3019
- if (errorIds.length === 0) {
3020
- console.log("No execution errors found in the previous output. Nothing to retry.");
3021
- return;
3022
- }
3023
- console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
3024
- const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
3025
- options = { ...options, filter: filterPattern };
3026
- retryNonErrorResults = await loadNonErrorResults(retryPath);
3027
- }
3028
- if (options.workspacePath) {
3029
- const resolvedWorkspace = path10.resolve(options.workspacePath);
3030
- try {
3031
- const { stat: stat2 } = await import("node:fs/promises");
3032
- const stats = await stat2(resolvedWorkspace);
3033
- if (!stats.isDirectory()) {
3034
- throw new Error(`--workspace-path is not a directory: ${resolvedWorkspace}`);
3035
- }
3036
- } catch (err) {
3037
- if (err.code === "ENOENT") {
3038
- throw new Error(`--workspace-path does not exist: ${resolvedWorkspace}`);
3039
- }
3040
- throw err;
3041
- }
3042
- options = { ...options, workspacePath: resolvedWorkspace };
3043
- }
3044
- if (options.verbose) {
3045
- console.log(`Repository root: ${repoRoot}`);
3046
- }
3047
- let otelExporter = null;
3048
- const useFileExport = !!(options.otelFile || options.traceFile);
3049
- if (options.exportOtel || useFileExport) {
3050
- try {
3051
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-MQBGD6LP.js");
3052
- let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
3053
- let headers = {};
3054
- if (options.otelBackend) {
3055
- const preset = OTEL_BACKEND_PRESETS[options.otelBackend];
3056
- if (preset) {
3057
- endpoint = preset.endpoint;
3058
- headers = preset.headers(process.env);
3059
- } else {
3060
- console.warn(`Unknown OTel backend preset: ${options.otelBackend}`);
3061
- }
3062
- }
3063
- if (process.env.OTEL_EXPORTER_OTLP_HEADERS) {
3064
- for (const pair of process.env.OTEL_EXPORTER_OTLP_HEADERS.split(",")) {
3065
- const [key, ...rest] = pair.split("=");
3066
- if (key) headers[key.trim()] = rest.join("=").trim();
3067
- }
3068
- }
3069
- const captureContent = options.otelCaptureContent || process.env.AGENTV_OTEL_CAPTURE_CONTENT === "true";
3070
- otelExporter = new OtelTraceExporter({
3071
- endpoint,
3072
- headers,
3073
- captureContent,
3074
- groupTurns: options.otelGroupTurns,
3075
- otlpFilePath: options.otelFile ? path10.resolve(options.otelFile) : void 0,
3076
- traceFilePath: options.traceFile ? path10.resolve(options.traceFile) : void 0
3077
- });
3078
- const initialized = await otelExporter.init();
3079
- if (!initialized) {
3080
- console.warn(
3081
- "OTel export requested but @opentelemetry packages not available. Install them to enable export."
3082
- );
3083
- otelExporter = null;
3084
- }
3085
- } catch (err) {
3086
- console.warn(
3087
- `OTel export initialization failed: ${err instanceof Error ? err.message : String(err)}`
3088
- );
3089
- otelExporter = null;
3090
- }
3091
- }
3092
- const outputPath = options.outPath ? path10.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
3093
- const extraOutputPaths = options.outputPaths.map((p) => path10.resolve(p));
3094
- const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
3095
- const uniqueOutputPaths = [...new Set(allOutputPaths)];
3096
- let outputWriter;
3097
- if (uniqueOutputPaths.length === 1) {
3098
- outputWriter = await createOutputWriter(outputPath, options.format);
3099
- console.log(`Output path: ${outputPath}`);
3100
- } else {
3101
- outputWriter = await createMultiWriter(uniqueOutputPaths);
3102
- console.log("Output paths:");
3103
- for (const p of uniqueOutputPaths) {
3104
- console.log(` ${p}`);
3105
- }
3106
- }
3107
- const resolvedTestFiles = input.testFiles.map((file) => path10.resolve(file));
3108
- if (options.otelFile) {
3109
- console.log(`OTLP JSON file: ${path10.resolve(options.otelFile)}`);
3110
- }
3111
- if (options.traceFile) {
3112
- console.log(`Trace file: ${path10.resolve(options.traceFile)}`);
3113
- }
3114
- const evaluationRunner = await resolveEvaluationRunner();
3115
- const allResults = [];
3116
- const seenEvalCases = /* @__PURE__ */ new Set();
3117
- const displayIdTracker = createDisplayIdTracker();
3118
- const totalWorkers = options.workers ?? DEFAULT_WORKERS;
3119
- const fileConcurrency = Math.min(
3120
- Math.max(1, totalWorkers),
3121
- Math.max(1, resolvedTestFiles.length)
3122
- );
3123
- const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : void 0;
3124
- const fileMetadata = /* @__PURE__ */ new Map();
3125
- for (const testFilePath of resolvedTestFiles) {
3126
- const meta = await prepareFileMetadata({
3127
- testFilePath,
3128
- repoRoot,
3129
- cwd,
3130
- options
3131
- });
3132
- fileMetadata.set(testFilePath, meta);
3133
- }
3134
- const firstMeta = fileMetadata.values().next().value;
3135
- const yamlCacheEnabled = firstMeta?.yamlCache;
3136
- const yamlCachePath = firstMeta?.yamlCachePath;
3137
- const cacheEnabled = shouldEnableCache({
3138
- cliCache: options.cache,
3139
- cliNoCache: options.noCache,
3140
- yamlCache: yamlCacheEnabled
3141
- });
3142
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path10.resolve(yamlCachePath) : void 0) : void 0;
3143
- const useCache = cacheEnabled;
3144
- if (cacheEnabled) {
3145
- console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
3146
- }
3147
- const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
3148
- let totalEvalCount = 0;
3149
- for (const meta of fileMetadata.values()) {
3150
- const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
3151
- for (const test of meta.evalCases) {
3152
- const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
3153
- totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
3154
- }
3155
- }
3156
- if (totalEvalCount === 0) {
3157
- throw new Error("No tests matched the provided filters.");
3158
- }
3159
- const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
3160
- progressReporter.start();
3161
- progressReporter.setTotal(totalEvalCount);
3162
- const seenCodexLogPaths = /* @__PURE__ */ new Set();
3163
- const unsubscribeCodexLogs = subscribeToCodexLogEntries((entry) => {
3164
- if (!entry.filePath || seenCodexLogPaths.has(entry.filePath)) {
3165
- return;
3166
- }
3167
- seenCodexLogPaths.add(entry.filePath);
3168
- progressReporter.addLogPaths([entry.filePath], "codex");
3169
- });
3170
- const seenPiLogPaths = /* @__PURE__ */ new Set();
3171
- const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
3172
- if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) {
3173
- return;
3174
- }
3175
- seenPiLogPaths.add(entry.filePath);
3176
- progressReporter.addLogPaths([entry.filePath], "pi");
3177
- });
3178
- const seenCopilotLogPaths = /* @__PURE__ */ new Set();
3179
- const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => {
3180
- if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
3181
- return;
3182
- }
3183
- seenCopilotLogPaths.add(entry.filePath);
3184
- progressReporter.addLogPaths([entry.filePath], "copilot");
3185
- });
3186
- const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => {
3187
- if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
3188
- return;
3189
- }
3190
- seenCopilotLogPaths.add(entry.filePath);
3191
- progressReporter.addLogPaths([entry.filePath], "copilot");
3192
- });
3193
- for (const [testFilePath, meta] of fileMetadata.entries()) {
3194
- for (const { selection, inlineTargetLabel } of meta.selections) {
3195
- for (const testId of meta.evalIds) {
3196
- const evalKey = makeEvalKey(
3197
- testFilePath,
3198
- meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
3199
- );
3200
- seenEvalCases.add(evalKey);
3201
- const displayId = displayIdTracker.getOrAssign(evalKey);
3202
- progressReporter.update(displayId, {
3203
- workerId: displayId,
3204
- testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
3205
- status: "pending",
3206
- targetLabel: inlineTargetLabel
3207
- });
3208
- }
3209
- }
3210
- }
3211
- try {
3212
- await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
3213
- const targetPrep = fileMetadata.get(testFilePath);
3214
- if (!targetPrep) {
3215
- throw new Error(`Missing metadata for ${testFilePath}`);
3216
- }
3217
- const targetResults = await Promise.all(
3218
- targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
3219
- const targetName = selection.targetName;
3220
- const applicableEvalCases = targetPrep.selections.length > 1 ? targetPrep.evalCases.filter((test) => {
3221
- if (test.targets && test.targets.length > 0) {
3222
- return test.targets.includes(targetName);
3223
- }
3224
- return true;
3225
- }) : targetPrep.evalCases;
3226
- if (applicableEvalCases.length === 0) {
3227
- return [];
3228
- }
3229
- const result = await runSingleEvalFile({
3230
- testFilePath,
3231
- cwd,
3232
- repoRoot,
3233
- options,
3234
- outputWriter,
3235
- otelExporter,
3236
- cache,
3237
- evaluationRunner,
3238
- workersOverride: perFileWorkers,
3239
- progressReporter,
3240
- seenEvalCases,
3241
- displayIdTracker,
3242
- selection,
3243
- inlineTargetLabel,
3244
- evalCases: applicableEvalCases,
3245
- trialsConfig: targetPrep.trialsConfig,
3246
- matrixMode: targetPrep.selections.length > 1,
3247
- totalBudgetUsd: targetPrep.totalBudgetUsd,
3248
- failOnError: targetPrep.failOnError
3249
- });
3250
- return result.results;
3251
- })
3252
- );
3253
- for (const results of targetResults) {
3254
- allResults.push(...results);
3255
- }
3256
- });
3257
- progressReporter.finish();
3258
- if (retryNonErrorResults && retryNonErrorResults.length > 0) {
3259
- for (const preserved of retryNonErrorResults) {
3260
- await outputWriter.append(preserved);
3261
- }
3262
- allResults.push(...retryNonErrorResults);
3263
- console.log(
3264
- `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
3265
- );
3266
- }
3267
- const summary = calculateEvaluationSummary(allResults);
3268
- console.log(formatEvaluationSummary(summary));
3269
- if (isMatrixMode && allResults.length > 0) {
3270
- console.log(formatMatrixSummary(allResults));
3271
- }
3272
- const failedWithWorkspaces = allResults.filter(
3273
- (r) => r.workspacePath && (r.error || r.score < 0.5)
3274
- );
3275
- if (failedWithWorkspaces.length > 0) {
3276
- console.log("\nWorkspaces preserved for debugging:");
3277
- for (const result of failedWithWorkspaces) {
3278
- console.log(` ${result.testId}: ${result.workspacePath}`);
3279
- }
3280
- }
3281
- if (allResults.length > 0) {
3282
- if (uniqueOutputPaths.length === 1) {
3283
- console.log(`
3284
- Results written to: ${outputPath}`);
3285
- } else {
3286
- console.log("\nResults written to:");
3287
- for (const p of uniqueOutputPaths) {
3288
- console.log(` ${p}`);
3289
- }
3290
- }
3291
- }
3292
- } finally {
3293
- unsubscribeCodexLogs();
3294
- unsubscribePiLogs();
3295
- unsubscribeCopilotSdkLogs();
3296
- unsubscribeCopilotCliLogs();
3297
- await outputWriter.close().catch(() => void 0);
3298
- if (otelExporter) {
3299
- try {
3300
- await otelExporter.shutdown();
3301
- } catch {
3302
- }
3303
- }
3304
- }
3305
- }
3306
- async function resolveEvaluationRunner() {
3307
- const overridePath = process.env.AGENTEVO_CLI_EVAL_RUNNER;
3308
- if (!overridePath) {
3309
- return runEvaluation;
3310
- }
3311
- const resolved = path10.isAbsolute(overridePath) ? overridePath : path10.resolve(process.cwd(), overridePath);
3312
- const moduleUrl = pathToFileURL(resolved).href;
3313
- const mod = await import(moduleUrl);
3314
- const candidate = mod.runEvaluation;
3315
- if (typeof candidate !== "function") {
3316
- throw new Error(
3317
- `Module '${resolved}' must export a 'runEvaluation' function to override the default implementation`
3318
- );
3319
- }
3320
- return candidate;
3321
- }
3322
-
3323
- export {
3324
- package_default,
3325
- toSnakeCaseDeep,
3326
- resolveEvalPaths,
3327
- findRepoRoot,
3328
- detectFileType,
3329
- validateEvalFile,
3330
- validateTargetsFile,
3331
- validateConfigFile,
3332
- validateFileReferences,
3333
- TARGET_FILE_CANDIDATES,
3334
- fileExists,
3335
- selectTarget,
3336
- runEvalCommand
3337
- };
3338
- //# sourceMappingURL=chunk-4MSAOMCC.js.map