agentv 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/dist/agentv-provider-5CJVBBGG-2XVZBW7L.js +9 -0
  2. package/dist/agentv-provider-5CJVBBGG-2XVZBW7L.js.map +1 -0
  3. package/dist/chunk-3L2L5GIL.js +51 -0
  4. package/dist/chunk-3L2L5GIL.js.map +1 -0
  5. package/dist/chunk-5H446C7X.js +50 -0
  6. package/dist/chunk-5H446C7X.js.map +1 -0
  7. package/dist/chunk-6GSYTMXD.js +31520 -0
  8. package/dist/chunk-6GSYTMXD.js.map +1 -0
  9. package/dist/chunk-BL4PVUAT.js +261 -0
  10. package/dist/chunk-BL4PVUAT.js.map +1 -0
  11. package/dist/chunk-C5GOHBQM.js +84 -0
  12. package/dist/chunk-C5GOHBQM.js.map +1 -0
  13. package/dist/chunk-FTPA72PY.js +6149 -0
  14. package/dist/chunk-FTPA72PY.js.map +1 -0
  15. package/dist/chunk-JK6V4KVD.js +114 -0
  16. package/dist/chunk-JK6V4KVD.js.map +1 -0
  17. package/dist/chunk-LRULMAAA.js +1711 -0
  18. package/dist/chunk-LRULMAAA.js.map +1 -0
  19. package/dist/chunk-OR4WXZAF.js +24302 -0
  20. package/dist/chunk-OR4WXZAF.js.map +1 -0
  21. package/dist/chunk-PCQA43SA.js +4248 -0
  22. package/dist/chunk-PCQA43SA.js.map +1 -0
  23. package/dist/chunk-SR4I5KET.js +1238 -0
  24. package/dist/chunk-SR4I5KET.js.map +1 -0
  25. package/dist/chunk-VQ2ZO7XJ.js +2098 -0
  26. package/dist/chunk-VQ2ZO7XJ.js.map +1 -0
  27. package/dist/chunk-XALGXSKB.js +21 -0
  28. package/dist/chunk-XALGXSKB.js.map +1 -0
  29. package/dist/chunk-XOSNETAV.js +565 -0
  30. package/dist/chunk-XOSNETAV.js.map +1 -0
  31. package/dist/cli.js +29 -0
  32. package/dist/cli.js.map +1 -0
  33. package/dist/dist-3BMOAU4X.js +305 -0
  34. package/dist/dist-3BMOAU4X.js.map +1 -0
  35. package/dist/esm-5Q4BZALM-5REQWAUV.js +924 -0
  36. package/dist/esm-5Q4BZALM-5REQWAUV.js.map +1 -0
  37. package/dist/esm-CZAWIY6F.js +32 -0
  38. package/dist/esm-CZAWIY6F.js.map +1 -0
  39. package/dist/esm-QNEMCJPL.js +933 -0
  40. package/dist/esm-QNEMCJPL.js.map +1 -0
  41. package/dist/esm-R77SNOF5.js +65 -0
  42. package/dist/esm-R77SNOF5.js.map +1 -0
  43. package/dist/esm-RVQPUGWH.js +1207 -0
  44. package/dist/esm-RVQPUGWH.js.map +1 -0
  45. package/dist/getMachineId-bsd-HSK5LZMG.js +41 -0
  46. package/dist/getMachineId-bsd-HSK5LZMG.js.map +1 -0
  47. package/dist/getMachineId-darwin-4DP6CCJV.js +41 -0
  48. package/dist/getMachineId-darwin-4DP6CCJV.js.map +1 -0
  49. package/dist/getMachineId-linux-44LJ5UJB.js +33 -0
  50. package/dist/getMachineId-linux-44LJ5UJB.js.map +1 -0
  51. package/dist/getMachineId-unsupported-NVK6IATM.js +24 -0
  52. package/dist/getMachineId-unsupported-NVK6IATM.js.map +1 -0
  53. package/dist/getMachineId-win-YZ36S7VA.js +43 -0
  54. package/dist/getMachineId-win-YZ36S7VA.js.map +1 -0
  55. package/dist/index.js +20 -0
  56. package/dist/index.js.map +1 -0
  57. package/dist/interactive-DLHPNSZ7.js +334 -0
  58. package/dist/interactive-DLHPNSZ7.js.map +1 -0
  59. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +9 -0
  60. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +1 -0
  61. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +9 -0
  62. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +1 -0
  63. package/dist/src-ML4D2MC2.js +1733 -0
  64. package/dist/src-ML4D2MC2.js.map +1 -0
  65. package/dist/templates/.agentv/targets.yaml +5 -24
  66. package/dist/token-POXF46NU.js +66 -0
  67. package/dist/token-POXF46NU.js.map +1 -0
  68. package/dist/token-util-6GWYZWGE.js +8 -0
  69. package/dist/token-util-6GWYZWGE.js.map +1 -0
  70. package/package.json +1 -1
@@ -0,0 +1,4248 @@
1
+ import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
+ import {
3
+ CLI_PLACEHOLDERS,
4
+ KNOWN_PROVIDERS,
5
+ PROVIDER_ALIASES,
6
+ ResponseCache,
7
+ buildDirectoryChain,
8
+ buildSearchRoots,
9
+ ensureVSCodeSubagents,
10
+ findGitRoot,
11
+ isEvaluatorKind,
12
+ listTargetNames,
13
+ loadConfig,
14
+ loadTestSuite,
15
+ loadTsConfig,
16
+ normalizeLineEndings,
17
+ readTargetDefinitions,
18
+ readTestSuiteMetadata,
19
+ resolveFileReference,
20
+ resolveTargetDefinition,
21
+ runEvaluation,
22
+ shouldEnableCache,
23
+ shouldSkipCacheForTemperature,
24
+ subscribeToCodexLogEntries,
25
+ subscribeToCopilotCliLogEntries,
26
+ subscribeToCopilotSdkLogEntries,
27
+ subscribeToPiLogEntries
28
+ } from "./chunk-OR4WXZAF.js";
29
+
30
+ // package.json
31
+ var package_default = {
32
+ name: "agentv",
33
+ version: "3.2.0",
34
+ description: "CLI entry point for AgentV",
35
+ type: "module",
36
+ repository: {
37
+ type: "git",
38
+ url: "https://github.com/EntityProcess/agentv.git"
39
+ },
40
+ homepage: "https://github.com/EntityProcess/agentv#readme",
41
+ bugs: {
42
+ url: "https://github.com/EntityProcess/agentv/issues"
43
+ },
44
+ bin: {
45
+ agentv: "./dist/cli.js"
46
+ },
47
+ files: ["dist", "README.md"],
48
+ scripts: {
49
+ dev: "bun src/cli.ts",
50
+ build: "tsup && bun run copy-readme",
51
+ "copy-readme": `bun -e "import { cpSync } from 'fs'; cpSync('../../README.md', 'README.md')"`,
52
+ prepublishOnly: "bun run copy-readme",
53
+ typecheck: "tsc --noEmit",
54
+ lint: "biome check .",
55
+ format: "biome format --write .",
56
+ fix: "biome check --write .",
57
+ test: "bun test",
58
+ "test:watch": "bun test --watch"
59
+ },
60
+ dependencies: {
61
+ "@anthropic-ai/claude-agent-sdk": "^0.2.49",
62
+ "@github/copilot-sdk": "^0.1.25",
63
+ "@inquirer/prompts": "^8.2.1",
64
+ "@mariozechner/pi-agent-core": "^0.54.2",
65
+ "@mariozechner/pi-ai": "^0.54.2",
66
+ "@openai/codex-sdk": "^0.104.0",
67
+ "cmd-ts": "^0.14.3",
68
+ dotenv: "^16.4.5",
69
+ "fast-glob": "^3.3.3",
70
+ json5: "^2.2.3",
71
+ micromatch: "^4.0.8",
72
+ semver: "^7.7.4",
73
+ yaml: "^2.6.1"
74
+ },
75
+ devDependencies: {
76
+ "@agentv/core": "workspace:*",
77
+ "@types/semver": "^7.7.1",
78
+ execa: "^9.3.0"
79
+ }
80
+ };
81
+
82
+ // src/commands/eval/shared.ts
83
+ import { constants } from "node:fs";
84
+ import { access, stat } from "node:fs/promises";
85
+ import path from "node:path";
86
+ import fg from "fast-glob";
87
+ async function resolveEvalPaths(evalPaths, cwd) {
88
+ const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
89
+ if (normalizedInputs.length === 0) {
90
+ throw new Error("No eval paths provided.");
91
+ }
92
+ const unmatched = [];
93
+ const results = /* @__PURE__ */ new Set();
94
+ for (const pattern of normalizedInputs) {
95
+ const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
96
+ try {
97
+ const stats = await stat(candidatePath);
98
+ if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
99
+ results.add(candidatePath);
100
+ continue;
101
+ }
102
+ } catch {
103
+ }
104
+ const globPattern = pattern.includes("\\") ? pattern.replace(/\\/g, "/") : pattern;
105
+ const matches = await fg(globPattern, {
106
+ cwd,
107
+ absolute: true,
108
+ onlyFiles: true,
109
+ unique: true,
110
+ dot: true,
111
+ followSymbolicLinks: true
112
+ });
113
+ const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
114
+ if (yamlMatches.length === 0) {
115
+ unmatched.push(pattern);
116
+ continue;
117
+ }
118
+ for (const filePath of yamlMatches) {
119
+ results.add(path.normalize(filePath));
120
+ }
121
+ }
122
+ if (unmatched.length > 0) {
123
+ throw new Error(
124
+ `No eval files matched: ${unmatched.join(
125
+ ", "
126
+ )}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`
127
+ );
128
+ }
129
+ const sorted = Array.from(results);
130
+ sorted.sort();
131
+ return sorted;
132
+ }
133
+ async function findRepoRoot(start) {
134
+ const fallback = path.resolve(start);
135
+ let current = fallback;
136
+ while (current !== void 0) {
137
+ const candidate = path.join(current, ".git");
138
+ try {
139
+ await access(candidate, constants.F_OK);
140
+ return current;
141
+ } catch {
142
+ const parent = path.dirname(current);
143
+ if (parent === current) {
144
+ break;
145
+ }
146
+ current = parent;
147
+ }
148
+ }
149
+ return fallback;
150
+ }
151
+
152
+ // src/utils/targets.ts
153
+ import { constants as constants2 } from "node:fs";
154
+ import { access as access2 } from "node:fs/promises";
155
+ import path2 from "node:path";
156
+ var TARGET_FILE_CANDIDATES = [
157
+ "targets.yaml",
158
+ "targets.yml",
159
+ path2.join(".agentv", "targets.yaml"),
160
+ path2.join(".agentv", "targets.yml")
161
+ ];
162
+ async function fileExists(filePath) {
163
+ try {
164
+ await access2(filePath, constants2.F_OK);
165
+ return true;
166
+ } catch {
167
+ return false;
168
+ }
169
+ }
170
+ async function discoverTargetsFile(options) {
171
+ const { explicitPath, testFilePath, repoRoot, cwd } = options;
172
+ if (explicitPath) {
173
+ const resolvedExplicit = path2.resolve(explicitPath);
174
+ if (await fileExists(resolvedExplicit)) {
175
+ return resolvedExplicit;
176
+ }
177
+ for (const candidate of TARGET_FILE_CANDIDATES) {
178
+ const nested = path2.join(resolvedExplicit, candidate);
179
+ if (await fileExists(nested)) {
180
+ return nested;
181
+ }
182
+ }
183
+ throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
184
+ }
185
+ const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
186
+ const resolvedCwd = path2.resolve(cwd);
187
+ if (!directories.includes(resolvedCwd)) {
188
+ directories.push(resolvedCwd);
189
+ }
190
+ for (const directory of directories) {
191
+ for (const candidate of TARGET_FILE_CANDIDATES) {
192
+ const fullPath = path2.join(directory, candidate);
193
+ if (await fileExists(fullPath)) {
194
+ return fullPath;
195
+ }
196
+ }
197
+ }
198
+ throw new Error("Unable to locate targets.yaml. Use --targets to specify the file explicitly.");
199
+ }
200
+
201
+ // src/commands/eval/run-eval.ts
202
+ import { constants as constants4 } from "node:fs";
203
+ import { access as access4 } from "node:fs/promises";
204
+ import path12 from "node:path";
205
+ import { pathToFileURL } from "node:url";
206
+
207
+ // src/version-check.ts
208
+ import { coerce, satisfies, validRange } from "semver";
209
+ var ANSI_YELLOW = "\x1B[33m";
210
+ var ANSI_RED = "\x1B[31m";
211
+ var ANSI_RESET = "\x1B[0m";
212
+ function checkVersion(requiredVersion) {
213
+ const currentVersion = package_default.version;
214
+ if (!requiredVersion.trim() || !validRange(requiredVersion)) {
215
+ throw new Error(
216
+ `Invalid required_version "${requiredVersion}" in .agentv/config.yaml. Must be a valid semver range (e.g., ">=2.11.0", "^2.11.0").`
217
+ );
218
+ }
219
+ return {
220
+ satisfied: satisfies(coerce(currentVersion) ?? currentVersion, requiredVersion),
221
+ currentVersion,
222
+ requiredRange: requiredVersion
223
+ };
224
+ }
225
+ async function enforceRequiredVersion(requiredVersion, options) {
226
+ let result;
227
+ try {
228
+ result = checkVersion(requiredVersion);
229
+ } catch (err) {
230
+ console.error(`${ANSI_RED}Error: ${err.message}${ANSI_RESET}`);
231
+ process.exit(1);
232
+ }
233
+ if (result.satisfied) {
234
+ return;
235
+ }
236
+ const warning = `${ANSI_YELLOW}Warning: This project requires agentv ${result.requiredRange} but you have ${result.currentVersion}.${ANSI_RESET}
237
+ Run \`agentv self update\` to upgrade.`;
238
+ if (options?.strict) {
239
+ console.error(warning);
240
+ console.error(
241
+ `${ANSI_RED}Aborting: --strict mode requires the installed version to satisfy the required range.${ANSI_RESET}`
242
+ );
243
+ process.exit(1);
244
+ }
245
+ if (process.stdin.isTTY && process.stdout.isTTY) {
246
+ console.warn(warning);
247
+ const shouldContinue = await promptContinue();
248
+ if (!shouldContinue) {
249
+ process.exit(1);
250
+ }
251
+ } else {
252
+ process.stderr.write(`${warning}
253
+ `);
254
+ }
255
+ }
256
+ async function promptContinue() {
257
+ const { confirm } = await import("@inquirer/prompts");
258
+ return confirm({ message: "Continue anyway?", default: false });
259
+ }
260
+
261
+ // src/commands/eval/artifact-writer.ts
262
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
263
+ import path3 from "node:path";
264
+ var PASS_THRESHOLD = 0.8;
265
+ function computeStats(values) {
266
+ if (values.length === 0) {
267
+ return { mean: 0, stddev: 0 };
268
+ }
269
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
270
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
271
+ return {
272
+ mean: Math.round(mean * 1e3) / 1e3,
273
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
274
+ };
275
+ }
276
+ function computePassRate(result) {
277
+ const scores = result.scores;
278
+ if (scores && scores.length > 0) {
279
+ const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
280
+ return passed / scores.length;
281
+ }
282
+ return result.score >= PASS_THRESHOLD ? 1 : 0;
283
+ }
284
+ function countToolCalls(result) {
285
+ const toolCalls = {};
286
+ let total = 0;
287
+ const trace = result.trace;
288
+ if (trace?.steps) {
289
+ for (const step of trace.steps) {
290
+ if (step.toolName || step.type === "tool") {
291
+ const name = step.toolName ?? "unknown";
292
+ toolCalls[name] = (toolCalls[name] ?? 0) + 1;
293
+ total += 1;
294
+ }
295
+ }
296
+ }
297
+ return { toolCalls, total };
298
+ }
299
+ function parseWorkspaceChanges(fileChanges) {
300
+ if (!fileChanges) {
301
+ return void 0;
302
+ }
303
+ let filesModified = 0;
304
+ let filesCreated = 0;
305
+ const lines = fileChanges.split("\n");
306
+ for (const line of lines) {
307
+ if (line.startsWith("--- /dev/null")) {
308
+ filesCreated += 1;
309
+ } else if (line.startsWith("--- a/")) {
310
+ filesModified += 1;
311
+ }
312
+ }
313
+ const summaryLines = lines.slice(0, 20);
314
+ const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
315
+ ... (${lines.length - 20} more lines)` : fileChanges;
316
+ return {
317
+ files_modified: filesModified,
318
+ files_created: filesCreated,
319
+ diff_summary: diffSummary
320
+ };
321
+ }
322
+ function buildExpectations(result) {
323
+ const expectations = [];
324
+ if (result.scores && result.scores.length > 0) {
325
+ for (const evaluator of result.scores) {
326
+ for (const hit of evaluator.hits) {
327
+ expectations.push({
328
+ text: hit,
329
+ passed: true,
330
+ evidence: evaluator.reasoning ?? ""
331
+ });
332
+ }
333
+ for (const miss of evaluator.misses) {
334
+ expectations.push({
335
+ text: miss,
336
+ passed: false,
337
+ evidence: evaluator.reasoning ?? ""
338
+ });
339
+ }
340
+ }
341
+ } else {
342
+ for (const hit of result.hits) {
343
+ expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
344
+ }
345
+ for (const miss of result.misses) {
346
+ expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
347
+ }
348
+ }
349
+ return expectations;
350
+ }
351
+ function buildEvaluators(scores) {
352
+ if (!scores || scores.length === 0) {
353
+ return void 0;
354
+ }
355
+ return scores.map((s) => ({
356
+ name: s.name,
357
+ type: s.type,
358
+ score: s.score,
359
+ reasoning: s.reasoning ?? "",
360
+ weight: s.weight,
361
+ verdict: s.verdict,
362
+ hits: s.hits,
363
+ misses: s.misses,
364
+ details: s.details
365
+ }));
366
+ }
367
+ function buildGradingArtifact(result) {
368
+ const expectations = buildExpectations(result);
369
+ const passed = expectations.filter((e) => e.passed).length;
370
+ const failed = expectations.filter((e) => !e.passed).length;
371
+ const total = expectations.length;
372
+ const { toolCalls, total: totalToolCalls } = countToolCalls(result);
373
+ const errorsEncountered = result.error ? 1 : 0;
374
+ return {
375
+ expectations,
376
+ summary: {
377
+ passed,
378
+ failed,
379
+ total,
380
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
381
+ },
382
+ execution_metrics: {
383
+ tool_calls: toolCalls,
384
+ total_tool_calls: totalToolCalls,
385
+ errors_encountered: errorsEncountered
386
+ },
387
+ evaluators: buildEvaluators(result.scores),
388
+ workspace_changes: parseWorkspaceChanges(result.fileChanges),
389
+ conversation: result.conversationId ? {
390
+ turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
391
+ conversation_id: result.conversationId
392
+ } : void 0
393
+ };
394
+ }
395
+ function buildTimingArtifact(results) {
396
+ let totalInput = 0;
397
+ let totalOutput = 0;
398
+ let totalDurationMs = 0;
399
+ for (const result of results) {
400
+ const usage = result.tokenUsage;
401
+ if (usage) {
402
+ totalInput += usage.input ?? 0;
403
+ totalOutput += usage.output ?? 0;
404
+ }
405
+ if (result.durationMs != null) {
406
+ totalDurationMs += result.durationMs;
407
+ }
408
+ }
409
+ return {
410
+ total_tokens: totalInput + totalOutput,
411
+ duration_ms: totalDurationMs,
412
+ total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
413
+ token_usage: {
414
+ input: totalInput,
415
+ output: totalOutput
416
+ }
417
+ };
418
+ }
419
+ function buildBenchmarkArtifact(results, evalFile = "") {
420
+ const targetSet = /* @__PURE__ */ new Set();
421
+ const testIdSet = /* @__PURE__ */ new Set();
422
+ for (const result of results) {
423
+ targetSet.add(result.target);
424
+ testIdSet.add(result.testId);
425
+ }
426
+ const targets = [...targetSet].sort();
427
+ const testIds = [...testIdSet].sort();
428
+ const runSummary = {};
429
+ const notes = [];
430
+ for (const target of targets) {
431
+ const targetResults = results.filter((r) => r.target === target);
432
+ const passRates = targetResults.map(computePassRate);
433
+ const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
434
+ const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
435
+ const usage = r.tokenUsage;
436
+ return (usage.input ?? 0) + (usage.output ?? 0);
437
+ });
438
+ const entry = {
439
+ pass_rate: computeStats(passRates),
440
+ time_seconds: computeStats(timings),
441
+ tokens: computeStats(tokens)
442
+ };
443
+ const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
444
+ if (toolCallCounts.some((c) => c > 0)) {
445
+ entry.tool_calls = computeStats(toolCallCounts);
446
+ }
447
+ const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
448
+ if (costs.length > 0) {
449
+ entry.cost_usd = computeStats(costs);
450
+ }
451
+ runSummary[target] = entry;
452
+ }
453
+ const evaluatorScores = /* @__PURE__ */ new Map();
454
+ for (const result of results) {
455
+ if (result.scores) {
456
+ for (const score of result.scores) {
457
+ const key = `${score.name}:${score.type}`;
458
+ if (!evaluatorScores.has(key)) {
459
+ evaluatorScores.set(key, []);
460
+ }
461
+ evaluatorScores.get(key)?.push(score.score);
462
+ }
463
+ }
464
+ }
465
+ let perEvaluatorSummary;
466
+ if (evaluatorScores.size > 0) {
467
+ perEvaluatorSummary = {};
468
+ for (const [key, scores] of evaluatorScores) {
469
+ perEvaluatorSummary[key] = computeStats(scores);
470
+ }
471
+ }
472
+ const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
473
+ if (errorCount > 0) {
474
+ notes.push(
475
+ `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
476
+ );
477
+ }
478
+ if (results.length === 0) {
479
+ notes.push("No results to summarize");
480
+ }
481
+ const firstResult = results[0];
482
+ const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
483
+ return {
484
+ metadata: {
485
+ eval_file: evalFile,
486
+ timestamp,
487
+ targets,
488
+ tests_run: testIds
489
+ },
490
+ run_summary: runSummary,
491
+ per_evaluator_summary: perEvaluatorSummary,
492
+ notes
493
+ };
494
+ }
495
+ async function writeArtifactsFromResults(results, outputDir, options) {
496
+ const gradingDir = path3.join(outputDir, "grading");
497
+ const timingPath = path3.join(outputDir, "timing.json");
498
+ const benchmarkPath = path3.join(outputDir, "benchmark.json");
499
+ await mkdir(gradingDir, { recursive: true });
500
+ for (const result of results) {
501
+ const grading = buildGradingArtifact(result);
502
+ const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
503
+ const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
504
+ await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
505
+ `, "utf8");
506
+ }
507
+ const timing = buildTimingArtifact(results);
508
+ await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
509
+ `, "utf8");
510
+ const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
511
+ await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
512
+ `, "utf8");
513
+ return { gradingDir, timingPath, benchmarkPath };
514
+ }
515
+
516
+ // src/commands/eval/benchmark-writer.ts
517
+ import { writeFile as writeFile2 } from "node:fs/promises";
518
+ var PASS_THRESHOLD2 = 0.8;
519
+ function computeStats2(values) {
520
+ if (values.length === 0) {
521
+ return { mean: 0, stddev: 0 };
522
+ }
523
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
524
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
525
+ return {
526
+ mean: Math.round(mean * 1e3) / 1e3,
527
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
528
+ };
529
+ }
530
+ function computePassRate2(result) {
531
+ const scores = result.scores;
532
+ if (scores && scores.length > 0) {
533
+ const passed = scores.filter((s) => s.score >= PASS_THRESHOLD2).length;
534
+ return passed / scores.length;
535
+ }
536
+ return result.score >= PASS_THRESHOLD2 ? 1 : 0;
537
+ }
538
+ function buildBenchmarkJson(results) {
539
+ const passRates = results.map(computePassRate2);
540
+ const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
541
+ const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
542
+ const usage = r.tokenUsage;
543
+ return (usage.input ?? 0) + (usage.output ?? 0);
544
+ });
545
+ return {
546
+ run_summary: {
547
+ with_skill: {
548
+ pass_rate: computeStats2(passRates),
549
+ time_seconds: computeStats2(timings),
550
+ tokens: computeStats2(tokens)
551
+ }
552
+ }
553
+ };
554
+ }
555
+ async function writeBenchmarkJson(outputPath, results) {
556
+ const benchmark = buildBenchmarkJson(results);
557
+ await writeFile2(outputPath, `${JSON.stringify(benchmark, null, 2)}
558
+ `, "utf8");
559
+ }
560
+
561
+ // src/commands/eval/env.ts
562
+ import { constants as constants3 } from "node:fs";
563
+ import { access as access3 } from "node:fs/promises";
564
+ import path4 from "node:path";
565
+ import { config as loadDotenv } from "dotenv";
566
+ function uniqueDirs(directories) {
567
+ const seen = /* @__PURE__ */ new Set();
568
+ const result = [];
569
+ for (const dir of directories) {
570
+ const absolute = path4.resolve(dir);
571
+ if (seen.has(absolute)) {
572
+ continue;
573
+ }
574
+ seen.add(absolute);
575
+ result.push(absolute);
576
+ }
577
+ return result;
578
+ }
579
+ async function fileExists2(filePath) {
580
+ try {
581
+ await access3(filePath, constants3.F_OK);
582
+ return true;
583
+ } catch {
584
+ return false;
585
+ }
586
+ }
587
+ function collectAncestorDirectories(start, boundary) {
588
+ const directories = [];
589
+ const boundaryDir = path4.resolve(boundary);
590
+ let current = path4.resolve(start);
591
+ while (current !== void 0) {
592
+ directories.push(current);
593
+ if (current === boundaryDir) {
594
+ break;
595
+ }
596
+ const parent = path4.dirname(current);
597
+ if (parent === current) {
598
+ break;
599
+ }
600
+ current = parent;
601
+ }
602
+ return directories;
603
+ }
604
+ async function loadEnvFromHierarchy(options) {
605
+ const { testFilePath, repoRoot, verbose } = options;
606
+ const testDir = path4.dirname(path4.resolve(testFilePath));
607
+ const cwd = process.cwd();
608
+ const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
609
+ const envFiles = [];
610
+ for (const dir of searchDirs) {
611
+ const candidate = path4.join(dir, ".env");
612
+ if (await fileExists2(candidate)) {
613
+ envFiles.push(candidate);
614
+ }
615
+ }
616
+ if (envFiles.length === 0) {
617
+ if (verbose) {
618
+ console.log("No .env file found in hierarchy");
619
+ }
620
+ return void 0;
621
+ }
622
+ for (let i = 0; i < envFiles.length; i++) {
623
+ const envFile = envFiles[i];
624
+ loadDotenv({ path: envFile, override: false });
625
+ if (verbose) {
626
+ console.log(`Loaded environment from: ${envFile}`);
627
+ }
628
+ }
629
+ return envFiles[0];
630
+ }
631
+
632
+ // src/commands/eval/output-writer.ts
633
+ import path10 from "node:path";
634
+
635
+ // src/commands/eval/html-writer.ts
636
+ import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
637
+ import path5 from "node:path";
638
+
639
+ // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
640
+ var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
641
+ var E_ALREADY_LOCKED = new Error("mutex already locked");
642
+ var E_CANCELED = new Error("request for lock canceled");
643
+ var __awaiter$2 = function(thisArg, _arguments, P, generator) {
644
+ function adopt(value) {
645
+ return value instanceof P ? value : new P(function(resolve) {
646
+ resolve(value);
647
+ });
648
+ }
649
+ return new (P || (P = Promise))(function(resolve, reject) {
650
+ function fulfilled(value) {
651
+ try {
652
+ step(generator.next(value));
653
+ } catch (e) {
654
+ reject(e);
655
+ }
656
+ }
657
+ function rejected(value) {
658
+ try {
659
+ step(generator["throw"](value));
660
+ } catch (e) {
661
+ reject(e);
662
+ }
663
+ }
664
+ function step(result) {
665
+ result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
666
+ }
667
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
668
+ });
669
+ };
670
+ var Semaphore = class {
671
+ constructor(_value, _cancelError = E_CANCELED) {
672
+ this._value = _value;
673
+ this._cancelError = _cancelError;
674
+ this._queue = [];
675
+ this._weightedWaiters = [];
676
+ }
677
+ acquire(weight = 1, priority = 0) {
678
+ if (weight <= 0)
679
+ throw new Error(`invalid weight ${weight}: must be positive`);
680
+ return new Promise((resolve, reject) => {
681
+ const task = { resolve, reject, weight, priority };
682
+ const i = findIndexFromEnd(this._queue, (other) => priority <= other.priority);
683
+ if (i === -1 && weight <= this._value) {
684
+ this._dispatchItem(task);
685
+ } else {
686
+ this._queue.splice(i + 1, 0, task);
687
+ }
688
+ });
689
+ }
690
+ runExclusive(callback_1) {
691
+ return __awaiter$2(this, arguments, void 0, function* (callback, weight = 1, priority = 0) {
692
+ const [value, release] = yield this.acquire(weight, priority);
693
+ try {
694
+ return yield callback(value);
695
+ } finally {
696
+ release();
697
+ }
698
+ });
699
+ }
700
+ waitForUnlock(weight = 1, priority = 0) {
701
+ if (weight <= 0)
702
+ throw new Error(`invalid weight ${weight}: must be positive`);
703
+ if (this._couldLockImmediately(weight, priority)) {
704
+ return Promise.resolve();
705
+ } else {
706
+ return new Promise((resolve) => {
707
+ if (!this._weightedWaiters[weight - 1])
708
+ this._weightedWaiters[weight - 1] = [];
709
+ insertSorted(this._weightedWaiters[weight - 1], { resolve, priority });
710
+ });
711
+ }
712
+ }
713
+ isLocked() {
714
+ return this._value <= 0;
715
+ }
716
+ getValue() {
717
+ return this._value;
718
+ }
719
+ setValue(value) {
720
+ this._value = value;
721
+ this._dispatchQueue();
722
+ }
723
+ release(weight = 1) {
724
+ if (weight <= 0)
725
+ throw new Error(`invalid weight ${weight}: must be positive`);
726
+ this._value += weight;
727
+ this._dispatchQueue();
728
+ }
729
+ cancel() {
730
+ this._queue.forEach((entry) => entry.reject(this._cancelError));
731
+ this._queue = [];
732
+ }
733
+ _dispatchQueue() {
734
+ this._drainUnlockWaiters();
735
+ while (this._queue.length > 0 && this._queue[0].weight <= this._value) {
736
+ this._dispatchItem(this._queue.shift());
737
+ this._drainUnlockWaiters();
738
+ }
739
+ }
740
+ _dispatchItem(item) {
741
+ const previousValue = this._value;
742
+ this._value -= item.weight;
743
+ item.resolve([previousValue, this._newReleaser(item.weight)]);
744
+ }
745
+ _newReleaser(weight) {
746
+ let called = false;
747
+ return () => {
748
+ if (called)
749
+ return;
750
+ called = true;
751
+ this.release(weight);
752
+ };
753
+ }
754
+ _drainUnlockWaiters() {
755
+ if (this._queue.length === 0) {
756
+ for (let weight = this._value; weight > 0; weight--) {
757
+ const waiters = this._weightedWaiters[weight - 1];
758
+ if (!waiters)
759
+ continue;
760
+ waiters.forEach((waiter) => waiter.resolve());
761
+ this._weightedWaiters[weight - 1] = [];
762
+ }
763
+ } else {
764
+ const queuedPriority = this._queue[0].priority;
765
+ for (let weight = this._value; weight > 0; weight--) {
766
+ const waiters = this._weightedWaiters[weight - 1];
767
+ if (!waiters)
768
+ continue;
769
+ const i = waiters.findIndex((waiter) => waiter.priority <= queuedPriority);
770
+ (i === -1 ? waiters : waiters.splice(0, i)).forEach((waiter) => waiter.resolve());
771
+ }
772
+ }
773
+ }
774
+ _couldLockImmediately(weight, priority) {
775
+ return (this._queue.length === 0 || this._queue[0].priority < priority) && weight <= this._value;
776
+ }
777
+ };
778
+ function insertSorted(a, v) {
779
+ const i = findIndexFromEnd(a, (other) => v.priority <= other.priority);
780
+ a.splice(i + 1, 0, v);
781
+ }
782
+ function findIndexFromEnd(a, predicate) {
783
+ for (let i = a.length - 1; i >= 0; i--) {
784
+ if (predicate(a[i])) {
785
+ return i;
786
+ }
787
+ }
788
+ return -1;
789
+ }
790
+ var __awaiter$1 = function(thisArg, _arguments, P, generator) {
791
+ function adopt(value) {
792
+ return value instanceof P ? value : new P(function(resolve) {
793
+ resolve(value);
794
+ });
795
+ }
796
+ return new (P || (P = Promise))(function(resolve, reject) {
797
+ function fulfilled(value) {
798
+ try {
799
+ step(generator.next(value));
800
+ } catch (e) {
801
+ reject(e);
802
+ }
803
+ }
804
+ function rejected(value) {
805
+ try {
806
+ step(generator["throw"](value));
807
+ } catch (e) {
808
+ reject(e);
809
+ }
810
+ }
811
+ function step(result) {
812
+ result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
813
+ }
814
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
815
+ });
816
+ };
817
+ var Mutex = class {
818
+ constructor(cancelError) {
819
+ this._semaphore = new Semaphore(1, cancelError);
820
+ }
821
+ acquire() {
822
+ return __awaiter$1(this, arguments, void 0, function* (priority = 0) {
823
+ const [, releaser] = yield this._semaphore.acquire(1, priority);
824
+ return releaser;
825
+ });
826
+ }
827
+ runExclusive(callback, priority = 0) {
828
+ return this._semaphore.runExclusive(() => callback(), 1, priority);
829
+ }
830
+ isLocked() {
831
+ return this._semaphore.isLocked();
832
+ }
833
+ waitForUnlock(priority = 0) {
834
+ return this._semaphore.waitForUnlock(1, priority);
835
+ }
836
+ release() {
837
+ if (this._semaphore.isLocked())
838
+ this._semaphore.release();
839
+ }
840
+ cancel() {
841
+ return this._semaphore.cancel();
842
+ }
843
+ };
844
+
845
+ // src/commands/eval/html-writer.ts
846
+ var HtmlWriter = class _HtmlWriter {
847
+ filePath;
848
+ results = [];
849
+ mutex = new Mutex();
850
+ closed = false;
851
+ isLive = true;
852
+ constructor(filePath) {
853
+ this.filePath = filePath;
854
+ }
855
+ static async open(filePath) {
856
+ await mkdir2(path5.dirname(filePath), { recursive: true });
857
+ const writer = new _HtmlWriter(filePath);
858
+ await writer.writeHtml();
859
+ return writer;
860
+ }
861
+ async append(result) {
862
+ await this.mutex.runExclusive(async () => {
863
+ if (this.closed) {
864
+ throw new Error("Cannot write to closed HTML writer");
865
+ }
866
+ this.results.push(result);
867
+ await this.writeHtml();
868
+ });
869
+ }
870
+ async close() {
871
+ await this.mutex.runExclusive(async () => {
872
+ if (this.closed) {
873
+ return;
874
+ }
875
+ this.closed = true;
876
+ this.isLive = false;
877
+ await this.writeHtml();
878
+ });
879
+ }
880
+ async writeHtml() {
881
+ const html = generateHtml(this.results, this.isLive);
882
+ await writeFile3(this.filePath, html, "utf8");
883
+ }
884
+ };
885
+ function generateHtml(results, isLive) {
886
+ const lightResults = results.map((r) => {
887
+ const { requests, trace, ...rest } = r;
888
+ return rest;
889
+ });
890
+ const dataJson = JSON.stringify(lightResults).replace(/<\//g, "<\\/");
891
+ const metaRefresh = isLive ? ' <meta http-equiv="refresh" content="2">\n' : "";
892
+ const liveIndicator = isLive ? '<span class="live-badge">\u25CF LIVE</span>' : `<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>`;
893
+ return `<!DOCTYPE html>
894
+ <html lang="en">
895
+ <head>
896
+ <meta charset="utf-8">
897
+ <meta name="viewport" content="width=device-width, initial-scale=1">
898
+ ${metaRefresh} <title>AgentV Evaluation Report</title>
899
+ <style>
900
+ ${STYLES}
901
+ </style>
902
+ </head>
903
+ <body>
904
+ <header class="header">
905
+ <div class="header-left">
906
+ <h1 class="header-title">AgentV</h1>
907
+ <span class="header-subtitle">Evaluation Report</span>
908
+ </div>
909
+ <div class="header-right">${liveIndicator}</div>
910
+ </header>
911
+ <nav class="tabs" id="tabs">
912
+ <button class="tab active" data-tab="overview">Overview</button>
913
+ <button class="tab" data-tab="tests">Test Cases</button>
914
+ </nav>
915
+ <main id="app"></main>
916
+ <script>
917
+ var DATA = ${dataJson};
918
+ var IS_LIVE = ${String(isLive)};
919
+ ${SCRIPT}
920
+ </script>
921
+ </body>
922
+ </html>`;
923
+ }
924
+ function escapeHtml(s) {
925
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
926
+ }
927
+ var STYLES = `
928
+ *{margin:0;padding:0;box-sizing:border-box}
929
+ :root{
930
+ --bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
931
+ --text:#1f2328;--text-muted:#656d76;
932
+ --primary:#0969da;--primary-bg:#ddf4ff;
933
+ --success:#1a7f37;--success-bg:#dafbe1;
934
+ --danger:#cf222e;--danger-bg:#ffebe9;
935
+ --warning:#9a6700;--warning-bg:#fff8c5;
936
+ --radius:6px;
937
+ --shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
938
+ --font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
939
+ --mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
940
+ }
941
+ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
942
+
943
+ /* Header */
944
+ .header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
945
+ .header-left{display:flex;align-items:baseline;gap:12px}
946
+ .header-title{font-size:18px;font-weight:600}
947
+ .header-subtitle{font-size:14px;color:var(--text-muted)}
948
+ .live-badge{color:var(--success);font-size:12px;font-weight:600;animation:pulse 2s infinite}
949
+ @keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}}
950
+ .timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
951
+
952
+ /* Tabs */
953
+ .tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
954
+ .tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
955
+ .tab:hover{color:var(--text)}
956
+ .tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
957
+
958
+ #app{max-width:1280px;margin:0 auto;padding:24px}
959
+
960
+ /* Stat cards */
961
+ .stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
962
+ .stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
963
+ .stat-card.pass .stat-value{color:var(--success)}
964
+ .stat-card.fail .stat-value{color:var(--danger)}
965
+ .stat-card.error .stat-value{color:var(--danger)}
966
+ .stat-card.warn .stat-value{color:var(--warning)}
967
+ .stat-card.total .stat-value{color:var(--primary)}
968
+ .stat-value{font-size:28px;font-weight:700;line-height:1.2}
969
+ .stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
970
+
971
+ /* Sections */
972
+ .section{margin-bottom:24px}
973
+ .section-title{font-size:16px;font-weight:600;margin-bottom:12px}
974
+
975
+ /* Tables */
976
+ .table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
977
+ .data-table{width:100%;border-collapse:collapse;font-size:13px}
978
+ .data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
979
+ .data-table th.sortable{cursor:pointer;user-select:none}
980
+ .data-table th.sortable:hover{color:var(--text)}
981
+ .data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
982
+ .data-table tbody tr:last-child td{border-bottom:none}
983
+
984
+ /* Status icons */
985
+ .status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
986
+ .status-icon.pass{background:var(--success-bg);color:var(--success)}
987
+ .status-icon.fail{background:var(--danger-bg);color:var(--danger)}
988
+ .status-icon.error{background:var(--warning-bg);color:var(--warning)}
989
+
990
+ /* Score colors */
991
+ .score-high{color:var(--success);font-weight:600}
992
+ .score-mid{color:var(--warning);font-weight:600}
993
+ .score-low{color:var(--danger);font-weight:600}
994
+
995
+ /* Pass-rate bar */
996
+ .bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
997
+ .bar-fill{height:100%;border-radius:4px;transition:width .3s}
998
+ .bar-fill.score-high{background:var(--success)}
999
+ .bar-fill.score-mid{background:var(--warning)}
1000
+ .bar-fill.score-low{background:var(--danger)}
1001
+
1002
+ /* Histogram */
1003
+ .histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
1004
+ .hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
1005
+ .hist-row:last-child{margin-bottom:0}
1006
+ .hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
1007
+ .hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
1008
+ .hist-bar{height:100%;border-radius:3px;transition:width .3s}
1009
+ .hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
1010
+
1011
+ /* Filters */
1012
+ .filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
1013
+ .filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
1014
+ .filter-search{flex:1;min-width:200px}
1015
+ .filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
1016
+
1017
+ /* Test rows */
1018
+ .test-row{cursor:pointer;transition:background .1s}
1019
+ .test-row:hover{background:var(--bg)!important}
1020
+ .test-row.expanded{background:var(--primary-bg)!important}
1021
+ .expand-col{width:32px;text-align:center}
1022
+ .expand-icon{color:var(--text-muted);font-size:12px}
1023
+ .fw-medium{font-weight:500}
1024
+ .text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
1025
+
1026
+ /* Detail panel */
1027
+ .detail-row td{padding:0!important;background:var(--bg)!important}
1028
+ .detail-panel{padding:16px 24px}
1029
+ .detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
1030
+ .detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
1031
+ .detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
1032
+ .detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
1033
+ .eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
1034
+ .eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
1035
+ .eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
1036
+ .reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
1037
+ .expect-list{list-style:none;padding:0;margin-bottom:12px}
1038
+ .expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
1039
+ .expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
1040
+ .expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
1041
+ .error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
1042
+ .error-box h4{color:var(--danger);margin:0 0 6px}
1043
+ .error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
1044
+ .detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
1045
+ .empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
1046
+ .empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
1047
+ `;
1048
+ var SCRIPT = `
1049
+ (function(){
1050
+ /* ---- helpers ---- */
1051
+ function esc(s){
1052
+ if(s==null)return"";
1053
+ return String(s).replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;");
1054
+ }
1055
+ function getStatus(r){
1056
+ if(r.executionStatus==="execution_error")return"error";
1057
+ if(r.executionStatus==="quality_failure")return"fail";
1058
+ if(r.executionStatus==="ok")return"pass";
1059
+ if(r.error)return"error";
1060
+ return r.score>=0.5?"pass":"fail";
1061
+ }
1062
+ function sIcon(s){
1063
+ if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
1064
+ if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
1065
+ return'<span class="status-icon error">!</span>';
1066
+ }
1067
+ function fmtDur(ms){
1068
+ if(ms==null)return"\\u2014";
1069
+ if(ms<1000)return ms+"ms";
1070
+ if(ms<60000)return(ms/1000).toFixed(1)+"s";
1071
+ return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
1072
+ }
1073
+ function fmtTok(n){
1074
+ if(n==null)return"\\u2014";
1075
+ if(n>=1e6)return(n/1e6).toFixed(1)+"M";
1076
+ if(n>=1e3)return(n/1e3).toFixed(1)+"K";
1077
+ return String(n);
1078
+ }
1079
+ function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
1080
+ function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
1081
+ function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
1082
+
1083
+ /* ---- compute stats ---- */
1084
+ function computeStats(d){
1085
+ var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[];
1086
+ for(var i=0;i<d.length;i++){
1087
+ var r=d[i],s=getStatus(r);
1088
+ if(s==="pass")p++;else if(s==="fail")f++;else e++;
1089
+ if(r.durationMs)dur+=r.durationMs;
1090
+ if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
1091
+ if(r.costUsd)cost+=r.costUsd;
1092
+ if(s!=="error")sc.push(r.score);
1093
+ }
1094
+ var g=t-e;
1095
+ return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc};
1096
+ }
1097
+ function computeTargets(d){
1098
+ var m={};
1099
+ for(var i=0;i<d.length;i++){
1100
+ var r=d[i],tgt=r.target||"unknown";
1101
+ if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
1102
+ var o=m[tgt];o.results.push(r);
1103
+ var s=getStatus(r);
1104
+ if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
1105
+ if(s!=="error"){o.ts+=r.score;o.sc++;}
1106
+ if(r.durationMs)o.dur+=r.durationMs;
1107
+ if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
1108
+ if(r.costUsd)o.cost+=r.costUsd;
1109
+ }
1110
+ var a=[];for(var k in m)a.push(m[k]);return a;
1111
+ }
1112
+ function getEvalNames(){
1113
+ var n={};
1114
+ for(var i=0;i<DATA.length;i++){
1115
+ var sc=DATA[i].scores;
1116
+ if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
1117
+ }
1118
+ return Object.keys(n);
1119
+ }
1120
+ function getEvalScore(r,name){
1121
+ if(!r.scores)return null;
1122
+ for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
1123
+ return null;
1124
+ }
1125
+
1126
+ var stats=computeStats(DATA);
1127
+ var tgtStats=computeTargets(DATA);
1128
+ var tgtNames=tgtStats.map(function(t){return t.target;});
1129
+
1130
+ /* ---- state ---- */
1131
+ var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
1132
+
1133
+ /* ---- DOM refs ---- */
1134
+ var app=document.getElementById("app");
1135
+ var tabBtns=document.querySelectorAll(".tab");
1136
+
1137
+ /* ---- tabs ---- */
1138
+ function setTab(t){
1139
+ state.tab=t;
1140
+ for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
1141
+ render();
1142
+ }
1143
+ for(var i=0;i<tabBtns.length;i++){
1144
+ tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
1145
+ }
1146
+
1147
+ /* ---- render ---- */
1148
+ function render(){
1149
+ if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results yet</h3><p>'+(IS_LIVE?"Waiting for evaluation results\\u2026 Page will auto-refresh.":"Run an evaluation to generate results.")+"</p></div>";return;}
1150
+ if(state.tab==="overview")renderOverview();else renderTests();
1151
+ }
1152
+
1153
+ /* ---- stat card helper ---- */
1154
+ function card(label,value,type){
1155
+ return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
1156
+ }
1157
+
1158
+ /* ---- overview ---- */
1159
+ function renderOverview(){
1160
+ var h='<div class="stats-grid">';
1161
+ h+=card("Total Tests",stats.total,"total");
1162
+ h+=card("Passed",stats.passed,"pass");
1163
+ h+=card("Failed",stats.failed,"fail");
1164
+ h+=card("Errors",stats.errors,"error");
1165
+ var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
1166
+ h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
1167
+ h+=card("Duration",fmtDur(stats.dur),"neutral");
1168
+ h+=card("Tokens",fmtTok(stats.tokens),"neutral");
1169
+ h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
1170
+ h+="</div>";
1171
+
1172
+ /* targets table */
1173
+ if(tgtStats.length>1){
1174
+ h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
1175
+ h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
1176
+ for(var i=0;i<tgtStats.length;i++){
1177
+ var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
1178
+ h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
1179
+ h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
1180
+ h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
1181
+ }
1182
+ h+="</tbody></table></div></div>";
1183
+ }
1184
+
1185
+ /* histogram */
1186
+ if(stats.scores.length>0){
1187
+ var bk=[0,0,0,0,0];
1188
+ for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
1189
+ var mx=Math.max.apply(null,bk);
1190
+ var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
1191
+ h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
1192
+ for(var i=0;i<bk.length;i++){
1193
+ var pct=mx>0?(bk[i]/mx*100):0;
1194
+ h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
1195
+ }
1196
+ h+="</div></div>";
1197
+ }
1198
+ app.innerHTML=h;
1199
+ }
1200
+
1201
+ /* ---- test cases ---- */
1202
+ function renderTests(){
1203
+ var evalNames=getEvalNames();
1204
+ var h='<div class="filter-bar">';
1205
+ h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
1206
+ if(tgtNames.length>1){
1207
+ h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
1208
+ for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
1209
+ h+="</select>";
1210
+ }
1211
+ h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
1212
+ h+='<span class="filter-count" id="flt-count"></span></div>';
1213
+
1214
+ h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
1215
+ h+='<th class="expand-col"></th>';
1216
+ h+=sHdr("Status","status");
1217
+ h+=sHdr("Test ID","testId");
1218
+ if(tgtNames.length>1)h+=sHdr("Target","target");
1219
+ h+=sHdr("Score","score");
1220
+ for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
1221
+ h+=sHdr("Duration","durationMs");
1222
+ h+=sHdr("Cost","costUsd");
1223
+ h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
1224
+ app.innerHTML=h;
1225
+
1226
+ /* wire events */
1227
+ var selS=document.getElementById("flt-status");
1228
+ selS.value=state.filter.status;
1229
+ selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
1230
+ var selT=document.getElementById("flt-target");
1231
+ if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
1232
+ document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
1233
+ var ths=document.querySelectorAll("th[data-sort]");
1234
+ for(var i=0;i<ths.length;i++){
1235
+ ths[i].addEventListener("click",(function(th){return function(){
1236
+ var c=th.getAttribute("data-sort");
1237
+ if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
1238
+ else{state.sort.col=c;state.sort.dir="asc";}
1239
+ renderTests();
1240
+ };})(ths[i]));
1241
+ }
1242
+ renderRows();
1243
+ }
1244
+
1245
+ function sHdr(label,col){
1246
+ var arrow="";
1247
+ if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
1248
+ return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
1249
+ }
1250
+
1251
+ function filtered(){
1252
+ var out=[];
1253
+ for(var i=0;i<DATA.length;i++){
1254
+ var r=DATA[i],s=getStatus(r);
1255
+ if(state.filter.status!=="all"&&s!==state.filter.status)continue;
1256
+ if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
1257
+ if(state.filter.search&&r.testId.toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
1258
+ out.push(r);
1259
+ }
1260
+ var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
1261
+ out.sort(function(a,b){
1262
+ var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
1263
+ if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
1264
+ if(typeof va==="string")return va.localeCompare(vb)*dir;
1265
+ return(va-vb)*dir;
1266
+ });
1267
+ return out;
1268
+ }
1269
+
1270
+ function renderRows(){
1271
+ var rows=filtered(),evalNames=getEvalNames();
1272
+ var tbody=document.getElementById("test-body");
1273
+ var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
1274
+ document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
1275
+ var h="";
1276
+ for(var i=0;i<rows.length;i++){
1277
+ var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
1278
+ h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'">';
1279
+ h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
1280
+ h+="<td>"+sIcon(s)+"</td>";
1281
+ h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
1282
+ if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
1283
+ h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
1284
+ for(var j=0;j<evalNames.length;j++){
1285
+ var es=getEvalScore(r,evalNames[j]);
1286
+ h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
1287
+ }
1288
+ h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
1289
+ if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
1290
+ }
1291
+ if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
1292
+ tbody.innerHTML=h;
1293
+
1294
+ /* row click */
1295
+ var trs=tbody.querySelectorAll(".test-row");
1296
+ for(var k=0;k<trs.length;k++){
1297
+ trs[k].addEventListener("click",(function(tr){return function(){
1298
+ var key=tr.getAttribute("data-key");
1299
+ state.expanded[key]=!state.expanded[key];
1300
+ renderRows();
1301
+ };})(trs[k]));
1302
+ }
1303
+ }
1304
+
1305
+ /* ---- detail panel ---- */
1306
+ function renderDetail(r){
1307
+ var h='<div class="detail-panel">';
1308
+
1309
+ /* input / output */
1310
+ h+='<div class="detail-grid">';
1311
+ if(r.input!=null){
1312
+ h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(typeof r.input==="string"?r.input:JSON.stringify(r.input,null,2))+"</pre></div>";
1313
+ }
1314
+ h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.answer||"")+"</pre></div>";
1315
+ h+="</div>";
1316
+
1317
+ /* evaluator results */
1318
+ if(r.scores&&r.scores.length>0){
1319
+ h+="<h4>Evaluator Results</h4>";
1320
+ h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Reasoning</th></tr></thead><tbody>';
1321
+ for(var i=0;i<r.scores.length;i++){
1322
+ var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
1323
+ h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(ev.reasoning||"")+"</td></tr>";
1324
+ }
1325
+ h+="</tbody></table>";
1326
+ }
1327
+
1328
+ /* hits / misses */
1329
+ if(r.hits&&r.hits.length>0){
1330
+ h+='<h4>Passed Expectations</h4><ul class="expect-list pass">';
1331
+ for(var i=0;i<r.hits.length;i++)h+="<li>"+esc(r.hits[i])+"</li>";
1332
+ h+="</ul>";
1333
+ }
1334
+ if(r.misses&&r.misses.length>0){
1335
+ h+='<h4>Failed Expectations</h4><ul class="expect-list fail">';
1336
+ for(var i=0;i<r.misses.length;i++)h+="<li>"+esc(r.misses[i])+"</li>";
1337
+ h+="</ul>";
1338
+ }
1339
+
1340
+ /* error */
1341
+ if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
1342
+
1343
+ /* metadata */
1344
+ h+='<div class="detail-meta">';
1345
+ var m=[];
1346
+ if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
1347
+ if(r.durationMs)m.push(fmtDur(r.durationMs));
1348
+ if(r.target)m.push(r.target);
1349
+ if(r.costUsd)m.push(fmtCost(r.costUsd));
1350
+ if(r.timestamp)m.push(r.timestamp);
1351
+ h+=esc(m.join(" \\u00B7 "));
1352
+ h+="</div></div>";
1353
+ return h;
1354
+ }
1355
+
1356
+ /* ---- init ---- */
1357
+ render();
1358
+ })();
1359
+ `;
1360
+
1361
+ // src/commands/eval/json-writer.ts
1362
+ import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
1363
+ import path6 from "node:path";
1364
+
1365
+ // src/utils/case-conversion.ts
1366
+ function toSnakeCase(str) {
1367
+ if (/^[A-Z]/.test(str)) {
1368
+ return str;
1369
+ }
1370
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
1371
+ }
1372
+ function toSnakeCaseDeep(obj) {
1373
+ if (obj === null || obj === void 0) {
1374
+ return obj;
1375
+ }
1376
+ if (Array.isArray(obj)) {
1377
+ return obj.map((item) => toSnakeCaseDeep(item));
1378
+ }
1379
+ if (typeof obj === "object") {
1380
+ const result = {};
1381
+ for (const [key, value] of Object.entries(obj)) {
1382
+ const snakeKey = toSnakeCase(key);
1383
+ result[snakeKey] = toSnakeCaseDeep(value);
1384
+ }
1385
+ return result;
1386
+ }
1387
+ return obj;
1388
+ }
1389
+
1390
+ // src/commands/eval/json-writer.ts
1391
+ var JsonWriter = class _JsonWriter {
1392
+ filePath;
1393
+ results = [];
1394
+ closed = false;
1395
+ constructor(filePath) {
1396
+ this.filePath = filePath;
1397
+ }
1398
+ static async open(filePath) {
1399
+ await mkdir3(path6.dirname(filePath), { recursive: true });
1400
+ return new _JsonWriter(filePath);
1401
+ }
1402
+ async append(result) {
1403
+ if (this.closed) {
1404
+ throw new Error("Cannot write to closed JSON writer");
1405
+ }
1406
+ this.results.push(result);
1407
+ }
1408
+ async close() {
1409
+ if (this.closed) {
1410
+ return;
1411
+ }
1412
+ this.closed = true;
1413
+ const passed = this.results.filter((r) => r.score >= 0.5).length;
1414
+ const failed = this.results.length - passed;
1415
+ const total = this.results.length;
1416
+ const output = {
1417
+ stats: {
1418
+ total,
1419
+ passed,
1420
+ failed,
1421
+ passRate: total > 0 ? passed / total : 0
1422
+ },
1423
+ results: this.results
1424
+ };
1425
+ const snakeCaseOutput = toSnakeCaseDeep(output);
1426
+ await writeFile4(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
1427
+ `, "utf8");
1428
+ }
1429
+ };
1430
+
1431
+ // src/commands/eval/jsonl-writer.ts
1432
+ import { createWriteStream } from "node:fs";
1433
+ import { mkdir as mkdir4 } from "node:fs/promises";
1434
+ import path7 from "node:path";
1435
+ import { finished } from "node:stream/promises";
1436
+ var JsonlWriter = class _JsonlWriter {
1437
+ stream;
1438
+ mutex = new Mutex();
1439
+ closed = false;
1440
+ constructor(stream) {
1441
+ this.stream = stream;
1442
+ }
1443
+ static async open(filePath) {
1444
+ await mkdir4(path7.dirname(filePath), { recursive: true });
1445
+ const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
1446
+ return new _JsonlWriter(stream);
1447
+ }
1448
+ async append(record) {
1449
+ await this.mutex.runExclusive(async () => {
1450
+ if (this.closed) {
1451
+ throw new Error("Cannot write to closed JSONL writer");
1452
+ }
1453
+ const snakeCaseRecord = toSnakeCaseDeep(record);
1454
+ const line = `${JSON.stringify(snakeCaseRecord)}
1455
+ `;
1456
+ if (!this.stream.write(line)) {
1457
+ await new Promise((resolve, reject) => {
1458
+ this.stream.once("drain", resolve);
1459
+ this.stream.once("error", reject);
1460
+ });
1461
+ }
1462
+ });
1463
+ }
1464
+ async close() {
1465
+ if (this.closed) {
1466
+ return;
1467
+ }
1468
+ this.closed = true;
1469
+ this.stream.end();
1470
+ await finished(this.stream);
1471
+ }
1472
+ };
1473
+
1474
+ // src/commands/eval/junit-writer.ts
1475
+ import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
1476
+ import path8 from "node:path";
1477
+ function escapeXml(str) {
1478
+ return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
1479
+ }
1480
+ var JunitWriter = class _JunitWriter {
1481
+ filePath;
1482
+ results = [];
1483
+ closed = false;
1484
+ constructor(filePath) {
1485
+ this.filePath = filePath;
1486
+ }
1487
+ static async open(filePath) {
1488
+ await mkdir5(path8.dirname(filePath), { recursive: true });
1489
+ return new _JunitWriter(filePath);
1490
+ }
1491
+ async append(result) {
1492
+ if (this.closed) {
1493
+ throw new Error("Cannot write to closed JUnit writer");
1494
+ }
1495
+ this.results.push(result);
1496
+ }
1497
+ async close() {
1498
+ if (this.closed) {
1499
+ return;
1500
+ }
1501
+ this.closed = true;
1502
+ const grouped = /* @__PURE__ */ new Map();
1503
+ for (const result of this.results) {
1504
+ const suite = result.dataset ?? "default";
1505
+ const existing = grouped.get(suite);
1506
+ if (existing) {
1507
+ existing.push(result);
1508
+ } else {
1509
+ grouped.set(suite, [result]);
1510
+ }
1511
+ }
1512
+ const suiteXmls = [];
1513
+ for (const [suiteName, results] of grouped) {
1514
+ const failures = results.filter((r) => r.score < 0.5).length;
1515
+ const errors = results.filter((r) => r.error !== void 0).length;
1516
+ const testCases = results.map((r) => {
1517
+ const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
1518
+ let inner = "";
1519
+ if (r.error) {
1520
+ inner = `
1521
+ <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
1522
+ `;
1523
+ } else if (r.score < 0.5) {
1524
+ const message = `score=${r.score.toFixed(3)}`;
1525
+ const detail = [
1526
+ `Score: ${r.score.toFixed(3)}`,
1527
+ r.reasoning ? `Reasoning: ${r.reasoning}` : "",
1528
+ r.misses.length > 0 ? `Misses: ${r.misses.join(", ")}` : ""
1529
+ ].filter(Boolean).join("\n");
1530
+ inner = `
1531
+ <failure message="${escapeXml(message)}">${escapeXml(detail)}</failure>
1532
+ `;
1533
+ }
1534
+ return ` <testcase name="${escapeXml(r.testId)}" classname="${escapeXml(suiteName)}" time="${time}">${inner}</testcase>`;
1535
+ });
1536
+ suiteXmls.push(
1537
+ ` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}">
1538
+ ${testCases.join("\n")}
1539
+ </testsuite>`
1540
+ );
1541
+ }
1542
+ const totalTests = this.results.length;
1543
+ const totalFailures = this.results.filter((r) => r.score < 0.5).length;
1544
+ const totalErrors = this.results.filter((r) => r.error !== void 0).length;
1545
+ const xml = `<?xml version="1.0" encoding="UTF-8"?>
1546
+ <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
1547
+ ${suiteXmls.join("\n")}
1548
+ </testsuites>
1549
+ `;
1550
+ await writeFile5(this.filePath, xml, "utf8");
1551
+ }
1552
+ };
1553
+
1554
+ // src/commands/eval/yaml-writer.ts
1555
+ import { createWriteStream as createWriteStream2 } from "node:fs";
1556
+ import { mkdir as mkdir6 } from "node:fs/promises";
1557
+ import path9 from "node:path";
1558
+ import { finished as finished2 } from "node:stream/promises";
1559
+ import { stringify as stringifyYaml } from "yaml";
1560
+ var YamlWriter = class _YamlWriter {
1561
+ stream;
1562
+ mutex = new Mutex();
1563
+ closed = false;
1564
+ isFirst = true;
1565
+ constructor(stream) {
1566
+ this.stream = stream;
1567
+ }
1568
+ static async open(filePath) {
1569
+ await mkdir6(path9.dirname(filePath), { recursive: true });
1570
+ const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
1571
+ return new _YamlWriter(stream);
1572
+ }
1573
+ async append(record) {
1574
+ await this.mutex.runExclusive(async () => {
1575
+ if (this.closed) {
1576
+ throw new Error("Cannot write to closed YAML writer");
1577
+ }
1578
+ const snakeCaseRecord = toSnakeCaseDeep(record);
1579
+ const yamlDoc = stringifyYaml(snakeCaseRecord, {
1580
+ indent: 2,
1581
+ lineWidth: 0
1582
+ // Disable line wrapping
1583
+ // Let YAML library choose appropriate string style based on content
1584
+ // (will use block literal for multiline strings with actual newlines)
1585
+ });
1586
+ const normalizedYaml = normalizeLineEndings(yamlDoc);
1587
+ const separator = this.isFirst ? "---\n" : "\n---\n";
1588
+ this.isFirst = false;
1589
+ const content = `${separator}${normalizedYaml}`;
1590
+ if (!this.stream.write(content)) {
1591
+ await new Promise((resolve, reject) => {
1592
+ this.stream.once("drain", resolve);
1593
+ this.stream.once("error", reject);
1594
+ });
1595
+ }
1596
+ });
1597
+ }
1598
+ async close() {
1599
+ if (this.closed) {
1600
+ return;
1601
+ }
1602
+ this.closed = true;
1603
+ this.stream.end();
1604
+ await finished2(this.stream);
1605
+ }
1606
+ };
1607
+
1608
+ // src/commands/eval/output-writer.ts
1609
+ async function createOutputWriter(filePath, format) {
1610
+ switch (format) {
1611
+ case "jsonl":
1612
+ return JsonlWriter.open(filePath);
1613
+ case "yaml":
1614
+ return YamlWriter.open(filePath);
1615
+ case "html":
1616
+ return HtmlWriter.open(filePath);
1617
+ default: {
1618
+ const exhaustiveCheck = format;
1619
+ throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
1620
+ }
1621
+ }
1622
+ }
1623
+ function getDefaultExtension(format) {
1624
+ switch (format) {
1625
+ case "jsonl":
1626
+ return ".jsonl";
1627
+ case "yaml":
1628
+ return ".yaml";
1629
+ case "html":
1630
+ return ".html";
1631
+ default: {
1632
+ const exhaustiveCheck = format;
1633
+ throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
1634
+ }
1635
+ }
1636
+ }
1637
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
1638
+ function createWriterFromPath(filePath) {
1639
+ const ext = path10.extname(filePath).toLowerCase();
1640
+ switch (ext) {
1641
+ case ".jsonl":
1642
+ return JsonlWriter.open(filePath);
1643
+ case ".json":
1644
+ return JsonWriter.open(filePath);
1645
+ case ".xml":
1646
+ return JunitWriter.open(filePath);
1647
+ case ".yaml":
1648
+ case ".yml":
1649
+ return YamlWriter.open(filePath);
1650
+ case ".html":
1651
+ case ".htm":
1652
+ return HtmlWriter.open(filePath);
1653
+ default:
1654
+ throw new Error(
1655
+ `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`
1656
+ );
1657
+ }
1658
+ }
1659
+ async function createMultiWriter(filePaths) {
1660
+ const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp)));
1661
+ return {
1662
+ async append(result) {
1663
+ await Promise.all(writers.map((w) => w.append(result)));
1664
+ },
1665
+ async close() {
1666
+ await Promise.all(writers.map((w) => w.close()));
1667
+ }
1668
+ };
1669
+ }
1670
+
1671
+ // src/commands/eval/progress-display.ts
1672
+ var ProgressDisplay = class {
1673
+ workers = /* @__PURE__ */ new Map();
1674
+ totalTests = 0;
1675
+ completedTests = 0;
1676
+ logPaths = [];
1677
+ logPathSet = /* @__PURE__ */ new Set();
1678
+ hasPrintedLogHeader = false;
1679
+ started = false;
1680
+ finished = false;
1681
+ verbose;
1682
+ constructor(_maxWorkers, options) {
1683
+ this.verbose = options?.verbose ?? false;
1684
+ }
1685
+ isInteractiveMode() {
1686
+ return false;
1687
+ }
1688
+ start() {
1689
+ this.started = true;
1690
+ this.finished = false;
1691
+ }
1692
+ setTotalTests(count) {
1693
+ this.totalTests = count;
1694
+ }
1695
+ updateWorker(progress) {
1696
+ const previous = this.workers.get(progress.workerId);
1697
+ this.workers.set(progress.workerId, progress);
1698
+ if (progress.status === "completed" || progress.status === "failed") {
1699
+ this.completedTests++;
1700
+ }
1701
+ const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
1702
+ const countPrefix = `${this.completedTests}/${this.totalTests}`;
1703
+ switch (progress.status) {
1704
+ case "pending":
1705
+ if (this.verbose && !previous) {
1706
+ console.log(`${countPrefix} \u23F3 ${progress.testId}${targetSuffix}`);
1707
+ }
1708
+ break;
1709
+ case "running":
1710
+ if (!previous || previous.status === "pending") {
1711
+ console.log(`${countPrefix} \u{1F504} ${progress.testId}${targetSuffix}`);
1712
+ }
1713
+ break;
1714
+ case "completed":
1715
+ console.log(`${countPrefix} \u2705 ${progress.testId}${targetSuffix}`);
1716
+ break;
1717
+ case "failed":
1718
+ console.log(
1719
+ `${countPrefix} \u274C ${progress.testId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
1720
+ );
1721
+ break;
1722
+ }
1723
+ }
1724
+ addLogPaths(paths, provider) {
1725
+ const newPaths = [];
1726
+ for (const path13 of paths) {
1727
+ if (this.logPathSet.has(path13)) {
1728
+ continue;
1729
+ }
1730
+ this.logPathSet.add(path13);
1731
+ newPaths.push(path13);
1732
+ }
1733
+ if (newPaths.length === 0) {
1734
+ return;
1735
+ }
1736
+ this.logPaths.push(...newPaths);
1737
+ if (!this.hasPrintedLogHeader) {
1738
+ console.log("");
1739
+ const label = provider === "pi" ? "Pi Coding Agent" : provider === "copilot" ? "Copilot CLI" : "Codex CLI";
1740
+ console.log(`${label} logs:`);
1741
+ this.hasPrintedLogHeader = true;
1742
+ }
1743
+ const startIndex = this.logPaths.length - newPaths.length;
1744
+ newPaths.forEach((path13, offset) => {
1745
+ console.log(`${startIndex + offset + 1}. ${path13}`);
1746
+ });
1747
+ }
1748
+ finish() {
1749
+ this.finished = true;
1750
+ console.log("");
1751
+ }
1752
+ clear() {
1753
+ }
1754
+ };
1755
+
1756
+ // src/commands/eval/retry-errors.ts
1757
+ import { createReadStream } from "node:fs";
1758
+ import { createInterface } from "node:readline";
1759
+ async function loadErrorTestIds(jsonlPath) {
1760
+ const ids = [];
1761
+ const rl = createInterface({
1762
+ input: createReadStream(jsonlPath),
1763
+ crlfDelay: Number.POSITIVE_INFINITY
1764
+ });
1765
+ for await (const line of rl) {
1766
+ const trimmed = line.trim();
1767
+ if (!trimmed) continue;
1768
+ try {
1769
+ const parsed = JSON.parse(trimmed);
1770
+ if (parsed.executionStatus === "execution_error" && parsed.testId) {
1771
+ ids.push(parsed.testId);
1772
+ }
1773
+ } catch {
1774
+ }
1775
+ }
1776
+ return [...new Set(ids)];
1777
+ }
1778
+ async function loadNonErrorResults(jsonlPath) {
1779
+ const results = [];
1780
+ const rl = createInterface({
1781
+ input: createReadStream(jsonlPath),
1782
+ crlfDelay: Number.POSITIVE_INFINITY
1783
+ });
1784
+ for await (const line of rl) {
1785
+ const trimmed = line.trim();
1786
+ if (!trimmed) continue;
1787
+ try {
1788
+ const parsed = JSON.parse(trimmed);
1789
+ if (!parsed.testId || parsed.score === void 0) continue;
1790
+ if (parsed.executionStatus !== "execution_error") {
1791
+ results.push(parsed);
1792
+ }
1793
+ } catch {
1794
+ }
1795
+ }
1796
+ return results;
1797
+ }
1798
+
1799
+ // src/commands/eval/statistics.ts
1800
+ var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
1801
+ function computeMean(values) {
1802
+ if (values.length === 0) {
1803
+ return 0;
1804
+ }
1805
+ const sum = values.reduce((acc, value) => acc + value, 0);
1806
+ return sum / values.length;
1807
+ }
1808
+ function computeMedian(values) {
1809
+ if (values.length === 0) {
1810
+ return 0;
1811
+ }
1812
+ const sorted = [...values].sort((a, b) => a - b);
1813
+ const mid = Math.floor(sorted.length / 2);
1814
+ if (sorted.length % 2 === 0) {
1815
+ return (sorted[mid - 1] + sorted[mid]) / 2;
1816
+ }
1817
+ return sorted[mid];
1818
+ }
1819
+ function computeStandardDeviation(values) {
1820
+ if (values.length < 2) {
1821
+ return void 0;
1822
+ }
1823
+ const mean = computeMean(values);
1824
+ const variance = values.reduce((acc, value) => acc + (value - mean) ** 2, 0) / (values.length - 1);
1825
+ return Math.sqrt(variance);
1826
+ }
1827
+ function buildHistogram(values) {
1828
+ const bins = [];
1829
+ for (let index = 0; index < HISTOGRAM_BREAKPOINTS.length - 1; index += 1) {
1830
+ bins.push({
1831
+ range: [HISTOGRAM_BREAKPOINTS[index], HISTOGRAM_BREAKPOINTS[index + 1]],
1832
+ count: 0
1833
+ });
1834
+ }
1835
+ for (const value of values) {
1836
+ for (const bin of bins) {
1837
+ const [start, end] = bin.range;
1838
+ const isLastBin = end === HISTOGRAM_BREAKPOINTS[HISTOGRAM_BREAKPOINTS.length - 1];
1839
+ const withinRange = isLastBin ? value >= start && value <= end : value >= start && value < end + 1e-9;
1840
+ if (withinRange) {
1841
+ bin.count += 1;
1842
+ break;
1843
+ }
1844
+ }
1845
+ }
1846
+ return bins;
1847
+ }
1848
+ function calculateEvaluationSummary(results) {
1849
+ const total = results.length;
1850
+ const errors = results.filter((result) => result.error !== void 0).map((result) => ({ testId: result.testId, error: result.error }));
1851
+ const errorCount = errors.length;
1852
+ if (total === 0) {
1853
+ return {
1854
+ total: 0,
1855
+ mean: 0,
1856
+ median: 0,
1857
+ min: 0,
1858
+ max: 0,
1859
+ standardDeviation: void 0,
1860
+ histogram: buildHistogram([]),
1861
+ topResults: [],
1862
+ bottomResults: [],
1863
+ errorCount: 0,
1864
+ errors: [],
1865
+ executionErrorCount: 0,
1866
+ qualityFailureCount: 0,
1867
+ passedCount: 0,
1868
+ byFailureStage: {},
1869
+ byFailureReason: {}
1870
+ };
1871
+ }
1872
+ const executionErrors = results.filter((r) => r.executionStatus === "execution_error");
1873
+ const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
1874
+ const qualityScores = qualityResults.map((r) => r.score);
1875
+ const mean = computeMean(qualityScores);
1876
+ const median = computeMedian(qualityScores);
1877
+ const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
1878
+ const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
1879
+ const standardDeviation = computeStandardDeviation(qualityScores);
1880
+ const histogram = buildHistogram(qualityScores);
1881
+ const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
1882
+ const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
1883
+ const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
1884
+ const executionErrorCount = executionErrors.length;
1885
+ const qualityFailureCount = results.filter((r) => r.executionStatus === "quality_failure").length;
1886
+ const passedCount = results.filter((r) => r.executionStatus === "ok").length;
1887
+ const byFailureStage = {};
1888
+ const byFailureReason = {};
1889
+ for (const result of executionErrors) {
1890
+ if (result.failureStage) {
1891
+ byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
1892
+ }
1893
+ if (result.failureReasonCode) {
1894
+ byFailureReason[result.failureReasonCode] = (byFailureReason[result.failureReasonCode] ?? 0) + 1;
1895
+ }
1896
+ }
1897
+ return {
1898
+ total,
1899
+ mean,
1900
+ median,
1901
+ min,
1902
+ max,
1903
+ standardDeviation,
1904
+ histogram,
1905
+ topResults,
1906
+ bottomResults,
1907
+ errorCount,
1908
+ errors,
1909
+ executionErrorCount,
1910
+ qualityFailureCount,
1911
+ passedCount,
1912
+ byFailureStage,
1913
+ byFailureReason
1914
+ };
1915
+ }
1916
+ function formatScore(value) {
1917
+ return value.toFixed(3);
1918
+ }
1919
+ function formatEvaluationSummary(summary) {
1920
+ if (summary.total === 0) {
1921
+ return "\nNo results to summarize";
1922
+ }
1923
+ const lines = [];
1924
+ if (summary.errorCount > 0) {
1925
+ lines.push("\n==================================================");
1926
+ lines.push("EXECUTION ERRORS");
1927
+ lines.push("==================================================");
1928
+ for (const error of summary.errors) {
1929
+ lines.push(`
1930
+ \u274C ${error.testId}`);
1931
+ lines.push(` ${error.error}`);
1932
+ }
1933
+ lines.push("");
1934
+ }
1935
+ lines.push("\n==================================================");
1936
+ lines.push("EVALUATION SUMMARY");
1937
+ lines.push("==================================================");
1938
+ lines.push(`Total tests: ${summary.total}`);
1939
+ lines.push(`Passed: ${summary.passedCount}`);
1940
+ if (summary.qualityFailureCount > 0) {
1941
+ lines.push(`Quality failures: ${summary.qualityFailureCount}`);
1942
+ }
1943
+ if (summary.executionErrorCount > 0) {
1944
+ lines.push(`Execution errors: ${summary.executionErrorCount}`);
1945
+ }
1946
+ if (summary.executionErrorCount > 0) {
1947
+ const qualityCount = summary.total - summary.executionErrorCount;
1948
+ lines.push(
1949
+ `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`
1950
+ );
1951
+ } else {
1952
+ lines.push(`Mean score: ${formatScore(summary.mean)}`);
1953
+ }
1954
+ lines.push(`Median score: ${formatScore(summary.median)}`);
1955
+ lines.push(`Min score: ${formatScore(summary.min)}`);
1956
+ lines.push(`Max score: ${formatScore(summary.max)}`);
1957
+ if (typeof summary.standardDeviation === "number") {
1958
+ lines.push(`Std deviation: ${formatScore(summary.standardDeviation)}`);
1959
+ }
1960
+ lines.push("\nScore distribution:");
1961
+ for (const bin of summary.histogram) {
1962
+ const [start, end] = bin.range;
1963
+ lines.push(` ${start.toFixed(1)}-${end.toFixed(1)}: ${bin.count}`);
1964
+ }
1965
+ lines.push("\nTop performing tests:");
1966
+ summary.topResults.forEach((result, index) => {
1967
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
1968
+ });
1969
+ lines.push("\nLowest performing tests:");
1970
+ summary.bottomResults.forEach((result, index) => {
1971
+ lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
1972
+ });
1973
+ const failureStageEntries = Object.entries(summary.byFailureStage);
1974
+ if (failureStageEntries.length > 0) {
1975
+ lines.push("\nExecution errors by stage:");
1976
+ for (const [stage, count] of failureStageEntries) {
1977
+ lines.push(` ${stage}: ${count}`);
1978
+ }
1979
+ }
1980
+ const failureReasonEntries = Object.entries(summary.byFailureReason);
1981
+ if (failureReasonEntries.length > 0) {
1982
+ lines.push("\nExecution errors by reason:");
1983
+ for (const [reason, count] of failureReasonEntries) {
1984
+ lines.push(` ${reason}: ${count}`);
1985
+ }
1986
+ }
1987
+ return lines.join("\n");
1988
+ }
1989
+ function formatMatrixSummary(results) {
1990
+ const targetSet = /* @__PURE__ */ new Set();
1991
+ const testIdSet = /* @__PURE__ */ new Set();
1992
+ for (const result of results) {
1993
+ targetSet.add(result.target);
1994
+ testIdSet.add(result.testId);
1995
+ }
1996
+ const targets = [...targetSet].sort();
1997
+ const testIds = [...testIdSet].sort();
1998
+ if (targets.length < 2) {
1999
+ return "";
2000
+ }
2001
+ const scoreMap = /* @__PURE__ */ new Map();
2002
+ for (const result of results) {
2003
+ if (!scoreMap.has(result.testId)) {
2004
+ scoreMap.set(result.testId, /* @__PURE__ */ new Map());
2005
+ }
2006
+ scoreMap.get(result.testId)?.set(result.target, result.score);
2007
+ }
2008
+ const lines = [];
2009
+ lines.push("\n==================================================");
2010
+ lines.push("MATRIX RESULTS (tests \xD7 targets)");
2011
+ lines.push("==================================================");
2012
+ const testIdColWidth = Math.max(7, ...testIds.map((id) => id.length));
2013
+ const targetColWidth = Math.max(7, ...targets.map((t) => t.length));
2014
+ const header = `${"Test".padEnd(testIdColWidth)} ${targets.map((t) => t.padEnd(targetColWidth)).join(" ")}`;
2015
+ lines.push(header);
2016
+ lines.push("-".repeat(header.length));
2017
+ for (const testId of testIds) {
2018
+ const cells = targets.map((target) => {
2019
+ const score = scoreMap.get(testId)?.get(target);
2020
+ return score !== void 0 ? formatScore(score).padEnd(targetColWidth) : "-".padEnd(targetColWidth);
2021
+ });
2022
+ lines.push(`${testId.padEnd(testIdColWidth)} ${cells.join(" ")}`);
2023
+ }
2024
+ lines.push("-".repeat(header.length));
2025
+ const avgCells = targets.map((target) => {
2026
+ const scores = results.filter((r) => r.target === target).map((r) => r.score);
2027
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
2028
+ return formatScore(avg).padEnd(targetColWidth);
2029
+ });
2030
+ lines.push(`${"Average".padEnd(testIdColWidth)} ${avgCells.join(" ")}`);
2031
+ return lines.join("\n");
2032
+ }
2033
+
2034
+ // ../../packages/core/dist/evaluation/validation/index.js
2035
+ import { readFile as readFile2 } from "node:fs/promises";
2036
+ import path11 from "node:path";
2037
+ import { parse } from "yaml";
2038
+ import { readFile as readFile22 } from "node:fs/promises";
2039
+ import path22 from "node:path";
2040
+ import { parse as parse2 } from "yaml";
2041
+ import { readFile as readFile3 } from "node:fs/promises";
2042
+ import path32 from "node:path";
2043
+ import { parse as parse3 } from "yaml";
2044
+ import { readFile as readFile4 } from "node:fs/promises";
2045
+ import { parse as parse4 } from "yaml";
2046
+ import { readFile as readFile5 } from "node:fs/promises";
2047
+ import path42 from "node:path";
2048
+ import { parse as parse5 } from "yaml";
2049
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
2050
+ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
2051
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
2052
+ async function detectFileType(filePath) {
2053
+ try {
2054
+ const content = await readFile2(filePath, "utf8");
2055
+ const parsed = parse(content);
2056
+ if (typeof parsed !== "object" || parsed === null) {
2057
+ return inferFileTypeFromPath(filePath);
2058
+ }
2059
+ const record = parsed;
2060
+ const schema = record.$schema;
2061
+ if (typeof schema !== "string") {
2062
+ return inferFileTypeFromPath(filePath);
2063
+ }
2064
+ switch (schema) {
2065
+ case SCHEMA_EVAL_V2:
2066
+ return "eval";
2067
+ case SCHEMA_TARGETS_V2:
2068
+ return "targets";
2069
+ case SCHEMA_CONFIG_V2:
2070
+ return "config";
2071
+ default:
2072
+ return inferFileTypeFromPath(filePath);
2073
+ }
2074
+ } catch {
2075
+ return inferFileTypeFromPath(filePath);
2076
+ }
2077
+ }
2078
+ function inferFileTypeFromPath(filePath) {
2079
+ const normalized = path11.normalize(filePath).replace(/\\/g, "/");
2080
+ const basename = path11.basename(filePath);
2081
+ if (normalized.includes("/.agentv/")) {
2082
+ if (basename === "config.yaml" || basename === "config.yml") {
2083
+ return "config";
2084
+ }
2085
+ if (basename === "targets.yaml" || basename === "targets.yml") {
2086
+ return "targets";
2087
+ }
2088
+ }
2089
+ return "eval";
2090
+ }
2091
+ var ASSERTION_TYPES_WITH_STRING_VALUE = /* @__PURE__ */ new Set([
2092
+ "contains",
2093
+ "icontains",
2094
+ "starts-with",
2095
+ "ends-with",
2096
+ "equals",
2097
+ "regex"
2098
+ ]);
2099
+ var ASSERTION_TYPES_WITH_ARRAY_VALUE = /* @__PURE__ */ new Set([
2100
+ "contains-any",
2101
+ "contains-all",
2102
+ "icontains-any",
2103
+ "icontains-all"
2104
+ ]);
2105
+ var VALID_TEST_FILE_EXTENSIONS = /* @__PURE__ */ new Set([".yaml", ".yml", ".jsonl"]);
2106
+ var NAME_PATTERN = /^[a-z0-9-]+$/;
2107
+ function isObject(value) {
2108
+ return typeof value === "object" && value !== null && !Array.isArray(value);
2109
+ }
2110
+ async function validateEvalFile(filePath) {
2111
+ const errors = [];
2112
+ const absolutePath = path22.resolve(filePath);
2113
+ let parsed;
2114
+ try {
2115
+ const content = await readFile22(absolutePath, "utf8");
2116
+ parsed = parse2(content);
2117
+ } catch (error) {
2118
+ errors.push({
2119
+ severity: "error",
2120
+ filePath: absolutePath,
2121
+ message: `Failed to parse YAML: ${error.message}`
2122
+ });
2123
+ return {
2124
+ valid: false,
2125
+ filePath: absolutePath,
2126
+ fileType: "eval",
2127
+ errors
2128
+ };
2129
+ }
2130
+ if (!isObject(parsed)) {
2131
+ errors.push({
2132
+ severity: "error",
2133
+ filePath: absolutePath,
2134
+ message: "File must contain a YAML object"
2135
+ });
2136
+ return {
2137
+ valid: false,
2138
+ filePath: absolutePath,
2139
+ fileType: "eval",
2140
+ errors
2141
+ };
2142
+ }
2143
+ validateMetadata(parsed, absolutePath, errors);
2144
+ const suiteInput = parsed.input;
2145
+ if (suiteInput !== void 0) {
2146
+ if (typeof suiteInput === "string") {
2147
+ } else if (Array.isArray(suiteInput)) {
2148
+ validateMessages(suiteInput, "input", absolutePath, errors);
2149
+ } else {
2150
+ errors.push({
2151
+ severity: "error",
2152
+ filePath: absolutePath,
2153
+ location: "input",
2154
+ message: "Invalid suite-level 'input' field (must be a string or array of messages)"
2155
+ });
2156
+ }
2157
+ }
2158
+ let cases = parsed.tests;
2159
+ if (cases === void 0 && "eval_cases" in parsed) {
2160
+ cases = parsed.eval_cases;
2161
+ errors.push({
2162
+ severity: "warning",
2163
+ filePath: absolutePath,
2164
+ location: "eval_cases",
2165
+ message: "'eval_cases' is deprecated. Use 'tests' instead."
2166
+ });
2167
+ }
2168
+ if (cases === void 0 && "evalcases" in parsed) {
2169
+ cases = parsed.evalcases;
2170
+ errors.push({
2171
+ severity: "warning",
2172
+ filePath: absolutePath,
2173
+ location: "evalcases",
2174
+ message: "'evalcases' is deprecated. Use 'tests' instead."
2175
+ });
2176
+ }
2177
+ if (typeof cases === "string") {
2178
+ validateTestsStringPath(cases, absolutePath, errors);
2179
+ return {
2180
+ valid: errors.filter((e) => e.severity === "error").length === 0,
2181
+ filePath: absolutePath,
2182
+ fileType: "eval",
2183
+ errors
2184
+ };
2185
+ }
2186
+ if (!Array.isArray(cases)) {
2187
+ errors.push({
2188
+ severity: "error",
2189
+ filePath: absolutePath,
2190
+ location: "tests",
2191
+ message: "Missing or invalid 'tests' field (must be an array or a file path string)"
2192
+ });
2193
+ return {
2194
+ valid: errors.length === 0,
2195
+ filePath: absolutePath,
2196
+ fileType: "eval",
2197
+ errors
2198
+ };
2199
+ }
2200
+ for (let i = 0; i < cases.length; i++) {
2201
+ const evalCase = cases[i];
2202
+ const location = `tests[${i}]`;
2203
+ if (!isObject(evalCase)) {
2204
+ errors.push({
2205
+ severity: "error",
2206
+ filePath: absolutePath,
2207
+ location,
2208
+ message: "Eval case must be an object"
2209
+ });
2210
+ continue;
2211
+ }
2212
+ const id = evalCase.id;
2213
+ if (typeof id !== "string" || id.trim().length === 0) {
2214
+ errors.push({
2215
+ severity: "error",
2216
+ filePath: absolutePath,
2217
+ location: `${location}.id`,
2218
+ message: "Missing or invalid 'id' field (must be a non-empty string)"
2219
+ });
2220
+ }
2221
+ let criteria = evalCase.criteria;
2222
+ if (criteria === void 0 && "expected_outcome" in evalCase) {
2223
+ criteria = evalCase.expected_outcome;
2224
+ errors.push({
2225
+ severity: "warning",
2226
+ filePath: absolutePath,
2227
+ location: `${location}.expected_outcome`,
2228
+ message: "'expected_outcome' is deprecated. Use 'criteria' instead."
2229
+ });
2230
+ }
2231
+ if (criteria !== void 0 && (typeof criteria !== "string" || criteria.trim().length === 0)) {
2232
+ errors.push({
2233
+ severity: "error",
2234
+ filePath: absolutePath,
2235
+ location: `${location}.criteria`,
2236
+ message: "Invalid 'criteria' field (must be a non-empty string if provided)"
2237
+ });
2238
+ }
2239
+ const inputField = evalCase.input;
2240
+ if (inputField !== void 0) {
2241
+ if (typeof inputField === "string") {
2242
+ } else if (Array.isArray(inputField)) {
2243
+ validateMessages(inputField, `${location}.input`, absolutePath, errors);
2244
+ } else {
2245
+ errors.push({
2246
+ severity: "error",
2247
+ filePath: absolutePath,
2248
+ location: `${location}.input`,
2249
+ message: "Invalid 'input' field (must be a string or array of messages)"
2250
+ });
2251
+ }
2252
+ } else {
2253
+ errors.push({
2254
+ severity: "error",
2255
+ filePath: absolutePath,
2256
+ location: `${location}.input`,
2257
+ message: "Missing 'input' field (must be a string or array of messages)"
2258
+ });
2259
+ }
2260
+ const expectedOutputField = evalCase.expected_output;
2261
+ if (expectedOutputField !== void 0) {
2262
+ if (typeof expectedOutputField === "string") {
2263
+ } else if (Array.isArray(expectedOutputField)) {
2264
+ if (expectedOutputField.length > 0 && isObject(expectedOutputField[0]) && "role" in expectedOutputField[0]) {
2265
+ validateMessages(
2266
+ expectedOutputField,
2267
+ `${location}.expected_output`,
2268
+ absolutePath,
2269
+ errors
2270
+ );
2271
+ }
2272
+ } else if (isObject(expectedOutputField)) {
2273
+ } else {
2274
+ errors.push({
2275
+ severity: "error",
2276
+ filePath: absolutePath,
2277
+ location: `${location}.expected_output`,
2278
+ message: "Invalid 'expected_output' field (must be a string, object, or array)"
2279
+ });
2280
+ }
2281
+ }
2282
+ const assertField = evalCase.assertions ?? evalCase.assert;
2283
+ if (assertField !== void 0) {
2284
+ validateAssertArray(assertField, location, absolutePath, errors);
2285
+ }
2286
+ }
2287
+ if (isObject(parsed.workspace)) {
2288
+ validateWorkspaceRepoConfig(parsed.workspace, absolutePath, errors);
2289
+ }
2290
+ return {
2291
+ valid: errors.filter((e) => e.severity === "error").length === 0,
2292
+ filePath: absolutePath,
2293
+ fileType: "eval",
2294
+ errors
2295
+ };
2296
+ }
2297
+ function validateWorkspaceRepoConfig(workspace, filePath, errors) {
2298
+ const repos = workspace.repos;
2299
+ const hooks = workspace.hooks;
2300
+ const afterEachHook = isObject(hooks) ? hooks.after_each : void 0;
2301
+ const isolation = workspace.isolation;
2302
+ if (Array.isArray(repos)) {
2303
+ for (const repo of repos) {
2304
+ if (!isObject(repo)) continue;
2305
+ const checkout = repo.checkout;
2306
+ const clone = repo.clone;
2307
+ if (isObject(checkout) && isObject(clone)) {
2308
+ const ancestor = checkout.ancestor;
2309
+ const depth = clone.depth;
2310
+ if (typeof ancestor === "number" && typeof depth === "number" && depth < ancestor + 1) {
2311
+ errors.push({
2312
+ severity: "warning",
2313
+ filePath,
2314
+ location: `workspace.repos[path=${repo.path}]`,
2315
+ message: `clone.depth (${depth}) may be insufficient for checkout.ancestor (${ancestor}). Recommend depth >= ${ancestor + 1}.`
2316
+ });
2317
+ }
2318
+ }
2319
+ }
2320
+ }
2321
+ if (isObject(afterEachHook) && afterEachHook.reset && afterEachHook.reset !== "none") {
2322
+ if (!Array.isArray(repos) || repos.length === 0) {
2323
+ errors.push({
2324
+ severity: "warning",
2325
+ filePath,
2326
+ location: "workspace.hooks.after_each",
2327
+ message: `hooks.after_each.reset '${afterEachHook.reset}' has no effect without repos.`
2328
+ });
2329
+ }
2330
+ }
2331
+ if (isObject(afterEachHook) && afterEachHook.reset && isolation === "per_test") {
2332
+ errors.push({
2333
+ severity: "warning",
2334
+ filePath,
2335
+ location: "workspace.hooks.after_each",
2336
+ message: "hooks.after_each.reset is redundant with isolation: per_test (each test gets a fresh workspace)."
2337
+ });
2338
+ }
2339
+ }
2340
+ function validateMessages(messages, location, filePath, errors) {
2341
+ for (let i = 0; i < messages.length; i++) {
2342
+ const message = messages[i];
2343
+ const msgLocation = `${location}[${i}]`;
2344
+ if (!isObject(message)) {
2345
+ errors.push({
2346
+ severity: "error",
2347
+ filePath,
2348
+ location: msgLocation,
2349
+ message: "Message must be an object"
2350
+ });
2351
+ continue;
2352
+ }
2353
+ const role = message.role;
2354
+ const validRoles = ["system", "user", "assistant"];
2355
+ if (!validRoles.includes(role)) {
2356
+ errors.push({
2357
+ severity: "error",
2358
+ filePath,
2359
+ location: `${msgLocation}.role`,
2360
+ message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
2361
+ });
2362
+ }
2363
+ const content = message.content;
2364
+ if (typeof content === "string") {
2365
+ validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
2366
+ } else if (Array.isArray(content)) {
2367
+ for (let j = 0; j < content.length; j++) {
2368
+ const contentItem = content[j];
2369
+ const contentLocation = `${msgLocation}.content[${j}]`;
2370
+ if (typeof contentItem === "string") {
2371
+ validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
2372
+ } else if (isObject(contentItem)) {
2373
+ const type = contentItem.type;
2374
+ if (typeof type !== "string") {
2375
+ errors.push({
2376
+ severity: "error",
2377
+ filePath,
2378
+ location: `${contentLocation}.type`,
2379
+ message: "Content object must have a 'type' field"
2380
+ });
2381
+ }
2382
+ if (type === "text") {
2383
+ const value = contentItem.value;
2384
+ if (typeof value !== "string") {
2385
+ errors.push({
2386
+ severity: "error",
2387
+ filePath,
2388
+ location: `${contentLocation}.value`,
2389
+ message: "Content with type 'text' must have a 'value' field"
2390
+ });
2391
+ } else {
2392
+ validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
2393
+ }
2394
+ }
2395
+ } else {
2396
+ errors.push({
2397
+ severity: "error",
2398
+ filePath,
2399
+ location: contentLocation,
2400
+ message: "Content array items must be strings or objects"
2401
+ });
2402
+ }
2403
+ }
2404
+ } else {
2405
+ errors.push({
2406
+ severity: "error",
2407
+ filePath,
2408
+ location: `${msgLocation}.content`,
2409
+ message: "Missing or invalid 'content' field (must be a string or array)"
2410
+ });
2411
+ }
2412
+ }
2413
+ }
2414
+ function validateMetadata(parsed, filePath, errors) {
2415
+ const name = parsed.name;
2416
+ if (name !== void 0) {
2417
+ if (typeof name === "string") {
2418
+ if (!NAME_PATTERN.test(name)) {
2419
+ errors.push({
2420
+ severity: "warning",
2421
+ filePath,
2422
+ location: "name",
2423
+ message: `Invalid 'name' format '${name}'. Must match pattern /^[a-z0-9-]+$/ (lowercase alphanumeric with hyphens).`
2424
+ });
2425
+ }
2426
+ }
2427
+ if (!("description" in parsed) || parsed.description === void 0) {
2428
+ errors.push({
2429
+ severity: "warning",
2430
+ filePath,
2431
+ location: "name",
2432
+ message: "When 'name' is present, 'description' should also be provided."
2433
+ });
2434
+ }
2435
+ }
2436
+ }
2437
+ function validateTestsStringPath(testsPath, filePath, errors) {
2438
+ const ext = path22.extname(testsPath);
2439
+ if (!VALID_TEST_FILE_EXTENSIONS.has(ext)) {
2440
+ errors.push({
2441
+ severity: "warning",
2442
+ filePath,
2443
+ location: "tests",
2444
+ message: `Unsupported file extension '${ext}' for tests path '${testsPath}'. Supported extensions: ${[...VALID_TEST_FILE_EXTENSIONS].join(", ")}`
2445
+ });
2446
+ }
2447
+ }
2448
+ function validateAssertArray(assertField, parentLocation, filePath, errors) {
2449
+ if (!Array.isArray(assertField)) {
2450
+ errors.push({
2451
+ severity: "warning",
2452
+ filePath,
2453
+ location: `${parentLocation}.assertions`,
2454
+ message: "'assertions' must be an array of assertion objects."
2455
+ });
2456
+ return;
2457
+ }
2458
+ const objectItems = [];
2459
+ for (let i = 0; i < assertField.length; i++) {
2460
+ const item = assertField[i];
2461
+ if (typeof item === "string") {
2462
+ if (item.trim().length === 0) {
2463
+ errors.push({
2464
+ severity: "warning",
2465
+ filePath,
2466
+ location: `${parentLocation}.assertions[${i}]`,
2467
+ message: "Empty string assertion item will be ignored."
2468
+ });
2469
+ }
2470
+ continue;
2471
+ }
2472
+ if (!isObject(item)) {
2473
+ errors.push({
2474
+ severity: "warning",
2475
+ filePath,
2476
+ location: `${parentLocation}.assertions[${i}]`,
2477
+ message: "Assertion item must be a string or an object with a type field."
2478
+ });
2479
+ continue;
2480
+ }
2481
+ objectItems.push({ item, index: i });
2482
+ }
2483
+ for (const { item, index } of objectItems) {
2484
+ const location = `${parentLocation}.assertions[${index}]`;
2485
+ const rawTypeValue = item.type;
2486
+ if (rawTypeValue === void 0 || typeof rawTypeValue !== "string") {
2487
+ errors.push({
2488
+ severity: "warning",
2489
+ filePath,
2490
+ location: `${location}.type`,
2491
+ message: "Assertion item is missing a 'type' field."
2492
+ });
2493
+ continue;
2494
+ }
2495
+ const typeValue = rawTypeValue.replace(/_/g, "-");
2496
+ if (!isEvaluatorKind(typeValue)) {
2497
+ errors.push({
2498
+ severity: "warning",
2499
+ filePath,
2500
+ location: `${location}.type`,
2501
+ message: `Unknown assertion type '${rawTypeValue}'.`
2502
+ });
2503
+ continue;
2504
+ }
2505
+ if (ASSERTION_TYPES_WITH_STRING_VALUE.has(typeValue)) {
2506
+ const value = item.value;
2507
+ if (value === void 0 || typeof value !== "string") {
2508
+ errors.push({
2509
+ severity: "warning",
2510
+ filePath,
2511
+ location: `${location}.value`,
2512
+ message: `Assertion type '${typeValue}' requires a 'value' field (string).`
2513
+ });
2514
+ continue;
2515
+ }
2516
+ if (typeValue === "regex") {
2517
+ try {
2518
+ new RegExp(value);
2519
+ } catch {
2520
+ errors.push({
2521
+ severity: "warning",
2522
+ filePath,
2523
+ location: `${location}.value`,
2524
+ message: `Invalid regex pattern '${value}': not a valid regular expression.`
2525
+ });
2526
+ }
2527
+ }
2528
+ }
2529
+ if (ASSERTION_TYPES_WITH_ARRAY_VALUE.has(typeValue)) {
2530
+ const value = item.value;
2531
+ if (!Array.isArray(value) || value.length === 0) {
2532
+ errors.push({
2533
+ severity: "warning",
2534
+ filePath,
2535
+ location: `${location}.value`,
2536
+ message: `Assertion type '${typeValue}' requires a 'value' field (non-empty string array).`
2537
+ });
2538
+ continue;
2539
+ }
2540
+ }
2541
+ const required = item.required;
2542
+ if (required !== void 0) {
2543
+ validateRequiredField(required, location, filePath, errors);
2544
+ }
2545
+ }
2546
+ }
2547
+ function validateRequiredField(required, parentLocation, filePath, errors) {
2548
+ if (typeof required === "boolean") {
2549
+ return;
2550
+ }
2551
+ if (typeof required === "number") {
2552
+ if (required <= 0 || required > 1) {
2553
+ errors.push({
2554
+ severity: "warning",
2555
+ filePath,
2556
+ location: `${parentLocation}.required`,
2557
+ message: `Invalid 'required' value ${required}. When a number, it must be between 0 (exclusive) and 1 (inclusive).`
2558
+ });
2559
+ }
2560
+ return;
2561
+ }
2562
+ errors.push({
2563
+ severity: "warning",
2564
+ filePath,
2565
+ location: `${parentLocation}.required`,
2566
+ message: `Invalid 'required' value. Must be a boolean or a number between 0 (exclusive) and 1 (inclusive).`
2567
+ });
2568
+ }
2569
+ function validateContentForRoleMarkers(content, location, filePath, errors) {
2570
+ const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
2571
+ for (const marker of markers) {
2572
+ if (content.toLowerCase().includes(marker.toLowerCase())) {
2573
+ errors.push({
2574
+ severity: "warning",
2575
+ filePath,
2576
+ location,
2577
+ message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
2578
+ });
2579
+ }
2580
+ }
2581
+ }
2582
+ function isObject2(value) {
2583
+ return typeof value === "object" && value !== null && !Array.isArray(value);
2584
+ }
2585
+ var COMMON_SETTINGS = /* @__PURE__ */ new Set(["provider_batching", "providerBatching"]);
2586
+ var RETRY_SETTINGS = /* @__PURE__ */ new Set([
2587
+ "max_retries",
2588
+ "maxRetries",
2589
+ "retry_initial_delay_ms",
2590
+ "retryInitialDelayMs",
2591
+ "retry_max_delay_ms",
2592
+ "retryMaxDelayMs",
2593
+ "retry_backoff_factor",
2594
+ "retryBackoffFactor",
2595
+ "retry_status_codes",
2596
+ "retryStatusCodes"
2597
+ ]);
2598
+ var AZURE_SETTINGS = /* @__PURE__ */ new Set([
2599
+ ...COMMON_SETTINGS,
2600
+ ...RETRY_SETTINGS,
2601
+ "endpoint",
2602
+ "resource",
2603
+ "resourceName",
2604
+ "api_key",
2605
+ "apiKey",
2606
+ "deployment",
2607
+ "deploymentName",
2608
+ "model",
2609
+ "version",
2610
+ "api_version",
2611
+ "temperature",
2612
+ "max_output_tokens",
2613
+ "maxTokens"
2614
+ ]);
2615
+ var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
2616
+ ...COMMON_SETTINGS,
2617
+ ...RETRY_SETTINGS,
2618
+ "api_key",
2619
+ "apiKey",
2620
+ "model",
2621
+ "deployment",
2622
+ "variant",
2623
+ "temperature",
2624
+ "max_output_tokens",
2625
+ "maxTokens",
2626
+ "thinking_budget",
2627
+ "thinkingBudget"
2628
+ ]);
2629
+ var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
2630
+ ...COMMON_SETTINGS,
2631
+ ...RETRY_SETTINGS,
2632
+ "api_key",
2633
+ "apiKey",
2634
+ "model",
2635
+ "deployment",
2636
+ "variant",
2637
+ "temperature",
2638
+ "max_output_tokens",
2639
+ "maxTokens"
2640
+ ]);
2641
+ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
2642
+ ...COMMON_SETTINGS,
2643
+ "model",
2644
+ "executable",
2645
+ "command",
2646
+ "binary",
2647
+ "args",
2648
+ "arguments",
2649
+ "cwd",
2650
+ "timeout_seconds",
2651
+ "timeoutSeconds",
2652
+ "log_dir",
2653
+ "logDir",
2654
+ "log_directory",
2655
+ "logDirectory",
2656
+ "log_format",
2657
+ "logFormat",
2658
+ "log_output_format",
2659
+ "logOutputFormat",
2660
+ "system_prompt",
2661
+ "systemPrompt",
2662
+ "workspace_template",
2663
+ "workspaceTemplate"
2664
+ ]);
2665
+ var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
2666
+ ...COMMON_SETTINGS,
2667
+ "cli_url",
2668
+ "cliUrl",
2669
+ "cli_path",
2670
+ "cliPath",
2671
+ "github_token",
2672
+ "githubToken",
2673
+ "model",
2674
+ "cwd",
2675
+ "timeout_seconds",
2676
+ "timeoutSeconds",
2677
+ "log_dir",
2678
+ "logDir",
2679
+ "log_format",
2680
+ "logFormat",
2681
+ "system_prompt",
2682
+ "systemPrompt",
2683
+ "workspace_template",
2684
+ "workspaceTemplate"
2685
+ ]);
2686
+ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
2687
+ ...COMMON_SETTINGS,
2688
+ "executable",
2689
+ "command",
2690
+ "binary",
2691
+ "args",
2692
+ "arguments",
2693
+ "model",
2694
+ "cwd",
2695
+ "timeout_seconds",
2696
+ "timeoutSeconds",
2697
+ "log_dir",
2698
+ "logDir",
2699
+ "log_format",
2700
+ "logFormat",
2701
+ "system_prompt",
2702
+ "systemPrompt",
2703
+ "workspace_template",
2704
+ "workspaceTemplate"
2705
+ ]);
2706
+ var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
2707
+ ...COMMON_SETTINGS,
2708
+ "executable",
2709
+ "workspace_template",
2710
+ "workspaceTemplate",
2711
+ "wait",
2712
+ "dry_run",
2713
+ "dryRun",
2714
+ "subagent_root",
2715
+ "subagentRoot",
2716
+ "timeout_seconds",
2717
+ "timeoutSeconds"
2718
+ ]);
2719
+ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
2720
+ ...COMMON_SETTINGS,
2721
+ "response",
2722
+ "delayMs",
2723
+ "delayMinMs",
2724
+ "delayMaxMs",
2725
+ "trace"
2726
+ // For testing tool-trajectory evaluator
2727
+ ]);
2728
+ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
2729
+ ...COMMON_SETTINGS,
2730
+ "model",
2731
+ "cwd",
2732
+ "timeout_seconds",
2733
+ "timeoutSeconds",
2734
+ "log_dir",
2735
+ "logDir",
2736
+ "log_directory",
2737
+ "logDirectory",
2738
+ "log_format",
2739
+ "logFormat",
2740
+ "log_output_format",
2741
+ "logOutputFormat",
2742
+ "system_prompt",
2743
+ "systemPrompt",
2744
+ "workspace_template",
2745
+ "workspaceTemplate",
2746
+ "max_turns",
2747
+ "maxTurns",
2748
+ "max_budget_usd",
2749
+ "maxBudgetUsd"
2750
+ ]);
2751
+ function getKnownSettings(provider) {
2752
+ const normalizedProvider = provider.toLowerCase();
2753
+ switch (normalizedProvider) {
2754
+ case "azure":
2755
+ case "azure-openai":
2756
+ return AZURE_SETTINGS;
2757
+ case "anthropic":
2758
+ return ANTHROPIC_SETTINGS;
2759
+ case "gemini":
2760
+ case "google":
2761
+ case "google-gemini":
2762
+ return GEMINI_SETTINGS;
2763
+ case "codex":
2764
+ case "codex-cli":
2765
+ return CODEX_SETTINGS;
2766
+ case "copilot-sdk":
2767
+ case "copilot_sdk":
2768
+ return COPILOT_SDK_SETTINGS;
2769
+ case "copilot":
2770
+ case "copilot-cli":
2771
+ return COPILOT_CLI_SETTINGS;
2772
+ case "claude":
2773
+ case "claude-code":
2774
+ case "claude-cli":
2775
+ case "claude-sdk":
2776
+ return CLAUDE_SETTINGS;
2777
+ case "vscode":
2778
+ case "vscode-insiders":
2779
+ return VSCODE_SETTINGS;
2780
+ case "mock":
2781
+ return MOCK_SETTINGS;
2782
+ case "cli":
2783
+ return null;
2784
+ default:
2785
+ return null;
2786
+ }
2787
+ }
2788
+ function validateUnknownSettings(target, provider, absolutePath, location, errors) {
2789
+ const removedTargetFields = /* @__PURE__ */ new Set(["workspace_template", "workspaceTemplate"]);
2790
+ const knownSettings = getKnownSettings(provider);
2791
+ if (!knownSettings) {
2792
+ return;
2793
+ }
2794
+ const baseFields = /* @__PURE__ */ new Set([
2795
+ "name",
2796
+ "provider",
2797
+ "grader_target",
2798
+ "judge_target",
2799
+ "workers",
2800
+ "$schema",
2801
+ "targets"
2802
+ ]);
2803
+ for (const key of Object.keys(target)) {
2804
+ if (removedTargetFields.has(key)) {
2805
+ errors.push({
2806
+ severity: "error",
2807
+ filePath: absolutePath,
2808
+ location: `${location}.${key}`,
2809
+ message: "target-level workspace_template has been removed. Use eval-level workspace.template."
2810
+ });
2811
+ continue;
2812
+ }
2813
+ if (!baseFields.has(key) && !knownSettings.has(key)) {
2814
+ errors.push({
2815
+ severity: "warning",
2816
+ filePath: absolutePath,
2817
+ location: `${location}.${key}`,
2818
+ message: `Unknown setting '${key}' for ${provider} provider. This property will be ignored.`
2819
+ });
2820
+ }
2821
+ }
2822
+ }
2823
+ async function validateTargetsFile(filePath) {
2824
+ const errors = [];
2825
+ const absolutePath = path32.resolve(filePath);
2826
+ let parsed;
2827
+ try {
2828
+ const content = await readFile3(absolutePath, "utf8");
2829
+ parsed = parse3(content);
2830
+ } catch (error) {
2831
+ errors.push({
2832
+ severity: "error",
2833
+ filePath: absolutePath,
2834
+ message: `Failed to parse YAML: ${error.message}`
2835
+ });
2836
+ return {
2837
+ valid: false,
2838
+ filePath: absolutePath,
2839
+ fileType: "targets",
2840
+ errors
2841
+ };
2842
+ }
2843
+ function validateCliSettings(target, absolutePath2, location, errors2) {
2844
+ const command = target.command;
2845
+ if (typeof command !== "string" || command.trim().length === 0) {
2846
+ errors2.push({
2847
+ severity: "error",
2848
+ filePath: absolutePath2,
2849
+ location: `${location}.command`,
2850
+ message: "CLI provider requires 'command' as a non-empty string"
2851
+ });
2852
+ } else {
2853
+ recordUnknownPlaceholders(command, absolutePath2, `${location}.command`, errors2);
2854
+ }
2855
+ const healthcheck = target.healthcheck;
2856
+ if (healthcheck !== void 0) {
2857
+ validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
2858
+ }
2859
+ }
2860
+ function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
2861
+ if (!isObject2(healthcheck)) {
2862
+ errors2.push({
2863
+ severity: "error",
2864
+ filePath: absolutePath2,
2865
+ location,
2866
+ message: "'healthcheck' must be an object when provided"
2867
+ });
2868
+ return;
2869
+ }
2870
+ const timeoutSeconds = healthcheck.timeout_seconds ?? healthcheck.timeoutSeconds;
2871
+ if (timeoutSeconds !== void 0) {
2872
+ const numericTimeout = Number(timeoutSeconds);
2873
+ if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
2874
+ errors2.push({
2875
+ severity: "error",
2876
+ filePath: absolutePath2,
2877
+ location: `${location}.timeoutSeconds`,
2878
+ message: "healthcheck.timeoutSeconds must be a positive number when provided"
2879
+ });
2880
+ }
2881
+ }
2882
+ const hasUrl = typeof healthcheck.url === "string" && healthcheck.url.trim().length > 0;
2883
+ const hasCommand = typeof healthcheck.command === "string" && healthcheck.command.trim().length > 0;
2884
+ if (!hasUrl && !hasCommand) {
2885
+ errors2.push({
2886
+ severity: "error",
2887
+ filePath: absolutePath2,
2888
+ location,
2889
+ message: "healthcheck must have either 'url' (HTTP) or 'command' (command)"
2890
+ });
2891
+ return;
2892
+ }
2893
+ if (hasUrl) {
2894
+ return;
2895
+ }
2896
+ recordUnknownPlaceholders(
2897
+ healthcheck.command,
2898
+ absolutePath2,
2899
+ `${location}.command`,
2900
+ errors2
2901
+ );
2902
+ const cwd = healthcheck.cwd;
2903
+ if (cwd !== void 0 && typeof cwd !== "string") {
2904
+ errors2.push({
2905
+ severity: "error",
2906
+ filePath: absolutePath2,
2907
+ location: `${location}.cwd`,
2908
+ message: "healthcheck.cwd must be a string when provided"
2909
+ });
2910
+ }
2911
+ }
2912
+ function recordUnknownPlaceholders(template, absolutePath2, location, errors2) {
2913
+ const placeholders = extractPlaceholders(template);
2914
+ for (const placeholder of placeholders) {
2915
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
2916
+ errors2.push({
2917
+ severity: "error",
2918
+ filePath: absolutePath2,
2919
+ location,
2920
+ message: `Unknown CLI placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
2921
+ });
2922
+ }
2923
+ }
2924
+ }
2925
+ function extractPlaceholders(template) {
2926
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
2927
+ const result = [];
2928
+ for (const match of matches) {
2929
+ const placeholder = match[1];
2930
+ if (placeholder) {
2931
+ result.push(placeholder);
2932
+ }
2933
+ }
2934
+ return result;
2935
+ }
2936
+ if (!isObject2(parsed)) {
2937
+ errors.push({
2938
+ severity: "error",
2939
+ filePath: absolutePath,
2940
+ message: "File must contain a YAML object"
2941
+ });
2942
+ return {
2943
+ valid: false,
2944
+ filePath: absolutePath,
2945
+ fileType: "targets",
2946
+ errors
2947
+ };
2948
+ }
2949
+ const targets = parsed.targets;
2950
+ if (!Array.isArray(targets)) {
2951
+ errors.push({
2952
+ severity: "error",
2953
+ filePath: absolutePath,
2954
+ location: "targets",
2955
+ message: "Missing or invalid 'targets' field (must be an array)"
2956
+ });
2957
+ return {
2958
+ valid: errors.length === 0,
2959
+ filePath: absolutePath,
2960
+ fileType: "targets",
2961
+ errors
2962
+ };
2963
+ }
2964
+ const knownProviders = [...KNOWN_PROVIDERS, ...PROVIDER_ALIASES];
2965
+ for (let i = 0; i < targets.length; i++) {
2966
+ const target = targets[i];
2967
+ const location = `targets[${i}]`;
2968
+ if (!isObject2(target)) {
2969
+ errors.push({
2970
+ severity: "error",
2971
+ filePath: absolutePath,
2972
+ location,
2973
+ message: "Target must be an object"
2974
+ });
2975
+ continue;
2976
+ }
2977
+ const name = target.name;
2978
+ if (typeof name !== "string" || name.trim().length === 0) {
2979
+ errors.push({
2980
+ severity: "error",
2981
+ filePath: absolutePath,
2982
+ location: `${location}.name`,
2983
+ message: "Missing or invalid 'name' field (must be a non-empty string)"
2984
+ });
2985
+ }
2986
+ const provider = target.provider;
2987
+ const providerValue = typeof provider === "string" ? provider.trim().toLowerCase() : void 0;
2988
+ if (typeof provider !== "string" || provider.trim().length === 0) {
2989
+ errors.push({
2990
+ severity: "error",
2991
+ filePath: absolutePath,
2992
+ location: `${location}.provider`,
2993
+ message: "Missing or invalid 'provider' field (must be a non-empty string)"
2994
+ });
2995
+ } else if (!knownProviders.includes(provider)) {
2996
+ errors.push({
2997
+ severity: "warning",
2998
+ filePath: absolutePath,
2999
+ location: `${location}.provider`,
3000
+ message: `Unknown provider '${provider}'. Known providers: ${knownProviders.join(", ")}`
3001
+ });
3002
+ }
3003
+ if (providerValue === "cli") {
3004
+ validateCliSettings(target, absolutePath, location, errors);
3005
+ }
3006
+ if (typeof provider === "string") {
3007
+ validateUnknownSettings(target, provider, absolutePath, location, errors);
3008
+ }
3009
+ const graderTarget = target.grader_target ?? target.judge_target;
3010
+ if (graderTarget !== void 0 && typeof graderTarget !== "string") {
3011
+ errors.push({
3012
+ severity: "error",
3013
+ filePath: absolutePath,
3014
+ location: `${location}.grader_target`,
3015
+ message: "Invalid 'grader_target' field (must be a string)"
3016
+ });
3017
+ }
3018
+ }
3019
+ return {
3020
+ valid: errors.filter((e) => e.severity === "error").length === 0,
3021
+ filePath: absolutePath,
3022
+ fileType: "targets",
3023
+ errors
3024
+ };
3025
+ }
3026
+ async function validateConfigFile(filePath) {
3027
+ const errors = [];
3028
+ try {
3029
+ const content = await readFile4(filePath, "utf8");
3030
+ const parsed = parse4(content);
3031
+ if (typeof parsed !== "object" || parsed === null) {
3032
+ errors.push({
3033
+ severity: "error",
3034
+ filePath,
3035
+ message: "Config file must contain a valid YAML object"
3036
+ });
3037
+ return { valid: false, filePath, fileType: "config", errors };
3038
+ }
3039
+ const config = parsed;
3040
+ const guidelinePatterns = config.guideline_patterns;
3041
+ if (guidelinePatterns !== void 0) {
3042
+ if (!Array.isArray(guidelinePatterns)) {
3043
+ errors.push({
3044
+ severity: "error",
3045
+ filePath,
3046
+ location: "guideline_patterns",
3047
+ message: "Field 'guideline_patterns' must be an array"
3048
+ });
3049
+ } else if (!guidelinePatterns.every((p) => typeof p === "string")) {
3050
+ errors.push({
3051
+ severity: "error",
3052
+ filePath,
3053
+ location: "guideline_patterns",
3054
+ message: "All entries in 'guideline_patterns' must be strings"
3055
+ });
3056
+ } else if (guidelinePatterns.length === 0) {
3057
+ errors.push({
3058
+ severity: "warning",
3059
+ filePath,
3060
+ location: "guideline_patterns",
3061
+ message: "Field 'guideline_patterns' is empty. Consider removing it or adding patterns."
3062
+ });
3063
+ }
3064
+ }
3065
+ const evalPatterns = config.eval_patterns;
3066
+ if (evalPatterns !== void 0) {
3067
+ if (!Array.isArray(evalPatterns)) {
3068
+ errors.push({
3069
+ severity: "error",
3070
+ filePath,
3071
+ location: "eval_patterns",
3072
+ message: "Field 'eval_patterns' must be an array"
3073
+ });
3074
+ } else if (!evalPatterns.every((p) => typeof p === "string")) {
3075
+ errors.push({
3076
+ severity: "error",
3077
+ filePath,
3078
+ location: "eval_patterns",
3079
+ message: "All entries in 'eval_patterns' must be strings"
3080
+ });
3081
+ } else if (evalPatterns.length === 0) {
3082
+ errors.push({
3083
+ severity: "warning",
3084
+ filePath,
3085
+ location: "eval_patterns",
3086
+ message: "Field 'eval_patterns' is empty. Consider removing it or adding patterns."
3087
+ });
3088
+ }
3089
+ }
3090
+ const requiredVersion = config.required_version;
3091
+ if (requiredVersion !== void 0) {
3092
+ if (typeof requiredVersion !== "string" || requiredVersion.trim().length === 0) {
3093
+ errors.push({
3094
+ severity: "error",
3095
+ filePath,
3096
+ location: "required_version",
3097
+ message: `Field 'required_version' must be a non-empty string (e.g. ">=3.1.0")`
3098
+ });
3099
+ }
3100
+ }
3101
+ const allowedFields = /* @__PURE__ */ new Set([
3102
+ "$schema",
3103
+ "guideline_patterns",
3104
+ "eval_patterns",
3105
+ "required_version",
3106
+ "execution"
3107
+ ]);
3108
+ const unexpectedFields = Object.keys(config).filter((key) => !allowedFields.has(key));
3109
+ if (unexpectedFields.length > 0) {
3110
+ errors.push({
3111
+ severity: "warning",
3112
+ filePath,
3113
+ message: `Unexpected fields: ${unexpectedFields.join(", ")}`
3114
+ });
3115
+ }
3116
+ return {
3117
+ valid: errors.filter((e) => e.severity === "error").length === 0,
3118
+ filePath,
3119
+ fileType: "config",
3120
+ errors
3121
+ };
3122
+ } catch (error) {
3123
+ errors.push({
3124
+ severity: "error",
3125
+ filePath,
3126
+ message: `Failed to parse config file: ${error.message}`
3127
+ });
3128
+ return { valid: false, filePath, fileType: "config", errors };
3129
+ }
3130
+ }
3131
+ function isObject3(value) {
3132
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3133
+ }
3134
+ async function validateFileReferences(evalFilePath) {
3135
+ const errors = [];
3136
+ const absolutePath = path42.resolve(evalFilePath);
3137
+ const gitRoot = await findGitRoot(absolutePath);
3138
+ if (!gitRoot) {
3139
+ errors.push({
3140
+ severity: "error",
3141
+ filePath: absolutePath,
3142
+ message: "Cannot validate file references: git repository root not found"
3143
+ });
3144
+ return errors;
3145
+ }
3146
+ const searchRoots = buildSearchRoots(absolutePath, gitRoot);
3147
+ let parsed;
3148
+ try {
3149
+ const content = await readFile5(absolutePath, "utf8");
3150
+ parsed = parse5(content);
3151
+ } catch {
3152
+ return errors;
3153
+ }
3154
+ if (!isObject3(parsed)) {
3155
+ return errors;
3156
+ }
3157
+ let cases = parsed.tests;
3158
+ if (cases === void 0 && "eval_cases" in parsed) {
3159
+ cases = parsed.eval_cases;
3160
+ }
3161
+ if (cases === void 0 && "evalcases" in parsed) {
3162
+ cases = parsed.evalcases;
3163
+ }
3164
+ if (!Array.isArray(cases)) {
3165
+ return errors;
3166
+ }
3167
+ for (let i = 0; i < cases.length; i++) {
3168
+ const evalCase = cases[i];
3169
+ if (!isObject3(evalCase)) {
3170
+ continue;
3171
+ }
3172
+ const inputField = evalCase.input;
3173
+ if (Array.isArray(inputField)) {
3174
+ await validateMessagesFileRefs(
3175
+ inputField,
3176
+ `tests[${i}].input`,
3177
+ searchRoots,
3178
+ absolutePath,
3179
+ errors
3180
+ );
3181
+ }
3182
+ const expectedOutputField = evalCase.expected_output;
3183
+ if (Array.isArray(expectedOutputField)) {
3184
+ await validateMessagesFileRefs(
3185
+ expectedOutputField,
3186
+ `tests[${i}].expected_output`,
3187
+ searchRoots,
3188
+ absolutePath,
3189
+ errors
3190
+ );
3191
+ }
3192
+ }
3193
+ return errors;
3194
+ }
3195
+ async function validateMessagesFileRefs(messages, location, searchRoots, filePath, errors) {
3196
+ for (let i = 0; i < messages.length; i++) {
3197
+ const message = messages[i];
3198
+ if (!isObject3(message)) {
3199
+ continue;
3200
+ }
3201
+ const content = message.content;
3202
+ if (typeof content === "string") {
3203
+ continue;
3204
+ }
3205
+ if (!Array.isArray(content)) {
3206
+ continue;
3207
+ }
3208
+ for (let j = 0; j < content.length; j++) {
3209
+ const contentItem = content[j];
3210
+ if (!isObject3(contentItem)) {
3211
+ continue;
3212
+ }
3213
+ const type = contentItem.type;
3214
+ if (type !== "file") {
3215
+ continue;
3216
+ }
3217
+ const value = contentItem.value;
3218
+ if (typeof value !== "string") {
3219
+ errors.push({
3220
+ severity: "error",
3221
+ filePath,
3222
+ location: `${location}[${i}].content[${j}].value`,
3223
+ message: "File reference must have a 'value' field with the file path"
3224
+ });
3225
+ continue;
3226
+ }
3227
+ const { resolvedPath } = await resolveFileReference(value, searchRoots);
3228
+ if (!resolvedPath) {
3229
+ errors.push({
3230
+ severity: "error",
3231
+ filePath,
3232
+ location: `${location}[${i}].content[${j}]`,
3233
+ message: `Referenced file not found: ${value}`
3234
+ });
3235
+ } else {
3236
+ try {
3237
+ const fileContent = await readFile5(resolvedPath, "utf8");
3238
+ if (fileContent.trim().length === 0) {
3239
+ errors.push({
3240
+ severity: "warning",
3241
+ filePath,
3242
+ location: `${location}[${i}].content[${j}]`,
3243
+ message: `Referenced file is empty: ${value}`
3244
+ });
3245
+ }
3246
+ } catch (error) {
3247
+ errors.push({
3248
+ severity: "error",
3249
+ filePath,
3250
+ location: `${location}[${i}].content[${j}]`,
3251
+ message: `Cannot read referenced file: ${value} (${error.message})`
3252
+ });
3253
+ }
3254
+ }
3255
+ }
3256
+ }
3257
+ }
3258
+
3259
+ // src/commands/eval/targets.ts
3260
+ var ANSI_YELLOW2 = "\x1B[33m";
3261
+ var ANSI_RED2 = "\x1B[31m";
3262
+ var ANSI_RESET2 = "\x1B[0m";
3263
+ function isTTY() {
3264
+ return process.stdout.isTTY ?? false;
3265
+ }
3266
+ async function readTestSuiteTarget(testFilePath) {
3267
+ const metadata = await readTestSuiteMetadata(testFilePath);
3268
+ return metadata.target;
3269
+ }
3270
+ function pickTargetName(options) {
3271
+ const cliName = options.cliTargetName?.trim();
3272
+ if (cliName && cliName !== "default") {
3273
+ return { name: cliName, source: "cli" };
3274
+ }
3275
+ const fileName = options.fileTargetName?.trim();
3276
+ if (fileName && fileName.length > 0) {
3277
+ return { name: fileName, source: "test-file" };
3278
+ }
3279
+ return { name: "default", source: "default" };
3280
+ }
3281
+ async function selectTarget(options) {
3282
+ const {
3283
+ testFilePath,
3284
+ repoRoot,
3285
+ cwd,
3286
+ explicitTargetsPath,
3287
+ cliTargetName,
3288
+ dryRun,
3289
+ dryRunDelay,
3290
+ dryRunDelayMin,
3291
+ dryRunDelayMax,
3292
+ env
3293
+ } = options;
3294
+ const targetsFilePath = await discoverTargetsFile({
3295
+ explicitPath: explicitTargetsPath,
3296
+ testFilePath,
3297
+ repoRoot,
3298
+ cwd
3299
+ });
3300
+ const validationResult = await validateTargetsFile(targetsFilePath);
3301
+ const warnings = validationResult.errors.filter((e) => e.severity === "warning");
3302
+ const useColors = isTTY();
3303
+ if (warnings.length > 0) {
3304
+ console.warn(`
3305
+ Warnings in ${targetsFilePath}:`);
3306
+ for (const warning of warnings) {
3307
+ const location = warning.location ? ` [${warning.location}]` : "";
3308
+ const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
3309
+ const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
3310
+ console.warn(`${prefix}${location} ${message}`);
3311
+ }
3312
+ console.warn("");
3313
+ }
3314
+ const errors = validationResult.errors.filter((e) => e.severity === "error");
3315
+ if (errors.length > 0) {
3316
+ console.error(`
3317
+ Errors in ${targetsFilePath}:`);
3318
+ for (const error of errors) {
3319
+ const location = error.location ? ` [${error.location}]` : "";
3320
+ const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
3321
+ const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
3322
+ console.error(`${prefix}${location} ${message}`);
3323
+ }
3324
+ throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
3325
+ }
3326
+ const definitions = await readTargetDefinitions(targetsFilePath);
3327
+ const fileTargetName = await readTestSuiteTarget(testFilePath);
3328
+ const targetChoice = pickTargetName({ cliTargetName, fileTargetName });
3329
+ const targetDefinition = definitions.find(
3330
+ (definition) => definition.name === targetChoice.name
3331
+ );
3332
+ if (!targetDefinition) {
3333
+ const available = listTargetNames(definitions).join(", ");
3334
+ throw new Error(
3335
+ `Target '${targetChoice.name}' not found in ${targetsFilePath}. Available targets: ${available}`
3336
+ );
3337
+ }
3338
+ if (dryRun) {
3339
+ const mockTarget = {
3340
+ kind: "mock",
3341
+ name: `${targetDefinition.name}-dry-run`,
3342
+ graderTarget: void 0,
3343
+ config: {
3344
+ response: '{"answer":"Mock dry-run response"}',
3345
+ delayMs: dryRunDelay,
3346
+ delayMinMs: dryRunDelayMin,
3347
+ delayMaxMs: dryRunDelayMax
3348
+ }
3349
+ };
3350
+ return {
3351
+ definitions,
3352
+ resolvedTarget: mockTarget,
3353
+ targetName: targetChoice.name,
3354
+ targetSource: targetChoice.source,
3355
+ targetsFilePath
3356
+ };
3357
+ }
3358
+ try {
3359
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3360
+ return {
3361
+ definitions,
3362
+ resolvedTarget,
3363
+ targetName: targetChoice.name,
3364
+ targetSource: targetChoice.source,
3365
+ targetsFilePath
3366
+ };
3367
+ } catch (error) {
3368
+ const message = error instanceof Error ? error.message : String(error);
3369
+ throw new Error(`Failed to resolve target '${targetChoice.name}': ${message}`);
3370
+ }
3371
+ }
3372
+ async function selectMultipleTargets(options) {
3373
+ const {
3374
+ testFilePath,
3375
+ repoRoot,
3376
+ cwd,
3377
+ explicitTargetsPath,
3378
+ dryRun,
3379
+ dryRunDelay,
3380
+ dryRunDelayMin,
3381
+ dryRunDelayMax,
3382
+ env,
3383
+ targetNames
3384
+ } = options;
3385
+ const targetsFilePath = await discoverTargetsFile({
3386
+ explicitPath: explicitTargetsPath,
3387
+ testFilePath,
3388
+ repoRoot,
3389
+ cwd
3390
+ });
3391
+ const validationResult = await validateTargetsFile(targetsFilePath);
3392
+ const warnings = validationResult.errors.filter((e) => e.severity === "warning");
3393
+ const useColors = isTTY();
3394
+ if (warnings.length > 0) {
3395
+ console.warn(`
3396
+ Warnings in ${targetsFilePath}:`);
3397
+ for (const warning of warnings) {
3398
+ const location = warning.location ? ` [${warning.location}]` : "";
3399
+ const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
3400
+ const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
3401
+ console.warn(`${prefix}${location} ${message}`);
3402
+ }
3403
+ console.warn("");
3404
+ }
3405
+ const errors = validationResult.errors.filter((e) => e.severity === "error");
3406
+ if (errors.length > 0) {
3407
+ console.error(`
3408
+ Errors in ${targetsFilePath}:`);
3409
+ for (const error of errors) {
3410
+ const location = error.location ? ` [${error.location}]` : "";
3411
+ const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
3412
+ const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
3413
+ console.error(`${prefix}${location} ${message}`);
3414
+ }
3415
+ throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
3416
+ }
3417
+ const definitions = await readTargetDefinitions(targetsFilePath);
3418
+ const results = [];
3419
+ for (const name of targetNames) {
3420
+ const targetDefinition = definitions.find(
3421
+ (definition) => definition.name === name
3422
+ );
3423
+ if (!targetDefinition) {
3424
+ const available = listTargetNames(definitions).join(", ");
3425
+ throw new Error(
3426
+ `Target '${name}' not found in ${targetsFilePath}. Available targets: ${available}`
3427
+ );
3428
+ }
3429
+ if (dryRun) {
3430
+ const mockTarget = {
3431
+ kind: "mock",
3432
+ name: `${targetDefinition.name}-dry-run`,
3433
+ graderTarget: void 0,
3434
+ config: {
3435
+ response: '{"answer":"Mock dry-run response"}',
3436
+ delayMs: dryRunDelay,
3437
+ delayMinMs: dryRunDelayMin,
3438
+ delayMaxMs: dryRunDelayMax
3439
+ }
3440
+ };
3441
+ results.push({
3442
+ definitions,
3443
+ resolvedTarget: mockTarget,
3444
+ targetName: name,
3445
+ targetSource: "cli",
3446
+ targetsFilePath
3447
+ });
3448
+ } else {
3449
+ try {
3450
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3451
+ results.push({
3452
+ definitions,
3453
+ resolvedTarget,
3454
+ targetName: name,
3455
+ targetSource: "cli",
3456
+ targetsFilePath
3457
+ });
3458
+ } catch (error) {
3459
+ const message = error instanceof Error ? error.message : String(error);
3460
+ throw new Error(`Failed to resolve target '${name}': ${message}`);
3461
+ }
3462
+ }
3463
+ }
3464
+ return results;
3465
+ }
3466
+
3467
+ // src/commands/eval/run-eval.ts
3468
+ var DEFAULT_WORKERS = 3;
3469
+ function normalizeBoolean(value) {
3470
+ return value === true;
3471
+ }
3472
+ function normalizeString(value) {
3473
+ if (typeof value !== "string") {
3474
+ return void 0;
3475
+ }
3476
+ const trimmed = value.trim();
3477
+ return trimmed.length > 0 ? trimmed : void 0;
3478
+ }
3479
+ function resolveTimestampPlaceholder(value) {
3480
+ if (!value.includes("{timestamp}")) {
3481
+ return value;
3482
+ }
3483
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3484
+ return value.replaceAll("{timestamp}", timestamp);
3485
+ }
3486
+ function normalizeNumber(value, fallback) {
3487
+ if (typeof value === "number" && Number.isFinite(value)) {
3488
+ return value;
3489
+ }
3490
+ if (typeof value === "string") {
3491
+ const parsed = Number.parseInt(value, 10);
3492
+ if (!Number.isNaN(parsed)) {
3493
+ return parsed;
3494
+ }
3495
+ }
3496
+ return fallback;
3497
+ }
3498
+ function normalizeOptionalNumber(value) {
3499
+ if (typeof value === "number" && Number.isFinite(value)) {
3500
+ return value;
3501
+ }
3502
+ if (typeof value === "string") {
3503
+ const parsed = Number.parseInt(value, 10);
3504
+ if (!Number.isNaN(parsed)) {
3505
+ return parsed;
3506
+ }
3507
+ }
3508
+ return void 0;
3509
+ }
3510
+ function normalizeWorkspaceMode(value) {
3511
+ return value === "pooled" || value === "temp" || value === "static" ? value : void 0;
3512
+ }
3513
+ function normalizeOptions(rawOptions, config, yamlExecution) {
3514
+ const cliFormat = normalizeString(rawOptions.outputFormat);
3515
+ const configFormat = config?.output?.format;
3516
+ const formatStr = cliFormat ?? configFormat ?? "jsonl";
3517
+ const format = formatStr === "yaml" ? "yaml" : "jsonl";
3518
+ const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
3519
+ const configWorkers = config?.execution?.workers;
3520
+ const workers = cliWorkers ?? configWorkers ?? 0;
3521
+ const rawOutputPaths = rawOptions.output;
3522
+ const outputPaths = Array.isArray(rawOutputPaths) ? rawOutputPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
3523
+ const rawTarget = rawOptions.target;
3524
+ let cliTargets = [];
3525
+ let singleTarget;
3526
+ if (Array.isArray(rawTarget)) {
3527
+ cliTargets = rawTarget.filter((v) => typeof v === "string" && v.trim().length > 0);
3528
+ singleTarget = cliTargets.length === 1 ? cliTargets[0] : void 0;
3529
+ } else if (typeof rawTarget === "string") {
3530
+ const trimmed = rawTarget.trim();
3531
+ if (trimmed.length > 0 && trimmed !== "default") {
3532
+ cliTargets = [trimmed];
3533
+ singleTarget = trimmed;
3534
+ }
3535
+ }
3536
+ const cliAgentTimeout = normalizeOptionalNumber(rawOptions.agentTimeout);
3537
+ const configAgentTimeoutSeconds = config?.execution?.agentTimeoutMs != null ? config.execution.agentTimeoutMs / 1e3 : void 0;
3538
+ const cliMaxRetries = normalizeOptionalNumber(rawOptions.maxRetries);
3539
+ const configMaxRetries = config?.execution?.maxRetries;
3540
+ const cliCache = normalizeBoolean(rawOptions.cache);
3541
+ const cliNoCache = normalizeBoolean(rawOptions.noCache);
3542
+ const configCacheEnabled = config?.cache?.enabled;
3543
+ const resolvedCache = cliCache || !cliNoCache && configCacheEnabled === true;
3544
+ const resolvedNoCache = cliNoCache;
3545
+ const cliOut = normalizeString(rawOptions.out);
3546
+ const configOut = config?.output?.dir;
3547
+ const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
3548
+ const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
3549
+ const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
3550
+ if (cliWorkspacePath && cliWorkspaceModeRaw && cliWorkspaceMode !== "static") {
3551
+ throw new Error("--workspace-path requires --workspace-mode=static (or omit --workspace-mode)");
3552
+ }
3553
+ const yamlExecutionRecord = yamlExecution;
3554
+ const yamlWorkspaceMode = normalizeWorkspaceMode(yamlExecutionRecord?.workspace_mode);
3555
+ const yamlWorkspacePath = normalizeString(yamlExecutionRecord?.workspace_path);
3556
+ const workspacePath = cliWorkspacePath ?? yamlWorkspacePath;
3557
+ const workspaceMode = cliWorkspacePath ? "static" : cliWorkspaceMode ?? yamlWorkspaceMode;
3558
+ return {
3559
+ target: singleTarget,
3560
+ cliTargets,
3561
+ targetsPath: normalizeString(rawOptions.targets),
3562
+ filter: normalizeString(rawOptions.filter),
3563
+ workers: workers > 0 ? workers : void 0,
3564
+ outPath: cliOut ?? configOut,
3565
+ outputPaths,
3566
+ format,
3567
+ dryRun: normalizeBoolean(rawOptions.dryRun),
3568
+ dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
3569
+ dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
3570
+ dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0),
3571
+ agentTimeoutSeconds: cliAgentTimeout ?? configAgentTimeoutSeconds,
3572
+ maxRetries: cliMaxRetries ?? configMaxRetries ?? 2,
3573
+ cache: resolvedCache,
3574
+ noCache: resolvedNoCache,
3575
+ // Boolean OR: config `true` cannot be overridden to `false` from CLI.
3576
+ // Intentional — there are no --no-verbose / --no-keep-workspaces flags.
3577
+ // Precedence: CLI > YAML config > TS config
3578
+ verbose: normalizeBoolean(rawOptions.verbose) || yamlExecution?.verbose === true || config?.execution?.verbose === true,
3579
+ // Precedence: CLI > YAML config > TS config
3580
+ otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
3581
+ traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
3582
+ exportOtel: normalizeBoolean(rawOptions.exportOtel),
3583
+ otelBackend: normalizeString(rawOptions.otelBackend),
3584
+ otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
3585
+ otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
3586
+ retryErrors: normalizeString(rawOptions.retryErrors),
3587
+ workspaceMode,
3588
+ workspacePath,
3589
+ benchmarkJson: normalizeString(rawOptions.benchmarkJson),
3590
+ artifacts: normalizeString(rawOptions.artifacts),
3591
+ graderTarget: normalizeString(rawOptions.graderTarget),
3592
+ model: normalizeString(rawOptions.model)
3593
+ };
3594
+ }
3595
+ async function ensureFileExists(filePath, description) {
3596
+ try {
3597
+ await access4(filePath, constants4.F_OK);
3598
+ } catch {
3599
+ throw new Error(`${description} not found: ${filePath}`);
3600
+ }
3601
+ }
3602
+ function buildDefaultOutputPath(cwd, format) {
3603
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3604
+ const baseName = "eval";
3605
+ const extension = getDefaultExtension(format);
3606
+ return path12.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3607
+ }
3608
+ function createProgressReporter(maxWorkers, options) {
3609
+ const display = new ProgressDisplay(maxWorkers, options);
3610
+ return {
3611
+ isInteractive: display.isInteractiveMode(),
3612
+ start: () => display.start(),
3613
+ setTotal: (total) => display.setTotalTests(total),
3614
+ update: (workerId, progress) => display.updateWorker({ ...progress, workerId }),
3615
+ finish: () => display.finish(),
3616
+ addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
3617
+ };
3618
+ }
3619
+ function makeEvalKey(testFilePath, evalId) {
3620
+ return `${path12.resolve(testFilePath)}::${evalId}`;
3621
+ }
3622
+ function createDisplayIdTracker() {
3623
+ const map = /* @__PURE__ */ new Map();
3624
+ let nextId = 1;
3625
+ return {
3626
+ getOrAssign(evalKey) {
3627
+ const existing = map.get(evalKey);
3628
+ if (existing !== void 0) {
3629
+ return existing;
3630
+ }
3631
+ const assigned = nextId++;
3632
+ map.set(evalKey, assigned);
3633
+ return assigned;
3634
+ }
3635
+ };
3636
+ }
3637
+ function applyVerboseOverride(selection, cliVerbose) {
3638
+ const { resolvedTarget } = selection;
3639
+ if (resolvedTarget.kind !== "cli") {
3640
+ return selection;
3641
+ }
3642
+ return {
3643
+ ...selection,
3644
+ resolvedTarget: {
3645
+ ...resolvedTarget,
3646
+ config: {
3647
+ ...resolvedTarget.config,
3648
+ verbose: cliVerbose
3649
+ }
3650
+ }
3651
+ };
3652
+ }
3653
+ async function prepareFileMetadata(params) {
3654
+ const { testFilePath, repoRoot, cwd, options } = params;
3655
+ await ensureFileExists(testFilePath, "Test file");
3656
+ await loadEnvFromHierarchy({
3657
+ testFilePath,
3658
+ repoRoot,
3659
+ verbose: options.verbose
3660
+ });
3661
+ const suite = await loadTestSuite(testFilePath, repoRoot, {
3662
+ verbose: options.verbose,
3663
+ filter: options.filter
3664
+ });
3665
+ const filteredIds = suite.tests.map((value) => value.id);
3666
+ const cliTargets = options.cliTargets;
3667
+ const suiteTargets = suite.targets;
3668
+ let targetNames;
3669
+ if (cliTargets.length > 0) {
3670
+ targetNames = cliTargets;
3671
+ } else if (suiteTargets && suiteTargets.length > 0) {
3672
+ targetNames = suiteTargets;
3673
+ } else {
3674
+ targetNames = [];
3675
+ }
3676
+ let selections;
3677
+ if (targetNames.length > 1) {
3678
+ const multiSelections = await selectMultipleTargets({
3679
+ testFilePath,
3680
+ repoRoot,
3681
+ cwd,
3682
+ explicitTargetsPath: options.targetsPath,
3683
+ dryRun: options.dryRun,
3684
+ dryRunDelay: options.dryRunDelay,
3685
+ dryRunDelayMin: options.dryRunDelayMin,
3686
+ dryRunDelayMax: options.dryRunDelayMax,
3687
+ env: process.env,
3688
+ targetNames
3689
+ });
3690
+ selections = multiSelections.map((sel) => {
3691
+ const providerLabel = options.dryRun ? `${sel.resolvedTarget.kind} (dry-run)` : sel.resolvedTarget.kind;
3692
+ return {
3693
+ selection: sel,
3694
+ inlineTargetLabel: `${sel.targetName} [provider=${providerLabel}]`
3695
+ };
3696
+ });
3697
+ } else {
3698
+ const selection = await selectTarget({
3699
+ testFilePath,
3700
+ repoRoot,
3701
+ cwd,
3702
+ explicitTargetsPath: options.targetsPath,
3703
+ cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
3704
+ dryRun: options.dryRun,
3705
+ dryRunDelay: options.dryRunDelay,
3706
+ dryRunDelayMin: options.dryRunDelayMin,
3707
+ dryRunDelayMax: options.dryRunDelayMax,
3708
+ env: process.env
3709
+ });
3710
+ const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
3711
+ selections = [
3712
+ {
3713
+ selection,
3714
+ inlineTargetLabel: `${selection.targetName} [provider=${providerLabel}]`
3715
+ }
3716
+ ];
3717
+ }
3718
+ return {
3719
+ evalIds: filteredIds,
3720
+ evalCases: suite.tests,
3721
+ selections,
3722
+ trialsConfig: suite.trials,
3723
+ suiteTargets,
3724
+ yamlCache: suite.cacheConfig?.enabled,
3725
+ yamlCachePath: suite.cacheConfig?.cachePath,
3726
+ totalBudgetUsd: suite.totalBudgetUsd,
3727
+ failOnError: suite.failOnError
3728
+ };
3729
+ }
3730
+ async function runWithLimit(items, limit, task) {
3731
+ const safeLimit = Math.max(1, limit);
3732
+ let index = 0;
3733
+ const workers = Array.from({ length: safeLimit }, async () => {
3734
+ while (index < items.length) {
3735
+ const current = items[index];
3736
+ index += 1;
3737
+ await task(current);
3738
+ }
3739
+ });
3740
+ await Promise.all(workers);
3741
+ }
3742
+ async function runSingleEvalFile(params) {
3743
+ const {
3744
+ testFilePath,
3745
+ cwd,
3746
+ repoRoot,
3747
+ options,
3748
+ outputWriter,
3749
+ otelExporter,
3750
+ cache,
3751
+ evaluationRunner,
3752
+ workersOverride,
3753
+ progressReporter,
3754
+ seenEvalCases,
3755
+ displayIdTracker,
3756
+ selection,
3757
+ inlineTargetLabel,
3758
+ evalCases,
3759
+ trialsConfig,
3760
+ matrixMode,
3761
+ totalBudgetUsd,
3762
+ failOnError
3763
+ } = params;
3764
+ const targetName = selection.targetName;
3765
+ await ensureFileExists(testFilePath, "Test file");
3766
+ const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
3767
+ const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
3768
+ const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
3769
+ if (!progressReporter.isInteractive || options.verbose) {
3770
+ console.log(targetMessage);
3771
+ }
3772
+ const agentTimeoutMs = options.agentTimeoutSeconds != null ? Math.max(0, options.agentTimeoutSeconds) * 1e3 : void 0;
3773
+ const workerPreference = workersOverride ?? options.workers;
3774
+ let resolvedWorkers = workerPreference ?? resolvedTargetSelection.resolvedTarget.workers ?? DEFAULT_WORKERS;
3775
+ if (resolvedWorkers < 1 || resolvedWorkers > 50) {
3776
+ throw new Error(`Workers must be between 1 and 50, got: ${resolvedWorkers}`);
3777
+ }
3778
+ const isVSCodeProvider = ["vscode", "vscode-insiders"].includes(
3779
+ resolvedTargetSelection.resolvedTarget.kind
3780
+ );
3781
+ if (isVSCodeProvider && resolvedWorkers > 1) {
3782
+ console.warn(
3783
+ `Warning: VSCode providers require window focus. Limiting workers from ${resolvedWorkers} to 1 to prevent race conditions.`
3784
+ );
3785
+ resolvedWorkers = 1;
3786
+ }
3787
+ if (isVSCodeProvider && !options.dryRun) {
3788
+ const vsConfig = resolvedTargetSelection.resolvedTarget.config;
3789
+ await ensureVSCodeSubagents({
3790
+ kind: resolvedTargetSelection.resolvedTarget.kind,
3791
+ count: resolvedWorkers,
3792
+ verbose: options.verbose,
3793
+ vscodeCmd: vsConfig.executable
3794
+ });
3795
+ }
3796
+ const streamingObserver = otelExporter?.createStreamingObserver() ?? null;
3797
+ const results = await evaluationRunner({
3798
+ testFilePath,
3799
+ repoRoot,
3800
+ target: resolvedTargetSelection.resolvedTarget,
3801
+ targets: resolvedTargetSelection.definitions,
3802
+ env: process.env,
3803
+ maxRetries: Math.max(0, options.maxRetries),
3804
+ agentTimeoutMs,
3805
+ cache,
3806
+ useCache: (() => {
3807
+ if (!cache) return false;
3808
+ const targetConfig = resolvedTargetSelection.resolvedTarget.config;
3809
+ if (shouldSkipCacheForTemperature(targetConfig)) {
3810
+ if (options.verbose) {
3811
+ console.log("Cache skipped: target temperature > 0");
3812
+ }
3813
+ return false;
3814
+ }
3815
+ return true;
3816
+ })(),
3817
+ evalCases,
3818
+ verbose: options.verbose,
3819
+ maxConcurrency: resolvedWorkers,
3820
+ workspaceMode: options.workspaceMode,
3821
+ workspacePath: options.workspacePath,
3822
+ trials: trialsConfig,
3823
+ totalBudgetUsd,
3824
+ failOnError,
3825
+ graderTarget: options.graderTarget,
3826
+ model: options.model,
3827
+ streamCallbacks: streamingObserver?.getStreamCallbacks(),
3828
+ onResult: async (result) => {
3829
+ streamingObserver?.finalizeEvalCase(result.score, result.error);
3830
+ const { output: _, ...resultWithoutTrace } = result;
3831
+ await outputWriter.append(resultWithoutTrace);
3832
+ if (otelExporter && !streamingObserver) {
3833
+ try {
3834
+ await otelExporter.exportResult(result);
3835
+ } catch (err) {
3836
+ if (options.verbose) {
3837
+ console.warn(
3838
+ `OTel export warning: ${err instanceof Error ? err.message : String(err)}`
3839
+ );
3840
+ }
3841
+ }
3842
+ }
3843
+ },
3844
+ onProgress: async (event) => {
3845
+ const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
3846
+ const evalKey = makeEvalKey(testFilePath, evalKeyId);
3847
+ if (event.status === "pending" && !seenEvalCases.has(evalKey)) {
3848
+ seenEvalCases.add(evalKey);
3849
+ progressReporter.setTotal(seenEvalCases.size);
3850
+ }
3851
+ const displayId = displayIdTracker.getOrAssign(evalKey);
3852
+ if (event.status === "running" && streamingObserver) {
3853
+ streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
3854
+ }
3855
+ progressReporter.update(displayId, {
3856
+ workerId: displayId,
3857
+ testId: matrixMode ? `${event.testId}@${targetName}` : event.testId,
3858
+ status: event.status,
3859
+ startedAt: event.startedAt,
3860
+ completedAt: event.completedAt,
3861
+ error: event.error,
3862
+ targetLabel: inlineTargetLabel
3863
+ });
3864
+ }
3865
+ });
3866
+ return { results: [...results] };
3867
+ }
3868
+ async function runEvalCommand(input) {
3869
+ const cwd = process.cwd();
3870
+ let config = null;
3871
+ try {
3872
+ config = await loadTsConfig(cwd);
3873
+ } catch (err) {
3874
+ console.warn(
3875
+ `Warning: Failed to load agentv config: ${err instanceof Error ? err.message : String(err)}`
3876
+ );
3877
+ }
3878
+ const repoRoot = await findRepoRoot(cwd);
3879
+ const yamlConfig = await loadConfig(path12.join(cwd, "_"), repoRoot);
3880
+ if (yamlConfig?.required_version) {
3881
+ await enforceRequiredVersion(yamlConfig.required_version, {
3882
+ strict: normalizeBoolean(input.rawOptions.strict)
3883
+ });
3884
+ }
3885
+ let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
3886
+ if (options.graderTarget === "agentv" && !options.model) {
3887
+ throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
3888
+ }
3889
+ let retryNonErrorResults;
3890
+ if (options.retryErrors) {
3891
+ const retryPath = path12.resolve(options.retryErrors);
3892
+ await ensureFileExists(retryPath, "Retry-errors JSONL file");
3893
+ const errorIds = await loadErrorTestIds(retryPath);
3894
+ if (errorIds.length === 0) {
3895
+ console.log("No execution errors found in the previous output. Nothing to retry.");
3896
+ return;
3897
+ }
3898
+ console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(", ")}`);
3899
+ const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(",")}}`;
3900
+ options = { ...options, filter: filterPattern };
3901
+ retryNonErrorResults = await loadNonErrorResults(retryPath);
3902
+ }
3903
+ if (options.workspacePath) {
3904
+ const resolvedWorkspace = path12.resolve(options.workspacePath);
3905
+ try {
3906
+ const { stat: stat2 } = await import("node:fs/promises");
3907
+ const stats = await stat2(resolvedWorkspace);
3908
+ if (!stats.isDirectory()) {
3909
+ throw new Error(`--workspace-path is not a directory: ${resolvedWorkspace}`);
3910
+ }
3911
+ } catch (err) {
3912
+ if (err.code === "ENOENT") {
3913
+ throw new Error(`--workspace-path does not exist: ${resolvedWorkspace}`);
3914
+ }
3915
+ throw err;
3916
+ }
3917
+ options = { ...options, workspacePath: resolvedWorkspace };
3918
+ }
3919
+ if (options.verbose) {
3920
+ console.log(`Repository root: ${repoRoot}`);
3921
+ }
3922
+ let otelExporter = null;
3923
+ const useFileExport = !!(options.otelFile || options.traceFile);
3924
+ if (options.exportOtel || useFileExport) {
3925
+ try {
3926
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-3BMOAU4X.js");
3927
+ let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
3928
+ let headers = {};
3929
+ if (options.otelBackend) {
3930
+ const preset = OTEL_BACKEND_PRESETS[options.otelBackend];
3931
+ if (preset) {
3932
+ endpoint = preset.endpoint;
3933
+ headers = preset.headers(process.env);
3934
+ } else {
3935
+ console.warn(`Unknown OTel backend preset: ${options.otelBackend}`);
3936
+ }
3937
+ }
3938
+ if (process.env.OTEL_EXPORTER_OTLP_HEADERS) {
3939
+ for (const pair of process.env.OTEL_EXPORTER_OTLP_HEADERS.split(",")) {
3940
+ const [key, ...rest] = pair.split("=");
3941
+ if (key) headers[key.trim()] = rest.join("=").trim();
3942
+ }
3943
+ }
3944
+ const captureContent = options.otelCaptureContent || process.env.AGENTV_OTEL_CAPTURE_CONTENT === "true";
3945
+ otelExporter = new OtelTraceExporter({
3946
+ endpoint,
3947
+ headers,
3948
+ captureContent,
3949
+ groupTurns: options.otelGroupTurns,
3950
+ otlpFilePath: options.otelFile ? path12.resolve(options.otelFile) : void 0,
3951
+ traceFilePath: options.traceFile ? path12.resolve(options.traceFile) : void 0
3952
+ });
3953
+ const initialized = await otelExporter.init();
3954
+ if (!initialized) {
3955
+ console.warn(
3956
+ "OTel export requested but @opentelemetry packages not available. Install them to enable export."
3957
+ );
3958
+ otelExporter = null;
3959
+ }
3960
+ } catch (err) {
3961
+ console.warn(
3962
+ `OTel export initialization failed: ${err instanceof Error ? err.message : String(err)}`
3963
+ );
3964
+ otelExporter = null;
3965
+ }
3966
+ }
3967
+ const outputPath = options.outPath ? path12.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
3968
+ const extraOutputPaths = options.outputPaths.map((p) => path12.resolve(p));
3969
+ const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
3970
+ const uniqueOutputPaths = [...new Set(allOutputPaths)];
3971
+ let outputWriter;
3972
+ if (uniqueOutputPaths.length === 1) {
3973
+ outputWriter = await createOutputWriter(outputPath, options.format);
3974
+ console.log(`Output path: ${outputPath}`);
3975
+ } else {
3976
+ outputWriter = await createMultiWriter(uniqueOutputPaths);
3977
+ console.log("Output paths:");
3978
+ for (const p of uniqueOutputPaths) {
3979
+ console.log(` ${p}`);
3980
+ }
3981
+ }
3982
+ const resolvedTestFiles = input.testFiles.map((file) => path12.resolve(file));
3983
+ if (options.otelFile) {
3984
+ console.log(`OTLP JSON file: ${path12.resolve(options.otelFile)}`);
3985
+ }
3986
+ if (options.traceFile) {
3987
+ console.log(`Trace file: ${path12.resolve(options.traceFile)}`);
3988
+ }
3989
+ const evaluationRunner = await resolveEvaluationRunner();
3990
+ const allResults = [];
3991
+ const seenEvalCases = /* @__PURE__ */ new Set();
3992
+ const displayIdTracker = createDisplayIdTracker();
3993
+ const totalWorkers = options.workers ?? DEFAULT_WORKERS;
3994
+ const fileConcurrency = Math.min(
3995
+ Math.max(1, totalWorkers),
3996
+ Math.max(1, resolvedTestFiles.length)
3997
+ );
3998
+ const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : void 0;
3999
+ const fileMetadata = /* @__PURE__ */ new Map();
4000
+ const tsFiles = [];
4001
+ const yamlFiles = [];
4002
+ for (const testFilePath of resolvedTestFiles) {
4003
+ if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
4004
+ tsFiles.push(testFilePath);
4005
+ } else {
4006
+ yamlFiles.push(testFilePath);
4007
+ }
4008
+ }
4009
+ for (const tsFile of tsFiles) {
4010
+ await ensureFileExists(tsFile, "TypeScript eval file");
4011
+ await import(pathToFileURL(tsFile).href);
4012
+ }
4013
+ if (yamlFiles.length === 0 && tsFiles.length > 0) {
4014
+ return;
4015
+ }
4016
+ for (const testFilePath of yamlFiles) {
4017
+ const meta = await prepareFileMetadata({
4018
+ testFilePath,
4019
+ repoRoot,
4020
+ cwd,
4021
+ options
4022
+ });
4023
+ fileMetadata.set(testFilePath, meta);
4024
+ }
4025
+ const firstMeta = fileMetadata.values().next().value;
4026
+ const yamlCacheEnabled = firstMeta?.yamlCache;
4027
+ const yamlCachePath = firstMeta?.yamlCachePath;
4028
+ const cacheEnabled = shouldEnableCache({
4029
+ cliCache: options.cache,
4030
+ cliNoCache: options.noCache,
4031
+ yamlCache: yamlCacheEnabled
4032
+ });
4033
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path12.resolve(yamlCachePath) : void 0) : void 0;
4034
+ const useCache = cacheEnabled;
4035
+ if (cacheEnabled) {
4036
+ console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
4037
+ }
4038
+ const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
4039
+ let totalEvalCount = 0;
4040
+ for (const meta of fileMetadata.values()) {
4041
+ const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
4042
+ for (const test of meta.evalCases) {
4043
+ const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
4044
+ totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
4045
+ }
4046
+ }
4047
+ if (totalEvalCount === 0) {
4048
+ throw new Error("No tests matched the provided filters.");
4049
+ }
4050
+ const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
4051
+ progressReporter.start();
4052
+ progressReporter.setTotal(totalEvalCount);
4053
+ const seenCodexLogPaths = /* @__PURE__ */ new Set();
4054
+ const unsubscribeCodexLogs = subscribeToCodexLogEntries((entry) => {
4055
+ if (!entry.filePath || seenCodexLogPaths.has(entry.filePath)) {
4056
+ return;
4057
+ }
4058
+ seenCodexLogPaths.add(entry.filePath);
4059
+ progressReporter.addLogPaths([entry.filePath], "codex");
4060
+ });
4061
+ const seenPiLogPaths = /* @__PURE__ */ new Set();
4062
+ const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
4063
+ if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) {
4064
+ return;
4065
+ }
4066
+ seenPiLogPaths.add(entry.filePath);
4067
+ progressReporter.addLogPaths([entry.filePath], "pi");
4068
+ });
4069
+ const seenCopilotLogPaths = /* @__PURE__ */ new Set();
4070
+ const unsubscribeCopilotSdkLogs = subscribeToCopilotSdkLogEntries((entry) => {
4071
+ if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
4072
+ return;
4073
+ }
4074
+ seenCopilotLogPaths.add(entry.filePath);
4075
+ progressReporter.addLogPaths([entry.filePath], "copilot");
4076
+ });
4077
+ const unsubscribeCopilotCliLogs = subscribeToCopilotCliLogEntries((entry) => {
4078
+ if (!entry.filePath || seenCopilotLogPaths.has(entry.filePath)) {
4079
+ return;
4080
+ }
4081
+ seenCopilotLogPaths.add(entry.filePath);
4082
+ progressReporter.addLogPaths([entry.filePath], "copilot");
4083
+ });
4084
+ for (const [testFilePath, meta] of fileMetadata.entries()) {
4085
+ for (const { selection, inlineTargetLabel } of meta.selections) {
4086
+ for (const testId of meta.evalIds) {
4087
+ const evalKey = makeEvalKey(
4088
+ testFilePath,
4089
+ meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
4090
+ );
4091
+ seenEvalCases.add(evalKey);
4092
+ const displayId = displayIdTracker.getOrAssign(evalKey);
4093
+ progressReporter.update(displayId, {
4094
+ workerId: displayId,
4095
+ testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
4096
+ status: "pending",
4097
+ targetLabel: inlineTargetLabel
4098
+ });
4099
+ }
4100
+ }
4101
+ }
4102
+ try {
4103
+ await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
4104
+ const targetPrep = fileMetadata.get(testFilePath);
4105
+ if (!targetPrep) {
4106
+ throw new Error(`Missing metadata for ${testFilePath}`);
4107
+ }
4108
+ const targetResults = await Promise.all(
4109
+ targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
4110
+ const targetName = selection.targetName;
4111
+ const applicableEvalCases = targetPrep.selections.length > 1 ? targetPrep.evalCases.filter((test) => {
4112
+ if (test.targets && test.targets.length > 0) {
4113
+ return test.targets.includes(targetName);
4114
+ }
4115
+ return true;
4116
+ }) : targetPrep.evalCases;
4117
+ if (applicableEvalCases.length === 0) {
4118
+ return [];
4119
+ }
4120
+ const result = await runSingleEvalFile({
4121
+ testFilePath,
4122
+ cwd,
4123
+ repoRoot,
4124
+ options,
4125
+ outputWriter,
4126
+ otelExporter,
4127
+ cache,
4128
+ evaluationRunner,
4129
+ workersOverride: perFileWorkers,
4130
+ progressReporter,
4131
+ seenEvalCases,
4132
+ displayIdTracker,
4133
+ selection,
4134
+ inlineTargetLabel,
4135
+ evalCases: applicableEvalCases,
4136
+ trialsConfig: targetPrep.trialsConfig,
4137
+ matrixMode: targetPrep.selections.length > 1,
4138
+ totalBudgetUsd: targetPrep.totalBudgetUsd,
4139
+ failOnError: targetPrep.failOnError
4140
+ });
4141
+ return result.results;
4142
+ })
4143
+ );
4144
+ for (const results of targetResults) {
4145
+ allResults.push(...results);
4146
+ }
4147
+ });
4148
+ progressReporter.finish();
4149
+ if (retryNonErrorResults && retryNonErrorResults.length > 0) {
4150
+ for (const preserved of retryNonErrorResults) {
4151
+ await outputWriter.append(preserved);
4152
+ }
4153
+ allResults.push(...retryNonErrorResults);
4154
+ console.log(
4155
+ `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`
4156
+ );
4157
+ }
4158
+ const summary = calculateEvaluationSummary(allResults);
4159
+ console.log(formatEvaluationSummary(summary));
4160
+ if (isMatrixMode && allResults.length > 0) {
4161
+ console.log(formatMatrixSummary(allResults));
4162
+ }
4163
+ if (options.benchmarkJson && allResults.length > 0) {
4164
+ const benchmarkPath = path12.resolve(options.benchmarkJson);
4165
+ await writeBenchmarkJson(benchmarkPath, allResults);
4166
+ console.log(`Benchmark written to: ${benchmarkPath}`);
4167
+ }
4168
+ if (options.artifacts && allResults.length > 0) {
4169
+ const artifactsDir = path12.resolve(options.artifacts);
4170
+ const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4171
+ const {
4172
+ gradingDir,
4173
+ timingPath,
4174
+ benchmarkPath: abp
4175
+ } = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile });
4176
+ console.log(`Artifacts written to: ${artifactsDir}`);
4177
+ console.log(` Grading: ${gradingDir} (${allResults.length} files)`);
4178
+ console.log(` Timing: ${timingPath}`);
4179
+ console.log(` Benchmark: ${abp}`);
4180
+ }
4181
+ const failedWithWorkspaces = allResults.filter(
4182
+ (r) => r.workspacePath && (r.error || r.score < 0.5)
4183
+ );
4184
+ if (failedWithWorkspaces.length > 0) {
4185
+ console.log("\nWorkspaces preserved for debugging:");
4186
+ for (const result of failedWithWorkspaces) {
4187
+ console.log(` ${result.testId}: ${result.workspacePath}`);
4188
+ }
4189
+ }
4190
+ if (allResults.length > 0) {
4191
+ if (uniqueOutputPaths.length === 1) {
4192
+ console.log(`
4193
+ Results written to: ${outputPath}`);
4194
+ } else {
4195
+ console.log("\nResults written to:");
4196
+ for (const p of uniqueOutputPaths) {
4197
+ console.log(` ${p}`);
4198
+ }
4199
+ }
4200
+ }
4201
+ } finally {
4202
+ unsubscribeCodexLogs();
4203
+ unsubscribePiLogs();
4204
+ unsubscribeCopilotSdkLogs();
4205
+ unsubscribeCopilotCliLogs();
4206
+ await outputWriter.close().catch(() => void 0);
4207
+ if (otelExporter) {
4208
+ try {
4209
+ await otelExporter.shutdown();
4210
+ } catch {
4211
+ }
4212
+ }
4213
+ }
4214
+ }
4215
+ async function resolveEvaluationRunner() {
4216
+ const overridePath = process.env.AGENTEVO_CLI_EVAL_RUNNER;
4217
+ if (!overridePath) {
4218
+ return runEvaluation;
4219
+ }
4220
+ const resolved = path12.isAbsolute(overridePath) ? overridePath : path12.resolve(process.cwd(), overridePath);
4221
+ const moduleUrl = pathToFileURL(resolved).href;
4222
+ const mod = await import(moduleUrl);
4223
+ const candidate = mod.runEvaluation;
4224
+ if (typeof candidate !== "function") {
4225
+ throw new Error(
4226
+ `Module '${resolved}' must export a 'runEvaluation' function to override the default implementation`
4227
+ );
4228
+ }
4229
+ return candidate;
4230
+ }
4231
+
4232
+ export {
4233
+ package_default,
4234
+ toSnakeCaseDeep,
4235
+ HtmlWriter,
4236
+ resolveEvalPaths,
4237
+ findRepoRoot,
4238
+ detectFileType,
4239
+ validateEvalFile,
4240
+ validateTargetsFile,
4241
+ validateConfigFile,
4242
+ validateFileReferences,
4243
+ TARGET_FILE_CANDIDATES,
4244
+ fileExists,
4245
+ selectTarget,
4246
+ runEvalCommand
4247
+ };
4248
+ //# sourceMappingURL=chunk-PCQA43SA.js.map