agentv 2.18.4 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +62 -36
  2. package/dist/agentv-provider-5CJVBBGG-2XVZBW7L.js +9 -0
  3. package/dist/{chunk-RMUVJ44Z.js → chunk-5WIB7A27.js} +598 -403
  4. package/dist/chunk-5WIB7A27.js.map +1 -0
  5. package/dist/chunk-6GSYTMXD.js +31520 -0
  6. package/dist/chunk-6GSYTMXD.js.map +1 -0
  7. package/dist/{chunk-KSUL3F3R.js → chunk-DY4ZDTTO.js} +1018 -140
  8. package/dist/chunk-DY4ZDTTO.js.map +1 -0
  9. package/dist/chunk-HF4X7ALN.js +24299 -0
  10. package/dist/chunk-HF4X7ALN.js.map +1 -0
  11. package/dist/{chunk-FV32QHPB.js → chunk-XOSNETAV.js} +1 -1
  12. package/dist/cli.js +5 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{dist-EDQZMZH2.js → dist-WN2QIOQR.js} +27 -11
  15. package/dist/{esm-DX3WQKEN.js → esm-CZAWIY6F.js} +2 -2
  16. package/dist/esm-CZAWIY6F.js.map +1 -0
  17. package/dist/index.js +5 -4
  18. package/dist/{interactive-J4IBXJF7.js → interactive-B432TCRZ.js} +5 -4
  19. package/dist/{interactive-J4IBXJF7.js.map → interactive-B432TCRZ.js.map} +1 -1
  20. package/dist/{src-2N5EJ2N6.js → src-ML4D2MC2.js} +2 -2
  21. package/dist/templates/.agentv/config.yaml +0 -5
  22. package/dist/templates/.agentv/targets.yaml +8 -11
  23. package/package.json +2 -2
  24. package/dist/chunk-KSUL3F3R.js.map +0 -1
  25. package/dist/chunk-RMUVJ44Z.js.map +0 -1
  26. package/dist/chunk-YTHTGLMT.js +0 -49786
  27. package/dist/chunk-YTHTGLMT.js.map +0 -1
  28. /package/dist/{dist-EDQZMZH2.js.map → agentv-provider-5CJVBBGG-2XVZBW7L.js.map} +0 -0
  29. /package/dist/{chunk-FV32QHPB.js.map → chunk-XOSNETAV.js.map} +0 -0
  30. /package/dist/{esm-DX3WQKEN.js.map → dist-WN2QIOQR.js.map} +0 -0
  31. /package/dist/{src-2N5EJ2N6.js.map → src-ML4D2MC2.js.map} +0 -0
@@ -25,12 +25,12 @@ import {
25
25
  subscribeToCopilotCliLogEntries,
26
26
  subscribeToCopilotSdkLogEntries,
27
27
  subscribeToPiLogEntries
28
- } from "./chunk-YTHTGLMT.js";
28
+ } from "./chunk-HF4X7ALN.js";
29
29
 
30
30
  // package.json
31
31
  var package_default = {
32
32
  name: "agentv",
33
- version: "2.18.4",
33
+ version: "3.0.0-next.1",
34
34
  description: "CLI entry point for AgentV",
35
35
  type: "module",
36
36
  repository: {
@@ -95,7 +95,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
95
95
  const candidatePath = path.isAbsolute(pattern) ? path.normalize(pattern) : path.resolve(cwd, pattern);
96
96
  try {
97
97
  const stats = await stat(candidatePath);
98
- if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
98
+ if (stats.isFile() && /\.(ya?ml|jsonl|json)$/i.test(candidatePath)) {
99
99
  results.add(candidatePath);
100
100
  continue;
101
101
  }
@@ -110,7 +110,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
110
110
  dot: true,
111
111
  followSymbolicLinks: true
112
112
  });
113
- const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
113
+ const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl|json)$/i.test(filePath));
114
114
  if (yamlMatches.length === 0) {
115
115
  unmatched.push(pattern);
116
116
  continue;
@@ -123,7 +123,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
123
123
  throw new Error(
124
124
  `No eval files matched: ${unmatched.join(
125
125
  ", "
126
- )}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`
126
+ )}. Provide YAML, JSONL, or JSON paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl", "evals.json").`
127
127
  );
128
128
  }
129
129
  const sorted = Array.from(results);
@@ -201,7 +201,7 @@ async function discoverTargetsFile(options) {
201
201
  // src/commands/eval/run-eval.ts
202
202
  import { constants as constants4 } from "node:fs";
203
203
  import { access as access4 } from "node:fs/promises";
204
- import path10 from "node:path";
204
+ import path12 from "node:path";
205
205
  import { pathToFileURL } from "node:url";
206
206
 
207
207
  // src/version-check.ts
@@ -258,16 +258,316 @@ async function promptContinue() {
258
258
  return confirm({ message: "Continue anyway?", default: false });
259
259
  }
260
260
 
261
+ // src/commands/eval/artifact-writer.ts
262
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
263
+ import path3 from "node:path";
264
+ var PASS_THRESHOLD = 0.8;
265
+ function computeStats(values) {
266
+ if (values.length === 0) {
267
+ return { mean: 0, stddev: 0 };
268
+ }
269
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
270
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
271
+ return {
272
+ mean: Math.round(mean * 1e3) / 1e3,
273
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
274
+ };
275
+ }
276
+ function computePassRate(result) {
277
+ const scores = result.scores;
278
+ if (scores && scores.length > 0) {
279
+ const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
280
+ return passed / scores.length;
281
+ }
282
+ return result.score >= PASS_THRESHOLD ? 1 : 0;
283
+ }
284
+ function countToolCalls(result) {
285
+ const toolCalls = {};
286
+ let total = 0;
287
+ const trace = result.trace;
288
+ if (trace?.steps) {
289
+ for (const step of trace.steps) {
290
+ if (step.toolName || step.type === "tool") {
291
+ const name = step.toolName ?? "unknown";
292
+ toolCalls[name] = (toolCalls[name] ?? 0) + 1;
293
+ total += 1;
294
+ }
295
+ }
296
+ }
297
+ return { toolCalls, total };
298
+ }
299
+ function parseWorkspaceChanges(fileChanges) {
300
+ if (!fileChanges) {
301
+ return void 0;
302
+ }
303
+ let filesModified = 0;
304
+ let filesCreated = 0;
305
+ const lines = fileChanges.split("\n");
306
+ for (const line of lines) {
307
+ if (line.startsWith("--- /dev/null")) {
308
+ filesCreated += 1;
309
+ } else if (line.startsWith("--- a/")) {
310
+ filesModified += 1;
311
+ }
312
+ }
313
+ const summaryLines = lines.slice(0, 20);
314
+ const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
315
+ ... (${lines.length - 20} more lines)` : fileChanges;
316
+ return {
317
+ files_modified: filesModified,
318
+ files_created: filesCreated,
319
+ diff_summary: diffSummary
320
+ };
321
+ }
322
+ function buildExpectations(result) {
323
+ const expectations = [];
324
+ if (result.scores && result.scores.length > 0) {
325
+ for (const evaluator of result.scores) {
326
+ for (const hit of evaluator.hits) {
327
+ expectations.push({
328
+ text: hit,
329
+ passed: true,
330
+ evidence: evaluator.reasoning ?? ""
331
+ });
332
+ }
333
+ for (const miss of evaluator.misses) {
334
+ expectations.push({
335
+ text: miss,
336
+ passed: false,
337
+ evidence: evaluator.reasoning ?? ""
338
+ });
339
+ }
340
+ }
341
+ } else {
342
+ for (const hit of result.hits) {
343
+ expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
344
+ }
345
+ for (const miss of result.misses) {
346
+ expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
347
+ }
348
+ }
349
+ return expectations;
350
+ }
351
+ function buildEvaluators(scores) {
352
+ if (!scores || scores.length === 0) {
353
+ return void 0;
354
+ }
355
+ return scores.map((s) => ({
356
+ name: s.name,
357
+ type: s.type,
358
+ score: s.score,
359
+ reasoning: s.reasoning ?? "",
360
+ weight: s.weight,
361
+ verdict: s.verdict,
362
+ hits: s.hits,
363
+ misses: s.misses,
364
+ details: s.details
365
+ }));
366
+ }
367
+ function buildGradingArtifact(result) {
368
+ const expectations = buildExpectations(result);
369
+ const passed = expectations.filter((e) => e.passed).length;
370
+ const failed = expectations.filter((e) => !e.passed).length;
371
+ const total = expectations.length;
372
+ const { toolCalls, total: totalToolCalls } = countToolCalls(result);
373
+ const errorsEncountered = result.error ? 1 : 0;
374
+ return {
375
+ expectations,
376
+ summary: {
377
+ passed,
378
+ failed,
379
+ total,
380
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
381
+ },
382
+ execution_metrics: {
383
+ tool_calls: toolCalls,
384
+ total_tool_calls: totalToolCalls,
385
+ errors_encountered: errorsEncountered
386
+ },
387
+ evaluators: buildEvaluators(result.scores),
388
+ workspace_changes: parseWorkspaceChanges(result.fileChanges),
389
+ conversation: result.conversationId ? {
390
+ turns: result.trace ? result.trace.steps?.length ?? 0 : 0,
391
+ conversation_id: result.conversationId
392
+ } : void 0
393
+ };
394
+ }
395
+ function buildTimingArtifact(results) {
396
+ let totalInput = 0;
397
+ let totalOutput = 0;
398
+ let totalDurationMs = 0;
399
+ for (const result of results) {
400
+ const usage = result.tokenUsage;
401
+ if (usage) {
402
+ totalInput += usage.input ?? 0;
403
+ totalOutput += usage.output ?? 0;
404
+ }
405
+ if (result.durationMs != null) {
406
+ totalDurationMs += result.durationMs;
407
+ }
408
+ }
409
+ return {
410
+ total_tokens: totalInput + totalOutput,
411
+ duration_ms: totalDurationMs,
412
+ total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
413
+ token_usage: {
414
+ input: totalInput,
415
+ output: totalOutput
416
+ }
417
+ };
418
+ }
419
+ function buildBenchmarkArtifact(results, evalFile = "") {
420
+ const targetSet = /* @__PURE__ */ new Set();
421
+ const testIdSet = /* @__PURE__ */ new Set();
422
+ for (const result of results) {
423
+ targetSet.add(result.target);
424
+ testIdSet.add(result.testId);
425
+ }
426
+ const targets = [...targetSet].sort();
427
+ const testIds = [...testIdSet].sort();
428
+ const runSummary = {};
429
+ const notes = [];
430
+ for (const target of targets) {
431
+ const targetResults = results.filter((r) => r.target === target);
432
+ const passRates = targetResults.map(computePassRate);
433
+ const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
434
+ const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
435
+ const usage = r.tokenUsage;
436
+ return (usage.input ?? 0) + (usage.output ?? 0);
437
+ });
438
+ const entry = {
439
+ pass_rate: computeStats(passRates),
440
+ time_seconds: computeStats(timings),
441
+ tokens: computeStats(tokens)
442
+ };
443
+ const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
444
+ if (toolCallCounts.some((c) => c > 0)) {
445
+ entry.tool_calls = computeStats(toolCallCounts);
446
+ }
447
+ const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
448
+ if (costs.length > 0) {
449
+ entry.cost_usd = computeStats(costs);
450
+ }
451
+ runSummary[target] = entry;
452
+ }
453
+ const evaluatorScores = /* @__PURE__ */ new Map();
454
+ for (const result of results) {
455
+ if (result.scores) {
456
+ for (const score of result.scores) {
457
+ const key = `${score.name}:${score.type}`;
458
+ if (!evaluatorScores.has(key)) {
459
+ evaluatorScores.set(key, []);
460
+ }
461
+ evaluatorScores.get(key)?.push(score.score);
462
+ }
463
+ }
464
+ }
465
+ let perEvaluatorSummary;
466
+ if (evaluatorScores.size > 0) {
467
+ perEvaluatorSummary = {};
468
+ for (const [key, scores] of evaluatorScores) {
469
+ perEvaluatorSummary[key] = computeStats(scores);
470
+ }
471
+ }
472
+ const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
473
+ if (errorCount > 0) {
474
+ notes.push(
475
+ `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
476
+ );
477
+ }
478
+ if (results.length === 0) {
479
+ notes.push("No results to summarize");
480
+ }
481
+ const firstResult = results[0];
482
+ const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
483
+ return {
484
+ metadata: {
485
+ eval_file: evalFile,
486
+ timestamp,
487
+ targets,
488
+ tests_run: testIds
489
+ },
490
+ run_summary: runSummary,
491
+ per_evaluator_summary: perEvaluatorSummary,
492
+ notes
493
+ };
494
+ }
495
+ async function writeArtifactsFromResults(results, outputDir, options) {
496
+ const gradingDir = path3.join(outputDir, "grading");
497
+ const timingPath = path3.join(outputDir, "timing.json");
498
+ const benchmarkPath = path3.join(outputDir, "benchmark.json");
499
+ await mkdir(gradingDir, { recursive: true });
500
+ for (const result of results) {
501
+ const grading = buildGradingArtifact(result);
502
+ const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
503
+ const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
504
+ await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
505
+ `, "utf8");
506
+ }
507
+ const timing = buildTimingArtifact(results);
508
+ await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}
509
+ `, "utf8");
510
+ const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
511
+ await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
512
+ `, "utf8");
513
+ return { gradingDir, timingPath, benchmarkPath };
514
+ }
515
+
516
+ // src/commands/eval/benchmark-writer.ts
517
+ import { writeFile as writeFile2 } from "node:fs/promises";
518
+ var PASS_THRESHOLD2 = 0.8;
519
+ function computeStats2(values) {
520
+ if (values.length === 0) {
521
+ return { mean: 0, stddev: 0 };
522
+ }
523
+ const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
524
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
525
+ return {
526
+ mean: Math.round(mean * 1e3) / 1e3,
527
+ stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
528
+ };
529
+ }
530
+ function computePassRate2(result) {
531
+ const scores = result.scores;
532
+ if (scores && scores.length > 0) {
533
+ const passed = scores.filter((s) => s.score >= PASS_THRESHOLD2).length;
534
+ return passed / scores.length;
535
+ }
536
+ return result.score >= PASS_THRESHOLD2 ? 1 : 0;
537
+ }
538
+ function buildBenchmarkJson(results) {
539
+ const passRates = results.map(computePassRate2);
540
+ const timings = results.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
541
+ const tokens = results.filter((r) => r.tokenUsage != null).map((r) => {
542
+ const usage = r.tokenUsage;
543
+ return (usage.input ?? 0) + (usage.output ?? 0);
544
+ });
545
+ return {
546
+ run_summary: {
547
+ with_skill: {
548
+ pass_rate: computeStats2(passRates),
549
+ time_seconds: computeStats2(timings),
550
+ tokens: computeStats2(tokens)
551
+ }
552
+ }
553
+ };
554
+ }
555
+ async function writeBenchmarkJson(outputPath, results) {
556
+ const benchmark = buildBenchmarkJson(results);
557
+ await writeFile2(outputPath, `${JSON.stringify(benchmark, null, 2)}
558
+ `, "utf8");
559
+ }
560
+
261
561
  // src/commands/eval/env.ts
262
562
  import { constants as constants3 } from "node:fs";
263
563
  import { access as access3 } from "node:fs/promises";
264
- import path3 from "node:path";
564
+ import path4 from "node:path";
265
565
  import { config as loadDotenv } from "dotenv";
266
566
  function uniqueDirs(directories) {
267
567
  const seen = /* @__PURE__ */ new Set();
268
568
  const result = [];
269
569
  for (const dir of directories) {
270
- const absolute = path3.resolve(dir);
570
+ const absolute = path4.resolve(dir);
271
571
  if (seen.has(absolute)) {
272
572
  continue;
273
573
  }
@@ -286,14 +586,14 @@ async function fileExists2(filePath) {
286
586
  }
287
587
  function collectAncestorDirectories(start, boundary) {
288
588
  const directories = [];
289
- const boundaryDir = path3.resolve(boundary);
290
- let current = path3.resolve(start);
589
+ const boundaryDir = path4.resolve(boundary);
590
+ let current = path4.resolve(start);
291
591
  while (current !== void 0) {
292
592
  directories.push(current);
293
593
  if (current === boundaryDir) {
294
594
  break;
295
595
  }
296
- const parent = path3.dirname(current);
596
+ const parent = path4.dirname(current);
297
597
  if (parent === current) {
298
598
  break;
299
599
  }
@@ -303,12 +603,12 @@ function collectAncestorDirectories(start, boundary) {
303
603
  }
304
604
  async function loadEnvFromHierarchy(options) {
305
605
  const { testFilePath, repoRoot, verbose } = options;
306
- const testDir = path3.dirname(path3.resolve(testFilePath));
606
+ const testDir = path4.dirname(path4.resolve(testFilePath));
307
607
  const cwd = process.cwd();
308
608
  const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
309
609
  const envFiles = [];
310
610
  for (const dir of searchDirs) {
311
- const candidate = path3.join(dir, ".env");
611
+ const candidate = path4.join(dir, ".env");
312
612
  if (await fileExists2(candidate)) {
313
613
  envFiles.push(candidate);
314
614
  }
@@ -319,7 +619,7 @@ async function loadEnvFromHierarchy(options) {
319
619
  }
320
620
  return void 0;
321
621
  }
322
- for (let i = envFiles.length - 1; i >= 0; i--) {
622
+ for (let i = 0; i < envFiles.length; i++) {
323
623
  const envFile = envFiles[i];
324
624
  loadDotenv({ path: envFile, override: false });
325
625
  if (verbose) {
@@ -330,83 +630,11 @@ async function loadEnvFromHierarchy(options) {
330
630
  }
331
631
 
332
632
  // src/commands/eval/output-writer.ts
333
- import path8 from "node:path";
334
-
335
- // src/commands/eval/json-writer.ts
336
- import { mkdir, writeFile } from "node:fs/promises";
337
- import path4 from "node:path";
338
-
339
- // src/utils/case-conversion.ts
340
- function toSnakeCase(str) {
341
- if (/^[A-Z]/.test(str)) {
342
- return str;
343
- }
344
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
345
- }
346
- function toSnakeCaseDeep(obj) {
347
- if (obj === null || obj === void 0) {
348
- return obj;
349
- }
350
- if (Array.isArray(obj)) {
351
- return obj.map((item) => toSnakeCaseDeep(item));
352
- }
353
- if (typeof obj === "object") {
354
- const result = {};
355
- for (const [key, value] of Object.entries(obj)) {
356
- const snakeKey = toSnakeCase(key);
357
- result[snakeKey] = toSnakeCaseDeep(value);
358
- }
359
- return result;
360
- }
361
- return obj;
362
- }
363
-
364
- // src/commands/eval/json-writer.ts
365
- var JsonWriter = class _JsonWriter {
366
- filePath;
367
- results = [];
368
- closed = false;
369
- constructor(filePath) {
370
- this.filePath = filePath;
371
- }
372
- static async open(filePath) {
373
- await mkdir(path4.dirname(filePath), { recursive: true });
374
- return new _JsonWriter(filePath);
375
- }
376
- async append(result) {
377
- if (this.closed) {
378
- throw new Error("Cannot write to closed JSON writer");
379
- }
380
- this.results.push(result);
381
- }
382
- async close() {
383
- if (this.closed) {
384
- return;
385
- }
386
- this.closed = true;
387
- const passed = this.results.filter((r) => r.score >= 0.5).length;
388
- const failed = this.results.length - passed;
389
- const total = this.results.length;
390
- const output = {
391
- stats: {
392
- total,
393
- passed,
394
- failed,
395
- passRate: total > 0 ? passed / total : 0
396
- },
397
- results: this.results
398
- };
399
- const snakeCaseOutput = toSnakeCaseDeep(output);
400
- await writeFile(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
401
- `, "utf8");
402
- }
403
- };
633
+ import path10 from "node:path";
404
634
 
405
- // src/commands/eval/jsonl-writer.ts
406
- import { createWriteStream } from "node:fs";
407
- import { mkdir as mkdir2 } from "node:fs/promises";
635
+ // src/commands/eval/html-writer.ts
636
+ import { mkdir as mkdir2, writeFile as writeFile3 } from "node:fs/promises";
408
637
  import path5 from "node:path";
409
- import { finished } from "node:stream/promises";
410
638
 
411
639
  // ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
412
640
  var E_TIMEOUT = new Error("timeout while waiting for mutex to become available");
@@ -614,7 +842,597 @@ var Mutex = class {
614
842
  }
615
843
  };
616
844
 
845
+ // src/commands/eval/html-writer.ts
846
+ var HtmlWriter = class _HtmlWriter {
847
+ filePath;
848
+ results = [];
849
+ mutex = new Mutex();
850
+ closed = false;
851
+ isLive = true;
852
+ constructor(filePath) {
853
+ this.filePath = filePath;
854
+ }
855
+ static async open(filePath) {
856
+ await mkdir2(path5.dirname(filePath), { recursive: true });
857
+ const writer = new _HtmlWriter(filePath);
858
+ await writer.writeHtml();
859
+ return writer;
860
+ }
861
+ async append(result) {
862
+ await this.mutex.runExclusive(async () => {
863
+ if (this.closed) {
864
+ throw new Error("Cannot write to closed HTML writer");
865
+ }
866
+ this.results.push(result);
867
+ await this.writeHtml();
868
+ });
869
+ }
870
+ async close() {
871
+ await this.mutex.runExclusive(async () => {
872
+ if (this.closed) {
873
+ return;
874
+ }
875
+ this.closed = true;
876
+ this.isLive = false;
877
+ await this.writeHtml();
878
+ });
879
+ }
880
+ async writeHtml() {
881
+ const html = generateHtml(this.results, this.isLive);
882
+ await writeFile3(this.filePath, html, "utf8");
883
+ }
884
+ };
885
+ function generateHtml(results, isLive) {
886
+ const lightResults = results.map((r) => {
887
+ const { requests, trace, ...rest } = r;
888
+ return rest;
889
+ });
890
+ const dataJson = JSON.stringify(lightResults).replace(/<\//g, "<\\/");
891
+ const metaRefresh = isLive ? ' <meta http-equiv="refresh" content="2">\n' : "";
892
+ const liveIndicator = isLive ? '<span class="live-badge">\u25CF LIVE</span>' : `<span class="timestamp">${escapeHtml((/* @__PURE__ */ new Date()).toISOString())}</span>`;
893
+ return `<!DOCTYPE html>
894
+ <html lang="en">
895
+ <head>
896
+ <meta charset="utf-8">
897
+ <meta name="viewport" content="width=device-width, initial-scale=1">
898
+ ${metaRefresh} <title>AgentV Evaluation Report</title>
899
+ <style>
900
+ ${STYLES}
901
+ </style>
902
+ </head>
903
+ <body>
904
+ <header class="header">
905
+ <div class="header-left">
906
+ <h1 class="header-title">AgentV</h1>
907
+ <span class="header-subtitle">Evaluation Report</span>
908
+ </div>
909
+ <div class="header-right">${liveIndicator}</div>
910
+ </header>
911
+ <nav class="tabs" id="tabs">
912
+ <button class="tab active" data-tab="overview">Overview</button>
913
+ <button class="tab" data-tab="tests">Test Cases</button>
914
+ </nav>
915
+ <main id="app"></main>
916
+ <script>
917
+ var DATA = ${dataJson};
918
+ var IS_LIVE = ${String(isLive)};
919
+ ${SCRIPT}
920
+ </script>
921
+ </body>
922
+ </html>`;
923
+ }
924
+ function escapeHtml(s) {
925
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
926
+ }
927
+ var STYLES = `
928
+ *{margin:0;padding:0;box-sizing:border-box}
929
+ :root{
930
+ --bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee;
931
+ --text:#1f2328;--text-muted:#656d76;
932
+ --primary:#0969da;--primary-bg:#ddf4ff;
933
+ --success:#1a7f37;--success-bg:#dafbe1;
934
+ --danger:#cf222e;--danger-bg:#ffebe9;
935
+ --warning:#9a6700;--warning-bg:#fff8c5;
936
+ --radius:6px;
937
+ --shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06);
938
+ --font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif;
939
+ --mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace;
940
+ }
941
+ body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}
942
+
943
+ /* Header */
944
+ .header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between}
945
+ .header-left{display:flex;align-items:baseline;gap:12px}
946
+ .header-title{font-size:18px;font-weight:600}
947
+ .header-subtitle{font-size:14px;color:var(--text-muted)}
948
+ .live-badge{color:var(--success);font-size:12px;font-weight:600;animation:pulse 2s infinite}
949
+ @keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}}
950
+ .timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)}
951
+
952
+ /* Tabs */
953
+ .tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex}
954
+ .tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s}
955
+ .tab:hover{color:var(--text)}
956
+ .tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)}
957
+
958
+ #app{max-width:1280px;margin:0 auto;padding:24px}
959
+
960
+ /* Stat cards */
961
+ .stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}
962
+ .stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)}
963
+ .stat-card.pass .stat-value{color:var(--success)}
964
+ .stat-card.fail .stat-value{color:var(--danger)}
965
+ .stat-card.error .stat-value{color:var(--danger)}
966
+ .stat-card.warn .stat-value{color:var(--warning)}
967
+ .stat-card.total .stat-value{color:var(--primary)}
968
+ .stat-value{font-size:28px;font-weight:700;line-height:1.2}
969
+ .stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px}
970
+
971
+ /* Sections */
972
+ .section{margin-bottom:24px}
973
+ .section-title{font-size:16px;font-weight:600;margin-bottom:12px}
974
+
975
+ /* Tables */
976
+ .table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)}
977
+ .data-table{width:100%;border-collapse:collapse;font-size:13px}
978
+ .data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap}
979
+ .data-table th.sortable{cursor:pointer;user-select:none}
980
+ .data-table th.sortable:hover{color:var(--text)}
981
+ .data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle}
982
+ .data-table tbody tr:last-child td{border-bottom:none}
983
+
984
+ /* Status icons */
985
+ .status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700}
986
+ .status-icon.pass{background:var(--success-bg);color:var(--success)}
987
+ .status-icon.fail{background:var(--danger-bg);color:var(--danger)}
988
+ .status-icon.error{background:var(--warning-bg);color:var(--warning)}
989
+
990
+ /* Score colors */
991
+ .score-high{color:var(--success);font-weight:600}
992
+ .score-mid{color:var(--warning);font-weight:600}
993
+ .score-low{color:var(--danger);font-weight:600}
994
+
995
+ /* Pass-rate bar */
996
+ .bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden}
997
+ .bar-fill{height:100%;border-radius:4px;transition:width .3s}
998
+ .bar-fill.score-high{background:var(--success)}
999
+ .bar-fill.score-mid{background:var(--warning)}
1000
+ .bar-fill.score-low{background:var(--danger)}
1001
+
1002
+ /* Histogram */
1003
+ .histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)}
1004
+ .hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px}
1005
+ .hist-row:last-child{margin-bottom:0}
1006
+ .hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
1007
+ .hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden}
1008
+ .hist-bar{height:100%;border-radius:3px;transition:width .3s}
1009
+ .hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0}
1010
+
1011
+ /* Filters */
1012
+ .filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap}
1013
+ .filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)}
1014
+ .filter-search{flex:1;min-width:200px}
1015
+ .filter-count{font-size:12px;color:var(--text-muted);margin-left:auto}
1016
+
1017
+ /* Test rows */
1018
+ .test-row{cursor:pointer;transition:background .1s}
1019
+ .test-row:hover{background:var(--bg)!important}
1020
+ .test-row.expanded{background:var(--primary-bg)!important}
1021
+ .expand-col{width:32px;text-align:center}
1022
+ .expand-icon{color:var(--text-muted);font-size:12px}
1023
+ .fw-medium{font-weight:500}
1024
+ .text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)}
1025
+
1026
+ /* Detail panel */
1027
+ .detail-row td{padding:0!important;background:var(--bg)!important}
1028
+ .detail-panel{padding:16px 24px}
1029
+ .detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px}
1030
+ .detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px}
1031
+ .detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6}
1032
+ .detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px}
1033
+ .eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px}
1034
+ .eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)}
1035
+ .eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)}
1036
+ .reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)}
1037
+ .expect-list{list-style:none;padding:0;margin-bottom:12px}
1038
+ .expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px}
1039
+ .expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700}
1040
+ .expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700}
1041
+ .error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px}
1042
+ .error-box h4{color:var(--danger);margin:0 0 6px}
1043
+ .error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word}
1044
+ .detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)}
1045
+ .empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)}
1046
+ .empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)}
1047
+ `;
1048
+ var SCRIPT = `
1049
+ (function(){
1050
+ /* ---- helpers ---- */
1051
+ function esc(s){
1052
+ if(s==null)return"";
1053
+ return String(s).replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;");
1054
+ }
1055
+ function getStatus(r){
1056
+ if(r.executionStatus==="execution_error")return"error";
1057
+ if(r.executionStatus==="quality_failure")return"fail";
1058
+ if(r.executionStatus==="ok")return"pass";
1059
+ if(r.error)return"error";
1060
+ return r.score>=0.5?"pass":"fail";
1061
+ }
1062
+ function sIcon(s){
1063
+ if(s==="pass")return'<span class="status-icon pass">\\u2713</span>';
1064
+ if(s==="fail")return'<span class="status-icon fail">\\u2717</span>';
1065
+ return'<span class="status-icon error">!</span>';
1066
+ }
1067
+ function fmtDur(ms){
1068
+ if(ms==null)return"\\u2014";
1069
+ if(ms<1000)return ms+"ms";
1070
+ if(ms<60000)return(ms/1000).toFixed(1)+"s";
1071
+ return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s";
1072
+ }
1073
+ function fmtTok(n){
1074
+ if(n==null)return"\\u2014";
1075
+ if(n>=1e6)return(n/1e6).toFixed(1)+"M";
1076
+ if(n>=1e3)return(n/1e3).toFixed(1)+"K";
1077
+ return String(n);
1078
+ }
1079
+ function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);}
1080
+ function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";}
1081
+ function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";}
1082
+
1083
+ /* ---- compute stats ---- */
1084
+ function computeStats(d){
1085
+ var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[];
1086
+ for(var i=0;i<d.length;i++){
1087
+ var r=d[i],s=getStatus(r);
1088
+ if(s==="pass")p++;else if(s==="fail")f++;else e++;
1089
+ if(r.durationMs)dur+=r.durationMs;
1090
+ if(r.tokenUsage){ti+=(r.tokenUsage.input||0);to+=(r.tokenUsage.output||0);}
1091
+ if(r.costUsd)cost+=r.costUsd;
1092
+ if(s!=="error")sc.push(r.score);
1093
+ }
1094
+ var g=t-e;
1095
+ return{total:t,passed:p,failed:f,errors:e,passRate:g>0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc};
1096
+ }
1097
+ function computeTargets(d){
1098
+ var m={};
1099
+ for(var i=0;i<d.length;i++){
1100
+ var r=d[i],tgt=r.target||"unknown";
1101
+ if(!m[tgt])m[tgt]={target:tgt,results:[],p:0,f:0,e:0,ts:0,sc:0,dur:0,tok:0,cost:0};
1102
+ var o=m[tgt];o.results.push(r);
1103
+ var s=getStatus(r);
1104
+ if(s==="pass")o.p++;else if(s==="fail")o.f++;else o.e++;
1105
+ if(s!=="error"){o.ts+=r.score;o.sc++;}
1106
+ if(r.durationMs)o.dur+=r.durationMs;
1107
+ if(r.tokenUsage)o.tok+=(r.tokenUsage.input||0)+(r.tokenUsage.output||0);
1108
+ if(r.costUsd)o.cost+=r.costUsd;
1109
+ }
1110
+ var a=[];for(var k in m)a.push(m[k]);return a;
1111
+ }
1112
+ function getEvalNames(){
1113
+ var n={};
1114
+ for(var i=0;i<DATA.length;i++){
1115
+ var sc=DATA[i].scores;
1116
+ if(sc)for(var j=0;j<sc.length;j++)n[sc[j].name]=true;
1117
+ }
1118
+ return Object.keys(n);
1119
+ }
1120
+ function getEvalScore(r,name){
1121
+ if(!r.scores)return null;
1122
+ for(var i=0;i<r.scores.length;i++)if(r.scores[i].name===name)return r.scores[i].score;
1123
+ return null;
1124
+ }
1125
+
1126
+ var stats=computeStats(DATA);
1127
+ var tgtStats=computeTargets(DATA);
1128
+ var tgtNames=tgtStats.map(function(t){return t.target;});
1129
+
1130
+ /* ---- state ---- */
1131
+ var state={tab:"overview",filter:{status:"all",target:"all",search:""},sort:{col:"testId",dir:"asc"},expanded:{}};
1132
+
1133
+ /* ---- DOM refs ---- */
1134
+ var app=document.getElementById("app");
1135
+ var tabBtns=document.querySelectorAll(".tab");
1136
+
1137
+ /* ---- tabs ---- */
1138
+ function setTab(t){
1139
+ state.tab=t;
1140
+ for(var i=0;i<tabBtns.length;i++)tabBtns[i].classList.toggle("active",tabBtns[i].getAttribute("data-tab")===t);
1141
+ render();
1142
+ }
1143
+ for(var i=0;i<tabBtns.length;i++){
1144
+ tabBtns[i].addEventListener("click",(function(b){return function(){setTab(b.getAttribute("data-tab"));};})(tabBtns[i]));
1145
+ }
1146
+
1147
+ /* ---- render ---- */
1148
+ function render(){
1149
+ if(DATA.length===0){app.innerHTML='<div class="empty-state"><h3>No results yet</h3><p>'+(IS_LIVE?"Waiting for evaluation results\\u2026 Page will auto-refresh.":"Run an evaluation to generate results.")+"</p></div>";return;}
1150
+ if(state.tab==="overview")renderOverview();else renderTests();
1151
+ }
1152
+
1153
+ /* ---- stat card helper ---- */
1154
+ function card(label,value,type){
1155
+ return'<div class="stat-card '+type+'"><div class="stat-value">'+value+'</div><div class="stat-label">'+label+"</div></div>";
1156
+ }
1157
+
1158
+ /* ---- overview ---- */
1159
+ function renderOverview(){
1160
+ var h='<div class="stats-grid">';
1161
+ h+=card("Total Tests",stats.total,"total");
1162
+ h+=card("Passed",stats.passed,"pass");
1163
+ h+=card("Failed",stats.failed,"fail");
1164
+ h+=card("Errors",stats.errors,"error");
1165
+ var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail";
1166
+ h+=card("Pass Rate",fmtPct(stats.passRate),prCls);
1167
+ h+=card("Duration",fmtDur(stats.dur),"neutral");
1168
+ h+=card("Tokens",fmtTok(stats.tokens),"neutral");
1169
+ h+=card("Est. Cost",fmtCost(stats.cost),"neutral");
1170
+ h+="</div>";
1171
+
1172
+ /* targets table */
1173
+ if(tgtStats.length>1){
1174
+ h+='<div class="section"><h2 class="section-title">Targets</h2><div class="table-wrap"><table class="data-table">';
1175
+ h+="<thead><tr><th>Target</th><th>Pass Rate</th><th></th><th>Passed</th><th>Failed</th><th>Errors</th><th>Avg Score</th><th>Duration</th><th>Tokens</th><th>Cost</th></tr></thead><tbody>";
1176
+ for(var i=0;i<tgtStats.length;i++){
1177
+ var t=tgtStats[i],g=t.p+t.f,pr=g>0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0;
1178
+ h+="<tr><td class=\\"fw-medium\\">"+esc(t.target)+"</td><td>"+fmtPct(pr)+'</td><td><div class="bar-bg"><div class="bar-fill '+sCls(pr)+'" style="width:'+(pr*100)+'%"></div></div></td>';
1179
+ h+='<td class="text-pass">'+t.p+'</td><td class="text-fail">'+t.f+'</td><td class="text-error">'+t.e+"</td>";
1180
+ h+='<td class="'+sCls(avg)+'">'+fmtPct(avg)+"</td><td>"+fmtDur(t.dur)+"</td><td>"+fmtTok(t.tok)+"</td><td>"+fmtCost(t.cost)+"</td></tr>";
1181
+ }
1182
+ h+="</tbody></table></div></div>";
1183
+ }
1184
+
1185
+ /* histogram */
1186
+ if(stats.scores.length>0){
1187
+ var bk=[0,0,0,0,0];
1188
+ for(var i=0;i<stats.scores.length;i++){var idx=Math.min(Math.floor(stats.scores[i]*5),4);bk[idx]++;}
1189
+ var mx=Math.max.apply(null,bk);
1190
+ var lb=["0\\u201320%","20\\u201340%","40\\u201360%","60\\u201380%","80\\u2013100%"];
1191
+ h+='<div class="section"><h2 class="section-title">Score Distribution</h2><div class="histogram">';
1192
+ for(var i=0;i<bk.length;i++){
1193
+ var pct=mx>0?(bk[i]/mx*100):0;
1194
+ h+='<div class="hist-row"><span class="hist-label">'+lb[i]+'</span><div class="hist-bar-bg"><div class="hist-bar '+(i>=4?"score-high":i>=2?"score-mid":"score-low")+'" style="width:'+pct+'%"></div></div><span class="hist-count">'+bk[i]+"</span></div>";
1195
+ }
1196
+ h+="</div></div>";
1197
+ }
1198
+ app.innerHTML=h;
1199
+ }
1200
+
1201
+ /* ---- test cases ---- */
1202
+ function renderTests(){
1203
+ var evalNames=getEvalNames();
1204
+ var h='<div class="filter-bar">';
1205
+ h+='<select id="flt-status" class="filter-select"><option value="all">All Status</option><option value="pass">Passed</option><option value="fail">Failed</option><option value="error">Errors</option></select>';
1206
+ if(tgtNames.length>1){
1207
+ h+='<select id="flt-target" class="filter-select"><option value="all">All Targets</option>';
1208
+ for(var i=0;i<tgtNames.length;i++)h+='<option value="'+esc(tgtNames[i])+'">'+esc(tgtNames[i])+"</option>";
1209
+ h+="</select>";
1210
+ }
1211
+ h+='<input type="text" id="flt-search" class="filter-search" placeholder="Search tests..." value="'+esc(state.filter.search)+'">';
1212
+ h+='<span class="filter-count" id="flt-count"></span></div>';
1213
+
1214
+ h+='<div class="table-wrap"><table class="data-table" id="test-tbl"><thead><tr>';
1215
+ h+='<th class="expand-col"></th>';
1216
+ h+=sHdr("Status","status");
1217
+ h+=sHdr("Test ID","testId");
1218
+ if(tgtNames.length>1)h+=sHdr("Target","target");
1219
+ h+=sHdr("Score","score");
1220
+ for(var i=0;i<evalNames.length;i++)h+="<th>"+esc(evalNames[i])+"</th>";
1221
+ h+=sHdr("Duration","durationMs");
1222
+ h+=sHdr("Cost","costUsd");
1223
+ h+="</tr></thead><tbody id=\\"test-body\\"></tbody></table></div>";
1224
+ app.innerHTML=h;
1225
+
1226
+ /* wire events */
1227
+ var selS=document.getElementById("flt-status");
1228
+ selS.value=state.filter.status;
1229
+ selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();});
1230
+ var selT=document.getElementById("flt-target");
1231
+ if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});}
1232
+ document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();});
1233
+ var ths=document.querySelectorAll("th[data-sort]");
1234
+ for(var i=0;i<ths.length;i++){
1235
+ ths[i].addEventListener("click",(function(th){return function(){
1236
+ var c=th.getAttribute("data-sort");
1237
+ if(state.sort.col===c)state.sort.dir=state.sort.dir==="asc"?"desc":"asc";
1238
+ else{state.sort.col=c;state.sort.dir="asc";}
1239
+ renderTests();
1240
+ };})(ths[i]));
1241
+ }
1242
+ renderRows();
1243
+ }
1244
+
1245
+ function sHdr(label,col){
1246
+ var arrow="";
1247
+ if(state.sort.col===col)arrow=state.sort.dir==="asc"?" \\u2191":" \\u2193";
1248
+ return'<th class="sortable" data-sort="'+col+'">'+label+arrow+"</th>";
1249
+ }
1250
+
1251
+ function filtered(){
1252
+ var out=[];
1253
+ for(var i=0;i<DATA.length;i++){
1254
+ var r=DATA[i],s=getStatus(r);
1255
+ if(state.filter.status!=="all"&&s!==state.filter.status)continue;
1256
+ if(state.filter.target!=="all"&&r.target!==state.filter.target)continue;
1257
+ if(state.filter.search&&r.testId.toLowerCase().indexOf(state.filter.search.toLowerCase())===-1)continue;
1258
+ out.push(r);
1259
+ }
1260
+ var col=state.sort.col,dir=state.sort.dir==="asc"?1:-1;
1261
+ out.sort(function(a,b){
1262
+ var va=col==="status"?getStatus(a):a[col],vb=col==="status"?getStatus(b):b[col];
1263
+ if(va==null&&vb==null)return 0;if(va==null)return 1;if(vb==null)return-1;
1264
+ if(typeof va==="string")return va.localeCompare(vb)*dir;
1265
+ return(va-vb)*dir;
1266
+ });
1267
+ return out;
1268
+ }
1269
+
1270
+ function renderRows(){
1271
+ var rows=filtered(),evalNames=getEvalNames();
1272
+ var tbody=document.getElementById("test-body");
1273
+ var colSpan=5+evalNames.length+(tgtNames.length>1?1:0);
1274
+ document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests";
1275
+ var h="";
1276
+ for(var i=0;i<rows.length;i++){
1277
+ var r=rows[i],s=getStatus(r),key=r.testId+":"+r.target,exp=!!state.expanded[key];
1278
+ h+='<tr class="test-row '+s+(exp?" expanded":"")+'" data-key="'+esc(key)+'">';
1279
+ h+='<td class="expand-col"><span class="expand-icon">'+(exp?"\\u25BE":"\\u25B8")+"</span></td>";
1280
+ h+="<td>"+sIcon(s)+"</td>";
1281
+ h+='<td class="fw-medium">'+esc(r.testId)+"</td>";
1282
+ if(tgtNames.length>1)h+="<td>"+esc(r.target)+"</td>";
1283
+ h+='<td class="'+sCls(r.score)+'">'+fmtPct(r.score)+"</td>";
1284
+ for(var j=0;j<evalNames.length;j++){
1285
+ var es=getEvalScore(r,evalNames[j]);
1286
+ h+='<td class="'+sCls(es)+'">'+(es!=null?fmtPct(es):"\\u2014")+"</td>";
1287
+ }
1288
+ h+="<td>"+fmtDur(r.durationMs)+"</td><td>"+fmtCost(r.costUsd)+"</td></tr>";
1289
+ if(exp)h+='<tr class="detail-row"><td colspan="'+colSpan+'">'+renderDetail(r)+"</td></tr>";
1290
+ }
1291
+ if(rows.length===0)h+='<tr><td colspan="'+colSpan+'" class="empty-state">No matching tests</td></tr>';
1292
+ tbody.innerHTML=h;
1293
+
1294
+ /* row click */
1295
+ var trs=tbody.querySelectorAll(".test-row");
1296
+ for(var k=0;k<trs.length;k++){
1297
+ trs[k].addEventListener("click",(function(tr){return function(){
1298
+ var key=tr.getAttribute("data-key");
1299
+ state.expanded[key]=!state.expanded[key];
1300
+ renderRows();
1301
+ };})(trs[k]));
1302
+ }
1303
+ }
1304
+
1305
+ /* ---- detail panel ---- */
1306
+ function renderDetail(r){
1307
+ var h='<div class="detail-panel">';
1308
+
1309
+ /* input / output */
1310
+ h+='<div class="detail-grid">';
1311
+ if(r.input!=null){
1312
+ h+='<div class="detail-block"><h4>Input</h4><pre class="detail-pre">'+esc(typeof r.input==="string"?r.input:JSON.stringify(r.input,null,2))+"</pre></div>";
1313
+ }
1314
+ h+='<div class="detail-block"><h4>Output</h4><pre class="detail-pre">'+esc(r.answer||"")+"</pre></div>";
1315
+ h+="</div>";
1316
+
1317
+ /* evaluator results */
1318
+ if(r.scores&&r.scores.length>0){
1319
+ h+="<h4>Evaluator Results</h4>";
1320
+ h+='<table class="eval-table"><thead><tr><th>Evaluator</th><th>Score</th><th>Status</th><th>Reasoning</th></tr></thead><tbody>';
1321
+ for(var i=0;i<r.scores.length;i++){
1322
+ var ev=r.scores[i],evS=ev.score>=0.5?"pass":"fail";
1323
+ h+="<tr><td class=\\"fw-medium\\">"+esc(ev.name)+'</td><td class="'+sCls(ev.score)+'">'+fmtPct(ev.score)+"</td><td>"+sIcon(evS)+'</td><td class="reasoning-cell">'+esc(ev.reasoning||"")+"</td></tr>";
1324
+ }
1325
+ h+="</tbody></table>";
1326
+ }
1327
+
1328
+ /* hits / misses */
1329
+ if(r.hits&&r.hits.length>0){
1330
+ h+='<h4>Passed Expectations</h4><ul class="expect-list pass">';
1331
+ for(var i=0;i<r.hits.length;i++)h+="<li>"+esc(r.hits[i])+"</li>";
1332
+ h+="</ul>";
1333
+ }
1334
+ if(r.misses&&r.misses.length>0){
1335
+ h+='<h4>Failed Expectations</h4><ul class="expect-list fail">';
1336
+ for(var i=0;i<r.misses.length;i++)h+="<li>"+esc(r.misses[i])+"</li>";
1337
+ h+="</ul>";
1338
+ }
1339
+
1340
+ /* error */
1341
+ if(r.error)h+='<div class="error-box"><h4>Error</h4><pre>'+esc(r.error)+"</pre></div>";
1342
+
1343
+ /* metadata */
1344
+ h+='<div class="detail-meta">';
1345
+ var m=[];
1346
+ if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens");
1347
+ if(r.durationMs)m.push(fmtDur(r.durationMs));
1348
+ if(r.target)m.push(r.target);
1349
+ if(r.costUsd)m.push(fmtCost(r.costUsd));
1350
+ if(r.timestamp)m.push(r.timestamp);
1351
+ h+=esc(m.join(" \\u00B7 "));
1352
+ h+="</div></div>";
1353
+ return h;
1354
+ }
1355
+
1356
+ /* ---- init ---- */
1357
+ render();
1358
+ })();
1359
+ `;
1360
+
1361
+ // src/commands/eval/json-writer.ts
1362
+ import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
1363
+ import path6 from "node:path";
1364
+
1365
+ // src/utils/case-conversion.ts
1366
+ function toSnakeCase(str) {
1367
+ if (/^[A-Z]/.test(str)) {
1368
+ return str;
1369
+ }
1370
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
1371
+ }
1372
+ function toSnakeCaseDeep(obj) {
1373
+ if (obj === null || obj === void 0) {
1374
+ return obj;
1375
+ }
1376
+ if (Array.isArray(obj)) {
1377
+ return obj.map((item) => toSnakeCaseDeep(item));
1378
+ }
1379
+ if (typeof obj === "object") {
1380
+ const result = {};
1381
+ for (const [key, value] of Object.entries(obj)) {
1382
+ const snakeKey = toSnakeCase(key);
1383
+ result[snakeKey] = toSnakeCaseDeep(value);
1384
+ }
1385
+ return result;
1386
+ }
1387
+ return obj;
1388
+ }
1389
+
1390
+ // src/commands/eval/json-writer.ts
1391
+ var JsonWriter = class _JsonWriter {
1392
+ filePath;
1393
+ results = [];
1394
+ closed = false;
1395
+ constructor(filePath) {
1396
+ this.filePath = filePath;
1397
+ }
1398
+ static async open(filePath) {
1399
+ await mkdir3(path6.dirname(filePath), { recursive: true });
1400
+ return new _JsonWriter(filePath);
1401
+ }
1402
+ async append(result) {
1403
+ if (this.closed) {
1404
+ throw new Error("Cannot write to closed JSON writer");
1405
+ }
1406
+ this.results.push(result);
1407
+ }
1408
+ async close() {
1409
+ if (this.closed) {
1410
+ return;
1411
+ }
1412
+ this.closed = true;
1413
+ const passed = this.results.filter((r) => r.score >= 0.5).length;
1414
+ const failed = this.results.length - passed;
1415
+ const total = this.results.length;
1416
+ const output = {
1417
+ stats: {
1418
+ total,
1419
+ passed,
1420
+ failed,
1421
+ passRate: total > 0 ? passed / total : 0
1422
+ },
1423
+ results: this.results
1424
+ };
1425
+ const snakeCaseOutput = toSnakeCaseDeep(output);
1426
+ await writeFile4(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}
1427
+ `, "utf8");
1428
+ }
1429
+ };
1430
+
617
1431
  // src/commands/eval/jsonl-writer.ts
1432
+ import { createWriteStream } from "node:fs";
1433
+ import { mkdir as mkdir4 } from "node:fs/promises";
1434
+ import path7 from "node:path";
1435
+ import { finished } from "node:stream/promises";
618
1436
  var JsonlWriter = class _JsonlWriter {
619
1437
  stream;
620
1438
  mutex = new Mutex();
@@ -623,7 +1441,7 @@ var JsonlWriter = class _JsonlWriter {
623
1441
  this.stream = stream;
624
1442
  }
625
1443
  static async open(filePath) {
626
- await mkdir2(path5.dirname(filePath), { recursive: true });
1444
+ await mkdir4(path7.dirname(filePath), { recursive: true });
627
1445
  const stream = createWriteStream(filePath, { flags: "w", encoding: "utf8" });
628
1446
  return new _JsonlWriter(stream);
629
1447
  }
@@ -654,8 +1472,8 @@ var JsonlWriter = class _JsonlWriter {
654
1472
  };
655
1473
 
656
1474
  // src/commands/eval/junit-writer.ts
657
- import { mkdir as mkdir3, writeFile as writeFile2 } from "node:fs/promises";
658
- import path6 from "node:path";
1475
+ import { mkdir as mkdir5, writeFile as writeFile5 } from "node:fs/promises";
1476
+ import path8 from "node:path";
659
1477
  function escapeXml(str) {
660
1478
  return str.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
661
1479
  }
@@ -667,7 +1485,7 @@ var JunitWriter = class _JunitWriter {
667
1485
  this.filePath = filePath;
668
1486
  }
669
1487
  static async open(filePath) {
670
- await mkdir3(path6.dirname(filePath), { recursive: true });
1488
+ await mkdir5(path8.dirname(filePath), { recursive: true });
671
1489
  return new _JunitWriter(filePath);
672
1490
  }
673
1491
  async append(result) {
@@ -729,14 +1547,14 @@ ${testCases.join("\n")}
729
1547
  ${suiteXmls.join("\n")}
730
1548
  </testsuites>
731
1549
  `;
732
- await writeFile2(this.filePath, xml, "utf8");
1550
+ await writeFile5(this.filePath, xml, "utf8");
733
1551
  }
734
1552
  };
735
1553
 
736
1554
  // src/commands/eval/yaml-writer.ts
737
1555
  import { createWriteStream as createWriteStream2 } from "node:fs";
738
- import { mkdir as mkdir4 } from "node:fs/promises";
739
- import path7 from "node:path";
1556
+ import { mkdir as mkdir6 } from "node:fs/promises";
1557
+ import path9 from "node:path";
740
1558
  import { finished as finished2 } from "node:stream/promises";
741
1559
  import { stringify as stringifyYaml } from "yaml";
742
1560
  var YamlWriter = class _YamlWriter {
@@ -748,7 +1566,7 @@ var YamlWriter = class _YamlWriter {
748
1566
  this.stream = stream;
749
1567
  }
750
1568
  static async open(filePath) {
751
- await mkdir4(path7.dirname(filePath), { recursive: true });
1569
+ await mkdir6(path9.dirname(filePath), { recursive: true });
752
1570
  const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
753
1571
  return new _YamlWriter(stream);
754
1572
  }
@@ -794,6 +1612,8 @@ async function createOutputWriter(filePath, format) {
794
1612
  return JsonlWriter.open(filePath);
795
1613
  case "yaml":
796
1614
  return YamlWriter.open(filePath);
1615
+ case "html":
1616
+ return HtmlWriter.open(filePath);
797
1617
  default: {
798
1618
  const exhaustiveCheck = format;
799
1619
  throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
@@ -806,15 +1626,17 @@ function getDefaultExtension(format) {
806
1626
  return ".jsonl";
807
1627
  case "yaml":
808
1628
  return ".yaml";
1629
+ case "html":
1630
+ return ".html";
809
1631
  default: {
810
1632
  const exhaustiveCheck = format;
811
1633
  throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
812
1634
  }
813
1635
  }
814
1636
  }
815
- var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml"]);
1637
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
816
1638
  function createWriterFromPath(filePath) {
817
- const ext = path8.extname(filePath).toLowerCase();
1639
+ const ext = path10.extname(filePath).toLowerCase();
818
1640
  switch (ext) {
819
1641
  case ".jsonl":
820
1642
  return JsonlWriter.open(filePath);
@@ -825,6 +1647,9 @@ function createWriterFromPath(filePath) {
825
1647
  case ".yaml":
826
1648
  case ".yml":
827
1649
  return YamlWriter.open(filePath);
1650
+ case ".html":
1651
+ case ".htm":
1652
+ return HtmlWriter.open(filePath);
828
1653
  default:
829
1654
  throw new Error(
830
1655
  `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(", ")}`
@@ -898,12 +1723,12 @@ var ProgressDisplay = class {
898
1723
  }
899
1724
  addLogPaths(paths, provider) {
900
1725
  const newPaths = [];
901
- for (const path11 of paths) {
902
- if (this.logPathSet.has(path11)) {
1726
+ for (const path13 of paths) {
1727
+ if (this.logPathSet.has(path13)) {
903
1728
  continue;
904
1729
  }
905
- this.logPathSet.add(path11);
906
- newPaths.push(path11);
1730
+ this.logPathSet.add(path13);
1731
+ newPaths.push(path13);
907
1732
  }
908
1733
  if (newPaths.length === 0) {
909
1734
  return;
@@ -916,8 +1741,8 @@ var ProgressDisplay = class {
916
1741
  this.hasPrintedLogHeader = true;
917
1742
  }
918
1743
  const startIndex = this.logPaths.length - newPaths.length;
919
- newPaths.forEach((path11, offset) => {
920
- console.log(`${startIndex + offset + 1}. ${path11}`);
1744
+ newPaths.forEach((path13, offset) => {
1745
+ console.log(`${startIndex + offset + 1}. ${path13}`);
921
1746
  });
922
1747
  }
923
1748
  finish() {
@@ -1207,10 +2032,10 @@ function formatMatrixSummary(results) {
1207
2032
  }
1208
2033
 
1209
2034
  // ../../packages/core/dist/evaluation/validation/index.js
1210
- import { readFile } from "node:fs/promises";
1211
- import path9 from "node:path";
1212
- import { parse } from "yaml";
1213
2035
  import { readFile as readFile2 } from "node:fs/promises";
2036
+ import path11 from "node:path";
2037
+ import { parse } from "yaml";
2038
+ import { readFile as readFile22 } from "node:fs/promises";
1214
2039
  import path22 from "node:path";
1215
2040
  import { parse as parse2 } from "yaml";
1216
2041
  import { readFile as readFile3 } from "node:fs/promises";
@@ -1226,7 +2051,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
1226
2051
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
1227
2052
  async function detectFileType(filePath) {
1228
2053
  try {
1229
- const content = await readFile(filePath, "utf8");
2054
+ const content = await readFile2(filePath, "utf8");
1230
2055
  const parsed = parse(content);
1231
2056
  if (typeof parsed !== "object" || parsed === null) {
1232
2057
  return inferFileTypeFromPath(filePath);
@@ -1251,8 +2076,8 @@ async function detectFileType(filePath) {
1251
2076
  }
1252
2077
  }
1253
2078
  function inferFileTypeFromPath(filePath) {
1254
- const normalized = path9.normalize(filePath).replace(/\\/g, "/");
1255
- const basename = path9.basename(filePath);
2079
+ const normalized = path11.normalize(filePath).replace(/\\/g, "/");
2080
+ const basename = path11.basename(filePath);
1256
2081
  if (normalized.includes("/.agentv/")) {
1257
2082
  if (basename === "config.yaml" || basename === "config.yml") {
1258
2083
  return "config";
@@ -1287,7 +2112,7 @@ async function validateEvalFile(filePath) {
1287
2112
  const absolutePath = path22.resolve(filePath);
1288
2113
  let parsed;
1289
2114
  try {
1290
- const content = await readFile2(absolutePath, "utf8");
2115
+ const content = await readFile22(absolutePath, "utf8");
1291
2116
  parsed = parse2(content);
1292
2117
  } catch (error) {
1293
2118
  errors.push({
@@ -1454,7 +2279,7 @@ async function validateEvalFile(filePath) {
1454
2279
  });
1455
2280
  }
1456
2281
  }
1457
- const assertField = evalCase.assert;
2282
+ const assertField = evalCase.assertions ?? evalCase.assert;
1458
2283
  if (assertField !== void 0) {
1459
2284
  validateAssertArray(assertField, location, absolutePath, errors);
1460
2285
  }
@@ -1625,14 +2450,14 @@ function validateAssertArray(assertField, parentLocation, filePath, errors) {
1625
2450
  errors.push({
1626
2451
  severity: "warning",
1627
2452
  filePath,
1628
- location: `${parentLocation}.assert`,
1629
- message: "'assert' must be an array of assertion objects."
2453
+ location: `${parentLocation}.assertions`,
2454
+ message: "'assertions' must be an array of assertion objects."
1630
2455
  });
1631
2456
  return;
1632
2457
  }
1633
2458
  for (let i = 0; i < assertField.length; i++) {
1634
2459
  const item = assertField[i];
1635
- const location = `${parentLocation}.assert[${i}]`;
2460
+ const location = `${parentLocation}.assertions[${i}]`;
1636
2461
  if (!isObject(item)) {
1637
2462
  errors.push({
1638
2463
  severity: "warning",
@@ -1931,6 +2756,7 @@ function getKnownSettings(provider) {
1931
2756
  return COPILOT_CLI_SETTINGS;
1932
2757
  case "claude":
1933
2758
  case "claude-code":
2759
+ case "claude-cli":
1934
2760
  case "claude-sdk":
1935
2761
  return CLAUDE_SETTINGS;
1936
2762
  case "vscode":
@@ -1950,7 +2776,15 @@ function validateUnknownSettings(target, provider, absolutePath, location, error
1950
2776
  if (!knownSettings) {
1951
2777
  return;
1952
2778
  }
1953
- const baseFields = /* @__PURE__ */ new Set(["name", "provider", "judge_target", "workers", "$schema", "targets"]);
2779
+ const baseFields = /* @__PURE__ */ new Set([
2780
+ "name",
2781
+ "provider",
2782
+ "grader_target",
2783
+ "judge_target",
2784
+ "workers",
2785
+ "$schema",
2786
+ "targets"
2787
+ ]);
1954
2788
  for (const key of Object.keys(target)) {
1955
2789
  if (removedTargetFields.has(key)) {
1956
2790
  errors.push({
@@ -2157,13 +2991,13 @@ async function validateTargetsFile(filePath) {
2157
2991
  if (typeof provider === "string") {
2158
2992
  validateUnknownSettings(target, provider, absolutePath, location, errors);
2159
2993
  }
2160
- const judgeTarget = target.judge_target;
2161
- if (judgeTarget !== void 0 && typeof judgeTarget !== "string") {
2994
+ const graderTarget = target.grader_target ?? target.judge_target;
2995
+ if (graderTarget !== void 0 && typeof graderTarget !== "string") {
2162
2996
  errors.push({
2163
2997
  severity: "error",
2164
2998
  filePath: absolutePath,
2165
- location: `${location}.judge_target`,
2166
- message: "Invalid 'judge_target' field (must be a string)"
2999
+ location: `${location}.grader_target`,
3000
+ message: "Invalid 'grader_target' field (must be a string)"
2167
3001
  });
2168
3002
  }
2169
3003
  }
@@ -2473,7 +3307,7 @@ Errors in ${targetsFilePath}:`);
2473
3307
  const mockTarget = {
2474
3308
  kind: "mock",
2475
3309
  name: `${targetDefinition.name}-dry-run`,
2476
- judgeTarget: void 0,
3310
+ graderTarget: void 0,
2477
3311
  config: {
2478
3312
  response: '{"answer":"Mock dry-run response"}',
2479
3313
  delayMs: dryRunDelay,
@@ -2564,7 +3398,7 @@ Errors in ${targetsFilePath}:`);
2564
3398
  const mockTarget = {
2565
3399
  kind: "mock",
2566
3400
  name: `${targetDefinition.name}-dry-run`,
2567
- judgeTarget: void 0,
3401
+ graderTarget: void 0,
2568
3402
  config: {
2569
3403
  response: '{"answer":"Mock dry-run response"}',
2570
3404
  delayMs: dryRunDelay,
@@ -2719,7 +3553,11 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
2719
3553
  otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
2720
3554
  retryErrors: normalizeString(rawOptions.retryErrors),
2721
3555
  workspaceMode,
2722
- workspacePath
3556
+ workspacePath,
3557
+ benchmarkJson: normalizeString(rawOptions.benchmarkJson),
3558
+ artifacts: normalizeString(rawOptions.artifacts),
3559
+ graderTarget: normalizeString(rawOptions.graderTarget),
3560
+ model: normalizeString(rawOptions.model)
2723
3561
  };
2724
3562
  }
2725
3563
  async function ensureFileExists(filePath, description) {
@@ -2733,7 +3571,7 @@ function buildDefaultOutputPath(cwd, format) {
2733
3571
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2734
3572
  const baseName = "eval";
2735
3573
  const extension = getDefaultExtension(format);
2736
- return path10.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3574
+ return path12.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
2737
3575
  }
2738
3576
  function createProgressReporter(maxWorkers, options) {
2739
3577
  const display = new ProgressDisplay(maxWorkers, options);
@@ -2747,7 +3585,7 @@ function createProgressReporter(maxWorkers, options) {
2747
3585
  };
2748
3586
  }
2749
3587
  function makeEvalKey(testFilePath, evalId) {
2750
- return `${path10.resolve(testFilePath)}::${evalId}`;
3588
+ return `${path12.resolve(testFilePath)}::${evalId}`;
2751
3589
  }
2752
3590
  function createDisplayIdTracker() {
2753
3591
  const map = /* @__PURE__ */ new Map();
@@ -2952,6 +3790,8 @@ async function runSingleEvalFile(params) {
2952
3790
  trials: trialsConfig,
2953
3791
  totalBudgetUsd,
2954
3792
  failOnError,
3793
+ graderTarget: options.graderTarget,
3794
+ model: options.model,
2955
3795
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
2956
3796
  onResult: async (result) => {
2957
3797
  streamingObserver?.finalizeEvalCase(result.score, result.error);
@@ -3004,16 +3844,19 @@ async function runEvalCommand(input) {
3004
3844
  );
3005
3845
  }
3006
3846
  const repoRoot = await findRepoRoot(cwd);
3007
- const yamlConfig = await loadConfig(path10.join(cwd, "_"), repoRoot);
3847
+ const yamlConfig = await loadConfig(path12.join(cwd, "_"), repoRoot);
3008
3848
  if (yamlConfig?.required_version) {
3009
3849
  await enforceRequiredVersion(yamlConfig.required_version, {
3010
3850
  strict: normalizeBoolean(input.rawOptions.strict)
3011
3851
  });
3012
3852
  }
3013
3853
  let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
3854
+ if (options.graderTarget === "agentv" && !options.model) {
3855
+ throw new Error("--grader-target agentv requires --model (e.g., --model openai:gpt-5-mini)");
3856
+ }
3014
3857
  let retryNonErrorResults;
3015
3858
  if (options.retryErrors) {
3016
- const retryPath = path10.resolve(options.retryErrors);
3859
+ const retryPath = path12.resolve(options.retryErrors);
3017
3860
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
3018
3861
  const errorIds = await loadErrorTestIds(retryPath);
3019
3862
  if (errorIds.length === 0) {
@@ -3026,7 +3869,7 @@ async function runEvalCommand(input) {
3026
3869
  retryNonErrorResults = await loadNonErrorResults(retryPath);
3027
3870
  }
3028
3871
  if (options.workspacePath) {
3029
- const resolvedWorkspace = path10.resolve(options.workspacePath);
3872
+ const resolvedWorkspace = path12.resolve(options.workspacePath);
3030
3873
  try {
3031
3874
  const { stat: stat2 } = await import("node:fs/promises");
3032
3875
  const stats = await stat2(resolvedWorkspace);
@@ -3048,7 +3891,7 @@ async function runEvalCommand(input) {
3048
3891
  const useFileExport = !!(options.otelFile || options.traceFile);
3049
3892
  if (options.exportOtel || useFileExport) {
3050
3893
  try {
3051
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-EDQZMZH2.js");
3894
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-WN2QIOQR.js");
3052
3895
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
3053
3896
  let headers = {};
3054
3897
  if (options.otelBackend) {
@@ -3072,8 +3915,8 @@ async function runEvalCommand(input) {
3072
3915
  headers,
3073
3916
  captureContent,
3074
3917
  groupTurns: options.otelGroupTurns,
3075
- otlpFilePath: options.otelFile ? path10.resolve(options.otelFile) : void 0,
3076
- traceFilePath: options.traceFile ? path10.resolve(options.traceFile) : void 0
3918
+ otlpFilePath: options.otelFile ? path12.resolve(options.otelFile) : void 0,
3919
+ traceFilePath: options.traceFile ? path12.resolve(options.traceFile) : void 0
3077
3920
  });
3078
3921
  const initialized = await otelExporter.init();
3079
3922
  if (!initialized) {
@@ -3089,8 +3932,8 @@ async function runEvalCommand(input) {
3089
3932
  otelExporter = null;
3090
3933
  }
3091
3934
  }
3092
- const outputPath = options.outPath ? path10.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
3093
- const extraOutputPaths = options.outputPaths.map((p) => path10.resolve(p));
3935
+ const outputPath = options.outPath ? path12.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
3936
+ const extraOutputPaths = options.outputPaths.map((p) => path12.resolve(p));
3094
3937
  const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
3095
3938
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
3096
3939
  let outputWriter;
@@ -3104,12 +3947,12 @@ async function runEvalCommand(input) {
3104
3947
  console.log(` ${p}`);
3105
3948
  }
3106
3949
  }
3107
- const resolvedTestFiles = input.testFiles.map((file) => path10.resolve(file));
3950
+ const resolvedTestFiles = input.testFiles.map((file) => path12.resolve(file));
3108
3951
  if (options.otelFile) {
3109
- console.log(`OTLP JSON file: ${path10.resolve(options.otelFile)}`);
3952
+ console.log(`OTLP JSON file: ${path12.resolve(options.otelFile)}`);
3110
3953
  }
3111
3954
  if (options.traceFile) {
3112
- console.log(`Trace file: ${path10.resolve(options.traceFile)}`);
3955
+ console.log(`Trace file: ${path12.resolve(options.traceFile)}`);
3113
3956
  }
3114
3957
  const evaluationRunner = await resolveEvaluationRunner();
3115
3958
  const allResults = [];
@@ -3122,7 +3965,23 @@ async function runEvalCommand(input) {
3122
3965
  );
3123
3966
  const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : void 0;
3124
3967
  const fileMetadata = /* @__PURE__ */ new Map();
3968
+ const tsFiles = [];
3969
+ const yamlFiles = [];
3125
3970
  for (const testFilePath of resolvedTestFiles) {
3971
+ if (/\.(ts|js|mts|mjs)$/.test(testFilePath)) {
3972
+ tsFiles.push(testFilePath);
3973
+ } else {
3974
+ yamlFiles.push(testFilePath);
3975
+ }
3976
+ }
3977
+ for (const tsFile of tsFiles) {
3978
+ await ensureFileExists(tsFile, "TypeScript eval file");
3979
+ await import(pathToFileURL(tsFile).href);
3980
+ }
3981
+ if (yamlFiles.length === 0 && tsFiles.length > 0) {
3982
+ return;
3983
+ }
3984
+ for (const testFilePath of yamlFiles) {
3126
3985
  const meta = await prepareFileMetadata({
3127
3986
  testFilePath,
3128
3987
  repoRoot,
@@ -3139,7 +3998,7 @@ async function runEvalCommand(input) {
3139
3998
  cliNoCache: options.noCache,
3140
3999
  yamlCache: yamlCacheEnabled
3141
4000
  });
3142
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path10.resolve(yamlCachePath) : void 0) : void 0;
4001
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path12.resolve(yamlCachePath) : void 0) : void 0;
3143
4002
  const useCache = cacheEnabled;
3144
4003
  if (cacheEnabled) {
3145
4004
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
@@ -3269,6 +4128,24 @@ async function runEvalCommand(input) {
3269
4128
  if (isMatrixMode && allResults.length > 0) {
3270
4129
  console.log(formatMatrixSummary(allResults));
3271
4130
  }
4131
+ if (options.benchmarkJson && allResults.length > 0) {
4132
+ const benchmarkPath = path12.resolve(options.benchmarkJson);
4133
+ await writeBenchmarkJson(benchmarkPath, allResults);
4134
+ console.log(`Benchmark written to: ${benchmarkPath}`);
4135
+ }
4136
+ if (options.artifacts && allResults.length > 0) {
4137
+ const artifactsDir = path12.resolve(options.artifacts);
4138
+ const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4139
+ const {
4140
+ gradingDir,
4141
+ timingPath,
4142
+ benchmarkPath: abp
4143
+ } = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile });
4144
+ console.log(`Artifacts written to: ${artifactsDir}`);
4145
+ console.log(` Grading: ${gradingDir} (${allResults.length} files)`);
4146
+ console.log(` Timing: ${timingPath}`);
4147
+ console.log(` Benchmark: ${abp}`);
4148
+ }
3272
4149
  const failedWithWorkspaces = allResults.filter(
3273
4150
  (r) => r.workspacePath && (r.error || r.score < 0.5)
3274
4151
  );
@@ -3308,7 +4185,7 @@ async function resolveEvaluationRunner() {
3308
4185
  if (!overridePath) {
3309
4186
  return runEvaluation;
3310
4187
  }
3311
- const resolved = path10.isAbsolute(overridePath) ? overridePath : path10.resolve(process.cwd(), overridePath);
4188
+ const resolved = path12.isAbsolute(overridePath) ? overridePath : path12.resolve(process.cwd(), overridePath);
3312
4189
  const moduleUrl = pathToFileURL(resolved).href;
3313
4190
  const mod = await import(moduleUrl);
3314
4191
  const candidate = mod.runEvaluation;
@@ -3323,6 +4200,7 @@ async function resolveEvaluationRunner() {
3323
4200
  export {
3324
4201
  package_default,
3325
4202
  toSnakeCaseDeep,
4203
+ HtmlWriter,
3326
4204
  resolveEvalPaths,
3327
4205
  findRepoRoot,
3328
4206
  detectFileType,
@@ -3335,4 +4213,4 @@ export {
3335
4213
  selectTarget,
3336
4214
  runEvalCommand
3337
4215
  };
3338
- //# sourceMappingURL=chunk-KSUL3F3R.js.map
4216
+ //# sourceMappingURL=chunk-DY4ZDTTO.js.map