agentv 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,12 +25,12 @@ import {
25
25
  subscribeToCopilotCliLogEntries,
26
26
  subscribeToCopilotSdkLogEntries,
27
27
  subscribeToPiLogEntries
28
- } from "./chunk-5M3K2DMV.js";
28
+ } from "./chunk-GOZV2HN2.js";
29
29
 
30
30
  // package.json
31
31
  var package_default = {
32
32
  name: "agentv",
33
- version: "3.3.0",
33
+ version: "3.4.0",
34
34
  description: "CLI entry point for AgentV",
35
35
  type: "module",
36
36
  repository: {
@@ -320,19 +320,19 @@ function parseWorkspaceChanges(fileChanges) {
320
320
  diff_summary: diffSummary
321
321
  };
322
322
  }
323
- function buildExpectations(result) {
324
- const expectations = [];
323
+ function buildAssertions(result) {
324
+ const assertions = [];
325
325
  if (result.scores && result.scores.length > 0) {
326
326
  for (const evaluator of result.scores) {
327
327
  for (const hit of evaluator.hits) {
328
- expectations.push({
328
+ assertions.push({
329
329
  text: hit,
330
330
  passed: true,
331
331
  evidence: evaluator.reasoning ?? ""
332
332
  });
333
333
  }
334
334
  for (const miss of evaluator.misses) {
335
- expectations.push({
335
+ assertions.push({
336
336
  text: miss,
337
337
  passed: false,
338
338
  evidence: evaluator.reasoning ?? ""
@@ -341,13 +341,13 @@ function buildExpectations(result) {
341
341
  }
342
342
  } else {
343
343
  for (const hit of result.hits) {
344
- expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
344
+ assertions.push({ text: hit, passed: true, evidence: result.reasoning ?? "" });
345
345
  }
346
346
  for (const miss of result.misses) {
347
- expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
347
+ assertions.push({ text: miss, passed: false, evidence: result.reasoning ?? "" });
348
348
  }
349
349
  }
350
- return expectations;
350
+ return assertions;
351
351
  }
352
352
  function buildEvaluators(scores) {
353
353
  if (!scores || scores.length === 0) {
@@ -366,14 +366,14 @@ function buildEvaluators(scores) {
366
366
  }));
367
367
  }
368
368
  function buildGradingArtifact(result) {
369
- const expectations = buildExpectations(result);
370
- const passed = expectations.filter((e) => e.passed).length;
371
- const failed = expectations.filter((e) => !e.passed).length;
372
- const total = expectations.length;
369
+ const assertions = buildAssertions(result);
370
+ const passed = assertions.filter((e) => e.passed).length;
371
+ const failed = assertions.filter((e) => !e.passed).length;
372
+ const total = assertions.length;
373
373
  const { toolCalls, total: totalToolCalls } = countToolCalls(result);
374
374
  const errorsEncountered = result.error ? 1 : 0;
375
375
  return {
376
- expectations,
376
+ assertions,
377
377
  summary: {
378
378
  passed,
379
379
  failed,
@@ -496,6 +496,42 @@ function buildBenchmarkArtifact(results, evalFile = "") {
496
496
  notes
497
497
  };
498
498
  }
499
+ function toCamelCase(str) {
500
+ return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
501
+ }
502
+ function toCamelCaseDeep(obj) {
503
+ if (obj === null || obj === void 0) {
504
+ return obj;
505
+ }
506
+ if (Array.isArray(obj)) {
507
+ return obj.map((item) => toCamelCaseDeep(item));
508
+ }
509
+ if (typeof obj === "object") {
510
+ const result = {};
511
+ for (const [key, value] of Object.entries(obj)) {
512
+ result[toCamelCase(key)] = toCamelCaseDeep(value);
513
+ }
514
+ return result;
515
+ }
516
+ return obj;
517
+ }
518
+ function parseJsonlResults(content) {
519
+ const results = [];
520
+ const lines = content.split("\n");
521
+ for (const line of lines) {
522
+ const trimmed = line.trim();
523
+ if (trimmed.length === 0) {
524
+ continue;
525
+ }
526
+ try {
527
+ const parsed = JSON.parse(trimmed);
528
+ const camelCased = toCamelCaseDeep(parsed);
529
+ results.push(camelCased);
530
+ } catch {
531
+ }
532
+ }
533
+ return results;
534
+ }
499
535
  async function writeArtifactsFromResults(results, outputDir, options) {
500
536
  const gradingDir = path3.join(outputDir, "grading");
501
537
  const timingPath = path3.join(outputDir, "timing.json");
@@ -1673,6 +1709,24 @@ async function createMultiWriter(filePaths) {
1673
1709
  }
1674
1710
 
1675
1711
  // src/commands/eval/progress-display.ts
1712
+ var ANSI_BOLD = "\x1B[1m";
1713
+ var ANSI_GREEN = "\x1B[32m";
1714
+ var ANSI_RED2 = "\x1B[31m";
1715
+ var ANSI_YELLOW2 = "\x1B[33m";
1716
+ var ANSI_RESET2 = "\x1B[0m";
1717
+ function useColors() {
1718
+ if (process.env.NO_COLOR !== void 0) return false;
1719
+ return process.stdout.isTTY ?? false;
1720
+ }
1721
+ function formatVerdict(score, verdict) {
1722
+ if (verdict === void 0) return "";
1723
+ const colors = useColors();
1724
+ const scoreStr = score !== void 0 ? score.toFixed(3) : "";
1725
+ const verdictLabel = verdict === "ERROR" ? "ERROR" : `${scoreStr} ${verdict}`;
1726
+ if (!colors) return ` | ${verdictLabel}`;
1727
+ const color = verdict === "PASS" ? ANSI_GREEN : verdict === "FAIL" ? ANSI_RED2 : ANSI_YELLOW2;
1728
+ return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET2}`;
1729
+ }
1676
1730
  var ProgressDisplay = class {
1677
1731
  workers = /* @__PURE__ */ new Map();
1678
1732
  totalTests = 0;
@@ -1716,11 +1770,13 @@ var ProgressDisplay = class {
1716
1770
  }
1717
1771
  break;
1718
1772
  case "completed":
1719
- console.log(`${countPrefix} \u2705 ${progress.testId}${targetSuffix}`);
1773
+ console.log(
1774
+ `${countPrefix} \u2705 ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`
1775
+ );
1720
1776
  break;
1721
1777
  case "failed":
1722
1778
  console.log(
1723
- `${countPrefix} \u274C ${progress.testId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
1779
+ `${countPrefix} \u274C ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ""}`
1724
1780
  );
1725
1781
  break;
1726
1782
  }
@@ -1760,6 +1816,22 @@ var ProgressDisplay = class {
1760
1816
  // src/commands/eval/retry-errors.ts
1761
1817
  import { createReadStream } from "node:fs";
1762
1818
  import { createInterface } from "node:readline";
1819
+ function getTestId(result) {
1820
+ return result.testId ?? result.test_id;
1821
+ }
1822
+ function getExecutionStatus(result) {
1823
+ return result.executionStatus ?? result.execution_status;
1824
+ }
1825
+ function toEvaluationResult(result) {
1826
+ if (result.testId !== void 0 && result.executionStatus !== void 0) {
1827
+ return result;
1828
+ }
1829
+ return {
1830
+ ...result,
1831
+ testId: getTestId(result) ?? "",
1832
+ executionStatus: getExecutionStatus(result)
1833
+ };
1834
+ }
1763
1835
  async function loadErrorTestIds(jsonlPath) {
1764
1836
  const ids = [];
1765
1837
  const rl = createInterface({
@@ -1771,8 +1843,10 @@ async function loadErrorTestIds(jsonlPath) {
1771
1843
  if (!trimmed) continue;
1772
1844
  try {
1773
1845
  const parsed = JSON.parse(trimmed);
1774
- if (parsed.executionStatus === "execution_error" && parsed.testId) {
1775
- ids.push(parsed.testId);
1846
+ const executionStatus = getExecutionStatus(parsed);
1847
+ const testId = getTestId(parsed);
1848
+ if (executionStatus === "execution_error" && testId) {
1849
+ ids.push(testId);
1776
1850
  }
1777
1851
  } catch {
1778
1852
  }
@@ -1790,9 +1864,11 @@ async function loadNonErrorResults(jsonlPath) {
1790
1864
  if (!trimmed) continue;
1791
1865
  try {
1792
1866
  const parsed = JSON.parse(trimmed);
1793
- if (!parsed.testId || parsed.score === void 0) continue;
1794
- if (parsed.executionStatus !== "execution_error") {
1795
- results.push(parsed);
1867
+ const testId = getTestId(parsed);
1868
+ const executionStatus = getExecutionStatus(parsed);
1869
+ if (!testId || parsed.score === void 0) continue;
1870
+ if (executionStatus !== "execution_error") {
1871
+ results.push(toEvaluationResult(parsed));
1796
1872
  }
1797
1873
  } catch {
1798
1874
  }
@@ -1936,7 +2012,19 @@ function formatEvaluationSummary(summary) {
1936
2012
  }
1937
2013
  lines.push("");
1938
2014
  }
2015
+ const overallPassed = summary.passedCount === summary.total - summary.executionErrorCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
2016
+ const overallVerdict = overallPassed ? "PASS" : "FAIL";
2017
+ const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
2018
+ const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2019
+ const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`;
1939
2020
  lines.push("\n==================================================");
2021
+ if (useColor) {
2022
+ lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
2023
+ } else {
2024
+ lines.push(verdictText);
2025
+ }
2026
+ lines.push("==================================================");
2027
+ lines.push("");
1940
2028
  lines.push("EVALUATION SUMMARY");
1941
2029
  lines.push("==================================================");
1942
2030
  lines.push(`Total tests: ${summary.total}`);
@@ -3292,9 +3380,9 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
3292
3380
  }
3293
3381
 
3294
3382
  // src/commands/eval/targets.ts
3295
- var ANSI_YELLOW2 = "\x1B[33m";
3296
- var ANSI_RED2 = "\x1B[31m";
3297
- var ANSI_RESET2 = "\x1B[0m";
3383
+ var ANSI_YELLOW3 = "\x1B[33m";
3384
+ var ANSI_RED3 = "\x1B[31m";
3385
+ var ANSI_RESET3 = "\x1B[0m";
3298
3386
  function isTTY() {
3299
3387
  return process.stdout.isTTY ?? false;
3300
3388
  }
@@ -3334,14 +3422,14 @@ async function selectTarget(options) {
3334
3422
  });
3335
3423
  const validationResult = await validateTargetsFile(targetsFilePath);
3336
3424
  const warnings = validationResult.errors.filter((e) => e.severity === "warning");
3337
- const useColors = isTTY();
3425
+ const useColors2 = isTTY();
3338
3426
  if (warnings.length > 0) {
3339
3427
  console.warn(`
3340
3428
  Warnings in ${targetsFilePath}:`);
3341
3429
  for (const warning of warnings) {
3342
3430
  const location = warning.location ? ` [${warning.location}]` : "";
3343
- const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
3344
- const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
3431
+ const prefix = useColors2 ? `${ANSI_YELLOW3} \u26A0${ANSI_RESET3}` : " \u26A0";
3432
+ const message = useColors2 ? `${ANSI_YELLOW3}${warning.message}${ANSI_RESET3}` : warning.message;
3345
3433
  console.warn(`${prefix}${location} ${message}`);
3346
3434
  }
3347
3435
  console.warn("");
@@ -3352,8 +3440,8 @@ Warnings in ${targetsFilePath}:`);
3352
3440
  Errors in ${targetsFilePath}:`);
3353
3441
  for (const error of errors) {
3354
3442
  const location = error.location ? ` [${error.location}]` : "";
3355
- const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
3356
- const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
3443
+ const prefix = useColors2 ? `${ANSI_RED3} \u2717${ANSI_RESET3}` : " \u2717";
3444
+ const message = useColors2 ? `${ANSI_RED3}${error.message}${ANSI_RESET3}` : error.message;
3357
3445
  console.error(`${prefix}${location} ${message}`);
3358
3446
  }
3359
3447
  throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -3425,14 +3513,14 @@ async function selectMultipleTargets(options) {
3425
3513
  });
3426
3514
  const validationResult = await validateTargetsFile(targetsFilePath);
3427
3515
  const warnings = validationResult.errors.filter((e) => e.severity === "warning");
3428
- const useColors = isTTY();
3516
+ const useColors2 = isTTY();
3429
3517
  if (warnings.length > 0) {
3430
3518
  console.warn(`
3431
3519
  Warnings in ${targetsFilePath}:`);
3432
3520
  for (const warning of warnings) {
3433
3521
  const location = warning.location ? ` [${warning.location}]` : "";
3434
- const prefix = useColors ? `${ANSI_YELLOW2} \u26A0${ANSI_RESET2}` : " \u26A0";
3435
- const message = useColors ? `${ANSI_YELLOW2}${warning.message}${ANSI_RESET2}` : warning.message;
3522
+ const prefix = useColors2 ? `${ANSI_YELLOW3} \u26A0${ANSI_RESET3}` : " \u26A0";
3523
+ const message = useColors2 ? `${ANSI_YELLOW3}${warning.message}${ANSI_RESET3}` : warning.message;
3436
3524
  console.warn(`${prefix}${location} ${message}`);
3437
3525
  }
3438
3526
  console.warn("");
@@ -3443,8 +3531,8 @@ Warnings in ${targetsFilePath}:`);
3443
3531
  Errors in ${targetsFilePath}:`);
3444
3532
  for (const error of errors) {
3445
3533
  const location = error.location ? ` [${error.location}]` : "";
3446
- const prefix = useColors ? `${ANSI_RED2} \u2717${ANSI_RESET2}` : " \u2717";
3447
- const message = useColors ? `${ANSI_RED2}${error.message}${ANSI_RESET2}` : error.message;
3534
+ const prefix = useColors2 ? `${ANSI_RED3} \u2717${ANSI_RESET3}` : " \u2717";
3535
+ const message = useColors2 ? `${ANSI_RED3}${error.message}${ANSI_RESET3}` : error.message;
3448
3536
  console.error(`${prefix}${location} ${message}`);
3449
3537
  }
3450
3538
  throw new Error(`Targets file validation failed with ${errors.length} error(s)`);
@@ -3737,13 +3825,10 @@ async function prepareFileMetadata(params) {
3737
3825
  env: process.env,
3738
3826
  targetNames
3739
3827
  });
3740
- selections = multiSelections.map((sel) => {
3741
- const providerLabel = options.dryRun ? `${sel.resolvedTarget.kind} (dry-run)` : sel.resolvedTarget.kind;
3742
- return {
3743
- selection: sel,
3744
- inlineTargetLabel: `${sel.targetName} ${buildTargetLabelSuffix(providerLabel, sel.resolvedTarget)}`
3745
- };
3746
- });
3828
+ selections = multiSelections.map((sel) => ({
3829
+ selection: sel,
3830
+ inlineTargetLabel: sel.targetName
3831
+ }));
3747
3832
  } else {
3748
3833
  const selection = await selectTarget({
3749
3834
  testFilePath,
@@ -3757,11 +3842,10 @@ async function prepareFileMetadata(params) {
3757
3842
  dryRunDelayMax: options.dryRunDelayMax,
3758
3843
  env: process.env
3759
3844
  });
3760
- const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
3761
3845
  selections = [
3762
3846
  {
3763
3847
  selection,
3764
- inlineTargetLabel: `${selection.targetName} ${buildTargetLabelSuffix(providerLabel, selection.resolvedTarget)}`
3848
+ inlineTargetLabel: selection.targetName
3765
3849
  }
3766
3850
  ];
3767
3851
  }
@@ -3902,6 +3986,10 @@ async function runSingleEvalFile(params) {
3902
3986
  if (event.status === "running" && streamingObserver) {
3903
3987
  streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
3904
3988
  }
3989
+ let verdict;
3990
+ if (event.executionStatus === "ok") verdict = "PASS";
3991
+ else if (event.executionStatus === "quality_failure") verdict = "FAIL";
3992
+ else if (event.executionStatus === "execution_error") verdict = "ERROR";
3905
3993
  progressReporter.update(displayId, {
3906
3994
  workerId: displayId,
3907
3995
  testId: matrixMode ? `${event.testId}@${targetName}` : event.testId,
@@ -3909,7 +3997,9 @@ async function runSingleEvalFile(params) {
3909
3997
  startedAt: event.startedAt,
3910
3998
  completedAt: event.completedAt,
3911
3999
  error: event.error,
3912
- targetLabel: inlineTargetLabel
4000
+ targetLabel: inlineTargetLabel,
4001
+ score: event.score,
4002
+ verdict
3913
4003
  });
3914
4004
  }
3915
4005
  });
@@ -3973,7 +4063,7 @@ async function runEvalCommand(input) {
3973
4063
  const useFileExport = !!(options.otelFile || options.traceFile);
3974
4064
  if (options.exportOtel || useFileExport) {
3975
4065
  try {
3976
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OC53WD3P.js");
4066
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-AFDYFH6Y.js");
3977
4067
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
3978
4068
  let headers = {};
3979
4069
  if (options.otelBackend) {
@@ -4248,6 +4338,22 @@ Results written to: ${outputPath}`);
4248
4338
  }
4249
4339
  }
4250
4340
  }
4341
+ if (summary.executionErrorCount > 0 && !options.retryErrors) {
4342
+ const evalFileArgs = resolvedTestFiles.map((f) => path12.relative(cwd, f)).join(" ");
4343
+ const targetFlag = options.target ? ` --target ${options.target}` : "";
4344
+ const relativeOutputPath = path12.relative(cwd, outputPath);
4345
+ console.log(
4346
+ `
4347
+ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
4348
+ agentv eval run ${evalFileArgs}${targetFlag} --retry-errors ${relativeOutputPath} -o ${relativeOutputPath}`
4349
+ );
4350
+ }
4351
+ return {
4352
+ executionErrorCount: summary.executionErrorCount,
4353
+ outputPath,
4354
+ testFiles: resolvedTestFiles,
4355
+ target: options.target
4356
+ };
4251
4357
  } finally {
4252
4358
  unsubscribeCodexLogs();
4253
4359
  unsubscribePiLogs();
@@ -4285,6 +4391,10 @@ export {
4285
4391
  HtmlWriter,
4286
4392
  resolveEvalPaths,
4287
4393
  findRepoRoot,
4394
+ buildGradingArtifact,
4395
+ buildTimingArtifact,
4396
+ buildBenchmarkArtifact,
4397
+ parseJsonlResults,
4288
4398
  detectFileType,
4289
4399
  validateEvalFile,
4290
4400
  validateTargetsFile,
@@ -4295,4 +4405,4 @@ export {
4295
4405
  selectTarget,
4296
4406
  runEvalCommand
4297
4407
  };
4298
- //# sourceMappingURL=chunk-4ZMSAQWS.js.map
4408
+ //# sourceMappingURL=chunk-RE5I3U2S.js.map