agentv 4.6.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  CLI_PLACEHOLDERS,
4
4
  COMMON_TARGET_SETTINGS,
5
+ DEFAULT_EVAL_PATTERNS,
6
+ DEFAULT_THRESHOLD,
5
7
  KNOWN_PROVIDERS,
6
8
  PROVIDER_ALIASES,
7
9
  ResponseCache,
@@ -9,6 +11,7 @@ import {
9
11
  buildSearchRoots,
10
12
  deriveCategory,
11
13
  ensureVSCodeSubagents,
14
+ findDeprecatedCamelCaseTargetWarnings,
12
15
  findGitRoot,
13
16
  interpolateEnv,
14
17
  isEvaluatorKind,
@@ -29,12 +32,12 @@ import {
29
32
  subscribeToCopilotCliLogEntries,
30
33
  subscribeToCopilotSdkLogEntries,
31
34
  subscribeToPiLogEntries
32
- } from "./chunk-YXXD27OK.js";
35
+ } from "./chunk-H4GQXK5M.js";
33
36
 
34
37
  // package.json
35
38
  var package_default = {
36
39
  name: "agentv",
37
- version: "4.6.1",
40
+ version: "4.8.0",
38
41
  description: "CLI entry point for AgentV",
39
42
  type: "module",
40
43
  repository: {
@@ -346,6 +349,9 @@ function buildDefaultRunDir(cwd) {
346
349
  function resolveRunIndexPath(runDir) {
347
350
  return path3.join(runDir, RESULT_INDEX_FILENAME);
348
351
  }
352
+ function isRunManifestPath(filePath) {
353
+ return path3.basename(filePath) === RESULT_INDEX_FILENAME;
354
+ }
349
355
  function resolveExistingRunPrimaryPath(runDir) {
350
356
  const indexPath = resolveRunIndexPath(runDir);
351
357
  if (existsSync(indexPath)) {
@@ -370,9 +376,19 @@ function resolveWorkspaceOrFilePath(filePath) {
370
376
  }
371
377
  return existing;
372
378
  }
379
+ function resolveRunManifestPath(filePath) {
380
+ if (isDirectoryPath(filePath)) {
381
+ return resolveWorkspaceOrFilePath(filePath);
382
+ }
383
+ if (!isRunManifestPath(filePath)) {
384
+ throw new Error(
385
+ `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
386
+ );
387
+ }
388
+ return filePath;
389
+ }
373
390
 
374
391
  // src/commands/eval/artifact-writer.ts
375
- var PASS_THRESHOLD = 0.8;
376
392
  function computeStats(values) {
377
393
  if (values.length === 0) {
378
394
  return { mean: 0, stddev: 0 };
@@ -387,10 +403,10 @@ function computeStats(values) {
387
403
  function computePassRate(result) {
388
404
  const scores = result.scores;
389
405
  if (scores && scores.length > 0) {
390
- const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
406
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
391
407
  return passed / scores.length;
392
408
  }
393
- return (result.score ?? 0) >= PASS_THRESHOLD ? 1 : 0;
409
+ return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
394
410
  }
395
411
  function countToolCalls(result) {
396
412
  const toolCalls = {};
@@ -596,12 +612,12 @@ function safeArtifactPathSegment(value, fallback) {
596
612
  function safeTestId(testId) {
597
613
  return safeArtifactPathSegment(testId, "unknown");
598
614
  }
599
- function getDataset(result) {
600
- return result.dataset;
615
+ function getSuite(result) {
616
+ return result.suite;
601
617
  }
602
618
  function buildArtifactSubdir(result) {
603
619
  const segments = [];
604
- const evalSet = getDataset(result);
620
+ const evalSet = getSuite(result);
605
621
  if (evalSet) {
606
622
  segments.push(safeArtifactPathSegment(evalSet, "default"));
607
623
  }
@@ -628,7 +644,7 @@ function buildResultIndexArtifact(result) {
628
644
  return {
629
645
  timestamp: result.timestamp,
630
646
  test_id: result.testId ?? "unknown",
631
- dataset: getDataset(result),
647
+ suite: getSuite(result),
632
648
  category: result.category,
633
649
  conversation_id: result.conversationId,
634
650
  score: result.score,
@@ -651,42 +667,6 @@ async function writeJsonlFile(filePath, records) {
651
667
  `;
652
668
  await writeFile(filePath, content, "utf8");
653
669
  }
654
- function toCamelCase(str) {
655
- return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
656
- }
657
- function toCamelCaseDeep(obj) {
658
- if (obj === null || obj === void 0) {
659
- return obj;
660
- }
661
- if (Array.isArray(obj)) {
662
- return obj.map((item) => toCamelCaseDeep(item));
663
- }
664
- if (typeof obj === "object") {
665
- const result = {};
666
- for (const [key, value] of Object.entries(obj)) {
667
- result[toCamelCase(key)] = toCamelCaseDeep(value);
668
- }
669
- return result;
670
- }
671
- return obj;
672
- }
673
- function parseJsonlResults(content) {
674
- const results = [];
675
- const lines = content.split("\n");
676
- for (const line of lines) {
677
- const trimmed = line.trim();
678
- if (trimmed.length === 0) {
679
- continue;
680
- }
681
- try {
682
- const parsed = JSON.parse(trimmed);
683
- const camelCased = toCamelCaseDeep(parsed);
684
- results.push(camelCased);
685
- } catch {
686
- }
687
- }
688
- return results;
689
- }
690
670
  async function writeArtifactsFromResults(results, outputDir, options) {
691
671
  const testArtifactDir = outputDir;
692
672
  const timingPath = path4.join(outputDir, "timing.json");
@@ -733,7 +713,6 @@ async function writeArtifactsFromResults(results, outputDir, options) {
733
713
 
734
714
  // src/commands/eval/benchmark-writer.ts
735
715
  import { writeFile as writeFile2 } from "node:fs/promises";
736
- var PASS_THRESHOLD2 = 0.8;
737
716
  function computeStats2(values) {
738
717
  if (values.length === 0) {
739
718
  return { mean: 0, stddev: 0 };
@@ -748,10 +727,10 @@ function computeStats2(values) {
748
727
  function computePassRate2(result) {
749
728
  const scores = result.scores;
750
729
  if (scores && scores.length > 0) {
751
- const passed = scores.filter((s) => s.score >= PASS_THRESHOLD2).length;
730
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
752
731
  return passed / scores.length;
753
732
  }
754
- return result.score >= PASS_THRESHOLD2 ? 1 : 0;
733
+ return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
755
734
  }
756
735
  function buildBenchmarkJson(results) {
757
736
  const passRates = results.map(computePassRate2);
@@ -1698,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
1698
1677
  this.closed = true;
1699
1678
  const grouped = /* @__PURE__ */ new Map();
1700
1679
  for (const result of this.results) {
1701
- const suite = result.dataset ?? "default";
1680
+ const suite = result.suite ?? "default";
1702
1681
  const existing = grouped.get(suite);
1703
1682
  if (existing) {
1704
1683
  existing.push(result);
@@ -1708,14 +1687,17 @@ var JunitWriter = class _JunitWriter {
1708
1687
  }
1709
1688
  const suiteXmls = [];
1710
1689
  for (const [suiteName, results] of grouped) {
1711
- const failures = results.filter((r) => r.score < this.threshold).length;
1712
- const errors = results.filter((r) => r.error !== void 0).length;
1690
+ const errors = results.filter((r) => r.executionStatus === "execution_error").length;
1691
+ const failures = results.filter(
1692
+ (r) => r.executionStatus !== "execution_error" && r.score < this.threshold
1693
+ ).length;
1713
1694
  const testCases = results.map((r) => {
1714
1695
  const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
1715
1696
  let inner = "";
1716
- if (r.error) {
1697
+ if (r.executionStatus === "execution_error") {
1698
+ const errorMsg = r.error ?? "Execution error";
1717
1699
  inner = `
1718
- <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
1700
+ <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>
1719
1701
  `;
1720
1702
  } else if (r.score < this.threshold) {
1721
1703
  const message = `score=${r.score.toFixed(3)}`;
@@ -1730,17 +1712,21 @@ var JunitWriter = class _JunitWriter {
1730
1712
  }
1731
1713
  return ` <testcase name="${escapeXml(r.testId)}" classname="${escapeXml(suiteName)}" time="${time}">${inner}</testcase>`;
1732
1714
  });
1715
+ const suiteTime = results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1e3;
1733
1716
  suiteXmls.push(
1734
- ` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}">
1717
+ ` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}" time="${suiteTime.toFixed(3)}">
1735
1718
  ${testCases.join("\n")}
1736
1719
  </testsuite>`
1737
1720
  );
1738
1721
  }
1739
1722
  const totalTests = this.results.length;
1740
- const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
1741
- const totalErrors = this.results.filter((r) => r.error !== void 0).length;
1723
+ const totalErrors = this.results.filter((r) => r.executionStatus === "execution_error").length;
1724
+ const totalFailures = this.results.filter(
1725
+ (r) => r.executionStatus !== "execution_error" && r.score < this.threshold
1726
+ ).length;
1727
+ const totalTime = this.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1e3;
1742
1728
  const xml = `<?xml version="1.0" encoding="UTF-8"?>
1743
- <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
1729
+ <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}" time="${totalTime.toFixed(3)}">
1744
1730
  ${suiteXmls.join("\n")}
1745
1731
  </testsuites>
1746
1732
  `;
@@ -1839,17 +1825,6 @@ function createWriterFromPath(filePath, options) {
1839
1825
  );
1840
1826
  }
1841
1827
  }
1842
- async function createMultiWriter(filePaths, options) {
1843
- const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
1844
- return {
1845
- async append(result) {
1846
- await Promise.all(writers.map((w) => w.append(result)));
1847
- },
1848
- async close() {
1849
- await Promise.all(writers.map((w) => w.close()));
1850
- }
1851
- };
1852
- }
1853
1828
 
1854
1829
  // src/commands/eval/progress-display.ts
1855
1830
  var ANSI_BOLD = "\x1B[1m";
@@ -1926,12 +1901,12 @@ var ProgressDisplay = class {
1926
1901
  }
1927
1902
  addLogPaths(paths, provider) {
1928
1903
  const newPaths = [];
1929
- for (const path16 of paths) {
1930
- if (this.logPathSet.has(path16)) {
1904
+ for (const path17 of paths) {
1905
+ if (this.logPathSet.has(path17)) {
1931
1906
  continue;
1932
1907
  }
1933
- this.logPathSet.add(path16);
1934
- newPaths.push(path16);
1908
+ this.logPathSet.add(path17);
1909
+ newPaths.push(path17);
1935
1910
  }
1936
1911
  if (newPaths.length === 0) {
1937
1912
  return;
@@ -1944,8 +1919,8 @@ var ProgressDisplay = class {
1944
1919
  this.hasPrintedLogHeader = true;
1945
1920
  }
1946
1921
  const startIndex = this.logPaths.length - newPaths.length;
1947
- newPaths.forEach((path16, offset) => {
1948
- console.log(`${startIndex + offset + 1}. ${path16}`);
1922
+ newPaths.forEach((path17, offset) => {
1923
+ console.log(`${startIndex + offset + 1}. ${path17}`);
1949
1924
  });
1950
1925
  }
1951
1926
  finish() {
@@ -1962,9 +1937,6 @@ import path12 from "node:path";
1962
1937
  function parseJsonlLines(content) {
1963
1938
  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
1964
1939
  }
1965
- function isIndexManifestPath(sourceFile) {
1966
- return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
1967
- }
1968
1940
  function parseMarkdownMessages(content) {
1969
1941
  const trimmed = content.trim();
1970
1942
  if (!trimmed.startsWith("@[")) {
@@ -2022,11 +1994,11 @@ function hydrateOutput(baseDir, record) {
2022
1994
  function hydrateManifestRecord(baseDir, record) {
2023
1995
  const grading = readOptionalJson(baseDir, record.grading_path);
2024
1996
  const timing = readOptionalJson(baseDir, record.timing_path);
2025
- const testId = record.test_id ?? record.eval_id ?? "unknown";
1997
+ const testId = record.test_id ?? "unknown";
2026
1998
  return {
2027
1999
  timestamp: record.timestamp,
2028
2000
  testId,
2029
- dataset: record.dataset,
2001
+ suite: record.suite,
2030
2002
  category: record.category,
2031
2003
  target: record.target,
2032
2004
  score: record.score,
@@ -2066,74 +2038,44 @@ function parseResultManifest(content) {
2066
2038
  }
2067
2039
  function resolveResultSourcePath(source, cwd) {
2068
2040
  const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
2069
- return resolveWorkspaceOrFilePath(resolved);
2041
+ if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
2042
+ return resolveRunManifestPath(resolved);
2043
+ }
2044
+ return resolved;
2070
2045
  }
2071
2046
  function loadManifestResults(sourceFile) {
2072
- const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2073
- if (!isIndexManifestPath(resolvedSourceFile)) {
2074
- return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
2075
- }
2047
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2076
2048
  const content = readFileSync(resolvedSourceFile, "utf8");
2077
2049
  const records = parseResultManifest(content);
2078
2050
  const baseDir = path12.dirname(resolvedSourceFile);
2079
2051
  return records.map((record) => hydrateManifestRecord(baseDir, record));
2080
2052
  }
2081
2053
  function loadLightweightResults(sourceFile) {
2082
- const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2054
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2083
2055
  const content = readFileSync(resolvedSourceFile, "utf8");
2084
- if (isIndexManifestPath(resolvedSourceFile)) {
2085
- return parseResultManifest(content).map((record) => ({
2086
- testId: record.test_id ?? record.eval_id ?? "unknown",
2087
- target: record.target,
2088
- experiment: record.experiment,
2089
- score: record.score,
2090
- scores: record.scores,
2091
- executionStatus: record.execution_status,
2092
- error: record.error,
2093
- timestamp: record.timestamp
2094
- }));
2095
- }
2096
- const records = [];
2097
- for (const line of content.split(/\r?\n/)) {
2098
- const trimmed = line.trim();
2099
- if (!trimmed) {
2100
- continue;
2101
- }
2102
- let record;
2103
- try {
2104
- record = JSON.parse(trimmed);
2105
- } catch {
2106
- continue;
2107
- }
2108
- const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
2109
- if (typeof rawTestId !== "string") {
2110
- throw new Error(`Missing test_id in result: ${trimmed}`);
2111
- }
2112
- if (typeof record.score !== "number") {
2113
- throw new Error(`Missing or invalid score in result: ${trimmed}`);
2114
- }
2115
- records.push({
2116
- testId: rawTestId,
2117
- target: typeof record.target === "string" ? record.target : void 0,
2118
- score: record.score,
2119
- scores: Array.isArray(record.scores) ? record.scores : void 0,
2120
- executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
2121
- error: typeof record.error === "string" ? record.error : void 0,
2122
- timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
2123
- });
2124
- }
2125
- return records;
2056
+ return parseResultManifest(content).map((record) => ({
2057
+ testId: record.test_id ?? "unknown",
2058
+ suite: record.suite,
2059
+ target: record.target,
2060
+ experiment: record.experiment,
2061
+ score: record.score,
2062
+ scores: record.scores,
2063
+ executionStatus: record.execution_status,
2064
+ error: record.error,
2065
+ timestamp: record.timestamp
2066
+ }));
2126
2067
  }
2127
2068
 
2128
2069
  // src/commands/eval/retry-errors.ts
2070
+ async function loadRetrySourceResults(jsonlPath) {
2071
+ return loadManifestResults(resolveResultSourcePath(jsonlPath));
2072
+ }
2129
2073
  async function loadErrorTestIds(jsonlPath) {
2130
- const resolvedPath = resolveResultSourcePath(jsonlPath);
2131
- const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
2074
+ const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
2132
2075
  return [...new Set(ids)];
2133
2076
  }
2134
2077
  async function loadNonErrorResults(jsonlPath) {
2135
- const resolvedPath = resolveResultSourcePath(jsonlPath);
2136
- return loadManifestResults(resolvedPath).filter(
2078
+ return (await loadRetrySourceResults(jsonlPath)).filter(
2137
2079
  (result) => result.testId && result.executionStatus !== "execution_error"
2138
2080
  );
2139
2081
  }
@@ -2146,7 +2088,7 @@ function resolveRunCacheFile(cache) {
2146
2088
  if (cache.lastRunDir) {
2147
2089
  return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
2148
2090
  }
2149
- return cache.lastResultFile ?? "";
2091
+ return "";
2150
2092
  }
2151
2093
  function cachePath(cwd) {
2152
2094
  return path13.join(cwd, ".agentv", CACHE_FILENAME);
@@ -2160,15 +2102,14 @@ async function loadRunCache(cwd) {
2160
2102
  }
2161
2103
  }
2162
2104
  async function saveRunCache(cwd, resultPath) {
2105
+ if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2106
+ return;
2107
+ }
2163
2108
  const dir = path13.join(cwd, ".agentv");
2164
2109
  await mkdir7(dir, { recursive: true });
2165
- const basename = path13.basename(resultPath);
2166
- const cache = basename === RESULT_INDEX_FILENAME ? {
2110
+ const cache = {
2167
2111
  lastRunDir: path13.dirname(resultPath),
2168
2112
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
2169
- } : {
2170
- lastResultFile: resultPath,
2171
- timestamp: (/* @__PURE__ */ new Date()).toISOString()
2172
2113
  };
2173
2114
  await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2174
2115
  `, "utf-8");
@@ -2313,11 +2254,21 @@ function formatEvaluationSummary(summary, options) {
2313
2254
  }
2314
2255
  const gradedCount = summary.total - summary.executionErrorCount;
2315
2256
  const threshold = options?.threshold ?? 0.8;
2316
- const overallPassed = summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
2317
- const overallVerdict = overallPassed ? "PASS" : "FAIL";
2257
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
2258
+ const overallPassed = !allExecutionErrors && (summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
2318
2259
  const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
2319
- const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2320
- const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2260
+ let overallVerdict;
2261
+ let verdictColor;
2262
+ let verdictText;
2263
+ if (allExecutionErrors) {
2264
+ overallVerdict = "INCONCLUSIVE";
2265
+ verdictColor = "\x1B[33m";
2266
+ verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
2267
+ } else {
2268
+ overallVerdict = overallPassed ? "PASS" : "FAIL";
2269
+ verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2270
+ verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2271
+ }
2321
2272
  lines.push("\n==================================================");
2322
2273
  if (useColor) {
2323
2274
  lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
@@ -2527,7 +2478,7 @@ var KNOWN_TEST_FIELDS = /* @__PURE__ */ new Set([
2527
2478
  "workspace",
2528
2479
  "metadata",
2529
2480
  "conversation_id",
2530
- "dataset",
2481
+ "suite",
2531
2482
  "note"
2532
2483
  ]);
2533
2484
  var NAME_PATTERN = /^[a-z0-9-]+$/;
@@ -3090,87 +3041,68 @@ function isObject2(value) {
3090
3041
  var COMMON_SETTINGS = new Set(COMMON_TARGET_SETTINGS);
3091
3042
  var RETRY_SETTINGS = /* @__PURE__ */ new Set([
3092
3043
  "max_retries",
3093
- "maxRetries",
3094
3044
  "retry_initial_delay_ms",
3095
- "retryInitialDelayMs",
3096
3045
  "retry_max_delay_ms",
3097
- "retryMaxDelayMs",
3098
3046
  "retry_backoff_factor",
3099
- "retryBackoffFactor",
3100
- "retry_status_codes",
3101
- "retryStatusCodes"
3047
+ "retry_status_codes"
3102
3048
  ]);
3103
3049
  var AZURE_SETTINGS = /* @__PURE__ */ new Set([
3104
3050
  ...COMMON_SETTINGS,
3105
3051
  ...RETRY_SETTINGS,
3106
3052
  "endpoint",
3107
3053
  "resource",
3108
- "resourceName",
3109
3054
  "api_key",
3110
- "apiKey",
3111
3055
  "deployment",
3112
- "deploymentName",
3113
3056
  "model",
3114
3057
  "version",
3115
3058
  "api_version",
3059
+ "api_format",
3116
3060
  "temperature",
3117
- "max_output_tokens",
3118
- "maxTokens"
3061
+ "max_output_tokens"
3119
3062
  ]);
3120
3063
  var OPENAI_SETTINGS = /* @__PURE__ */ new Set([
3121
3064
  ...COMMON_SETTINGS,
3122
3065
  ...RETRY_SETTINGS,
3123
3066
  "endpoint",
3124
3067
  "base_url",
3125
- "baseUrl",
3126
3068
  "api_key",
3127
- "apiKey",
3128
3069
  "model",
3129
3070
  "deployment",
3130
3071
  "variant",
3131
3072
  "api_format",
3132
- "apiFormat",
3133
3073
  "temperature",
3134
- "max_output_tokens",
3135
- "maxTokens"
3074
+ "max_output_tokens"
3136
3075
  ]);
3137
3076
  var OPENROUTER_SETTINGS = /* @__PURE__ */ new Set([
3138
3077
  ...COMMON_SETTINGS,
3139
3078
  ...RETRY_SETTINGS,
3140
3079
  "api_key",
3141
- "apiKey",
3142
3080
  "model",
3143
3081
  "deployment",
3144
3082
  "variant",
3145
3083
  "temperature",
3146
- "max_output_tokens",
3147
- "maxTokens"
3084
+ "max_output_tokens"
3148
3085
  ]);
3149
3086
  var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
3150
3087
  ...COMMON_SETTINGS,
3151
3088
  ...RETRY_SETTINGS,
3152
3089
  "api_key",
3153
- "apiKey",
3154
3090
  "model",
3155
3091
  "deployment",
3156
3092
  "variant",
3157
3093
  "temperature",
3158
3094
  "max_output_tokens",
3159
- "maxTokens",
3160
- "thinking_budget",
3161
- "thinkingBudget"
3095
+ "thinking_budget"
3162
3096
  ]);
3163
3097
  var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
3164
3098
  ...COMMON_SETTINGS,
3165
3099
  ...RETRY_SETTINGS,
3166
3100
  "api_key",
3167
- "apiKey",
3168
3101
  "model",
3169
3102
  "deployment",
3170
3103
  "variant",
3171
3104
  "temperature",
3172
- "max_output_tokens",
3173
- "maxTokens"
3105
+ "max_output_tokens"
3174
3106
  ]);
3175
3107
  var CODEX_SETTINGS = /* @__PURE__ */ new Set([
3176
3108
  ...COMMON_SETTINGS,
@@ -3182,40 +3114,26 @@ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
3182
3114
  "arguments",
3183
3115
  "cwd",
3184
3116
  "timeout_seconds",
3185
- "timeoutSeconds",
3186
3117
  "log_dir",
3187
- "logDir",
3188
3118
  "log_directory",
3189
- "logDirectory",
3190
3119
  "log_format",
3191
- "logFormat",
3192
3120
  "log_output_format",
3193
- "logOutputFormat",
3194
3121
  "system_prompt",
3195
- "systemPrompt",
3196
- "workspace_template",
3197
- "workspaceTemplate"
3122
+ "workspace_template"
3198
3123
  ]);
3199
3124
  var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
3200
3125
  ...COMMON_SETTINGS,
3201
3126
  "cli_url",
3202
- "cliUrl",
3203
3127
  "cli_path",
3204
- "cliPath",
3205
3128
  "github_token",
3206
- "githubToken",
3207
3129
  "model",
3208
3130
  "cwd",
3209
3131
  "timeout_seconds",
3210
- "timeoutSeconds",
3211
3132
  "log_dir",
3212
- "logDir",
3213
3133
  "log_format",
3214
- "logFormat",
3215
3134
  "system_prompt",
3216
- "systemPrompt",
3217
3135
  "workspace_template",
3218
- "workspaceTemplate"
3136
+ "byok"
3219
3137
  ]);
3220
3138
  var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
3221
3139
  ...COMMON_SETTINGS,
@@ -3227,35 +3145,23 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
3227
3145
  "model",
3228
3146
  "cwd",
3229
3147
  "timeout_seconds",
3230
- "timeoutSeconds",
3231
3148
  "log_dir",
3232
- "logDir",
3233
3149
  "log_format",
3234
- "logFormat",
3235
3150
  "system_prompt",
3236
- "systemPrompt",
3237
- "workspace_template",
3238
- "workspaceTemplate"
3151
+ "workspace_template"
3239
3152
  ]);
3240
3153
  var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
3241
3154
  ...COMMON_SETTINGS,
3242
3155
  "executable",
3243
3156
  "workspace_template",
3244
- "workspaceTemplate",
3245
3157
  "wait",
3246
3158
  "dry_run",
3247
- "dryRun",
3248
3159
  "subagent_root",
3249
- "subagentRoot",
3250
- "timeout_seconds",
3251
- "timeoutSeconds"
3160
+ "timeout_seconds"
3252
3161
  ]);
3253
3162
  var MOCK_SETTINGS = /* @__PURE__ */ new Set([
3254
3163
  ...COMMON_SETTINGS,
3255
3164
  "response",
3256
- "delayMs",
3257
- "delayMinMs",
3258
- "delayMaxMs",
3259
3165
  "trace"
3260
3166
  // For testing tool-trajectory evaluator
3261
3167
  ]);
@@ -3264,23 +3170,14 @@ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
3264
3170
  "model",
3265
3171
  "cwd",
3266
3172
  "timeout_seconds",
3267
- "timeoutSeconds",
3268
3173
  "log_dir",
3269
- "logDir",
3270
3174
  "log_directory",
3271
- "logDirectory",
3272
3175
  "log_format",
3273
- "logFormat",
3274
3176
  "log_output_format",
3275
- "logOutputFormat",
3276
3177
  "system_prompt",
3277
- "systemPrompt",
3278
3178
  "workspace_template",
3279
- "workspaceTemplate",
3280
3179
  "max_turns",
3281
- "maxTurns",
3282
- "max_budget_usd",
3283
- "maxBudgetUsd"
3180
+ "max_budget_usd"
3284
3181
  ]);
3285
3182
  function getKnownSettings(provider) {
3286
3183
  const normalizedProvider = provider.toLowerCase();
@@ -3405,15 +3302,15 @@ async function validateTargetsFile(filePath) {
3405
3302
  });
3406
3303
  return;
3407
3304
  }
3408
- const timeoutSeconds = healthcheck.timeout_seconds ?? healthcheck.timeoutSeconds;
3305
+ const timeoutSeconds = healthcheck.timeout_seconds;
3409
3306
  if (timeoutSeconds !== void 0) {
3410
3307
  const numericTimeout = Number(timeoutSeconds);
3411
3308
  if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
3412
3309
  errors2.push({
3413
3310
  severity: "error",
3414
3311
  filePath: absolutePath2,
3415
- location: `${location}.timeoutSeconds`,
3416
- message: "healthcheck.timeoutSeconds must be a positive number when provided"
3312
+ location: `${location}.timeout_seconds`,
3313
+ message: "healthcheck.timeout_seconds must be a positive number when provided"
3417
3314
  });
3418
3315
  }
3419
3316
  }
@@ -3512,6 +3409,18 @@ async function validateTargetsFile(filePath) {
3512
3409
  });
3513
3410
  continue;
3514
3411
  }
3412
+ for (const warning of findDeprecatedCamelCaseTargetWarnings(target, location)) {
3413
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
3414
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
3415
+ const field = fieldMatch?.[1] ?? "unknown";
3416
+ const replacement = replacementMatch?.[1] ?? "snake_case";
3417
+ errors.push({
3418
+ severity: "error",
3419
+ filePath: absolutePath,
3420
+ location: warning.location,
3421
+ message: `camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
3422
+ });
3423
+ }
3515
3424
  const name = target.name;
3516
3425
  if (typeof name !== "string" || name.trim().length === 0) {
3517
3426
  errors.push({
@@ -3891,7 +3800,9 @@ Errors in ${targetsFilePath}:`);
3891
3800
  };
3892
3801
  }
3893
3802
  try {
3894
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3803
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
3804
+ emitDeprecationWarnings: false
3805
+ });
3895
3806
  return {
3896
3807
  definitions,
3897
3808
  resolvedTarget,
@@ -3974,7 +3885,9 @@ Errors in ${targetsFilePath}:`);
3974
3885
  });
3975
3886
  } else {
3976
3887
  try {
3977
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3888
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
3889
+ emitDeprecationWarnings: false
3890
+ });
3978
3891
  results.push({
3979
3892
  definitions,
3980
3893
  resolvedTarget,
@@ -4043,6 +3956,16 @@ function normalizeStringArray(value) {
4043
3956
  }
4044
3957
  return [];
4045
3958
  }
3959
+ function normalizeFilter(value) {
3960
+ if (Array.isArray(value)) {
3961
+ const filters = normalizeStringArray(value);
3962
+ if (filters.length === 0) {
3963
+ return void 0;
3964
+ }
3965
+ return filters.length === 1 ? filters[0] : filters;
3966
+ }
3967
+ return normalizeString(value);
3968
+ }
4046
3969
  function matchesTagFilters(fileTags, includeTags, excludeTags) {
4047
3970
  const tags = new Set(fileTags ?? []);
4048
3971
  if (includeTags.length > 0) {
@@ -4084,15 +4007,12 @@ function trimOutputMessages(output, outputMessages) {
4084
4007
  return sliced.map((m) => ({ role: m.role, content: m.content }));
4085
4008
  }
4086
4009
  function normalizeOptions(rawOptions, config, yamlExecution) {
4087
- const cliFormat = normalizeString(rawOptions.outputFormat);
4088
- const configFormat = config?.output?.format;
4089
- const formatStr = cliFormat ?? configFormat ?? "jsonl";
4090
- const format = formatStr === "yaml" ? "yaml" : "jsonl";
4091
4010
  const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
4092
4011
  const configWorkers = config?.execution?.workers;
4093
4012
  const workers = cliWorkers ?? configWorkers ?? 0;
4094
- const rawOutputPaths = rawOptions.output;
4095
- const outputPaths = Array.isArray(rawOutputPaths) ? rawOutputPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
4013
+ const cliOutputDir = normalizeString(rawOptions.output);
4014
+ const rawExportPaths = rawOptions.export;
4015
+ const exportPaths = Array.isArray(rawExportPaths) ? rawExportPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
4096
4016
  const rawTarget = rawOptions.target;
4097
4017
  let cliTargets = [];
4098
4018
  let singleTarget;
@@ -4132,11 +4052,11 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4132
4052
  target: singleTarget,
4133
4053
  cliTargets,
4134
4054
  targetsPath: normalizeString(rawOptions.targets),
4135
- filter: normalizeString(rawOptions.filter),
4055
+ filter: normalizeFilter(rawOptions.filter),
4136
4056
  workers: workers > 0 ? workers : void 0,
4057
+ outputDir: cliOutputDir,
4137
4058
  outPath: cliOut ?? configOut,
4138
- outputPaths,
4139
- format,
4059
+ exportPaths,
4140
4060
  dryRun: normalizeBoolean(rawOptions.dryRun),
4141
4061
  dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
4142
4062
  dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
@@ -4165,7 +4085,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4165
4085
  outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
4166
4086
  threshold: normalizeOptionalNumber(rawOptions.threshold),
4167
4087
  tags: normalizeStringArray(rawOptions.tag),
4168
- excludeTags: normalizeStringArray(rawOptions.excludeTag)
4088
+ excludeTags: normalizeStringArray(rawOptions.excludeTag),
4089
+ transcript: normalizeString(rawOptions.transcript)
4169
4090
  };
4170
4091
  }
4171
4092
  async function ensureFileExists(filePath, description) {
@@ -4191,20 +4112,20 @@ function createProgressReporter(maxWorkers, options) {
4191
4112
  addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
4192
4113
  };
4193
4114
  }
4194
- function makeEvalKey(testFilePath, evalId) {
4195
- return `${path15.resolve(testFilePath)}::${evalId}`;
4115
+ function makeTestCaseKey(testFilePath, testId) {
4116
+ return `${path15.resolve(testFilePath)}::${testId}`;
4196
4117
  }
4197
4118
  function createDisplayIdTracker() {
4198
4119
  const map = /* @__PURE__ */ new Map();
4199
4120
  let nextId = 1;
4200
4121
  return {
4201
- getOrAssign(evalKey) {
4202
- const existing = map.get(evalKey);
4122
+ getOrAssign(testCaseKey) {
4123
+ const existing = map.get(testCaseKey);
4203
4124
  if (existing !== void 0) {
4204
4125
  return existing;
4205
4126
  }
4206
4127
  const assigned = nextId++;
4207
- map.set(evalKey, assigned);
4128
+ map.set(testCaseKey, assigned);
4208
4129
  return assigned;
4209
4130
  }
4210
4131
  };
@@ -4255,58 +4176,79 @@ async function prepareFileMetadata(params) {
4255
4176
  filter: options.filter,
4256
4177
  category
4257
4178
  });
4258
- const filteredIds = suite.tests.map((value) => value.id);
4259
- const cliTargets = options.cliTargets;
4179
+ const testIds = suite.tests.map((value) => value.id);
4260
4180
  const suiteTargets = suite.targets;
4261
- let targetNames;
4262
- if (cliTargets.length > 0) {
4263
- targetNames = cliTargets;
4264
- } else if (suiteTargets && suiteTargets.length > 0) {
4265
- targetNames = suiteTargets;
4266
- } else {
4267
- targetNames = [];
4268
- }
4269
4181
  let selections;
4270
- if (targetNames.length > 1) {
4271
- const multiSelections = await selectMultipleTargets({
4272
- testFilePath,
4273
- repoRoot,
4274
- cwd,
4275
- explicitTargetsPath: options.targetsPath,
4276
- dryRun: options.dryRun,
4277
- dryRunDelay: options.dryRunDelay,
4278
- dryRunDelayMin: options.dryRunDelayMin,
4279
- dryRunDelayMax: options.dryRunDelayMax,
4280
- env: process.env,
4281
- targetNames
4282
- });
4283
- selections = multiSelections.map((sel) => ({
4284
- selection: sel,
4285
- inlineTargetLabel: sel.targetName
4286
- }));
4287
- } else {
4288
- const selection = await selectTarget({
4289
- testFilePath,
4290
- repoRoot,
4291
- cwd,
4292
- explicitTargetsPath: options.targetsPath,
4293
- cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
4294
- dryRun: options.dryRun,
4295
- dryRunDelay: options.dryRunDelay,
4296
- dryRunDelayMin: options.dryRunDelayMin,
4297
- dryRunDelayMax: options.dryRunDelayMax,
4298
- env: process.env
4299
- });
4182
+ if (options.transcript) {
4183
+ const transcriptSelection = {
4184
+ definitions: [],
4185
+ resolvedTarget: {
4186
+ kind: "transcript",
4187
+ name: "transcript",
4188
+ config: {}
4189
+ },
4190
+ targetName: "transcript",
4191
+ targetSource: "cli",
4192
+ targetsFilePath: options.transcript
4193
+ };
4300
4194
  selections = [
4301
4195
  {
4302
- selection,
4303
- inlineTargetLabel: selection.targetName
4196
+ selection: transcriptSelection,
4197
+ inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
4304
4198
  }
4305
4199
  ];
4200
+ } else {
4201
+ const cliTargets = options.cliTargets;
4202
+ const suiteTargets2 = suite.targets;
4203
+ let targetNames;
4204
+ if (cliTargets.length > 0) {
4205
+ targetNames = cliTargets;
4206
+ } else if (suiteTargets2 && suiteTargets2.length > 0) {
4207
+ targetNames = suiteTargets2;
4208
+ } else {
4209
+ targetNames = [];
4210
+ }
4211
+ if (targetNames.length > 1) {
4212
+ const multiSelections = await selectMultipleTargets({
4213
+ testFilePath,
4214
+ repoRoot,
4215
+ cwd,
4216
+ explicitTargetsPath: options.targetsPath,
4217
+ dryRun: options.dryRun,
4218
+ dryRunDelay: options.dryRunDelay,
4219
+ dryRunDelayMin: options.dryRunDelayMin,
4220
+ dryRunDelayMax: options.dryRunDelayMax,
4221
+ env: process.env,
4222
+ targetNames
4223
+ });
4224
+ selections = multiSelections.map((sel) => ({
4225
+ selection: sel,
4226
+ inlineTargetLabel: sel.targetName
4227
+ }));
4228
+ } else {
4229
+ const selection = await selectTarget({
4230
+ testFilePath,
4231
+ repoRoot,
4232
+ cwd,
4233
+ explicitTargetsPath: options.targetsPath,
4234
+ cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
4235
+ dryRun: options.dryRun,
4236
+ dryRunDelay: options.dryRunDelay,
4237
+ dryRunDelayMin: options.dryRunDelayMin,
4238
+ dryRunDelayMax: options.dryRunDelayMax,
4239
+ env: process.env
4240
+ });
4241
+ selections = [
4242
+ {
4243
+ selection,
4244
+ inlineTargetLabel: selection.targetName
4245
+ }
4246
+ ];
4247
+ }
4306
4248
  }
4307
4249
  return {
4308
- evalIds: filteredIds,
4309
- evalCases: suite.tests,
4250
+ testIds,
4251
+ testCases: suite.tests,
4310
4252
  selections,
4311
4253
  trialsConfig: suite.trials,
4312
4254
  suiteTargets,
@@ -4344,15 +4286,16 @@ async function runSingleEvalFile(params) {
4344
4286
  workersOverride,
4345
4287
  yamlWorkers,
4346
4288
  progressReporter,
4347
- seenEvalCases,
4289
+ seenTestCases,
4348
4290
  displayIdTracker,
4349
4291
  selection,
4350
4292
  inlineTargetLabel,
4351
- evalCases,
4293
+ testCases,
4352
4294
  trialsConfig,
4353
4295
  matrixMode,
4354
4296
  totalBudgetUsd,
4355
- failOnError
4297
+ failOnError,
4298
+ providerFactory
4356
4299
  } = params;
4357
4300
  const targetName = selection.targetName;
4358
4301
  await ensureFileExists(testFilePath, "Test file");
@@ -4408,7 +4351,8 @@ async function runSingleEvalFile(params) {
4408
4351
  }
4409
4352
  return true;
4410
4353
  })(),
4411
- evalCases,
4354
+ filter: options.filter,
4355
+ evalCases: testCases,
4412
4356
  verbose: options.verbose,
4413
4357
  maxConcurrency: resolvedWorkers,
4414
4358
  workspaceMode: options.workspaceMode,
@@ -4419,6 +4363,7 @@ async function runSingleEvalFile(params) {
4419
4363
  graderTarget: options.graderTarget,
4420
4364
  model: options.model,
4421
4365
  threshold: options.threshold,
4366
+ providerFactory,
4422
4367
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
4423
4368
  onResult: async (result) => {
4424
4369
  streamingObserver?.completeFromResult?.(result);
@@ -4442,13 +4387,13 @@ async function runSingleEvalFile(params) {
4442
4387
  }
4443
4388
  },
4444
4389
  onProgress: async (event) => {
4445
- const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
4446
- const evalKey = makeEvalKey(testFilePath, evalKeyId);
4447
- if (event.status === "pending" && !seenEvalCases.has(evalKey)) {
4448
- seenEvalCases.add(evalKey);
4449
- progressReporter.setTotal(seenEvalCases.size);
4390
+ const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
4391
+ const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId);
4392
+ if (event.status === "pending" && !seenTestCases.has(testCaseKey)) {
4393
+ seenTestCases.add(testCaseKey);
4394
+ progressReporter.setTotal(seenTestCases.size);
4450
4395
  }
4451
- const displayId = displayIdTracker.getOrAssign(evalKey);
4396
+ const displayId = displayIdTracker.getOrAssign(testCaseKey);
4452
4397
  if (event.status === "running" && streamingObserver) {
4453
4398
  streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
4454
4399
  }
@@ -4528,13 +4473,48 @@ async function runEvalCommand(input) {
4528
4473
  if (options.verbose) {
4529
4474
  console.log(`Repository root: ${repoRoot}`);
4530
4475
  }
4531
- const usesDefaultArtifactWorkspace = !options.outPath;
4532
- const outputPath = options.outPath ? path15.resolve(options.outPath) : buildDefaultOutputPath(cwd);
4476
+ if (options.outPath) {
4477
+ console.warn("Warning: --out is deprecated. Use --output <dir> to set the artifact directory.");
4478
+ }
4479
+ if (options.artifacts) {
4480
+ console.warn(
4481
+ "Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory."
4482
+ );
4483
+ }
4484
+ if (options.benchmarkJson) {
4485
+ console.warn(
4486
+ "Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory."
4487
+ );
4488
+ }
4489
+ if (normalizeString(input.rawOptions.outputFormat)) {
4490
+ console.warn(
4491
+ "Warning: --output-format is deprecated. The artifact directory always uses JSONL."
4492
+ );
4493
+ }
4494
+ const explicitDir = options.outputDir ?? options.artifacts;
4495
+ let runDir;
4496
+ let outputPath;
4497
+ let usesDefaultArtifactWorkspace;
4498
+ if (explicitDir) {
4499
+ runDir = path15.resolve(explicitDir);
4500
+ mkdirSync(runDir, { recursive: true });
4501
+ outputPath = path15.join(runDir, "index.jsonl");
4502
+ usesDefaultArtifactWorkspace = true;
4503
+ } else if (options.outPath) {
4504
+ outputPath = path15.resolve(options.outPath);
4505
+ runDir = path15.dirname(outputPath);
4506
+ mkdirSync(runDir, { recursive: true });
4507
+ usesDefaultArtifactWorkspace = false;
4508
+ } else {
4509
+ outputPath = buildDefaultOutputPath(cwd);
4510
+ runDir = path15.dirname(outputPath);
4511
+ usesDefaultArtifactWorkspace = true;
4512
+ }
4533
4513
  let otelExporter = null;
4534
4514
  const useFileExport = !!options.otelFile;
4535
4515
  if (options.exportOtel || useFileExport) {
4536
4516
  try {
4537
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-BN5NUVAB.js");
4517
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QXVR2ZRH.js");
4538
4518
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4539
4519
  let headers = {};
4540
4520
  if (options.otelBackend) {
@@ -4575,16 +4555,11 @@ async function runEvalCommand(input) {
4575
4555
  }
4576
4556
  }
4577
4557
  const primaryWritePath = outputPath;
4578
- const extraOutputPaths = options.outputPaths.map((p) => path15.resolve(p));
4579
- const allOutputPaths = extraOutputPaths.length > 0 ? [primaryWritePath, ...extraOutputPaths] : [primaryWritePath];
4580
- const uniqueOutputPaths = [...new Set(allOutputPaths)];
4581
- const reportedOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4582
- const uniqueReportedOutputPaths = [...new Set(reportedOutputPaths)];
4583
- if (uniqueOutputPaths.length === 1) {
4584
- console.log(`Output path: ${outputPath}`);
4585
- } else {
4586
- console.log("Output paths:");
4587
- for (const p of uniqueReportedOutputPaths) {
4558
+ const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
4559
+ console.log(`Artifact directory: ${runDir}`);
4560
+ if (resolvedExportPaths.length > 0) {
4561
+ console.log("Export files:");
4562
+ for (const p of resolvedExportPaths) {
4588
4563
  console.log(` ${p}`);
4589
4564
  }
4590
4565
  }
@@ -4594,7 +4569,7 @@ async function runEvalCommand(input) {
4594
4569
  }
4595
4570
  const evaluationRunner = await resolveEvaluationRunner();
4596
4571
  const allResults = [];
4597
- const seenEvalCases = /* @__PURE__ */ new Set();
4572
+ const seenTestCases = /* @__PURE__ */ new Set();
4598
4573
  const displayIdTracker = createDisplayIdTracker();
4599
4574
  const totalWorkers = options.workers ?? DEFAULT_WORKERS;
4600
4575
  const fileConcurrency = Math.min(
@@ -4656,7 +4631,6 @@ async function runEvalCommand(input) {
4656
4631
  yamlCache: yamlCacheEnabled
4657
4632
  });
4658
4633
  const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
4659
- const useCache = cacheEnabled;
4660
4634
  if (cacheEnabled) {
4661
4635
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
4662
4636
  }
@@ -4666,17 +4640,12 @@ async function runEvalCommand(input) {
4666
4640
  throw new Error("--threshold must be between 0 and 1");
4667
4641
  }
4668
4642
  const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
4669
- let outputWriter;
4670
- if (uniqueOutputPaths.length === 1) {
4671
- outputWriter = await createOutputWriter(primaryWritePath, options.format);
4672
- } else {
4673
- outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
4674
- }
4643
+ const outputWriter = await createOutputWriter(primaryWritePath, "jsonl");
4675
4644
  const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
4676
4645
  let totalEvalCount = 0;
4677
4646
  for (const meta of fileMetadata.values()) {
4678
4647
  const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
4679
- for (const test of meta.evalCases) {
4648
+ for (const test of meta.testCases) {
4680
4649
  const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
4681
4650
  totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
4682
4651
  }
@@ -4720,13 +4689,13 @@ async function runEvalCommand(input) {
4720
4689
  });
4721
4690
  for (const [testFilePath, meta] of fileMetadata.entries()) {
4722
4691
  for (const { selection, inlineTargetLabel } of meta.selections) {
4723
- for (const testId of meta.evalIds) {
4724
- const evalKey = makeEvalKey(
4692
+ for (const testId of meta.testIds) {
4693
+ const testCaseKey = makeTestCaseKey(
4725
4694
  testFilePath,
4726
4695
  meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
4727
4696
  );
4728
- seenEvalCases.add(evalKey);
4729
- const displayId = displayIdTracker.getOrAssign(evalKey);
4697
+ seenTestCases.add(testCaseKey);
4698
+ const displayId = displayIdTracker.getOrAssign(testCaseKey);
4730
4699
  progressReporter.update(displayId, {
4731
4700
  workerId: displayId,
4732
4701
  testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
@@ -4737,6 +4706,24 @@ async function runEvalCommand(input) {
4737
4706
  }
4738
4707
  }
4739
4708
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
4709
+ let transcriptProviderFactory;
4710
+ if (options.transcript) {
4711
+ const { TranscriptProvider } = await import("./dist-QXVR2ZRH.js");
4712
+ const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
4713
+ const totalTests = [...fileMetadata.values()].reduce(
4714
+ (sum, meta) => sum + meta.testCases.length,
4715
+ 0
4716
+ );
4717
+ if (transcriptProvider.lineCount !== totalTests) {
4718
+ throw new Error(
4719
+ `Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`
4720
+ );
4721
+ }
4722
+ transcriptProviderFactory = () => transcriptProvider;
4723
+ console.log(
4724
+ `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`
4725
+ );
4726
+ }
4740
4727
  try {
4741
4728
  await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
4742
4729
  const targetPrep = fileMetadata.get(testFilePath);
@@ -4746,13 +4733,13 @@ async function runEvalCommand(input) {
4746
4733
  const targetResults = await Promise.all(
4747
4734
  targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
4748
4735
  const targetName = selection.targetName;
4749
- const applicableEvalCases = targetPrep.selections.length > 1 ? targetPrep.evalCases.filter((test) => {
4736
+ const applicableTestCases = targetPrep.selections.length > 1 ? targetPrep.testCases.filter((test) => {
4750
4737
  if (test.targets && test.targets.length > 0) {
4751
4738
  return test.targets.includes(targetName);
4752
4739
  }
4753
4740
  return true;
4754
- }) : targetPrep.evalCases;
4755
- if (applicableEvalCases.length === 0) {
4741
+ }) : targetPrep.testCases;
4742
+ if (applicableTestCases.length === 0) {
4756
4743
  return [];
4757
4744
  }
4758
4745
  try {
@@ -4768,16 +4755,17 @@ async function runEvalCommand(input) {
4768
4755
  workersOverride: perFileWorkers,
4769
4756
  yamlWorkers: targetPrep.yamlWorkers,
4770
4757
  progressReporter,
4771
- seenEvalCases,
4758
+ seenTestCases,
4772
4759
  displayIdTracker,
4773
4760
  selection,
4774
4761
  inlineTargetLabel,
4775
- evalCases: applicableEvalCases,
4776
- trialsConfig: targetPrep.trialsConfig,
4762
+ testCases: applicableTestCases,
4763
+ trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
4777
4764
  matrixMode: targetPrep.selections.length > 1,
4778
4765
  totalBudgetUsd: targetPrep.totalBudgetUsd,
4779
4766
  failOnError: targetPrep.failOnError,
4780
- threshold: resolvedThreshold
4767
+ threshold: resolvedThreshold,
4768
+ providerFactory: transcriptProviderFactory
4781
4769
  });
4782
4770
  return result.results;
4783
4771
  } catch (fileError) {
@@ -4785,9 +4773,9 @@ async function runEvalCommand(input) {
4785
4773
  console.error(`
4786
4774
  \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
4787
4775
  `);
4788
- const errorResults = applicableEvalCases.map((evalCase) => ({
4776
+ const errorResults = applicableTestCases.map((testCase) => ({
4789
4777
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4790
- testId: evalCase.id,
4778
+ testId: testCase.id,
4791
4779
  score: 0,
4792
4780
  assertions: [],
4793
4781
  output: [],
@@ -4824,6 +4812,7 @@ async function runEvalCommand(input) {
4824
4812
  const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
4825
4813
  const summary = calculateEvaluationSummary(allResults, thresholdOpts);
4826
4814
  console.log(formatEvaluationSummary(summary, thresholdOpts));
4815
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
4827
4816
  const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
4828
4817
  if (isMatrixMode && allResults.length > 0) {
4829
4818
  console.log(formatMatrixSummary(allResults));
@@ -4833,18 +4822,17 @@ async function runEvalCommand(input) {
4833
4822
  await writeBenchmarkJson(benchmarkPath, allResults);
4834
4823
  console.log(`Benchmark written to: ${benchmarkPath}`);
4835
4824
  }
4836
- if (usesDefaultArtifactWorkspace) {
4825
+ if (usesDefaultArtifactWorkspace && allResults.length > 0) {
4837
4826
  const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
4838
- const workspaceDir = path15.dirname(outputPath);
4839
4827
  const {
4840
4828
  testArtifactDir,
4841
4829
  timingPath,
4842
4830
  benchmarkPath: workspaceBenchmarkPath,
4843
4831
  indexPath
4844
- } = await writeArtifactsFromResults(allResults, workspaceDir, {
4832
+ } = await writeArtifactsFromResults(allResults, runDir, {
4845
4833
  evalFile
4846
4834
  });
4847
- console.log(`Artifact workspace written to: ${workspaceDir}`);
4835
+ console.log(`Artifact workspace written to: ${runDir}`);
4848
4836
  console.log(` Index: ${indexPath}`);
4849
4837
  console.log(
4850
4838
  ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
@@ -4852,24 +4840,17 @@ async function runEvalCommand(input) {
4852
4840
  console.log(` Timing: ${timingPath}`);
4853
4841
  console.log(` Benchmark: ${workspaceBenchmarkPath}`);
4854
4842
  }
4855
- if (options.artifacts) {
4856
- const artifactsDir = path15.resolve(options.artifacts);
4857
- const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
4858
- const {
4859
- testArtifactDir,
4860
- indexPath,
4861
- timingPath,
4862
- benchmarkPath: abp
4863
- } = await writeArtifactsFromResults(allResults, artifactsDir, {
4864
- evalFile
4865
- });
4866
- console.log(`Artifacts written to: ${artifactsDir}`);
4867
- console.log(` Index: ${indexPath}`);
4843
+ if (resolvedExportPaths.length > 0 && allResults.length > 0) {
4844
+ for (const exportPath of resolvedExportPaths) {
4845
+ const writer = await createWriterFromPath(exportPath, writerOptions);
4846
+ for (const result of allResults) {
4847
+ await writer.append(result);
4848
+ }
4849
+ await writer.close();
4850
+ }
4868
4851
  console.log(
4869
- ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
4852
+ `Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
4870
4853
  );
4871
- console.log(` Timing: ${timingPath}`);
4872
- console.log(` Benchmark: ${abp}`);
4873
4854
  }
4874
4855
  const failedWithWorkspaces = allResults.filter(
4875
4856
  (r) => r.workspacePath && (r.error || r.score < 0.5)
@@ -4881,15 +4862,8 @@ async function runEvalCommand(input) {
4881
4862
  }
4882
4863
  }
4883
4864
  if (allResults.length > 0) {
4884
- if (uniqueReportedOutputPaths.length === 1) {
4885
- console.log(`
4865
+ console.log(`
4886
4866
  Results written to: ${outputPath}`);
4887
- } else {
4888
- console.log("\nResults written to:");
4889
- for (const p of uniqueReportedOutputPaths) {
4890
- console.log(` ${p}`);
4891
- }
4892
- }
4893
4867
  await saveRunCache(cwd, outputPath).catch(() => void 0);
4894
4868
  }
4895
4869
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
@@ -4907,7 +4881,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
4907
4881
  outputPath,
4908
4882
  testFiles: activeTestFiles,
4909
4883
  target: options.target,
4910
- thresholdFailed
4884
+ thresholdFailed,
4885
+ allExecutionErrors
4911
4886
  };
4912
4887
  } finally {
4913
4888
  unsubscribeCodexLogs();
@@ -4940,6 +4915,43 @@ async function resolveEvaluationRunner() {
4940
4915
  return candidate;
4941
4916
  }
4942
4917
 
4918
+ // src/commands/eval/discover.ts
4919
+ import path16 from "node:path";
4920
+ import fg2 from "fast-glob";
4921
+ async function discoverEvalFiles(cwd) {
4922
+ const repoRoot = await findRepoRoot(cwd);
4923
+ const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
4924
+ const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
4925
+ const ignore = ["**/node_modules/**", "**/dist/**"];
4926
+ const matches = await fg2(patterns, {
4927
+ cwd,
4928
+ absolute: true,
4929
+ onlyFiles: true,
4930
+ ignore,
4931
+ followSymbolicLinks: true,
4932
+ caseSensitiveMatch: false
4933
+ });
4934
+ const evalFiles = matches.map((absPath) => {
4935
+ const relativePath = path16.relative(cwd, absPath);
4936
+ const category = deriveCategory(relativePath);
4937
+ return { path: absPath, relativePath, category };
4938
+ });
4939
+ evalFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
4940
+ return evalFiles;
4941
+ }
4942
+ function getCategories(files) {
4943
+ const categories = /* @__PURE__ */ new Set();
4944
+ for (const file of files) {
4945
+ categories.add(file.category);
4946
+ }
4947
+ const sorted = Array.from(categories);
4948
+ sorted.sort();
4949
+ return sorted;
4950
+ }
4951
+ function filterByCategory(files, category) {
4952
+ return files.filter((f) => f.category === category);
4953
+ }
4954
+
4943
4955
  export {
4944
4956
  package_default,
4945
4957
  toSnakeCaseDeep,
@@ -4948,12 +4960,13 @@ export {
4948
4960
  buildDefaultRunDir,
4949
4961
  resolveExistingRunPrimaryPath,
4950
4962
  resolveWorkspaceOrFilePath,
4951
- writeArtifactsFromResults,
4963
+ resolveRunManifestPath,
4952
4964
  parseResultManifest,
4953
4965
  resolveResultSourcePath,
4954
4966
  loadManifestResults,
4955
4967
  loadLightweightResults,
4956
4968
  HtmlWriter,
4969
+ writeArtifactsFromResults,
4957
4970
  resolveRunCacheFile,
4958
4971
  loadRunCache,
4959
4972
  resolveEvalPaths,
@@ -4966,6 +4979,9 @@ export {
4966
4979
  TARGET_FILE_CANDIDATES,
4967
4980
  fileExists,
4968
4981
  selectTarget,
4969
- runEvalCommand
4982
+ runEvalCommand,
4983
+ discoverEvalFiles,
4984
+ getCategories,
4985
+ filterByCategory
4970
4986
  };
4971
- //# sourceMappingURL=chunk-MHWYA4CS.js.map
4987
+ //# sourceMappingURL=chunk-QBZJSQXV.js.map