agentv 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  CLI_PLACEHOLDERS,
4
4
  COMMON_TARGET_SETTINGS,
5
+ DEFAULT_EVAL_PATTERNS,
6
+ DEFAULT_THRESHOLD,
5
7
  KNOWN_PROVIDERS,
6
8
  PROVIDER_ALIASES,
7
9
  ResponseCache,
@@ -9,6 +11,7 @@ import {
9
11
  buildSearchRoots,
10
12
  deriveCategory,
11
13
  ensureVSCodeSubagents,
14
+ findDeprecatedCamelCaseTargetWarnings,
12
15
  findGitRoot,
13
16
  interpolateEnv,
14
17
  isEvaluatorKind,
@@ -29,12 +32,12 @@ import {
29
32
  subscribeToCopilotCliLogEntries,
30
33
  subscribeToCopilotSdkLogEntries,
31
34
  subscribeToPiLogEntries
32
- } from "./chunk-YXXD27OK.js";
35
+ } from "./chunk-I6UE4LHZ.js";
33
36
 
34
37
  // package.json
35
38
  var package_default = {
36
39
  name: "agentv",
37
- version: "4.6.1",
40
+ version: "4.7.0",
38
41
  description: "CLI entry point for AgentV",
39
42
  type: "module",
40
43
  repository: {
@@ -346,6 +349,9 @@ function buildDefaultRunDir(cwd) {
346
349
  function resolveRunIndexPath(runDir) {
347
350
  return path3.join(runDir, RESULT_INDEX_FILENAME);
348
351
  }
352
+ function isRunManifestPath(filePath) {
353
+ return path3.basename(filePath) === RESULT_INDEX_FILENAME;
354
+ }
349
355
  function resolveExistingRunPrimaryPath(runDir) {
350
356
  const indexPath = resolveRunIndexPath(runDir);
351
357
  if (existsSync(indexPath)) {
@@ -370,9 +376,19 @@ function resolveWorkspaceOrFilePath(filePath) {
370
376
  }
371
377
  return existing;
372
378
  }
379
+ function resolveRunManifestPath(filePath) {
380
+ if (isDirectoryPath(filePath)) {
381
+ return resolveWorkspaceOrFilePath(filePath);
382
+ }
383
+ if (!isRunManifestPath(filePath)) {
384
+ throw new Error(
385
+ `Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
386
+ );
387
+ }
388
+ return filePath;
389
+ }
373
390
 
374
391
  // src/commands/eval/artifact-writer.ts
375
- var PASS_THRESHOLD = 0.8;
376
392
  function computeStats(values) {
377
393
  if (values.length === 0) {
378
394
  return { mean: 0, stddev: 0 };
@@ -387,10 +403,10 @@ function computeStats(values) {
387
403
  function computePassRate(result) {
388
404
  const scores = result.scores;
389
405
  if (scores && scores.length > 0) {
390
- const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
406
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
391
407
  return passed / scores.length;
392
408
  }
393
- return (result.score ?? 0) >= PASS_THRESHOLD ? 1 : 0;
409
+ return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
394
410
  }
395
411
  function countToolCalls(result) {
396
412
  const toolCalls = {};
@@ -596,12 +612,12 @@ function safeArtifactPathSegment(value, fallback) {
596
612
  function safeTestId(testId) {
597
613
  return safeArtifactPathSegment(testId, "unknown");
598
614
  }
599
- function getDataset(result) {
600
- return result.dataset;
615
+ function getSuite(result) {
616
+ return result.suite;
601
617
  }
602
618
  function buildArtifactSubdir(result) {
603
619
  const segments = [];
604
- const evalSet = getDataset(result);
620
+ const evalSet = getSuite(result);
605
621
  if (evalSet) {
606
622
  segments.push(safeArtifactPathSegment(evalSet, "default"));
607
623
  }
@@ -628,7 +644,7 @@ function buildResultIndexArtifact(result) {
628
644
  return {
629
645
  timestamp: result.timestamp,
630
646
  test_id: result.testId ?? "unknown",
631
- dataset: getDataset(result),
647
+ suite: getSuite(result),
632
648
  category: result.category,
633
649
  conversation_id: result.conversationId,
634
650
  score: result.score,
@@ -651,42 +667,6 @@ async function writeJsonlFile(filePath, records) {
651
667
  `;
652
668
  await writeFile(filePath, content, "utf8");
653
669
  }
654
- function toCamelCase(str) {
655
- return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
656
- }
657
- function toCamelCaseDeep(obj) {
658
- if (obj === null || obj === void 0) {
659
- return obj;
660
- }
661
- if (Array.isArray(obj)) {
662
- return obj.map((item) => toCamelCaseDeep(item));
663
- }
664
- if (typeof obj === "object") {
665
- const result = {};
666
- for (const [key, value] of Object.entries(obj)) {
667
- result[toCamelCase(key)] = toCamelCaseDeep(value);
668
- }
669
- return result;
670
- }
671
- return obj;
672
- }
673
- function parseJsonlResults(content) {
674
- const results = [];
675
- const lines = content.split("\n");
676
- for (const line of lines) {
677
- const trimmed = line.trim();
678
- if (trimmed.length === 0) {
679
- continue;
680
- }
681
- try {
682
- const parsed = JSON.parse(trimmed);
683
- const camelCased = toCamelCaseDeep(parsed);
684
- results.push(camelCased);
685
- } catch {
686
- }
687
- }
688
- return results;
689
- }
690
670
  async function writeArtifactsFromResults(results, outputDir, options) {
691
671
  const testArtifactDir = outputDir;
692
672
  const timingPath = path4.join(outputDir, "timing.json");
@@ -733,7 +713,6 @@ async function writeArtifactsFromResults(results, outputDir, options) {
733
713
 
734
714
  // src/commands/eval/benchmark-writer.ts
735
715
  import { writeFile as writeFile2 } from "node:fs/promises";
736
- var PASS_THRESHOLD2 = 0.8;
737
716
  function computeStats2(values) {
738
717
  if (values.length === 0) {
739
718
  return { mean: 0, stddev: 0 };
@@ -748,10 +727,10 @@ function computeStats2(values) {
748
727
  function computePassRate2(result) {
749
728
  const scores = result.scores;
750
729
  if (scores && scores.length > 0) {
751
- const passed = scores.filter((s) => s.score >= PASS_THRESHOLD2).length;
730
+ const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
752
731
  return passed / scores.length;
753
732
  }
754
- return result.score >= PASS_THRESHOLD2 ? 1 : 0;
733
+ return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
755
734
  }
756
735
  function buildBenchmarkJson(results) {
757
736
  const passRates = results.map(computePassRate2);
@@ -1698,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
1698
1677
  this.closed = true;
1699
1678
  const grouped = /* @__PURE__ */ new Map();
1700
1679
  for (const result of this.results) {
1701
- const suite = result.dataset ?? "default";
1680
+ const suite = result.suite ?? "default";
1702
1681
  const existing = grouped.get(suite);
1703
1682
  if (existing) {
1704
1683
  existing.push(result);
@@ -1708,14 +1687,17 @@ var JunitWriter = class _JunitWriter {
1708
1687
  }
1709
1688
  const suiteXmls = [];
1710
1689
  for (const [suiteName, results] of grouped) {
1711
- const failures = results.filter((r) => r.score < this.threshold).length;
1712
- const errors = results.filter((r) => r.error !== void 0).length;
1690
+ const errors = results.filter((r) => r.executionStatus === "execution_error").length;
1691
+ const failures = results.filter(
1692
+ (r) => r.executionStatus !== "execution_error" && r.score < this.threshold
1693
+ ).length;
1713
1694
  const testCases = results.map((r) => {
1714
1695
  const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
1715
1696
  let inner = "";
1716
- if (r.error) {
1697
+ if (r.executionStatus === "execution_error") {
1698
+ const errorMsg = r.error ?? "Execution error";
1717
1699
  inner = `
1718
- <error message="${escapeXml(r.error)}">${escapeXml(r.error)}</error>
1700
+ <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>
1719
1701
  `;
1720
1702
  } else if (r.score < this.threshold) {
1721
1703
  const message = `score=${r.score.toFixed(3)}`;
@@ -1737,8 +1719,10 @@ ${testCases.join("\n")}
1737
1719
  );
1738
1720
  }
1739
1721
  const totalTests = this.results.length;
1740
- const totalFailures = this.results.filter((r) => r.score < this.threshold).length;
1741
- const totalErrors = this.results.filter((r) => r.error !== void 0).length;
1722
+ const totalErrors = this.results.filter((r) => r.executionStatus === "execution_error").length;
1723
+ const totalFailures = this.results.filter(
1724
+ (r) => r.executionStatus !== "execution_error" && r.score < this.threshold
1725
+ ).length;
1742
1726
  const xml = `<?xml version="1.0" encoding="UTF-8"?>
1743
1727
  <testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
1744
1728
  ${suiteXmls.join("\n")}
@@ -1926,12 +1910,12 @@ var ProgressDisplay = class {
1926
1910
  }
1927
1911
  addLogPaths(paths, provider) {
1928
1912
  const newPaths = [];
1929
- for (const path16 of paths) {
1930
- if (this.logPathSet.has(path16)) {
1913
+ for (const path17 of paths) {
1914
+ if (this.logPathSet.has(path17)) {
1931
1915
  continue;
1932
1916
  }
1933
- this.logPathSet.add(path16);
1934
- newPaths.push(path16);
1917
+ this.logPathSet.add(path17);
1918
+ newPaths.push(path17);
1935
1919
  }
1936
1920
  if (newPaths.length === 0) {
1937
1921
  return;
@@ -1944,8 +1928,8 @@ var ProgressDisplay = class {
1944
1928
  this.hasPrintedLogHeader = true;
1945
1929
  }
1946
1930
  const startIndex = this.logPaths.length - newPaths.length;
1947
- newPaths.forEach((path16, offset) => {
1948
- console.log(`${startIndex + offset + 1}. ${path16}`);
1931
+ newPaths.forEach((path17, offset) => {
1932
+ console.log(`${startIndex + offset + 1}. ${path17}`);
1949
1933
  });
1950
1934
  }
1951
1935
  finish() {
@@ -1962,9 +1946,6 @@ import path12 from "node:path";
1962
1946
  function parseJsonlLines(content) {
1963
1947
  return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
1964
1948
  }
1965
- function isIndexManifestPath(sourceFile) {
1966
- return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
1967
- }
1968
1949
  function parseMarkdownMessages(content) {
1969
1950
  const trimmed = content.trim();
1970
1951
  if (!trimmed.startsWith("@[")) {
@@ -2022,11 +2003,11 @@ function hydrateOutput(baseDir, record) {
2022
2003
  function hydrateManifestRecord(baseDir, record) {
2023
2004
  const grading = readOptionalJson(baseDir, record.grading_path);
2024
2005
  const timing = readOptionalJson(baseDir, record.timing_path);
2025
- const testId = record.test_id ?? record.eval_id ?? "unknown";
2006
+ const testId = record.test_id ?? "unknown";
2026
2007
  return {
2027
2008
  timestamp: record.timestamp,
2028
2009
  testId,
2029
- dataset: record.dataset,
2010
+ suite: record.suite,
2030
2011
  category: record.category,
2031
2012
  target: record.target,
2032
2013
  score: record.score,
@@ -2066,74 +2047,44 @@ function parseResultManifest(content) {
2066
2047
  }
2067
2048
  function resolveResultSourcePath(source, cwd) {
2068
2049
  const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
2069
- return resolveWorkspaceOrFilePath(resolved);
2050
+ if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
2051
+ return resolveRunManifestPath(resolved);
2052
+ }
2053
+ return resolved;
2070
2054
  }
2071
2055
  function loadManifestResults(sourceFile) {
2072
- const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2073
- if (!isIndexManifestPath(resolvedSourceFile)) {
2074
- return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
2075
- }
2056
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2076
2057
  const content = readFileSync(resolvedSourceFile, "utf8");
2077
2058
  const records = parseResultManifest(content);
2078
2059
  const baseDir = path12.dirname(resolvedSourceFile);
2079
2060
  return records.map((record) => hydrateManifestRecord(baseDir, record));
2080
2061
  }
2081
2062
  function loadLightweightResults(sourceFile) {
2082
- const resolvedSourceFile = resolveWorkspaceOrFilePath(sourceFile);
2063
+ const resolvedSourceFile = resolveRunManifestPath(sourceFile);
2083
2064
  const content = readFileSync(resolvedSourceFile, "utf8");
2084
- if (isIndexManifestPath(resolvedSourceFile)) {
2085
- return parseResultManifest(content).map((record) => ({
2086
- testId: record.test_id ?? record.eval_id ?? "unknown",
2087
- target: record.target,
2088
- experiment: record.experiment,
2089
- score: record.score,
2090
- scores: record.scores,
2091
- executionStatus: record.execution_status,
2092
- error: record.error,
2093
- timestamp: record.timestamp
2094
- }));
2095
- }
2096
- const records = [];
2097
- for (const line of content.split(/\r?\n/)) {
2098
- const trimmed = line.trim();
2099
- if (!trimmed) {
2100
- continue;
2101
- }
2102
- let record;
2103
- try {
2104
- record = JSON.parse(trimmed);
2105
- } catch {
2106
- continue;
2107
- }
2108
- const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
2109
- if (typeof rawTestId !== "string") {
2110
- throw new Error(`Missing test_id in result: ${trimmed}`);
2111
- }
2112
- if (typeof record.score !== "number") {
2113
- throw new Error(`Missing or invalid score in result: ${trimmed}`);
2114
- }
2115
- records.push({
2116
- testId: rawTestId,
2117
- target: typeof record.target === "string" ? record.target : void 0,
2118
- score: record.score,
2119
- scores: Array.isArray(record.scores) ? record.scores : void 0,
2120
- executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
2121
- error: typeof record.error === "string" ? record.error : void 0,
2122
- timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
2123
- });
2124
- }
2125
- return records;
2065
+ return parseResultManifest(content).map((record) => ({
2066
+ testId: record.test_id ?? "unknown",
2067
+ suite: record.suite,
2068
+ target: record.target,
2069
+ experiment: record.experiment,
2070
+ score: record.score,
2071
+ scores: record.scores,
2072
+ executionStatus: record.execution_status,
2073
+ error: record.error,
2074
+ timestamp: record.timestamp
2075
+ }));
2126
2076
  }
2127
2077
 
2128
2078
  // src/commands/eval/retry-errors.ts
2079
+ async function loadRetrySourceResults(jsonlPath) {
2080
+ return loadManifestResults(resolveResultSourcePath(jsonlPath));
2081
+ }
2129
2082
  async function loadErrorTestIds(jsonlPath) {
2130
- const resolvedPath = resolveResultSourcePath(jsonlPath);
2131
- const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
2083
+ const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
2132
2084
  return [...new Set(ids)];
2133
2085
  }
2134
2086
  async function loadNonErrorResults(jsonlPath) {
2135
- const resolvedPath = resolveResultSourcePath(jsonlPath);
2136
- return loadManifestResults(resolvedPath).filter(
2087
+ return (await loadRetrySourceResults(jsonlPath)).filter(
2137
2088
  (result) => result.testId && result.executionStatus !== "execution_error"
2138
2089
  );
2139
2090
  }
@@ -2146,7 +2097,7 @@ function resolveRunCacheFile(cache) {
2146
2097
  if (cache.lastRunDir) {
2147
2098
  return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
2148
2099
  }
2149
- return cache.lastResultFile ?? "";
2100
+ return "";
2150
2101
  }
2151
2102
  function cachePath(cwd) {
2152
2103
  return path13.join(cwd, ".agentv", CACHE_FILENAME);
@@ -2160,15 +2111,14 @@ async function loadRunCache(cwd) {
2160
2111
  }
2161
2112
  }
2162
2113
  async function saveRunCache(cwd, resultPath) {
2114
+ if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
2115
+ return;
2116
+ }
2163
2117
  const dir = path13.join(cwd, ".agentv");
2164
2118
  await mkdir7(dir, { recursive: true });
2165
- const basename = path13.basename(resultPath);
2166
- const cache = basename === RESULT_INDEX_FILENAME ? {
2119
+ const cache = {
2167
2120
  lastRunDir: path13.dirname(resultPath),
2168
2121
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
2169
- } : {
2170
- lastResultFile: resultPath,
2171
- timestamp: (/* @__PURE__ */ new Date()).toISOString()
2172
2122
  };
2173
2123
  await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
2174
2124
  `, "utf-8");
@@ -2313,11 +2263,21 @@ function formatEvaluationSummary(summary, options) {
2313
2263
  }
2314
2264
  const gradedCount = summary.total - summary.executionErrorCount;
2315
2265
  const threshold = options?.threshold ?? 0.8;
2316
- const overallPassed = summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0;
2317
- const overallVerdict = overallPassed ? "PASS" : "FAIL";
2266
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
2267
+ const overallPassed = !allExecutionErrors && (summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
2318
2268
  const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
2319
- const verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2320
- const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2269
+ let overallVerdict;
2270
+ let verdictColor;
2271
+ let verdictText;
2272
+ if (allExecutionErrors) {
2273
+ overallVerdict = "INCONCLUSIVE";
2274
+ verdictColor = "\x1B[33m";
2275
+ verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
2276
+ } else {
2277
+ overallVerdict = overallPassed ? "PASS" : "FAIL";
2278
+ verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
2279
+ verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
2280
+ }
2321
2281
  lines.push("\n==================================================");
2322
2282
  if (useColor) {
2323
2283
  lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
@@ -2527,7 +2487,7 @@ var KNOWN_TEST_FIELDS = /* @__PURE__ */ new Set([
2527
2487
  "workspace",
2528
2488
  "metadata",
2529
2489
  "conversation_id",
2530
- "dataset",
2490
+ "suite",
2531
2491
  "note"
2532
2492
  ]);
2533
2493
  var NAME_PATTERN = /^[a-z0-9-]+$/;
@@ -3090,87 +3050,68 @@ function isObject2(value) {
3090
3050
  var COMMON_SETTINGS = new Set(COMMON_TARGET_SETTINGS);
3091
3051
  var RETRY_SETTINGS = /* @__PURE__ */ new Set([
3092
3052
  "max_retries",
3093
- "maxRetries",
3094
3053
  "retry_initial_delay_ms",
3095
- "retryInitialDelayMs",
3096
3054
  "retry_max_delay_ms",
3097
- "retryMaxDelayMs",
3098
3055
  "retry_backoff_factor",
3099
- "retryBackoffFactor",
3100
- "retry_status_codes",
3101
- "retryStatusCodes"
3056
+ "retry_status_codes"
3102
3057
  ]);
3103
3058
  var AZURE_SETTINGS = /* @__PURE__ */ new Set([
3104
3059
  ...COMMON_SETTINGS,
3105
3060
  ...RETRY_SETTINGS,
3106
3061
  "endpoint",
3107
3062
  "resource",
3108
- "resourceName",
3109
3063
  "api_key",
3110
- "apiKey",
3111
3064
  "deployment",
3112
- "deploymentName",
3113
3065
  "model",
3114
3066
  "version",
3115
3067
  "api_version",
3068
+ "api_format",
3116
3069
  "temperature",
3117
- "max_output_tokens",
3118
- "maxTokens"
3070
+ "max_output_tokens"
3119
3071
  ]);
3120
3072
  var OPENAI_SETTINGS = /* @__PURE__ */ new Set([
3121
3073
  ...COMMON_SETTINGS,
3122
3074
  ...RETRY_SETTINGS,
3123
3075
  "endpoint",
3124
3076
  "base_url",
3125
- "baseUrl",
3126
3077
  "api_key",
3127
- "apiKey",
3128
3078
  "model",
3129
3079
  "deployment",
3130
3080
  "variant",
3131
3081
  "api_format",
3132
- "apiFormat",
3133
3082
  "temperature",
3134
- "max_output_tokens",
3135
- "maxTokens"
3083
+ "max_output_tokens"
3136
3084
  ]);
3137
3085
  var OPENROUTER_SETTINGS = /* @__PURE__ */ new Set([
3138
3086
  ...COMMON_SETTINGS,
3139
3087
  ...RETRY_SETTINGS,
3140
3088
  "api_key",
3141
- "apiKey",
3142
3089
  "model",
3143
3090
  "deployment",
3144
3091
  "variant",
3145
3092
  "temperature",
3146
- "max_output_tokens",
3147
- "maxTokens"
3093
+ "max_output_tokens"
3148
3094
  ]);
3149
3095
  var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
3150
3096
  ...COMMON_SETTINGS,
3151
3097
  ...RETRY_SETTINGS,
3152
3098
  "api_key",
3153
- "apiKey",
3154
3099
  "model",
3155
3100
  "deployment",
3156
3101
  "variant",
3157
3102
  "temperature",
3158
3103
  "max_output_tokens",
3159
- "maxTokens",
3160
- "thinking_budget",
3161
- "thinkingBudget"
3104
+ "thinking_budget"
3162
3105
  ]);
3163
3106
  var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
3164
3107
  ...COMMON_SETTINGS,
3165
3108
  ...RETRY_SETTINGS,
3166
3109
  "api_key",
3167
- "apiKey",
3168
3110
  "model",
3169
3111
  "deployment",
3170
3112
  "variant",
3171
3113
  "temperature",
3172
- "max_output_tokens",
3173
- "maxTokens"
3114
+ "max_output_tokens"
3174
3115
  ]);
3175
3116
  var CODEX_SETTINGS = /* @__PURE__ */ new Set([
3176
3117
  ...COMMON_SETTINGS,
@@ -3182,40 +3123,25 @@ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
3182
3123
  "arguments",
3183
3124
  "cwd",
3184
3125
  "timeout_seconds",
3185
- "timeoutSeconds",
3186
3126
  "log_dir",
3187
- "logDir",
3188
3127
  "log_directory",
3189
- "logDirectory",
3190
3128
  "log_format",
3191
- "logFormat",
3192
3129
  "log_output_format",
3193
- "logOutputFormat",
3194
3130
  "system_prompt",
3195
- "systemPrompt",
3196
- "workspace_template",
3197
- "workspaceTemplate"
3131
+ "workspace_template"
3198
3132
  ]);
3199
3133
  var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
3200
3134
  ...COMMON_SETTINGS,
3201
3135
  "cli_url",
3202
- "cliUrl",
3203
3136
  "cli_path",
3204
- "cliPath",
3205
3137
  "github_token",
3206
- "githubToken",
3207
3138
  "model",
3208
3139
  "cwd",
3209
3140
  "timeout_seconds",
3210
- "timeoutSeconds",
3211
3141
  "log_dir",
3212
- "logDir",
3213
3142
  "log_format",
3214
- "logFormat",
3215
3143
  "system_prompt",
3216
- "systemPrompt",
3217
- "workspace_template",
3218
- "workspaceTemplate"
3144
+ "workspace_template"
3219
3145
  ]);
3220
3146
  var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
3221
3147
  ...COMMON_SETTINGS,
@@ -3227,35 +3153,23 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
3227
3153
  "model",
3228
3154
  "cwd",
3229
3155
  "timeout_seconds",
3230
- "timeoutSeconds",
3231
3156
  "log_dir",
3232
- "logDir",
3233
3157
  "log_format",
3234
- "logFormat",
3235
3158
  "system_prompt",
3236
- "systemPrompt",
3237
- "workspace_template",
3238
- "workspaceTemplate"
3159
+ "workspace_template"
3239
3160
  ]);
3240
3161
  var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
3241
3162
  ...COMMON_SETTINGS,
3242
3163
  "executable",
3243
3164
  "workspace_template",
3244
- "workspaceTemplate",
3245
3165
  "wait",
3246
3166
  "dry_run",
3247
- "dryRun",
3248
3167
  "subagent_root",
3249
- "subagentRoot",
3250
- "timeout_seconds",
3251
- "timeoutSeconds"
3168
+ "timeout_seconds"
3252
3169
  ]);
3253
3170
  var MOCK_SETTINGS = /* @__PURE__ */ new Set([
3254
3171
  ...COMMON_SETTINGS,
3255
3172
  "response",
3256
- "delayMs",
3257
- "delayMinMs",
3258
- "delayMaxMs",
3259
3173
  "trace"
3260
3174
  // For testing tool-trajectory evaluator
3261
3175
  ]);
@@ -3264,23 +3178,14 @@ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
3264
3178
  "model",
3265
3179
  "cwd",
3266
3180
  "timeout_seconds",
3267
- "timeoutSeconds",
3268
3181
  "log_dir",
3269
- "logDir",
3270
3182
  "log_directory",
3271
- "logDirectory",
3272
3183
  "log_format",
3273
- "logFormat",
3274
3184
  "log_output_format",
3275
- "logOutputFormat",
3276
3185
  "system_prompt",
3277
- "systemPrompt",
3278
3186
  "workspace_template",
3279
- "workspaceTemplate",
3280
3187
  "max_turns",
3281
- "maxTurns",
3282
- "max_budget_usd",
3283
- "maxBudgetUsd"
3188
+ "max_budget_usd"
3284
3189
  ]);
3285
3190
  function getKnownSettings(provider) {
3286
3191
  const normalizedProvider = provider.toLowerCase();
@@ -3405,15 +3310,15 @@ async function validateTargetsFile(filePath) {
3405
3310
  });
3406
3311
  return;
3407
3312
  }
3408
- const timeoutSeconds = healthcheck.timeout_seconds ?? healthcheck.timeoutSeconds;
3313
+ const timeoutSeconds = healthcheck.timeout_seconds;
3409
3314
  if (timeoutSeconds !== void 0) {
3410
3315
  const numericTimeout = Number(timeoutSeconds);
3411
3316
  if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
3412
3317
  errors2.push({
3413
3318
  severity: "error",
3414
3319
  filePath: absolutePath2,
3415
- location: `${location}.timeoutSeconds`,
3416
- message: "healthcheck.timeoutSeconds must be a positive number when provided"
3320
+ location: `${location}.timeout_seconds`,
3321
+ message: "healthcheck.timeout_seconds must be a positive number when provided"
3417
3322
  });
3418
3323
  }
3419
3324
  }
@@ -3512,6 +3417,18 @@ async function validateTargetsFile(filePath) {
3512
3417
  });
3513
3418
  continue;
3514
3419
  }
3420
+ for (const warning of findDeprecatedCamelCaseTargetWarnings(target, location)) {
3421
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
3422
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
3423
+ const field = fieldMatch?.[1] ?? "unknown";
3424
+ const replacement = replacementMatch?.[1] ?? "snake_case";
3425
+ errors.push({
3426
+ severity: "error",
3427
+ filePath: absolutePath,
3428
+ location: warning.location,
3429
+ message: `camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
3430
+ });
3431
+ }
3515
3432
  const name = target.name;
3516
3433
  if (typeof name !== "string" || name.trim().length === 0) {
3517
3434
  errors.push({
@@ -3891,7 +3808,9 @@ Errors in ${targetsFilePath}:`);
3891
3808
  };
3892
3809
  }
3893
3810
  try {
3894
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3811
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
3812
+ emitDeprecationWarnings: false
3813
+ });
3895
3814
  return {
3896
3815
  definitions,
3897
3816
  resolvedTarget,
@@ -3974,7 +3893,9 @@ Errors in ${targetsFilePath}:`);
3974
3893
  });
3975
3894
  } else {
3976
3895
  try {
3977
- const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath);
3896
+ const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
3897
+ emitDeprecationWarnings: false
3898
+ });
3978
3899
  results.push({
3979
3900
  definitions,
3980
3901
  resolvedTarget,
@@ -4043,6 +3964,16 @@ function normalizeStringArray(value) {
4043
3964
  }
4044
3965
  return [];
4045
3966
  }
3967
+ function normalizeFilter(value) {
3968
+ if (Array.isArray(value)) {
3969
+ const filters = normalizeStringArray(value);
3970
+ if (filters.length === 0) {
3971
+ return void 0;
3972
+ }
3973
+ return filters.length === 1 ? filters[0] : filters;
3974
+ }
3975
+ return normalizeString(value);
3976
+ }
4046
3977
  function matchesTagFilters(fileTags, includeTags, excludeTags) {
4047
3978
  const tags = new Set(fileTags ?? []);
4048
3979
  if (includeTags.length > 0) {
@@ -4132,7 +4063,7 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4132
4063
  target: singleTarget,
4133
4064
  cliTargets,
4134
4065
  targetsPath: normalizeString(rawOptions.targets),
4135
- filter: normalizeString(rawOptions.filter),
4066
+ filter: normalizeFilter(rawOptions.filter),
4136
4067
  workers: workers > 0 ? workers : void 0,
4137
4068
  outPath: cliOut ?? configOut,
4138
4069
  outputPaths,
@@ -4165,7 +4096,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4165
4096
  outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
4166
4097
  threshold: normalizeOptionalNumber(rawOptions.threshold),
4167
4098
  tags: normalizeStringArray(rawOptions.tag),
4168
- excludeTags: normalizeStringArray(rawOptions.excludeTag)
4099
+ excludeTags: normalizeStringArray(rawOptions.excludeTag),
4100
+ transcript: normalizeString(rawOptions.transcript)
4169
4101
  };
4170
4102
  }
4171
4103
  async function ensureFileExists(filePath, description) {
@@ -4191,20 +4123,20 @@ function createProgressReporter(maxWorkers, options) {
4191
4123
  addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
4192
4124
  };
4193
4125
  }
4194
- function makeEvalKey(testFilePath, evalId) {
4195
- return `${path15.resolve(testFilePath)}::${evalId}`;
4126
+ function makeTestCaseKey(testFilePath, testId) {
4127
+ return `${path15.resolve(testFilePath)}::${testId}`;
4196
4128
  }
4197
4129
  function createDisplayIdTracker() {
4198
4130
  const map = /* @__PURE__ */ new Map();
4199
4131
  let nextId = 1;
4200
4132
  return {
4201
- getOrAssign(evalKey) {
4202
- const existing = map.get(evalKey);
4133
+ getOrAssign(testCaseKey) {
4134
+ const existing = map.get(testCaseKey);
4203
4135
  if (existing !== void 0) {
4204
4136
  return existing;
4205
4137
  }
4206
4138
  const assigned = nextId++;
4207
- map.set(evalKey, assigned);
4139
+ map.set(testCaseKey, assigned);
4208
4140
  return assigned;
4209
4141
  }
4210
4142
  };
@@ -4255,58 +4187,79 @@ async function prepareFileMetadata(params) {
4255
4187
  filter: options.filter,
4256
4188
  category
4257
4189
  });
4258
- const filteredIds = suite.tests.map((value) => value.id);
4259
- const cliTargets = options.cliTargets;
4190
+ const testIds = suite.tests.map((value) => value.id);
4260
4191
  const suiteTargets = suite.targets;
4261
- let targetNames;
4262
- if (cliTargets.length > 0) {
4263
- targetNames = cliTargets;
4264
- } else if (suiteTargets && suiteTargets.length > 0) {
4265
- targetNames = suiteTargets;
4266
- } else {
4267
- targetNames = [];
4268
- }
4269
4192
  let selections;
4270
- if (targetNames.length > 1) {
4271
- const multiSelections = await selectMultipleTargets({
4272
- testFilePath,
4273
- repoRoot,
4274
- cwd,
4275
- explicitTargetsPath: options.targetsPath,
4276
- dryRun: options.dryRun,
4277
- dryRunDelay: options.dryRunDelay,
4278
- dryRunDelayMin: options.dryRunDelayMin,
4279
- dryRunDelayMax: options.dryRunDelayMax,
4280
- env: process.env,
4281
- targetNames
4282
- });
4283
- selections = multiSelections.map((sel) => ({
4284
- selection: sel,
4285
- inlineTargetLabel: sel.targetName
4286
- }));
4287
- } else {
4288
- const selection = await selectTarget({
4289
- testFilePath,
4290
- repoRoot,
4291
- cwd,
4292
- explicitTargetsPath: options.targetsPath,
4293
- cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
4294
- dryRun: options.dryRun,
4295
- dryRunDelay: options.dryRunDelay,
4296
- dryRunDelayMin: options.dryRunDelayMin,
4297
- dryRunDelayMax: options.dryRunDelayMax,
4298
- env: process.env
4299
- });
4193
+ if (options.transcript) {
4194
+ const transcriptSelection = {
4195
+ definitions: [],
4196
+ resolvedTarget: {
4197
+ kind: "transcript",
4198
+ name: "transcript",
4199
+ config: {}
4200
+ },
4201
+ targetName: "transcript",
4202
+ targetSource: "cli",
4203
+ targetsFilePath: options.transcript
4204
+ };
4300
4205
  selections = [
4301
4206
  {
4302
- selection,
4303
- inlineTargetLabel: selection.targetName
4207
+ selection: transcriptSelection,
4208
+ inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
4304
4209
  }
4305
4210
  ];
4211
+ } else {
4212
+ const cliTargets = options.cliTargets;
4213
+ const suiteTargets2 = suite.targets;
4214
+ let targetNames;
4215
+ if (cliTargets.length > 0) {
4216
+ targetNames = cliTargets;
4217
+ } else if (suiteTargets2 && suiteTargets2.length > 0) {
4218
+ targetNames = suiteTargets2;
4219
+ } else {
4220
+ targetNames = [];
4221
+ }
4222
+ if (targetNames.length > 1) {
4223
+ const multiSelections = await selectMultipleTargets({
4224
+ testFilePath,
4225
+ repoRoot,
4226
+ cwd,
4227
+ explicitTargetsPath: options.targetsPath,
4228
+ dryRun: options.dryRun,
4229
+ dryRunDelay: options.dryRunDelay,
4230
+ dryRunDelayMin: options.dryRunDelayMin,
4231
+ dryRunDelayMax: options.dryRunDelayMax,
4232
+ env: process.env,
4233
+ targetNames
4234
+ });
4235
+ selections = multiSelections.map((sel) => ({
4236
+ selection: sel,
4237
+ inlineTargetLabel: sel.targetName
4238
+ }));
4239
+ } else {
4240
+ const selection = await selectTarget({
4241
+ testFilePath,
4242
+ repoRoot,
4243
+ cwd,
4244
+ explicitTargetsPath: options.targetsPath,
4245
+ cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
4246
+ dryRun: options.dryRun,
4247
+ dryRunDelay: options.dryRunDelay,
4248
+ dryRunDelayMin: options.dryRunDelayMin,
4249
+ dryRunDelayMax: options.dryRunDelayMax,
4250
+ env: process.env
4251
+ });
4252
+ selections = [
4253
+ {
4254
+ selection,
4255
+ inlineTargetLabel: selection.targetName
4256
+ }
4257
+ ];
4258
+ }
4306
4259
  }
4307
4260
  return {
4308
- evalIds: filteredIds,
4309
- evalCases: suite.tests,
4261
+ testIds,
4262
+ testCases: suite.tests,
4310
4263
  selections,
4311
4264
  trialsConfig: suite.trials,
4312
4265
  suiteTargets,
@@ -4344,15 +4297,16 @@ async function runSingleEvalFile(params) {
4344
4297
  workersOverride,
4345
4298
  yamlWorkers,
4346
4299
  progressReporter,
4347
- seenEvalCases,
4300
+ seenTestCases,
4348
4301
  displayIdTracker,
4349
4302
  selection,
4350
4303
  inlineTargetLabel,
4351
- evalCases,
4304
+ testCases,
4352
4305
  trialsConfig,
4353
4306
  matrixMode,
4354
4307
  totalBudgetUsd,
4355
- failOnError
4308
+ failOnError,
4309
+ providerFactory
4356
4310
  } = params;
4357
4311
  const targetName = selection.targetName;
4358
4312
  await ensureFileExists(testFilePath, "Test file");
@@ -4408,7 +4362,8 @@ async function runSingleEvalFile(params) {
4408
4362
  }
4409
4363
  return true;
4410
4364
  })(),
4411
- evalCases,
4365
+ filter: options.filter,
4366
+ evalCases: testCases,
4412
4367
  verbose: options.verbose,
4413
4368
  maxConcurrency: resolvedWorkers,
4414
4369
  workspaceMode: options.workspaceMode,
@@ -4419,6 +4374,7 @@ async function runSingleEvalFile(params) {
4419
4374
  graderTarget: options.graderTarget,
4420
4375
  model: options.model,
4421
4376
  threshold: options.threshold,
4377
+ providerFactory,
4422
4378
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
4423
4379
  onResult: async (result) => {
4424
4380
  streamingObserver?.completeFromResult?.(result);
@@ -4442,13 +4398,13 @@ async function runSingleEvalFile(params) {
4442
4398
  }
4443
4399
  },
4444
4400
  onProgress: async (event) => {
4445
- const evalKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
4446
- const evalKey = makeEvalKey(testFilePath, evalKeyId);
4447
- if (event.status === "pending" && !seenEvalCases.has(evalKey)) {
4448
- seenEvalCases.add(evalKey);
4449
- progressReporter.setTotal(seenEvalCases.size);
4401
+ const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
4402
+ const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId);
4403
+ if (event.status === "pending" && !seenTestCases.has(testCaseKey)) {
4404
+ seenTestCases.add(testCaseKey);
4405
+ progressReporter.setTotal(seenTestCases.size);
4450
4406
  }
4451
- const displayId = displayIdTracker.getOrAssign(evalKey);
4407
+ const displayId = displayIdTracker.getOrAssign(testCaseKey);
4452
4408
  if (event.status === "running" && streamingObserver) {
4453
4409
  streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
4454
4410
  }
@@ -4534,7 +4490,7 @@ async function runEvalCommand(input) {
4534
4490
  const useFileExport = !!options.otelFile;
4535
4491
  if (options.exportOtel || useFileExport) {
4536
4492
  try {
4537
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-BN5NUVAB.js");
4493
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-XRVHRBJF.js");
4538
4494
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4539
4495
  let headers = {};
4540
4496
  if (options.otelBackend) {
@@ -4594,7 +4550,7 @@ async function runEvalCommand(input) {
4594
4550
  }
4595
4551
  const evaluationRunner = await resolveEvaluationRunner();
4596
4552
  const allResults = [];
4597
- const seenEvalCases = /* @__PURE__ */ new Set();
4553
+ const seenTestCases = /* @__PURE__ */ new Set();
4598
4554
  const displayIdTracker = createDisplayIdTracker();
4599
4555
  const totalWorkers = options.workers ?? DEFAULT_WORKERS;
4600
4556
  const fileConcurrency = Math.min(
@@ -4656,7 +4612,6 @@ async function runEvalCommand(input) {
4656
4612
  yamlCache: yamlCacheEnabled
4657
4613
  });
4658
4614
  const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
4659
- const useCache = cacheEnabled;
4660
4615
  if (cacheEnabled) {
4661
4616
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
4662
4617
  }
@@ -4676,7 +4631,7 @@ async function runEvalCommand(input) {
4676
4631
  let totalEvalCount = 0;
4677
4632
  for (const meta of fileMetadata.values()) {
4678
4633
  const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
4679
- for (const test of meta.evalCases) {
4634
+ for (const test of meta.testCases) {
4680
4635
  const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
4681
4636
  totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
4682
4637
  }
@@ -4720,13 +4675,13 @@ async function runEvalCommand(input) {
4720
4675
  });
4721
4676
  for (const [testFilePath, meta] of fileMetadata.entries()) {
4722
4677
  for (const { selection, inlineTargetLabel } of meta.selections) {
4723
- for (const testId of meta.evalIds) {
4724
- const evalKey = makeEvalKey(
4678
+ for (const testId of meta.testIds) {
4679
+ const testCaseKey = makeTestCaseKey(
4725
4680
  testFilePath,
4726
4681
  meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
4727
4682
  );
4728
- seenEvalCases.add(evalKey);
4729
- const displayId = displayIdTracker.getOrAssign(evalKey);
4683
+ seenTestCases.add(testCaseKey);
4684
+ const displayId = displayIdTracker.getOrAssign(testCaseKey);
4730
4685
  progressReporter.update(displayId, {
4731
4686
  workerId: displayId,
4732
4687
  testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
@@ -4737,6 +4692,24 @@ async function runEvalCommand(input) {
4737
4692
  }
4738
4693
  }
4739
4694
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
4695
+ let transcriptProviderFactory;
4696
+ if (options.transcript) {
4697
+ const { TranscriptProvider } = await import("./dist-XRVHRBJF.js");
4698
+ const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
4699
+ const totalTests = [...fileMetadata.values()].reduce(
4700
+ (sum, meta) => sum + meta.testCases.length,
4701
+ 0
4702
+ );
4703
+ if (transcriptProvider.lineCount !== totalTests) {
4704
+ throw new Error(
4705
+ `Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`
4706
+ );
4707
+ }
4708
+ transcriptProviderFactory = () => transcriptProvider;
4709
+ console.log(
4710
+ `Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`
4711
+ );
4712
+ }
4740
4713
  try {
4741
4714
  await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
4742
4715
  const targetPrep = fileMetadata.get(testFilePath);
@@ -4746,13 +4719,13 @@ async function runEvalCommand(input) {
4746
4719
  const targetResults = await Promise.all(
4747
4720
  targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
4748
4721
  const targetName = selection.targetName;
4749
- const applicableEvalCases = targetPrep.selections.length > 1 ? targetPrep.evalCases.filter((test) => {
4722
+ const applicableTestCases = targetPrep.selections.length > 1 ? targetPrep.testCases.filter((test) => {
4750
4723
  if (test.targets && test.targets.length > 0) {
4751
4724
  return test.targets.includes(targetName);
4752
4725
  }
4753
4726
  return true;
4754
- }) : targetPrep.evalCases;
4755
- if (applicableEvalCases.length === 0) {
4727
+ }) : targetPrep.testCases;
4728
+ if (applicableTestCases.length === 0) {
4756
4729
  return [];
4757
4730
  }
4758
4731
  try {
@@ -4768,16 +4741,17 @@ async function runEvalCommand(input) {
4768
4741
  workersOverride: perFileWorkers,
4769
4742
  yamlWorkers: targetPrep.yamlWorkers,
4770
4743
  progressReporter,
4771
- seenEvalCases,
4744
+ seenTestCases,
4772
4745
  displayIdTracker,
4773
4746
  selection,
4774
4747
  inlineTargetLabel,
4775
- evalCases: applicableEvalCases,
4776
- trialsConfig: targetPrep.trialsConfig,
4748
+ testCases: applicableTestCases,
4749
+ trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
4777
4750
  matrixMode: targetPrep.selections.length > 1,
4778
4751
  totalBudgetUsd: targetPrep.totalBudgetUsd,
4779
4752
  failOnError: targetPrep.failOnError,
4780
- threshold: resolvedThreshold
4753
+ threshold: resolvedThreshold,
4754
+ providerFactory: transcriptProviderFactory
4781
4755
  });
4782
4756
  return result.results;
4783
4757
  } catch (fileError) {
@@ -4785,9 +4759,9 @@ async function runEvalCommand(input) {
4785
4759
  console.error(`
4786
4760
  \u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
4787
4761
  `);
4788
- const errorResults = applicableEvalCases.map((evalCase) => ({
4762
+ const errorResults = applicableTestCases.map((testCase) => ({
4789
4763
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4790
- testId: evalCase.id,
4764
+ testId: testCase.id,
4791
4765
  score: 0,
4792
4766
  assertions: [],
4793
4767
  output: [],
@@ -4824,6 +4798,7 @@ async function runEvalCommand(input) {
4824
4798
  const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
4825
4799
  const summary = calculateEvaluationSummary(allResults, thresholdOpts);
4826
4800
  console.log(formatEvaluationSummary(summary, thresholdOpts));
4801
+ const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
4827
4802
  const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
4828
4803
  if (isMatrixMode && allResults.length > 0) {
4829
4804
  console.log(formatMatrixSummary(allResults));
@@ -4907,7 +4882,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
4907
4882
  outputPath,
4908
4883
  testFiles: activeTestFiles,
4909
4884
  target: options.target,
4910
- thresholdFailed
4885
+ thresholdFailed,
4886
+ allExecutionErrors
4911
4887
  };
4912
4888
  } finally {
4913
4889
  unsubscribeCodexLogs();
@@ -4940,6 +4916,43 @@ async function resolveEvaluationRunner() {
4940
4916
  return candidate;
4941
4917
  }
4942
4918
 
4919
+ // src/commands/eval/discover.ts
4920
+ import path16 from "node:path";
4921
+ import fg2 from "fast-glob";
4922
+ async function discoverEvalFiles(cwd) {
4923
+ const repoRoot = await findRepoRoot(cwd);
4924
+ const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
4925
+ const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
4926
+ const ignore = ["**/node_modules/**", "**/dist/**"];
4927
+ const matches = await fg2(patterns, {
4928
+ cwd,
4929
+ absolute: true,
4930
+ onlyFiles: true,
4931
+ ignore,
4932
+ followSymbolicLinks: true,
4933
+ caseSensitiveMatch: false
4934
+ });
4935
+ const evalFiles = matches.map((absPath) => {
4936
+ const relativePath = path16.relative(cwd, absPath);
4937
+ const category = deriveCategory(relativePath);
4938
+ return { path: absPath, relativePath, category };
4939
+ });
4940
+ evalFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
4941
+ return evalFiles;
4942
+ }
4943
+ function getCategories(files) {
4944
+ const categories = /* @__PURE__ */ new Set();
4945
+ for (const file of files) {
4946
+ categories.add(file.category);
4947
+ }
4948
+ const sorted = Array.from(categories);
4949
+ sorted.sort();
4950
+ return sorted;
4951
+ }
4952
+ function filterByCategory(files, category) {
4953
+ return files.filter((f) => f.category === category);
4954
+ }
4955
+
4943
4956
  export {
4944
4957
  package_default,
4945
4958
  toSnakeCaseDeep,
@@ -4948,12 +4961,13 @@ export {
4948
4961
  buildDefaultRunDir,
4949
4962
  resolveExistingRunPrimaryPath,
4950
4963
  resolveWorkspaceOrFilePath,
4951
- writeArtifactsFromResults,
4964
+ resolveRunManifestPath,
4952
4965
  parseResultManifest,
4953
4966
  resolveResultSourcePath,
4954
4967
  loadManifestResults,
4955
4968
  loadLightweightResults,
4956
4969
  HtmlWriter,
4970
+ writeArtifactsFromResults,
4957
4971
  resolveRunCacheFile,
4958
4972
  loadRunCache,
4959
4973
  resolveEvalPaths,
@@ -4966,6 +4980,9 @@ export {
4966
4980
  TARGET_FILE_CANDIDATES,
4967
4981
  fileExists,
4968
4982
  selectTarget,
4969
- runEvalCommand
4983
+ runEvalCommand,
4984
+ discoverEvalFiles,
4985
+ getCategories,
4986
+ filterByCategory
4970
4987
  };
4971
- //# sourceMappingURL=chunk-MHWYA4CS.js.map
4988
+ //# sourceMappingURL=chunk-AX4CQS45.js.map