agentv 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-MHWYA4CS.js → chunk-AX4CQS45.js} +300 -283
- package/dist/chunk-AX4CQS45.js.map +1 -0
- package/dist/{chunk-YXXD27OK.js → chunk-I6UE4LHZ.js} +1232 -439
- package/dist/chunk-I6UE4LHZ.js.map +1 -0
- package/dist/{chunk-NSVFUL27.js → chunk-VEAOMKNS.js} +4420 -3603
- package/dist/chunk-VEAOMKNS.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-BN5NUVAB.js → dist-XRVHRBJF.js} +16 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-DMSVE6CS.js → interactive-UBEMNJZG.js} +10 -47
- package/dist/interactive-UBEMNJZG.js.map +1 -0
- package/dist/studio/assets/index-DHxVz6M9.css +1 -0
- package/dist/studio/assets/{index-C7TnyYee.js → index-DcwjOyrk.js} +1 -1
- package/dist/studio/assets/index-Y5InSvcS.js +65 -0
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-MHWYA4CS.js.map +0 -1
- package/dist/chunk-NSVFUL27.js.map +0 -1
- package/dist/chunk-YXXD27OK.js.map +0 -1
- package/dist/interactive-DMSVE6CS.js.map +0 -1
- package/dist/studio/assets/index-jJVIJh8b.css +0 -1
- package/dist/studio/assets/index-vn54AYtS.js +0 -65
- /package/dist/{dist-BN5NUVAB.js.map → dist-XRVHRBJF.js.map} +0 -0
|
@@ -2,6 +2,8 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
|
|
|
2
2
|
import {
|
|
3
3
|
CLI_PLACEHOLDERS,
|
|
4
4
|
COMMON_TARGET_SETTINGS,
|
|
5
|
+
DEFAULT_EVAL_PATTERNS,
|
|
6
|
+
DEFAULT_THRESHOLD,
|
|
5
7
|
KNOWN_PROVIDERS,
|
|
6
8
|
PROVIDER_ALIASES,
|
|
7
9
|
ResponseCache,
|
|
@@ -9,6 +11,7 @@ import {
|
|
|
9
11
|
buildSearchRoots,
|
|
10
12
|
deriveCategory,
|
|
11
13
|
ensureVSCodeSubagents,
|
|
14
|
+
findDeprecatedCamelCaseTargetWarnings,
|
|
12
15
|
findGitRoot,
|
|
13
16
|
interpolateEnv,
|
|
14
17
|
isEvaluatorKind,
|
|
@@ -29,12 +32,12 @@ import {
|
|
|
29
32
|
subscribeToCopilotCliLogEntries,
|
|
30
33
|
subscribeToCopilotSdkLogEntries,
|
|
31
34
|
subscribeToPiLogEntries
|
|
32
|
-
} from "./chunk-
|
|
35
|
+
} from "./chunk-I6UE4LHZ.js";
|
|
33
36
|
|
|
34
37
|
// package.json
|
|
35
38
|
var package_default = {
|
|
36
39
|
name: "agentv",
|
|
37
|
-
version: "4.
|
|
40
|
+
version: "4.7.0",
|
|
38
41
|
description: "CLI entry point for AgentV",
|
|
39
42
|
type: "module",
|
|
40
43
|
repository: {
|
|
@@ -346,6 +349,9 @@ function buildDefaultRunDir(cwd) {
|
|
|
346
349
|
function resolveRunIndexPath(runDir) {
|
|
347
350
|
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
348
351
|
}
|
|
352
|
+
function isRunManifestPath(filePath) {
|
|
353
|
+
return path3.basename(filePath) === RESULT_INDEX_FILENAME;
|
|
354
|
+
}
|
|
349
355
|
function resolveExistingRunPrimaryPath(runDir) {
|
|
350
356
|
const indexPath = resolveRunIndexPath(runDir);
|
|
351
357
|
if (existsSync(indexPath)) {
|
|
@@ -370,9 +376,19 @@ function resolveWorkspaceOrFilePath(filePath) {
|
|
|
370
376
|
}
|
|
371
377
|
return existing;
|
|
372
378
|
}
|
|
379
|
+
function resolveRunManifestPath(filePath) {
|
|
380
|
+
if (isDirectoryPath(filePath)) {
|
|
381
|
+
return resolveWorkspaceOrFilePath(filePath);
|
|
382
|
+
}
|
|
383
|
+
if (!isRunManifestPath(filePath)) {
|
|
384
|
+
throw new Error(
|
|
385
|
+
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
386
|
+
);
|
|
387
|
+
}
|
|
388
|
+
return filePath;
|
|
389
|
+
}
|
|
373
390
|
|
|
374
391
|
// src/commands/eval/artifact-writer.ts
|
|
375
|
-
var PASS_THRESHOLD = 0.8;
|
|
376
392
|
function computeStats(values) {
|
|
377
393
|
if (values.length === 0) {
|
|
378
394
|
return { mean: 0, stddev: 0 };
|
|
@@ -387,10 +403,10 @@ function computeStats(values) {
|
|
|
387
403
|
function computePassRate(result) {
|
|
388
404
|
const scores = result.scores;
|
|
389
405
|
if (scores && scores.length > 0) {
|
|
390
|
-
const passed = scores.filter((s) => s.score >=
|
|
406
|
+
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
391
407
|
return passed / scores.length;
|
|
392
408
|
}
|
|
393
|
-
return (result.score ?? 0) >=
|
|
409
|
+
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
394
410
|
}
|
|
395
411
|
function countToolCalls(result) {
|
|
396
412
|
const toolCalls = {};
|
|
@@ -596,12 +612,12 @@ function safeArtifactPathSegment(value, fallback) {
|
|
|
596
612
|
function safeTestId(testId) {
|
|
597
613
|
return safeArtifactPathSegment(testId, "unknown");
|
|
598
614
|
}
|
|
599
|
-
function
|
|
600
|
-
return result.
|
|
615
|
+
function getSuite(result) {
|
|
616
|
+
return result.suite;
|
|
601
617
|
}
|
|
602
618
|
function buildArtifactSubdir(result) {
|
|
603
619
|
const segments = [];
|
|
604
|
-
const evalSet =
|
|
620
|
+
const evalSet = getSuite(result);
|
|
605
621
|
if (evalSet) {
|
|
606
622
|
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
607
623
|
}
|
|
@@ -628,7 +644,7 @@ function buildResultIndexArtifact(result) {
|
|
|
628
644
|
return {
|
|
629
645
|
timestamp: result.timestamp,
|
|
630
646
|
test_id: result.testId ?? "unknown",
|
|
631
|
-
|
|
647
|
+
suite: getSuite(result),
|
|
632
648
|
category: result.category,
|
|
633
649
|
conversation_id: result.conversationId,
|
|
634
650
|
score: result.score,
|
|
@@ -651,42 +667,6 @@ async function writeJsonlFile(filePath, records) {
|
|
|
651
667
|
`;
|
|
652
668
|
await writeFile(filePath, content, "utf8");
|
|
653
669
|
}
|
|
654
|
-
function toCamelCase(str) {
|
|
655
|
-
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
656
|
-
}
|
|
657
|
-
function toCamelCaseDeep(obj) {
|
|
658
|
-
if (obj === null || obj === void 0) {
|
|
659
|
-
return obj;
|
|
660
|
-
}
|
|
661
|
-
if (Array.isArray(obj)) {
|
|
662
|
-
return obj.map((item) => toCamelCaseDeep(item));
|
|
663
|
-
}
|
|
664
|
-
if (typeof obj === "object") {
|
|
665
|
-
const result = {};
|
|
666
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
667
|
-
result[toCamelCase(key)] = toCamelCaseDeep(value);
|
|
668
|
-
}
|
|
669
|
-
return result;
|
|
670
|
-
}
|
|
671
|
-
return obj;
|
|
672
|
-
}
|
|
673
|
-
function parseJsonlResults(content) {
|
|
674
|
-
const results = [];
|
|
675
|
-
const lines = content.split("\n");
|
|
676
|
-
for (const line of lines) {
|
|
677
|
-
const trimmed = line.trim();
|
|
678
|
-
if (trimmed.length === 0) {
|
|
679
|
-
continue;
|
|
680
|
-
}
|
|
681
|
-
try {
|
|
682
|
-
const parsed = JSON.parse(trimmed);
|
|
683
|
-
const camelCased = toCamelCaseDeep(parsed);
|
|
684
|
-
results.push(camelCased);
|
|
685
|
-
} catch {
|
|
686
|
-
}
|
|
687
|
-
}
|
|
688
|
-
return results;
|
|
689
|
-
}
|
|
690
670
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
691
671
|
const testArtifactDir = outputDir;
|
|
692
672
|
const timingPath = path4.join(outputDir, "timing.json");
|
|
@@ -733,7 +713,6 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
733
713
|
|
|
734
714
|
// src/commands/eval/benchmark-writer.ts
|
|
735
715
|
import { writeFile as writeFile2 } from "node:fs/promises";
|
|
736
|
-
var PASS_THRESHOLD2 = 0.8;
|
|
737
716
|
function computeStats2(values) {
|
|
738
717
|
if (values.length === 0) {
|
|
739
718
|
return { mean: 0, stddev: 0 };
|
|
@@ -748,10 +727,10 @@ function computeStats2(values) {
|
|
|
748
727
|
function computePassRate2(result) {
|
|
749
728
|
const scores = result.scores;
|
|
750
729
|
if (scores && scores.length > 0) {
|
|
751
|
-
const passed = scores.filter((s) => s.score >=
|
|
730
|
+
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
752
731
|
return passed / scores.length;
|
|
753
732
|
}
|
|
754
|
-
return result.score >=
|
|
733
|
+
return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
755
734
|
}
|
|
756
735
|
function buildBenchmarkJson(results) {
|
|
757
736
|
const passRates = results.map(computePassRate2);
|
|
@@ -1698,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1698
1677
|
this.closed = true;
|
|
1699
1678
|
const grouped = /* @__PURE__ */ new Map();
|
|
1700
1679
|
for (const result of this.results) {
|
|
1701
|
-
const suite = result.
|
|
1680
|
+
const suite = result.suite ?? "default";
|
|
1702
1681
|
const existing = grouped.get(suite);
|
|
1703
1682
|
if (existing) {
|
|
1704
1683
|
existing.push(result);
|
|
@@ -1708,14 +1687,17 @@ var JunitWriter = class _JunitWriter {
|
|
|
1708
1687
|
}
|
|
1709
1688
|
const suiteXmls = [];
|
|
1710
1689
|
for (const [suiteName, results] of grouped) {
|
|
1711
|
-
const
|
|
1712
|
-
const
|
|
1690
|
+
const errors = results.filter((r) => r.executionStatus === "execution_error").length;
|
|
1691
|
+
const failures = results.filter(
|
|
1692
|
+
(r) => r.executionStatus !== "execution_error" && r.score < this.threshold
|
|
1693
|
+
).length;
|
|
1713
1694
|
const testCases = results.map((r) => {
|
|
1714
1695
|
const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
|
|
1715
1696
|
let inner = "";
|
|
1716
|
-
if (r.
|
|
1697
|
+
if (r.executionStatus === "execution_error") {
|
|
1698
|
+
const errorMsg = r.error ?? "Execution error";
|
|
1717
1699
|
inner = `
|
|
1718
|
-
<error message="${escapeXml(
|
|
1700
|
+
<error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>
|
|
1719
1701
|
`;
|
|
1720
1702
|
} else if (r.score < this.threshold) {
|
|
1721
1703
|
const message = `score=${r.score.toFixed(3)}`;
|
|
@@ -1737,8 +1719,10 @@ ${testCases.join("\n")}
|
|
|
1737
1719
|
);
|
|
1738
1720
|
}
|
|
1739
1721
|
const totalTests = this.results.length;
|
|
1740
|
-
const
|
|
1741
|
-
const
|
|
1722
|
+
const totalErrors = this.results.filter((r) => r.executionStatus === "execution_error").length;
|
|
1723
|
+
const totalFailures = this.results.filter(
|
|
1724
|
+
(r) => r.executionStatus !== "execution_error" && r.score < this.threshold
|
|
1725
|
+
).length;
|
|
1742
1726
|
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
1743
1727
|
<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
|
|
1744
1728
|
${suiteXmls.join("\n")}
|
|
@@ -1926,12 +1910,12 @@ var ProgressDisplay = class {
|
|
|
1926
1910
|
}
|
|
1927
1911
|
addLogPaths(paths, provider) {
|
|
1928
1912
|
const newPaths = [];
|
|
1929
|
-
for (const
|
|
1930
|
-
if (this.logPathSet.has(
|
|
1913
|
+
for (const path17 of paths) {
|
|
1914
|
+
if (this.logPathSet.has(path17)) {
|
|
1931
1915
|
continue;
|
|
1932
1916
|
}
|
|
1933
|
-
this.logPathSet.add(
|
|
1934
|
-
newPaths.push(
|
|
1917
|
+
this.logPathSet.add(path17);
|
|
1918
|
+
newPaths.push(path17);
|
|
1935
1919
|
}
|
|
1936
1920
|
if (newPaths.length === 0) {
|
|
1937
1921
|
return;
|
|
@@ -1944,8 +1928,8 @@ var ProgressDisplay = class {
|
|
|
1944
1928
|
this.hasPrintedLogHeader = true;
|
|
1945
1929
|
}
|
|
1946
1930
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
1947
|
-
newPaths.forEach((
|
|
1948
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
1931
|
+
newPaths.forEach((path17, offset) => {
|
|
1932
|
+
console.log(`${startIndex + offset + 1}. ${path17}`);
|
|
1949
1933
|
});
|
|
1950
1934
|
}
|
|
1951
1935
|
finish() {
|
|
@@ -1962,9 +1946,6 @@ import path12 from "node:path";
|
|
|
1962
1946
|
function parseJsonlLines(content) {
|
|
1963
1947
|
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
1964
1948
|
}
|
|
1965
|
-
function isIndexManifestPath(sourceFile) {
|
|
1966
|
-
return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
|
|
1967
|
-
}
|
|
1968
1949
|
function parseMarkdownMessages(content) {
|
|
1969
1950
|
const trimmed = content.trim();
|
|
1970
1951
|
if (!trimmed.startsWith("@[")) {
|
|
@@ -2022,11 +2003,11 @@ function hydrateOutput(baseDir, record) {
|
|
|
2022
2003
|
function hydrateManifestRecord(baseDir, record) {
|
|
2023
2004
|
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
2024
2005
|
const timing = readOptionalJson(baseDir, record.timing_path);
|
|
2025
|
-
const testId = record.test_id ??
|
|
2006
|
+
const testId = record.test_id ?? "unknown";
|
|
2026
2007
|
return {
|
|
2027
2008
|
timestamp: record.timestamp,
|
|
2028
2009
|
testId,
|
|
2029
|
-
|
|
2010
|
+
suite: record.suite,
|
|
2030
2011
|
category: record.category,
|
|
2031
2012
|
target: record.target,
|
|
2032
2013
|
score: record.score,
|
|
@@ -2066,74 +2047,44 @@ function parseResultManifest(content) {
|
|
|
2066
2047
|
}
|
|
2067
2048
|
function resolveResultSourcePath(source, cwd) {
|
|
2068
2049
|
const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
|
|
2069
|
-
|
|
2050
|
+
if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
2051
|
+
return resolveRunManifestPath(resolved);
|
|
2052
|
+
}
|
|
2053
|
+
return resolved;
|
|
2070
2054
|
}
|
|
2071
2055
|
function loadManifestResults(sourceFile) {
|
|
2072
|
-
const resolvedSourceFile =
|
|
2073
|
-
if (!isIndexManifestPath(resolvedSourceFile)) {
|
|
2074
|
-
return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
|
|
2075
|
-
}
|
|
2056
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2076
2057
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2077
2058
|
const records = parseResultManifest(content);
|
|
2078
2059
|
const baseDir = path12.dirname(resolvedSourceFile);
|
|
2079
2060
|
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
2080
2061
|
}
|
|
2081
2062
|
function loadLightweightResults(sourceFile) {
|
|
2082
|
-
const resolvedSourceFile =
|
|
2063
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2083
2064
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
}
|
|
2096
|
-
const records = [];
|
|
2097
|
-
for (const line of content.split(/\r?\n/)) {
|
|
2098
|
-
const trimmed = line.trim();
|
|
2099
|
-
if (!trimmed) {
|
|
2100
|
-
continue;
|
|
2101
|
-
}
|
|
2102
|
-
let record;
|
|
2103
|
-
try {
|
|
2104
|
-
record = JSON.parse(trimmed);
|
|
2105
|
-
} catch {
|
|
2106
|
-
continue;
|
|
2107
|
-
}
|
|
2108
|
-
const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
|
|
2109
|
-
if (typeof rawTestId !== "string") {
|
|
2110
|
-
throw new Error(`Missing test_id in result: ${trimmed}`);
|
|
2111
|
-
}
|
|
2112
|
-
if (typeof record.score !== "number") {
|
|
2113
|
-
throw new Error(`Missing or invalid score in result: ${trimmed}`);
|
|
2114
|
-
}
|
|
2115
|
-
records.push({
|
|
2116
|
-
testId: rawTestId,
|
|
2117
|
-
target: typeof record.target === "string" ? record.target : void 0,
|
|
2118
|
-
score: record.score,
|
|
2119
|
-
scores: Array.isArray(record.scores) ? record.scores : void 0,
|
|
2120
|
-
executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
|
|
2121
|
-
error: typeof record.error === "string" ? record.error : void 0,
|
|
2122
|
-
timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
|
|
2123
|
-
});
|
|
2124
|
-
}
|
|
2125
|
-
return records;
|
|
2065
|
+
return parseResultManifest(content).map((record) => ({
|
|
2066
|
+
testId: record.test_id ?? "unknown",
|
|
2067
|
+
suite: record.suite,
|
|
2068
|
+
target: record.target,
|
|
2069
|
+
experiment: record.experiment,
|
|
2070
|
+
score: record.score,
|
|
2071
|
+
scores: record.scores,
|
|
2072
|
+
executionStatus: record.execution_status,
|
|
2073
|
+
error: record.error,
|
|
2074
|
+
timestamp: record.timestamp
|
|
2075
|
+
}));
|
|
2126
2076
|
}
|
|
2127
2077
|
|
|
2128
2078
|
// src/commands/eval/retry-errors.ts
|
|
2079
|
+
async function loadRetrySourceResults(jsonlPath) {
|
|
2080
|
+
return loadManifestResults(resolveResultSourcePath(jsonlPath));
|
|
2081
|
+
}
|
|
2129
2082
|
async function loadErrorTestIds(jsonlPath) {
|
|
2130
|
-
const
|
|
2131
|
-
const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
2083
|
+
const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
2132
2084
|
return [...new Set(ids)];
|
|
2133
2085
|
}
|
|
2134
2086
|
async function loadNonErrorResults(jsonlPath) {
|
|
2135
|
-
|
|
2136
|
-
return loadManifestResults(resolvedPath).filter(
|
|
2087
|
+
return (await loadRetrySourceResults(jsonlPath)).filter(
|
|
2137
2088
|
(result) => result.testId && result.executionStatus !== "execution_error"
|
|
2138
2089
|
);
|
|
2139
2090
|
}
|
|
@@ -2146,7 +2097,7 @@ function resolveRunCacheFile(cache) {
|
|
|
2146
2097
|
if (cache.lastRunDir) {
|
|
2147
2098
|
return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
|
|
2148
2099
|
}
|
|
2149
|
-
return
|
|
2100
|
+
return "";
|
|
2150
2101
|
}
|
|
2151
2102
|
function cachePath(cwd) {
|
|
2152
2103
|
return path13.join(cwd, ".agentv", CACHE_FILENAME);
|
|
@@ -2160,15 +2111,14 @@ async function loadRunCache(cwd) {
|
|
|
2160
2111
|
}
|
|
2161
2112
|
}
|
|
2162
2113
|
async function saveRunCache(cwd, resultPath) {
|
|
2114
|
+
if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
|
|
2115
|
+
return;
|
|
2116
|
+
}
|
|
2163
2117
|
const dir = path13.join(cwd, ".agentv");
|
|
2164
2118
|
await mkdir7(dir, { recursive: true });
|
|
2165
|
-
const
|
|
2166
|
-
const cache = basename === RESULT_INDEX_FILENAME ? {
|
|
2119
|
+
const cache = {
|
|
2167
2120
|
lastRunDir: path13.dirname(resultPath),
|
|
2168
2121
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2169
|
-
} : {
|
|
2170
|
-
lastResultFile: resultPath,
|
|
2171
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2172
2122
|
};
|
|
2173
2123
|
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
2174
2124
|
`, "utf-8");
|
|
@@ -2313,11 +2263,21 @@ function formatEvaluationSummary(summary, options) {
|
|
|
2313
2263
|
}
|
|
2314
2264
|
const gradedCount = summary.total - summary.executionErrorCount;
|
|
2315
2265
|
const threshold = options?.threshold ?? 0.8;
|
|
2316
|
-
const
|
|
2317
|
-
const
|
|
2266
|
+
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
2267
|
+
const overallPassed = !allExecutionErrors && (summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
|
|
2318
2268
|
const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
|
|
2319
|
-
|
|
2320
|
-
|
|
2269
|
+
let overallVerdict;
|
|
2270
|
+
let verdictColor;
|
|
2271
|
+
let verdictText;
|
|
2272
|
+
if (allExecutionErrors) {
|
|
2273
|
+
overallVerdict = "INCONCLUSIVE";
|
|
2274
|
+
verdictColor = "\x1B[33m";
|
|
2275
|
+
verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
|
|
2276
|
+
} else {
|
|
2277
|
+
overallVerdict = overallPassed ? "PASS" : "FAIL";
|
|
2278
|
+
verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
|
|
2279
|
+
verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
|
|
2280
|
+
}
|
|
2321
2281
|
lines.push("\n==================================================");
|
|
2322
2282
|
if (useColor) {
|
|
2323
2283
|
lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
|
|
@@ -2527,7 +2487,7 @@ var KNOWN_TEST_FIELDS = /* @__PURE__ */ new Set([
|
|
|
2527
2487
|
"workspace",
|
|
2528
2488
|
"metadata",
|
|
2529
2489
|
"conversation_id",
|
|
2530
|
-
"
|
|
2490
|
+
"suite",
|
|
2531
2491
|
"note"
|
|
2532
2492
|
]);
|
|
2533
2493
|
var NAME_PATTERN = /^[a-z0-9-]+$/;
|
|
@@ -3090,87 +3050,68 @@ function isObject2(value) {
|
|
|
3090
3050
|
var COMMON_SETTINGS = new Set(COMMON_TARGET_SETTINGS);
|
|
3091
3051
|
var RETRY_SETTINGS = /* @__PURE__ */ new Set([
|
|
3092
3052
|
"max_retries",
|
|
3093
|
-
"maxRetries",
|
|
3094
3053
|
"retry_initial_delay_ms",
|
|
3095
|
-
"retryInitialDelayMs",
|
|
3096
3054
|
"retry_max_delay_ms",
|
|
3097
|
-
"retryMaxDelayMs",
|
|
3098
3055
|
"retry_backoff_factor",
|
|
3099
|
-
"
|
|
3100
|
-
"retry_status_codes",
|
|
3101
|
-
"retryStatusCodes"
|
|
3056
|
+
"retry_status_codes"
|
|
3102
3057
|
]);
|
|
3103
3058
|
var AZURE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3104
3059
|
...COMMON_SETTINGS,
|
|
3105
3060
|
...RETRY_SETTINGS,
|
|
3106
3061
|
"endpoint",
|
|
3107
3062
|
"resource",
|
|
3108
|
-
"resourceName",
|
|
3109
3063
|
"api_key",
|
|
3110
|
-
"apiKey",
|
|
3111
3064
|
"deployment",
|
|
3112
|
-
"deploymentName",
|
|
3113
3065
|
"model",
|
|
3114
3066
|
"version",
|
|
3115
3067
|
"api_version",
|
|
3068
|
+
"api_format",
|
|
3116
3069
|
"temperature",
|
|
3117
|
-
"max_output_tokens"
|
|
3118
|
-
"maxTokens"
|
|
3070
|
+
"max_output_tokens"
|
|
3119
3071
|
]);
|
|
3120
3072
|
var OPENAI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3121
3073
|
...COMMON_SETTINGS,
|
|
3122
3074
|
...RETRY_SETTINGS,
|
|
3123
3075
|
"endpoint",
|
|
3124
3076
|
"base_url",
|
|
3125
|
-
"baseUrl",
|
|
3126
3077
|
"api_key",
|
|
3127
|
-
"apiKey",
|
|
3128
3078
|
"model",
|
|
3129
3079
|
"deployment",
|
|
3130
3080
|
"variant",
|
|
3131
3081
|
"api_format",
|
|
3132
|
-
"apiFormat",
|
|
3133
3082
|
"temperature",
|
|
3134
|
-
"max_output_tokens"
|
|
3135
|
-
"maxTokens"
|
|
3083
|
+
"max_output_tokens"
|
|
3136
3084
|
]);
|
|
3137
3085
|
var OPENROUTER_SETTINGS = /* @__PURE__ */ new Set([
|
|
3138
3086
|
...COMMON_SETTINGS,
|
|
3139
3087
|
...RETRY_SETTINGS,
|
|
3140
3088
|
"api_key",
|
|
3141
|
-
"apiKey",
|
|
3142
3089
|
"model",
|
|
3143
3090
|
"deployment",
|
|
3144
3091
|
"variant",
|
|
3145
3092
|
"temperature",
|
|
3146
|
-
"max_output_tokens"
|
|
3147
|
-
"maxTokens"
|
|
3093
|
+
"max_output_tokens"
|
|
3148
3094
|
]);
|
|
3149
3095
|
var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
|
|
3150
3096
|
...COMMON_SETTINGS,
|
|
3151
3097
|
...RETRY_SETTINGS,
|
|
3152
3098
|
"api_key",
|
|
3153
|
-
"apiKey",
|
|
3154
3099
|
"model",
|
|
3155
3100
|
"deployment",
|
|
3156
3101
|
"variant",
|
|
3157
3102
|
"temperature",
|
|
3158
3103
|
"max_output_tokens",
|
|
3159
|
-
"
|
|
3160
|
-
"thinking_budget",
|
|
3161
|
-
"thinkingBudget"
|
|
3104
|
+
"thinking_budget"
|
|
3162
3105
|
]);
|
|
3163
3106
|
var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3164
3107
|
...COMMON_SETTINGS,
|
|
3165
3108
|
...RETRY_SETTINGS,
|
|
3166
3109
|
"api_key",
|
|
3167
|
-
"apiKey",
|
|
3168
3110
|
"model",
|
|
3169
3111
|
"deployment",
|
|
3170
3112
|
"variant",
|
|
3171
3113
|
"temperature",
|
|
3172
|
-
"max_output_tokens"
|
|
3173
|
-
"maxTokens"
|
|
3114
|
+
"max_output_tokens"
|
|
3174
3115
|
]);
|
|
3175
3116
|
var CODEX_SETTINGS = /* @__PURE__ */ new Set([
|
|
3176
3117
|
...COMMON_SETTINGS,
|
|
@@ -3182,40 +3123,25 @@ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3182
3123
|
"arguments",
|
|
3183
3124
|
"cwd",
|
|
3184
3125
|
"timeout_seconds",
|
|
3185
|
-
"timeoutSeconds",
|
|
3186
3126
|
"log_dir",
|
|
3187
|
-
"logDir",
|
|
3188
3127
|
"log_directory",
|
|
3189
|
-
"logDirectory",
|
|
3190
3128
|
"log_format",
|
|
3191
|
-
"logFormat",
|
|
3192
3129
|
"log_output_format",
|
|
3193
|
-
"logOutputFormat",
|
|
3194
3130
|
"system_prompt",
|
|
3195
|
-
"
|
|
3196
|
-
"workspace_template",
|
|
3197
|
-
"workspaceTemplate"
|
|
3131
|
+
"workspace_template"
|
|
3198
3132
|
]);
|
|
3199
3133
|
var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
|
|
3200
3134
|
...COMMON_SETTINGS,
|
|
3201
3135
|
"cli_url",
|
|
3202
|
-
"cliUrl",
|
|
3203
3136
|
"cli_path",
|
|
3204
|
-
"cliPath",
|
|
3205
3137
|
"github_token",
|
|
3206
|
-
"githubToken",
|
|
3207
3138
|
"model",
|
|
3208
3139
|
"cwd",
|
|
3209
3140
|
"timeout_seconds",
|
|
3210
|
-
"timeoutSeconds",
|
|
3211
3141
|
"log_dir",
|
|
3212
|
-
"logDir",
|
|
3213
3142
|
"log_format",
|
|
3214
|
-
"logFormat",
|
|
3215
3143
|
"system_prompt",
|
|
3216
|
-
"
|
|
3217
|
-
"workspace_template",
|
|
3218
|
-
"workspaceTemplate"
|
|
3144
|
+
"workspace_template"
|
|
3219
3145
|
]);
|
|
3220
3146
|
var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3221
3147
|
...COMMON_SETTINGS,
|
|
@@ -3227,35 +3153,23 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3227
3153
|
"model",
|
|
3228
3154
|
"cwd",
|
|
3229
3155
|
"timeout_seconds",
|
|
3230
|
-
"timeoutSeconds",
|
|
3231
3156
|
"log_dir",
|
|
3232
|
-
"logDir",
|
|
3233
3157
|
"log_format",
|
|
3234
|
-
"logFormat",
|
|
3235
3158
|
"system_prompt",
|
|
3236
|
-
"
|
|
3237
|
-
"workspace_template",
|
|
3238
|
-
"workspaceTemplate"
|
|
3159
|
+
"workspace_template"
|
|
3239
3160
|
]);
|
|
3240
3161
|
var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3241
3162
|
...COMMON_SETTINGS,
|
|
3242
3163
|
"executable",
|
|
3243
3164
|
"workspace_template",
|
|
3244
|
-
"workspaceTemplate",
|
|
3245
3165
|
"wait",
|
|
3246
3166
|
"dry_run",
|
|
3247
|
-
"dryRun",
|
|
3248
3167
|
"subagent_root",
|
|
3249
|
-
"
|
|
3250
|
-
"timeout_seconds",
|
|
3251
|
-
"timeoutSeconds"
|
|
3168
|
+
"timeout_seconds"
|
|
3252
3169
|
]);
|
|
3253
3170
|
var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
3254
3171
|
...COMMON_SETTINGS,
|
|
3255
3172
|
"response",
|
|
3256
|
-
"delayMs",
|
|
3257
|
-
"delayMinMs",
|
|
3258
|
-
"delayMaxMs",
|
|
3259
3173
|
"trace"
|
|
3260
3174
|
// For testing tool-trajectory evaluator
|
|
3261
3175
|
]);
|
|
@@ -3264,23 +3178,14 @@ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3264
3178
|
"model",
|
|
3265
3179
|
"cwd",
|
|
3266
3180
|
"timeout_seconds",
|
|
3267
|
-
"timeoutSeconds",
|
|
3268
3181
|
"log_dir",
|
|
3269
|
-
"logDir",
|
|
3270
3182
|
"log_directory",
|
|
3271
|
-
"logDirectory",
|
|
3272
3183
|
"log_format",
|
|
3273
|
-
"logFormat",
|
|
3274
3184
|
"log_output_format",
|
|
3275
|
-
"logOutputFormat",
|
|
3276
3185
|
"system_prompt",
|
|
3277
|
-
"systemPrompt",
|
|
3278
3186
|
"workspace_template",
|
|
3279
|
-
"workspaceTemplate",
|
|
3280
3187
|
"max_turns",
|
|
3281
|
-
"
|
|
3282
|
-
"max_budget_usd",
|
|
3283
|
-
"maxBudgetUsd"
|
|
3188
|
+
"max_budget_usd"
|
|
3284
3189
|
]);
|
|
3285
3190
|
function getKnownSettings(provider) {
|
|
3286
3191
|
const normalizedProvider = provider.toLowerCase();
|
|
@@ -3405,15 +3310,15 @@ async function validateTargetsFile(filePath) {
|
|
|
3405
3310
|
});
|
|
3406
3311
|
return;
|
|
3407
3312
|
}
|
|
3408
|
-
const timeoutSeconds = healthcheck.timeout_seconds
|
|
3313
|
+
const timeoutSeconds = healthcheck.timeout_seconds;
|
|
3409
3314
|
if (timeoutSeconds !== void 0) {
|
|
3410
3315
|
const numericTimeout = Number(timeoutSeconds);
|
|
3411
3316
|
if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
|
|
3412
3317
|
errors2.push({
|
|
3413
3318
|
severity: "error",
|
|
3414
3319
|
filePath: absolutePath2,
|
|
3415
|
-
location: `${location}.
|
|
3416
|
-
message: "healthcheck.
|
|
3320
|
+
location: `${location}.timeout_seconds`,
|
|
3321
|
+
message: "healthcheck.timeout_seconds must be a positive number when provided"
|
|
3417
3322
|
});
|
|
3418
3323
|
}
|
|
3419
3324
|
}
|
|
@@ -3512,6 +3417,18 @@ async function validateTargetsFile(filePath) {
|
|
|
3512
3417
|
});
|
|
3513
3418
|
continue;
|
|
3514
3419
|
}
|
|
3420
|
+
for (const warning of findDeprecatedCamelCaseTargetWarnings(target, location)) {
|
|
3421
|
+
const fieldMatch = warning.message.match(/field '([^']+)'/);
|
|
3422
|
+
const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
|
|
3423
|
+
const field = fieldMatch?.[1] ?? "unknown";
|
|
3424
|
+
const replacement = replacementMatch?.[1] ?? "snake_case";
|
|
3425
|
+
errors.push({
|
|
3426
|
+
severity: "error",
|
|
3427
|
+
filePath: absolutePath,
|
|
3428
|
+
location: warning.location,
|
|
3429
|
+
message: `camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
|
|
3430
|
+
});
|
|
3431
|
+
}
|
|
3515
3432
|
const name = target.name;
|
|
3516
3433
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
3517
3434
|
errors.push({
|
|
@@ -3891,7 +3808,9 @@ Errors in ${targetsFilePath}:`);
|
|
|
3891
3808
|
};
|
|
3892
3809
|
}
|
|
3893
3810
|
try {
|
|
3894
|
-
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath
|
|
3811
|
+
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
|
|
3812
|
+
emitDeprecationWarnings: false
|
|
3813
|
+
});
|
|
3895
3814
|
return {
|
|
3896
3815
|
definitions,
|
|
3897
3816
|
resolvedTarget,
|
|
@@ -3974,7 +3893,9 @@ Errors in ${targetsFilePath}:`);
|
|
|
3974
3893
|
});
|
|
3975
3894
|
} else {
|
|
3976
3895
|
try {
|
|
3977
|
-
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath
|
|
3896
|
+
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
|
|
3897
|
+
emitDeprecationWarnings: false
|
|
3898
|
+
});
|
|
3978
3899
|
results.push({
|
|
3979
3900
|
definitions,
|
|
3980
3901
|
resolvedTarget,
|
|
@@ -4043,6 +3964,16 @@ function normalizeStringArray(value) {
|
|
|
4043
3964
|
}
|
|
4044
3965
|
return [];
|
|
4045
3966
|
}
|
|
3967
|
+
function normalizeFilter(value) {
|
|
3968
|
+
if (Array.isArray(value)) {
|
|
3969
|
+
const filters = normalizeStringArray(value);
|
|
3970
|
+
if (filters.length === 0) {
|
|
3971
|
+
return void 0;
|
|
3972
|
+
}
|
|
3973
|
+
return filters.length === 1 ? filters[0] : filters;
|
|
3974
|
+
}
|
|
3975
|
+
return normalizeString(value);
|
|
3976
|
+
}
|
|
4046
3977
|
function matchesTagFilters(fileTags, includeTags, excludeTags) {
|
|
4047
3978
|
const tags = new Set(fileTags ?? []);
|
|
4048
3979
|
if (includeTags.length > 0) {
|
|
@@ -4132,7 +4063,7 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4132
4063
|
target: singleTarget,
|
|
4133
4064
|
cliTargets,
|
|
4134
4065
|
targetsPath: normalizeString(rawOptions.targets),
|
|
4135
|
-
filter:
|
|
4066
|
+
filter: normalizeFilter(rawOptions.filter),
|
|
4136
4067
|
workers: workers > 0 ? workers : void 0,
|
|
4137
4068
|
outPath: cliOut ?? configOut,
|
|
4138
4069
|
outputPaths,
|
|
@@ -4165,7 +4096,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4165
4096
|
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
|
|
4166
4097
|
threshold: normalizeOptionalNumber(rawOptions.threshold),
|
|
4167
4098
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4168
|
-
excludeTags: normalizeStringArray(rawOptions.excludeTag)
|
|
4099
|
+
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4100
|
+
transcript: normalizeString(rawOptions.transcript)
|
|
4169
4101
|
};
|
|
4170
4102
|
}
|
|
4171
4103
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4191,20 +4123,20 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
4191
4123
|
addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
|
|
4192
4124
|
};
|
|
4193
4125
|
}
|
|
4194
|
-
function
|
|
4195
|
-
return `${path15.resolve(testFilePath)}::${
|
|
4126
|
+
function makeTestCaseKey(testFilePath, testId) {
|
|
4127
|
+
return `${path15.resolve(testFilePath)}::${testId}`;
|
|
4196
4128
|
}
|
|
4197
4129
|
function createDisplayIdTracker() {
|
|
4198
4130
|
const map = /* @__PURE__ */ new Map();
|
|
4199
4131
|
let nextId = 1;
|
|
4200
4132
|
return {
|
|
4201
|
-
getOrAssign(
|
|
4202
|
-
const existing = map.get(
|
|
4133
|
+
getOrAssign(testCaseKey) {
|
|
4134
|
+
const existing = map.get(testCaseKey);
|
|
4203
4135
|
if (existing !== void 0) {
|
|
4204
4136
|
return existing;
|
|
4205
4137
|
}
|
|
4206
4138
|
const assigned = nextId++;
|
|
4207
|
-
map.set(
|
|
4139
|
+
map.set(testCaseKey, assigned);
|
|
4208
4140
|
return assigned;
|
|
4209
4141
|
}
|
|
4210
4142
|
};
|
|
@@ -4255,58 +4187,79 @@ async function prepareFileMetadata(params) {
|
|
|
4255
4187
|
filter: options.filter,
|
|
4256
4188
|
category
|
|
4257
4189
|
});
|
|
4258
|
-
const
|
|
4259
|
-
const cliTargets = options.cliTargets;
|
|
4190
|
+
const testIds = suite.tests.map((value) => value.id);
|
|
4260
4191
|
const suiteTargets = suite.targets;
|
|
4261
|
-
let targetNames;
|
|
4262
|
-
if (cliTargets.length > 0) {
|
|
4263
|
-
targetNames = cliTargets;
|
|
4264
|
-
} else if (suiteTargets && suiteTargets.length > 0) {
|
|
4265
|
-
targetNames = suiteTargets;
|
|
4266
|
-
} else {
|
|
4267
|
-
targetNames = [];
|
|
4268
|
-
}
|
|
4269
4192
|
let selections;
|
|
4270
|
-
if (
|
|
4271
|
-
const
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4276
|
-
|
|
4277
|
-
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
});
|
|
4283
|
-
selections = multiSelections.map((sel) => ({
|
|
4284
|
-
selection: sel,
|
|
4285
|
-
inlineTargetLabel: sel.targetName
|
|
4286
|
-
}));
|
|
4287
|
-
} else {
|
|
4288
|
-
const selection = await selectTarget({
|
|
4289
|
-
testFilePath,
|
|
4290
|
-
repoRoot,
|
|
4291
|
-
cwd,
|
|
4292
|
-
explicitTargetsPath: options.targetsPath,
|
|
4293
|
-
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
|
|
4294
|
-
dryRun: options.dryRun,
|
|
4295
|
-
dryRunDelay: options.dryRunDelay,
|
|
4296
|
-
dryRunDelayMin: options.dryRunDelayMin,
|
|
4297
|
-
dryRunDelayMax: options.dryRunDelayMax,
|
|
4298
|
-
env: process.env
|
|
4299
|
-
});
|
|
4193
|
+
if (options.transcript) {
|
|
4194
|
+
const transcriptSelection = {
|
|
4195
|
+
definitions: [],
|
|
4196
|
+
resolvedTarget: {
|
|
4197
|
+
kind: "transcript",
|
|
4198
|
+
name: "transcript",
|
|
4199
|
+
config: {}
|
|
4200
|
+
},
|
|
4201
|
+
targetName: "transcript",
|
|
4202
|
+
targetSource: "cli",
|
|
4203
|
+
targetsFilePath: options.transcript
|
|
4204
|
+
};
|
|
4300
4205
|
selections = [
|
|
4301
4206
|
{
|
|
4302
|
-
selection,
|
|
4303
|
-
inlineTargetLabel:
|
|
4207
|
+
selection: transcriptSelection,
|
|
4208
|
+
inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
|
|
4304
4209
|
}
|
|
4305
4210
|
];
|
|
4211
|
+
} else {
|
|
4212
|
+
const cliTargets = options.cliTargets;
|
|
4213
|
+
const suiteTargets2 = suite.targets;
|
|
4214
|
+
let targetNames;
|
|
4215
|
+
if (cliTargets.length > 0) {
|
|
4216
|
+
targetNames = cliTargets;
|
|
4217
|
+
} else if (suiteTargets2 && suiteTargets2.length > 0) {
|
|
4218
|
+
targetNames = suiteTargets2;
|
|
4219
|
+
} else {
|
|
4220
|
+
targetNames = [];
|
|
4221
|
+
}
|
|
4222
|
+
if (targetNames.length > 1) {
|
|
4223
|
+
const multiSelections = await selectMultipleTargets({
|
|
4224
|
+
testFilePath,
|
|
4225
|
+
repoRoot,
|
|
4226
|
+
cwd,
|
|
4227
|
+
explicitTargetsPath: options.targetsPath,
|
|
4228
|
+
dryRun: options.dryRun,
|
|
4229
|
+
dryRunDelay: options.dryRunDelay,
|
|
4230
|
+
dryRunDelayMin: options.dryRunDelayMin,
|
|
4231
|
+
dryRunDelayMax: options.dryRunDelayMax,
|
|
4232
|
+
env: process.env,
|
|
4233
|
+
targetNames
|
|
4234
|
+
});
|
|
4235
|
+
selections = multiSelections.map((sel) => ({
|
|
4236
|
+
selection: sel,
|
|
4237
|
+
inlineTargetLabel: sel.targetName
|
|
4238
|
+
}));
|
|
4239
|
+
} else {
|
|
4240
|
+
const selection = await selectTarget({
|
|
4241
|
+
testFilePath,
|
|
4242
|
+
repoRoot,
|
|
4243
|
+
cwd,
|
|
4244
|
+
explicitTargetsPath: options.targetsPath,
|
|
4245
|
+
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
|
|
4246
|
+
dryRun: options.dryRun,
|
|
4247
|
+
dryRunDelay: options.dryRunDelay,
|
|
4248
|
+
dryRunDelayMin: options.dryRunDelayMin,
|
|
4249
|
+
dryRunDelayMax: options.dryRunDelayMax,
|
|
4250
|
+
env: process.env
|
|
4251
|
+
});
|
|
4252
|
+
selections = [
|
|
4253
|
+
{
|
|
4254
|
+
selection,
|
|
4255
|
+
inlineTargetLabel: selection.targetName
|
|
4256
|
+
}
|
|
4257
|
+
];
|
|
4258
|
+
}
|
|
4306
4259
|
}
|
|
4307
4260
|
return {
|
|
4308
|
-
|
|
4309
|
-
|
|
4261
|
+
testIds,
|
|
4262
|
+
testCases: suite.tests,
|
|
4310
4263
|
selections,
|
|
4311
4264
|
trialsConfig: suite.trials,
|
|
4312
4265
|
suiteTargets,
|
|
@@ -4344,15 +4297,16 @@ async function runSingleEvalFile(params) {
|
|
|
4344
4297
|
workersOverride,
|
|
4345
4298
|
yamlWorkers,
|
|
4346
4299
|
progressReporter,
|
|
4347
|
-
|
|
4300
|
+
seenTestCases,
|
|
4348
4301
|
displayIdTracker,
|
|
4349
4302
|
selection,
|
|
4350
4303
|
inlineTargetLabel,
|
|
4351
|
-
|
|
4304
|
+
testCases,
|
|
4352
4305
|
trialsConfig,
|
|
4353
4306
|
matrixMode,
|
|
4354
4307
|
totalBudgetUsd,
|
|
4355
|
-
failOnError
|
|
4308
|
+
failOnError,
|
|
4309
|
+
providerFactory
|
|
4356
4310
|
} = params;
|
|
4357
4311
|
const targetName = selection.targetName;
|
|
4358
4312
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -4408,7 +4362,8 @@ async function runSingleEvalFile(params) {
|
|
|
4408
4362
|
}
|
|
4409
4363
|
return true;
|
|
4410
4364
|
})(),
|
|
4411
|
-
|
|
4365
|
+
filter: options.filter,
|
|
4366
|
+
evalCases: testCases,
|
|
4412
4367
|
verbose: options.verbose,
|
|
4413
4368
|
maxConcurrency: resolvedWorkers,
|
|
4414
4369
|
workspaceMode: options.workspaceMode,
|
|
@@ -4419,6 +4374,7 @@ async function runSingleEvalFile(params) {
|
|
|
4419
4374
|
graderTarget: options.graderTarget,
|
|
4420
4375
|
model: options.model,
|
|
4421
4376
|
threshold: options.threshold,
|
|
4377
|
+
providerFactory,
|
|
4422
4378
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
4423
4379
|
onResult: async (result) => {
|
|
4424
4380
|
streamingObserver?.completeFromResult?.(result);
|
|
@@ -4442,13 +4398,13 @@ async function runSingleEvalFile(params) {
|
|
|
4442
4398
|
}
|
|
4443
4399
|
},
|
|
4444
4400
|
onProgress: async (event) => {
|
|
4445
|
-
const
|
|
4446
|
-
const
|
|
4447
|
-
if (event.status === "pending" && !
|
|
4448
|
-
|
|
4449
|
-
progressReporter.setTotal(
|
|
4401
|
+
const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
|
|
4402
|
+
const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId);
|
|
4403
|
+
if (event.status === "pending" && !seenTestCases.has(testCaseKey)) {
|
|
4404
|
+
seenTestCases.add(testCaseKey);
|
|
4405
|
+
progressReporter.setTotal(seenTestCases.size);
|
|
4450
4406
|
}
|
|
4451
|
-
const displayId = displayIdTracker.getOrAssign(
|
|
4407
|
+
const displayId = displayIdTracker.getOrAssign(testCaseKey);
|
|
4452
4408
|
if (event.status === "running" && streamingObserver) {
|
|
4453
4409
|
streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
|
|
4454
4410
|
}
|
|
@@ -4534,7 +4490,7 @@ async function runEvalCommand(input) {
|
|
|
4534
4490
|
const useFileExport = !!options.otelFile;
|
|
4535
4491
|
if (options.exportOtel || useFileExport) {
|
|
4536
4492
|
try {
|
|
4537
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4493
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-XRVHRBJF.js");
|
|
4538
4494
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4539
4495
|
let headers = {};
|
|
4540
4496
|
if (options.otelBackend) {
|
|
@@ -4594,7 +4550,7 @@ async function runEvalCommand(input) {
|
|
|
4594
4550
|
}
|
|
4595
4551
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
4596
4552
|
const allResults = [];
|
|
4597
|
-
const
|
|
4553
|
+
const seenTestCases = /* @__PURE__ */ new Set();
|
|
4598
4554
|
const displayIdTracker = createDisplayIdTracker();
|
|
4599
4555
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
4600
4556
|
const fileConcurrency = Math.min(
|
|
@@ -4656,7 +4612,6 @@ async function runEvalCommand(input) {
|
|
|
4656
4612
|
yamlCache: yamlCacheEnabled
|
|
4657
4613
|
});
|
|
4658
4614
|
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
|
|
4659
|
-
const useCache = cacheEnabled;
|
|
4660
4615
|
if (cacheEnabled) {
|
|
4661
4616
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
4662
4617
|
}
|
|
@@ -4676,7 +4631,7 @@ async function runEvalCommand(input) {
|
|
|
4676
4631
|
let totalEvalCount = 0;
|
|
4677
4632
|
for (const meta of fileMetadata.values()) {
|
|
4678
4633
|
const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
|
|
4679
|
-
for (const test of meta.
|
|
4634
|
+
for (const test of meta.testCases) {
|
|
4680
4635
|
const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
|
|
4681
4636
|
totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
|
|
4682
4637
|
}
|
|
@@ -4720,13 +4675,13 @@ async function runEvalCommand(input) {
|
|
|
4720
4675
|
});
|
|
4721
4676
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
4722
4677
|
for (const { selection, inlineTargetLabel } of meta.selections) {
|
|
4723
|
-
for (const testId of meta.
|
|
4724
|
-
const
|
|
4678
|
+
for (const testId of meta.testIds) {
|
|
4679
|
+
const testCaseKey = makeTestCaseKey(
|
|
4725
4680
|
testFilePath,
|
|
4726
4681
|
meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
|
|
4727
4682
|
);
|
|
4728
|
-
|
|
4729
|
-
const displayId = displayIdTracker.getOrAssign(
|
|
4683
|
+
seenTestCases.add(testCaseKey);
|
|
4684
|
+
const displayId = displayIdTracker.getOrAssign(testCaseKey);
|
|
4730
4685
|
progressReporter.update(displayId, {
|
|
4731
4686
|
workerId: displayId,
|
|
4732
4687
|
testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
|
|
@@ -4737,6 +4692,24 @@ async function runEvalCommand(input) {
|
|
|
4737
4692
|
}
|
|
4738
4693
|
}
|
|
4739
4694
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
4695
|
+
let transcriptProviderFactory;
|
|
4696
|
+
if (options.transcript) {
|
|
4697
|
+
const { TranscriptProvider } = await import("./dist-XRVHRBJF.js");
|
|
4698
|
+
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
4699
|
+
const totalTests = [...fileMetadata.values()].reduce(
|
|
4700
|
+
(sum, meta) => sum + meta.testCases.length,
|
|
4701
|
+
0
|
|
4702
|
+
);
|
|
4703
|
+
if (transcriptProvider.lineCount !== totalTests) {
|
|
4704
|
+
throw new Error(
|
|
4705
|
+
`Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`
|
|
4706
|
+
);
|
|
4707
|
+
}
|
|
4708
|
+
transcriptProviderFactory = () => transcriptProvider;
|
|
4709
|
+
console.log(
|
|
4710
|
+
`Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`
|
|
4711
|
+
);
|
|
4712
|
+
}
|
|
4740
4713
|
try {
|
|
4741
4714
|
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
|
|
4742
4715
|
const targetPrep = fileMetadata.get(testFilePath);
|
|
@@ -4746,13 +4719,13 @@ async function runEvalCommand(input) {
|
|
|
4746
4719
|
const targetResults = await Promise.all(
|
|
4747
4720
|
targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
|
|
4748
4721
|
const targetName = selection.targetName;
|
|
4749
|
-
const
|
|
4722
|
+
const applicableTestCases = targetPrep.selections.length > 1 ? targetPrep.testCases.filter((test) => {
|
|
4750
4723
|
if (test.targets && test.targets.length > 0) {
|
|
4751
4724
|
return test.targets.includes(targetName);
|
|
4752
4725
|
}
|
|
4753
4726
|
return true;
|
|
4754
|
-
}) : targetPrep.
|
|
4755
|
-
if (
|
|
4727
|
+
}) : targetPrep.testCases;
|
|
4728
|
+
if (applicableTestCases.length === 0) {
|
|
4756
4729
|
return [];
|
|
4757
4730
|
}
|
|
4758
4731
|
try {
|
|
@@ -4768,16 +4741,17 @@ async function runEvalCommand(input) {
|
|
|
4768
4741
|
workersOverride: perFileWorkers,
|
|
4769
4742
|
yamlWorkers: targetPrep.yamlWorkers,
|
|
4770
4743
|
progressReporter,
|
|
4771
|
-
|
|
4744
|
+
seenTestCases,
|
|
4772
4745
|
displayIdTracker,
|
|
4773
4746
|
selection,
|
|
4774
4747
|
inlineTargetLabel,
|
|
4775
|
-
|
|
4776
|
-
trialsConfig: targetPrep.trialsConfig,
|
|
4748
|
+
testCases: applicableTestCases,
|
|
4749
|
+
trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
|
|
4777
4750
|
matrixMode: targetPrep.selections.length > 1,
|
|
4778
4751
|
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
4779
4752
|
failOnError: targetPrep.failOnError,
|
|
4780
|
-
threshold: resolvedThreshold
|
|
4753
|
+
threshold: resolvedThreshold,
|
|
4754
|
+
providerFactory: transcriptProviderFactory
|
|
4781
4755
|
});
|
|
4782
4756
|
return result.results;
|
|
4783
4757
|
} catch (fileError) {
|
|
@@ -4785,9 +4759,9 @@ async function runEvalCommand(input) {
|
|
|
4785
4759
|
console.error(`
|
|
4786
4760
|
\u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
|
|
4787
4761
|
`);
|
|
4788
|
-
const errorResults =
|
|
4762
|
+
const errorResults = applicableTestCases.map((testCase) => ({
|
|
4789
4763
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4790
|
-
testId:
|
|
4764
|
+
testId: testCase.id,
|
|
4791
4765
|
score: 0,
|
|
4792
4766
|
assertions: [],
|
|
4793
4767
|
output: [],
|
|
@@ -4824,6 +4798,7 @@ async function runEvalCommand(input) {
|
|
|
4824
4798
|
const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4825
4799
|
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
|
|
4826
4800
|
console.log(formatEvaluationSummary(summary, thresholdOpts));
|
|
4801
|
+
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
4827
4802
|
const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
|
|
4828
4803
|
if (isMatrixMode && allResults.length > 0) {
|
|
4829
4804
|
console.log(formatMatrixSummary(allResults));
|
|
@@ -4907,7 +4882,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
4907
4882
|
outputPath,
|
|
4908
4883
|
testFiles: activeTestFiles,
|
|
4909
4884
|
target: options.target,
|
|
4910
|
-
thresholdFailed
|
|
4885
|
+
thresholdFailed,
|
|
4886
|
+
allExecutionErrors
|
|
4911
4887
|
};
|
|
4912
4888
|
} finally {
|
|
4913
4889
|
unsubscribeCodexLogs();
|
|
@@ -4940,6 +4916,43 @@ async function resolveEvaluationRunner() {
|
|
|
4940
4916
|
return candidate;
|
|
4941
4917
|
}
|
|
4942
4918
|
|
|
4919
|
+
// src/commands/eval/discover.ts
|
|
4920
|
+
import path16 from "node:path";
|
|
4921
|
+
import fg2 from "fast-glob";
|
|
4922
|
+
async function discoverEvalFiles(cwd) {
|
|
4923
|
+
const repoRoot = await findRepoRoot(cwd);
|
|
4924
|
+
const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
|
|
4925
|
+
const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
|
|
4926
|
+
const ignore = ["**/node_modules/**", "**/dist/**"];
|
|
4927
|
+
const matches = await fg2(patterns, {
|
|
4928
|
+
cwd,
|
|
4929
|
+
absolute: true,
|
|
4930
|
+
onlyFiles: true,
|
|
4931
|
+
ignore,
|
|
4932
|
+
followSymbolicLinks: true,
|
|
4933
|
+
caseSensitiveMatch: false
|
|
4934
|
+
});
|
|
4935
|
+
const evalFiles = matches.map((absPath) => {
|
|
4936
|
+
const relativePath = path16.relative(cwd, absPath);
|
|
4937
|
+
const category = deriveCategory(relativePath);
|
|
4938
|
+
return { path: absPath, relativePath, category };
|
|
4939
|
+
});
|
|
4940
|
+
evalFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
|
|
4941
|
+
return evalFiles;
|
|
4942
|
+
}
|
|
4943
|
+
function getCategories(files) {
|
|
4944
|
+
const categories = /* @__PURE__ */ new Set();
|
|
4945
|
+
for (const file of files) {
|
|
4946
|
+
categories.add(file.category);
|
|
4947
|
+
}
|
|
4948
|
+
const sorted = Array.from(categories);
|
|
4949
|
+
sorted.sort();
|
|
4950
|
+
return sorted;
|
|
4951
|
+
}
|
|
4952
|
+
function filterByCategory(files, category) {
|
|
4953
|
+
return files.filter((f) => f.category === category);
|
|
4954
|
+
}
|
|
4955
|
+
|
|
4943
4956
|
export {
|
|
4944
4957
|
package_default,
|
|
4945
4958
|
toSnakeCaseDeep,
|
|
@@ -4948,12 +4961,13 @@ export {
|
|
|
4948
4961
|
buildDefaultRunDir,
|
|
4949
4962
|
resolveExistingRunPrimaryPath,
|
|
4950
4963
|
resolveWorkspaceOrFilePath,
|
|
4951
|
-
|
|
4964
|
+
resolveRunManifestPath,
|
|
4952
4965
|
parseResultManifest,
|
|
4953
4966
|
resolveResultSourcePath,
|
|
4954
4967
|
loadManifestResults,
|
|
4955
4968
|
loadLightweightResults,
|
|
4956
4969
|
HtmlWriter,
|
|
4970
|
+
writeArtifactsFromResults,
|
|
4957
4971
|
resolveRunCacheFile,
|
|
4958
4972
|
loadRunCache,
|
|
4959
4973
|
resolveEvalPaths,
|
|
@@ -4966,6 +4980,9 @@ export {
|
|
|
4966
4980
|
TARGET_FILE_CANDIDATES,
|
|
4967
4981
|
fileExists,
|
|
4968
4982
|
selectTarget,
|
|
4969
|
-
runEvalCommand
|
|
4983
|
+
runEvalCommand,
|
|
4984
|
+
discoverEvalFiles,
|
|
4985
|
+
getCategories,
|
|
4986
|
+
filterByCategory
|
|
4970
4987
|
};
|
|
4971
|
-
//# sourceMappingURL=chunk-
|
|
4988
|
+
//# sourceMappingURL=chunk-AX4CQS45.js.map
|