agentv 4.6.1 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-NSVFUL27.js → chunk-A6W3KOCS.js} +4428 -3605
- package/dist/chunk-A6W3KOCS.js.map +1 -0
- package/dist/{chunk-YXXD27OK.js → chunk-H4GQXK5M.js} +1314 -440
- package/dist/chunk-H4GQXK5M.js.map +1 -0
- package/dist/{chunk-MHWYA4CS.js → chunk-QBZJSQXV.js} +365 -349
- package/dist/chunk-QBZJSQXV.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-BN5NUVAB.js → dist-QXVR2ZRH.js} +16 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-DMSVE6CS.js → interactive-IRYNIFCY.js} +10 -47
- package/dist/interactive-IRYNIFCY.js.map +1 -0
- package/dist/studio/assets/index-DHxVz6M9.css +1 -0
- package/dist/studio/assets/{index-C7TnyYee.js → index-DcwjOyrk.js} +1 -1
- package/dist/studio/assets/index-Y5InSvcS.js +65 -0
- package/dist/studio/index.html +2 -2
- package/package.json +1 -1
- package/dist/chunk-MHWYA4CS.js.map +0 -1
- package/dist/chunk-NSVFUL27.js.map +0 -1
- package/dist/chunk-YXXD27OK.js.map +0 -1
- package/dist/interactive-DMSVE6CS.js.map +0 -1
- package/dist/studio/assets/index-jJVIJh8b.css +0 -1
- package/dist/studio/assets/index-vn54AYtS.js +0 -65
- /package/dist/{dist-BN5NUVAB.js.map → dist-QXVR2ZRH.js.map} +0 -0
|
@@ -2,6 +2,8 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
|
|
|
2
2
|
import {
|
|
3
3
|
CLI_PLACEHOLDERS,
|
|
4
4
|
COMMON_TARGET_SETTINGS,
|
|
5
|
+
DEFAULT_EVAL_PATTERNS,
|
|
6
|
+
DEFAULT_THRESHOLD,
|
|
5
7
|
KNOWN_PROVIDERS,
|
|
6
8
|
PROVIDER_ALIASES,
|
|
7
9
|
ResponseCache,
|
|
@@ -9,6 +11,7 @@ import {
|
|
|
9
11
|
buildSearchRoots,
|
|
10
12
|
deriveCategory,
|
|
11
13
|
ensureVSCodeSubagents,
|
|
14
|
+
findDeprecatedCamelCaseTargetWarnings,
|
|
12
15
|
findGitRoot,
|
|
13
16
|
interpolateEnv,
|
|
14
17
|
isEvaluatorKind,
|
|
@@ -29,12 +32,12 @@ import {
|
|
|
29
32
|
subscribeToCopilotCliLogEntries,
|
|
30
33
|
subscribeToCopilotSdkLogEntries,
|
|
31
34
|
subscribeToPiLogEntries
|
|
32
|
-
} from "./chunk-
|
|
35
|
+
} from "./chunk-H4GQXK5M.js";
|
|
33
36
|
|
|
34
37
|
// package.json
|
|
35
38
|
var package_default = {
|
|
36
39
|
name: "agentv",
|
|
37
|
-
version: "4.
|
|
40
|
+
version: "4.8.0",
|
|
38
41
|
description: "CLI entry point for AgentV",
|
|
39
42
|
type: "module",
|
|
40
43
|
repository: {
|
|
@@ -346,6 +349,9 @@ function buildDefaultRunDir(cwd) {
|
|
|
346
349
|
function resolveRunIndexPath(runDir) {
|
|
347
350
|
return path3.join(runDir, RESULT_INDEX_FILENAME);
|
|
348
351
|
}
|
|
352
|
+
function isRunManifestPath(filePath) {
|
|
353
|
+
return path3.basename(filePath) === RESULT_INDEX_FILENAME;
|
|
354
|
+
}
|
|
349
355
|
function resolveExistingRunPrimaryPath(runDir) {
|
|
350
356
|
const indexPath = resolveRunIndexPath(runDir);
|
|
351
357
|
if (existsSync(indexPath)) {
|
|
@@ -370,9 +376,19 @@ function resolveWorkspaceOrFilePath(filePath) {
|
|
|
370
376
|
}
|
|
371
377
|
return existing;
|
|
372
378
|
}
|
|
379
|
+
function resolveRunManifestPath(filePath) {
|
|
380
|
+
if (isDirectoryPath(filePath)) {
|
|
381
|
+
return resolveWorkspaceOrFilePath(filePath);
|
|
382
|
+
}
|
|
383
|
+
if (!isRunManifestPath(filePath)) {
|
|
384
|
+
throw new Error(
|
|
385
|
+
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`
|
|
386
|
+
);
|
|
387
|
+
}
|
|
388
|
+
return filePath;
|
|
389
|
+
}
|
|
373
390
|
|
|
374
391
|
// src/commands/eval/artifact-writer.ts
|
|
375
|
-
var PASS_THRESHOLD = 0.8;
|
|
376
392
|
function computeStats(values) {
|
|
377
393
|
if (values.length === 0) {
|
|
378
394
|
return { mean: 0, stddev: 0 };
|
|
@@ -387,10 +403,10 @@ function computeStats(values) {
|
|
|
387
403
|
function computePassRate(result) {
|
|
388
404
|
const scores = result.scores;
|
|
389
405
|
if (scores && scores.length > 0) {
|
|
390
|
-
const passed = scores.filter((s) => s.score >=
|
|
406
|
+
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
391
407
|
return passed / scores.length;
|
|
392
408
|
}
|
|
393
|
-
return (result.score ?? 0) >=
|
|
409
|
+
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
394
410
|
}
|
|
395
411
|
function countToolCalls(result) {
|
|
396
412
|
const toolCalls = {};
|
|
@@ -596,12 +612,12 @@ function safeArtifactPathSegment(value, fallback) {
|
|
|
596
612
|
function safeTestId(testId) {
|
|
597
613
|
return safeArtifactPathSegment(testId, "unknown");
|
|
598
614
|
}
|
|
599
|
-
function
|
|
600
|
-
return result.
|
|
615
|
+
function getSuite(result) {
|
|
616
|
+
return result.suite;
|
|
601
617
|
}
|
|
602
618
|
function buildArtifactSubdir(result) {
|
|
603
619
|
const segments = [];
|
|
604
|
-
const evalSet =
|
|
620
|
+
const evalSet = getSuite(result);
|
|
605
621
|
if (evalSet) {
|
|
606
622
|
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
607
623
|
}
|
|
@@ -628,7 +644,7 @@ function buildResultIndexArtifact(result) {
|
|
|
628
644
|
return {
|
|
629
645
|
timestamp: result.timestamp,
|
|
630
646
|
test_id: result.testId ?? "unknown",
|
|
631
|
-
|
|
647
|
+
suite: getSuite(result),
|
|
632
648
|
category: result.category,
|
|
633
649
|
conversation_id: result.conversationId,
|
|
634
650
|
score: result.score,
|
|
@@ -651,42 +667,6 @@ async function writeJsonlFile(filePath, records) {
|
|
|
651
667
|
`;
|
|
652
668
|
await writeFile(filePath, content, "utf8");
|
|
653
669
|
}
|
|
654
|
-
function toCamelCase(str) {
|
|
655
|
-
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
656
|
-
}
|
|
657
|
-
function toCamelCaseDeep(obj) {
|
|
658
|
-
if (obj === null || obj === void 0) {
|
|
659
|
-
return obj;
|
|
660
|
-
}
|
|
661
|
-
if (Array.isArray(obj)) {
|
|
662
|
-
return obj.map((item) => toCamelCaseDeep(item));
|
|
663
|
-
}
|
|
664
|
-
if (typeof obj === "object") {
|
|
665
|
-
const result = {};
|
|
666
|
-
for (const [key, value] of Object.entries(obj)) {
|
|
667
|
-
result[toCamelCase(key)] = toCamelCaseDeep(value);
|
|
668
|
-
}
|
|
669
|
-
return result;
|
|
670
|
-
}
|
|
671
|
-
return obj;
|
|
672
|
-
}
|
|
673
|
-
function parseJsonlResults(content) {
|
|
674
|
-
const results = [];
|
|
675
|
-
const lines = content.split("\n");
|
|
676
|
-
for (const line of lines) {
|
|
677
|
-
const trimmed = line.trim();
|
|
678
|
-
if (trimmed.length === 0) {
|
|
679
|
-
continue;
|
|
680
|
-
}
|
|
681
|
-
try {
|
|
682
|
-
const parsed = JSON.parse(trimmed);
|
|
683
|
-
const camelCased = toCamelCaseDeep(parsed);
|
|
684
|
-
results.push(camelCased);
|
|
685
|
-
} catch {
|
|
686
|
-
}
|
|
687
|
-
}
|
|
688
|
-
return results;
|
|
689
|
-
}
|
|
690
670
|
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
691
671
|
const testArtifactDir = outputDir;
|
|
692
672
|
const timingPath = path4.join(outputDir, "timing.json");
|
|
@@ -733,7 +713,6 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
733
713
|
|
|
734
714
|
// src/commands/eval/benchmark-writer.ts
|
|
735
715
|
import { writeFile as writeFile2 } from "node:fs/promises";
|
|
736
|
-
var PASS_THRESHOLD2 = 0.8;
|
|
737
716
|
function computeStats2(values) {
|
|
738
717
|
if (values.length === 0) {
|
|
739
718
|
return { mean: 0, stddev: 0 };
|
|
@@ -748,10 +727,10 @@ function computeStats2(values) {
|
|
|
748
727
|
function computePassRate2(result) {
|
|
749
728
|
const scores = result.scores;
|
|
750
729
|
if (scores && scores.length > 0) {
|
|
751
|
-
const passed = scores.filter((s) => s.score >=
|
|
730
|
+
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
752
731
|
return passed / scores.length;
|
|
753
732
|
}
|
|
754
|
-
return result.score >=
|
|
733
|
+
return result.score >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
755
734
|
}
|
|
756
735
|
function buildBenchmarkJson(results) {
|
|
757
736
|
const passRates = results.map(computePassRate2);
|
|
@@ -1698,7 +1677,7 @@ var JunitWriter = class _JunitWriter {
|
|
|
1698
1677
|
this.closed = true;
|
|
1699
1678
|
const grouped = /* @__PURE__ */ new Map();
|
|
1700
1679
|
for (const result of this.results) {
|
|
1701
|
-
const suite = result.
|
|
1680
|
+
const suite = result.suite ?? "default";
|
|
1702
1681
|
const existing = grouped.get(suite);
|
|
1703
1682
|
if (existing) {
|
|
1704
1683
|
existing.push(result);
|
|
@@ -1708,14 +1687,17 @@ var JunitWriter = class _JunitWriter {
|
|
|
1708
1687
|
}
|
|
1709
1688
|
const suiteXmls = [];
|
|
1710
1689
|
for (const [suiteName, results] of grouped) {
|
|
1711
|
-
const
|
|
1712
|
-
const
|
|
1690
|
+
const errors = results.filter((r) => r.executionStatus === "execution_error").length;
|
|
1691
|
+
const failures = results.filter(
|
|
1692
|
+
(r) => r.executionStatus !== "execution_error" && r.score < this.threshold
|
|
1693
|
+
).length;
|
|
1713
1694
|
const testCases = results.map((r) => {
|
|
1714
1695
|
const time = r.durationMs ? (r.durationMs / 1e3).toFixed(3) : "0.000";
|
|
1715
1696
|
let inner = "";
|
|
1716
|
-
if (r.
|
|
1697
|
+
if (r.executionStatus === "execution_error") {
|
|
1698
|
+
const errorMsg = r.error ?? "Execution error";
|
|
1717
1699
|
inner = `
|
|
1718
|
-
<error message="${escapeXml(
|
|
1700
|
+
<error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>
|
|
1719
1701
|
`;
|
|
1720
1702
|
} else if (r.score < this.threshold) {
|
|
1721
1703
|
const message = `score=${r.score.toFixed(3)}`;
|
|
@@ -1730,17 +1712,21 @@ var JunitWriter = class _JunitWriter {
|
|
|
1730
1712
|
}
|
|
1731
1713
|
return ` <testcase name="${escapeXml(r.testId)}" classname="${escapeXml(suiteName)}" time="${time}">${inner}</testcase>`;
|
|
1732
1714
|
});
|
|
1715
|
+
const suiteTime = results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1e3;
|
|
1733
1716
|
suiteXmls.push(
|
|
1734
|
-
` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}">
|
|
1717
|
+
` <testsuite name="${escapeXml(suiteName)}" tests="${results.length}" failures="${failures}" errors="${errors}" time="${suiteTime.toFixed(3)}">
|
|
1735
1718
|
${testCases.join("\n")}
|
|
1736
1719
|
</testsuite>`
|
|
1737
1720
|
);
|
|
1738
1721
|
}
|
|
1739
1722
|
const totalTests = this.results.length;
|
|
1740
|
-
const
|
|
1741
|
-
const
|
|
1723
|
+
const totalErrors = this.results.filter((r) => r.executionStatus === "execution_error").length;
|
|
1724
|
+
const totalFailures = this.results.filter(
|
|
1725
|
+
(r) => r.executionStatus !== "execution_error" && r.score < this.threshold
|
|
1726
|
+
).length;
|
|
1727
|
+
const totalTime = this.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1e3;
|
|
1742
1728
|
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
|
1743
|
-
<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}">
|
|
1729
|
+
<testsuites tests="${totalTests}" failures="${totalFailures}" errors="${totalErrors}" time="${totalTime.toFixed(3)}">
|
|
1744
1730
|
${suiteXmls.join("\n")}
|
|
1745
1731
|
</testsuites>
|
|
1746
1732
|
`;
|
|
@@ -1839,17 +1825,6 @@ function createWriterFromPath(filePath, options) {
|
|
|
1839
1825
|
);
|
|
1840
1826
|
}
|
|
1841
1827
|
}
|
|
1842
|
-
async function createMultiWriter(filePaths, options) {
|
|
1843
|
-
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
|
|
1844
|
-
return {
|
|
1845
|
-
async append(result) {
|
|
1846
|
-
await Promise.all(writers.map((w) => w.append(result)));
|
|
1847
|
-
},
|
|
1848
|
-
async close() {
|
|
1849
|
-
await Promise.all(writers.map((w) => w.close()));
|
|
1850
|
-
}
|
|
1851
|
-
};
|
|
1852
|
-
}
|
|
1853
1828
|
|
|
1854
1829
|
// src/commands/eval/progress-display.ts
|
|
1855
1830
|
var ANSI_BOLD = "\x1B[1m";
|
|
@@ -1926,12 +1901,12 @@ var ProgressDisplay = class {
|
|
|
1926
1901
|
}
|
|
1927
1902
|
addLogPaths(paths, provider) {
|
|
1928
1903
|
const newPaths = [];
|
|
1929
|
-
for (const
|
|
1930
|
-
if (this.logPathSet.has(
|
|
1904
|
+
for (const path17 of paths) {
|
|
1905
|
+
if (this.logPathSet.has(path17)) {
|
|
1931
1906
|
continue;
|
|
1932
1907
|
}
|
|
1933
|
-
this.logPathSet.add(
|
|
1934
|
-
newPaths.push(
|
|
1908
|
+
this.logPathSet.add(path17);
|
|
1909
|
+
newPaths.push(path17);
|
|
1935
1910
|
}
|
|
1936
1911
|
if (newPaths.length === 0) {
|
|
1937
1912
|
return;
|
|
@@ -1944,8 +1919,8 @@ var ProgressDisplay = class {
|
|
|
1944
1919
|
this.hasPrintedLogHeader = true;
|
|
1945
1920
|
}
|
|
1946
1921
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
1947
|
-
newPaths.forEach((
|
|
1948
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
1922
|
+
newPaths.forEach((path17, offset) => {
|
|
1923
|
+
console.log(`${startIndex + offset + 1}. ${path17}`);
|
|
1949
1924
|
});
|
|
1950
1925
|
}
|
|
1951
1926
|
finish() {
|
|
@@ -1962,9 +1937,6 @@ import path12 from "node:path";
|
|
|
1962
1937
|
function parseJsonlLines(content) {
|
|
1963
1938
|
return content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
1964
1939
|
}
|
|
1965
|
-
function isIndexManifestPath(sourceFile) {
|
|
1966
|
-
return path12.basename(sourceFile) === RESULT_INDEX_FILENAME;
|
|
1967
|
-
}
|
|
1968
1940
|
function parseMarkdownMessages(content) {
|
|
1969
1941
|
const trimmed = content.trim();
|
|
1970
1942
|
if (!trimmed.startsWith("@[")) {
|
|
@@ -2022,11 +1994,11 @@ function hydrateOutput(baseDir, record) {
|
|
|
2022
1994
|
function hydrateManifestRecord(baseDir, record) {
|
|
2023
1995
|
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
2024
1996
|
const timing = readOptionalJson(baseDir, record.timing_path);
|
|
2025
|
-
const testId = record.test_id ??
|
|
1997
|
+
const testId = record.test_id ?? "unknown";
|
|
2026
1998
|
return {
|
|
2027
1999
|
timestamp: record.timestamp,
|
|
2028
2000
|
testId,
|
|
2029
|
-
|
|
2001
|
+
suite: record.suite,
|
|
2030
2002
|
category: record.category,
|
|
2031
2003
|
target: record.target,
|
|
2032
2004
|
score: record.score,
|
|
@@ -2066,74 +2038,44 @@ function parseResultManifest(content) {
|
|
|
2066
2038
|
}
|
|
2067
2039
|
function resolveResultSourcePath(source, cwd) {
|
|
2068
2040
|
const resolved = path12.isAbsolute(source) ? source : path12.resolve(cwd ?? process.cwd(), source);
|
|
2069
|
-
|
|
2041
|
+
if (isDirectoryPath(resolved) || path12.basename(resolved) === RESULT_INDEX_FILENAME) {
|
|
2042
|
+
return resolveRunManifestPath(resolved);
|
|
2043
|
+
}
|
|
2044
|
+
return resolved;
|
|
2070
2045
|
}
|
|
2071
2046
|
function loadManifestResults(sourceFile) {
|
|
2072
|
-
const resolvedSourceFile =
|
|
2073
|
-
if (!isIndexManifestPath(resolvedSourceFile)) {
|
|
2074
|
-
return parseJsonlResults(readFileSync(resolvedSourceFile, "utf8"));
|
|
2075
|
-
}
|
|
2047
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2076
2048
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2077
2049
|
const records = parseResultManifest(content);
|
|
2078
2050
|
const baseDir = path12.dirname(resolvedSourceFile);
|
|
2079
2051
|
return records.map((record) => hydrateManifestRecord(baseDir, record));
|
|
2080
2052
|
}
|
|
2081
2053
|
function loadLightweightResults(sourceFile) {
|
|
2082
|
-
const resolvedSourceFile =
|
|
2054
|
+
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
|
|
2083
2055
|
const content = readFileSync(resolvedSourceFile, "utf8");
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
}
|
|
2096
|
-
const records = [];
|
|
2097
|
-
for (const line of content.split(/\r?\n/)) {
|
|
2098
|
-
const trimmed = line.trim();
|
|
2099
|
-
if (!trimmed) {
|
|
2100
|
-
continue;
|
|
2101
|
-
}
|
|
2102
|
-
let record;
|
|
2103
|
-
try {
|
|
2104
|
-
record = JSON.parse(trimmed);
|
|
2105
|
-
} catch {
|
|
2106
|
-
continue;
|
|
2107
|
-
}
|
|
2108
|
-
const rawTestId = record.test_id ?? record.eval_id ?? record.testId ?? record.evalId;
|
|
2109
|
-
if (typeof rawTestId !== "string") {
|
|
2110
|
-
throw new Error(`Missing test_id in result: ${trimmed}`);
|
|
2111
|
-
}
|
|
2112
|
-
if (typeof record.score !== "number") {
|
|
2113
|
-
throw new Error(`Missing or invalid score in result: ${trimmed}`);
|
|
2114
|
-
}
|
|
2115
|
-
records.push({
|
|
2116
|
-
testId: rawTestId,
|
|
2117
|
-
target: typeof record.target === "string" ? record.target : void 0,
|
|
2118
|
-
score: record.score,
|
|
2119
|
-
scores: Array.isArray(record.scores) ? record.scores : void 0,
|
|
2120
|
-
executionStatus: typeof record.execution_status === "string" ? record.execution_status : typeof record.executionStatus === "string" ? record.executionStatus : void 0,
|
|
2121
|
-
error: typeof record.error === "string" ? record.error : void 0,
|
|
2122
|
-
timestamp: typeof record.timestamp === "string" ? record.timestamp : void 0
|
|
2123
|
-
});
|
|
2124
|
-
}
|
|
2125
|
-
return records;
|
|
2056
|
+
return parseResultManifest(content).map((record) => ({
|
|
2057
|
+
testId: record.test_id ?? "unknown",
|
|
2058
|
+
suite: record.suite,
|
|
2059
|
+
target: record.target,
|
|
2060
|
+
experiment: record.experiment,
|
|
2061
|
+
score: record.score,
|
|
2062
|
+
scores: record.scores,
|
|
2063
|
+
executionStatus: record.execution_status,
|
|
2064
|
+
error: record.error,
|
|
2065
|
+
timestamp: record.timestamp
|
|
2066
|
+
}));
|
|
2126
2067
|
}
|
|
2127
2068
|
|
|
2128
2069
|
// src/commands/eval/retry-errors.ts
|
|
2070
|
+
async function loadRetrySourceResults(jsonlPath) {
|
|
2071
|
+
return loadManifestResults(resolveResultSourcePath(jsonlPath));
|
|
2072
|
+
}
|
|
2129
2073
|
async function loadErrorTestIds(jsonlPath) {
|
|
2130
|
-
const
|
|
2131
|
-
const ids = loadLightweightResults(resolvedPath).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
2074
|
+
const ids = (await loadRetrySourceResults(jsonlPath)).filter((result) => result.executionStatus === "execution_error").map((result) => result.testId);
|
|
2132
2075
|
return [...new Set(ids)];
|
|
2133
2076
|
}
|
|
2134
2077
|
async function loadNonErrorResults(jsonlPath) {
|
|
2135
|
-
|
|
2136
|
-
return loadManifestResults(resolvedPath).filter(
|
|
2078
|
+
return (await loadRetrySourceResults(jsonlPath)).filter(
|
|
2137
2079
|
(result) => result.testId && result.executionStatus !== "execution_error"
|
|
2138
2080
|
);
|
|
2139
2081
|
}
|
|
@@ -2146,7 +2088,7 @@ function resolveRunCacheFile(cache) {
|
|
|
2146
2088
|
if (cache.lastRunDir) {
|
|
2147
2089
|
return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
|
|
2148
2090
|
}
|
|
2149
|
-
return
|
|
2091
|
+
return "";
|
|
2150
2092
|
}
|
|
2151
2093
|
function cachePath(cwd) {
|
|
2152
2094
|
return path13.join(cwd, ".agentv", CACHE_FILENAME);
|
|
@@ -2160,15 +2102,14 @@ async function loadRunCache(cwd) {
|
|
|
2160
2102
|
}
|
|
2161
2103
|
}
|
|
2162
2104
|
async function saveRunCache(cwd, resultPath) {
|
|
2105
|
+
if (path13.basename(resultPath) !== RESULT_INDEX_FILENAME) {
|
|
2106
|
+
return;
|
|
2107
|
+
}
|
|
2163
2108
|
const dir = path13.join(cwd, ".agentv");
|
|
2164
2109
|
await mkdir7(dir, { recursive: true });
|
|
2165
|
-
const
|
|
2166
|
-
const cache = basename === RESULT_INDEX_FILENAME ? {
|
|
2110
|
+
const cache = {
|
|
2167
2111
|
lastRunDir: path13.dirname(resultPath),
|
|
2168
2112
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2169
|
-
} : {
|
|
2170
|
-
lastResultFile: resultPath,
|
|
2171
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2172
2113
|
};
|
|
2173
2114
|
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
2174
2115
|
`, "utf-8");
|
|
@@ -2313,11 +2254,21 @@ function formatEvaluationSummary(summary, options) {
|
|
|
2313
2254
|
}
|
|
2314
2255
|
const gradedCount = summary.total - summary.executionErrorCount;
|
|
2315
2256
|
const threshold = options?.threshold ?? 0.8;
|
|
2316
|
-
const
|
|
2317
|
-
const
|
|
2257
|
+
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
2258
|
+
const overallPassed = !allExecutionErrors && (summary.passedCount === gradedCount || summary.qualityFailureCount === 0 && summary.executionErrorCount === 0);
|
|
2318
2259
|
const useColor = !(process.env.NO_COLOR !== void 0) && (process.stdout.isTTY ?? false);
|
|
2319
|
-
|
|
2320
|
-
|
|
2260
|
+
let overallVerdict;
|
|
2261
|
+
let verdictColor;
|
|
2262
|
+
let verdictText;
|
|
2263
|
+
if (allExecutionErrors) {
|
|
2264
|
+
overallVerdict = "INCONCLUSIVE";
|
|
2265
|
+
verdictColor = "\x1B[33m";
|
|
2266
|
+
verdictText = `RESULT: INCONCLUSIVE (all ${summary.total} test(s) had execution errors \u2014 no evaluation was performed)`;
|
|
2267
|
+
} else {
|
|
2268
|
+
overallVerdict = overallPassed ? "PASS" : "FAIL";
|
|
2269
|
+
verdictColor = overallPassed ? "\x1B[32m" : "\x1B[31m";
|
|
2270
|
+
verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`;
|
|
2271
|
+
}
|
|
2321
2272
|
lines.push("\n==================================================");
|
|
2322
2273
|
if (useColor) {
|
|
2323
2274
|
lines.push(`\x1B[1m${verdictColor}${verdictText}\x1B[0m`);
|
|
@@ -2527,7 +2478,7 @@ var KNOWN_TEST_FIELDS = /* @__PURE__ */ new Set([
|
|
|
2527
2478
|
"workspace",
|
|
2528
2479
|
"metadata",
|
|
2529
2480
|
"conversation_id",
|
|
2530
|
-
"
|
|
2481
|
+
"suite",
|
|
2531
2482
|
"note"
|
|
2532
2483
|
]);
|
|
2533
2484
|
var NAME_PATTERN = /^[a-z0-9-]+$/;
|
|
@@ -3090,87 +3041,68 @@ function isObject2(value) {
|
|
|
3090
3041
|
var COMMON_SETTINGS = new Set(COMMON_TARGET_SETTINGS);
|
|
3091
3042
|
var RETRY_SETTINGS = /* @__PURE__ */ new Set([
|
|
3092
3043
|
"max_retries",
|
|
3093
|
-
"maxRetries",
|
|
3094
3044
|
"retry_initial_delay_ms",
|
|
3095
|
-
"retryInitialDelayMs",
|
|
3096
3045
|
"retry_max_delay_ms",
|
|
3097
|
-
"retryMaxDelayMs",
|
|
3098
3046
|
"retry_backoff_factor",
|
|
3099
|
-
"
|
|
3100
|
-
"retry_status_codes",
|
|
3101
|
-
"retryStatusCodes"
|
|
3047
|
+
"retry_status_codes"
|
|
3102
3048
|
]);
|
|
3103
3049
|
var AZURE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3104
3050
|
...COMMON_SETTINGS,
|
|
3105
3051
|
...RETRY_SETTINGS,
|
|
3106
3052
|
"endpoint",
|
|
3107
3053
|
"resource",
|
|
3108
|
-
"resourceName",
|
|
3109
3054
|
"api_key",
|
|
3110
|
-
"apiKey",
|
|
3111
3055
|
"deployment",
|
|
3112
|
-
"deploymentName",
|
|
3113
3056
|
"model",
|
|
3114
3057
|
"version",
|
|
3115
3058
|
"api_version",
|
|
3059
|
+
"api_format",
|
|
3116
3060
|
"temperature",
|
|
3117
|
-
"max_output_tokens"
|
|
3118
|
-
"maxTokens"
|
|
3061
|
+
"max_output_tokens"
|
|
3119
3062
|
]);
|
|
3120
3063
|
var OPENAI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3121
3064
|
...COMMON_SETTINGS,
|
|
3122
3065
|
...RETRY_SETTINGS,
|
|
3123
3066
|
"endpoint",
|
|
3124
3067
|
"base_url",
|
|
3125
|
-
"baseUrl",
|
|
3126
3068
|
"api_key",
|
|
3127
|
-
"apiKey",
|
|
3128
3069
|
"model",
|
|
3129
3070
|
"deployment",
|
|
3130
3071
|
"variant",
|
|
3131
3072
|
"api_format",
|
|
3132
|
-
"apiFormat",
|
|
3133
3073
|
"temperature",
|
|
3134
|
-
"max_output_tokens"
|
|
3135
|
-
"maxTokens"
|
|
3074
|
+
"max_output_tokens"
|
|
3136
3075
|
]);
|
|
3137
3076
|
var OPENROUTER_SETTINGS = /* @__PURE__ */ new Set([
|
|
3138
3077
|
...COMMON_SETTINGS,
|
|
3139
3078
|
...RETRY_SETTINGS,
|
|
3140
3079
|
"api_key",
|
|
3141
|
-
"apiKey",
|
|
3142
3080
|
"model",
|
|
3143
3081
|
"deployment",
|
|
3144
3082
|
"variant",
|
|
3145
3083
|
"temperature",
|
|
3146
|
-
"max_output_tokens"
|
|
3147
|
-
"maxTokens"
|
|
3084
|
+
"max_output_tokens"
|
|
3148
3085
|
]);
|
|
3149
3086
|
var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
|
|
3150
3087
|
...COMMON_SETTINGS,
|
|
3151
3088
|
...RETRY_SETTINGS,
|
|
3152
3089
|
"api_key",
|
|
3153
|
-
"apiKey",
|
|
3154
3090
|
"model",
|
|
3155
3091
|
"deployment",
|
|
3156
3092
|
"variant",
|
|
3157
3093
|
"temperature",
|
|
3158
3094
|
"max_output_tokens",
|
|
3159
|
-
"
|
|
3160
|
-
"thinking_budget",
|
|
3161
|
-
"thinkingBudget"
|
|
3095
|
+
"thinking_budget"
|
|
3162
3096
|
]);
|
|
3163
3097
|
var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3164
3098
|
...COMMON_SETTINGS,
|
|
3165
3099
|
...RETRY_SETTINGS,
|
|
3166
3100
|
"api_key",
|
|
3167
|
-
"apiKey",
|
|
3168
3101
|
"model",
|
|
3169
3102
|
"deployment",
|
|
3170
3103
|
"variant",
|
|
3171
3104
|
"temperature",
|
|
3172
|
-
"max_output_tokens"
|
|
3173
|
-
"maxTokens"
|
|
3105
|
+
"max_output_tokens"
|
|
3174
3106
|
]);
|
|
3175
3107
|
var CODEX_SETTINGS = /* @__PURE__ */ new Set([
|
|
3176
3108
|
...COMMON_SETTINGS,
|
|
@@ -3182,40 +3114,26 @@ var CODEX_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3182
3114
|
"arguments",
|
|
3183
3115
|
"cwd",
|
|
3184
3116
|
"timeout_seconds",
|
|
3185
|
-
"timeoutSeconds",
|
|
3186
3117
|
"log_dir",
|
|
3187
|
-
"logDir",
|
|
3188
3118
|
"log_directory",
|
|
3189
|
-
"logDirectory",
|
|
3190
3119
|
"log_format",
|
|
3191
|
-
"logFormat",
|
|
3192
3120
|
"log_output_format",
|
|
3193
|
-
"logOutputFormat",
|
|
3194
3121
|
"system_prompt",
|
|
3195
|
-
"
|
|
3196
|
-
"workspace_template",
|
|
3197
|
-
"workspaceTemplate"
|
|
3122
|
+
"workspace_template"
|
|
3198
3123
|
]);
|
|
3199
3124
|
var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
|
|
3200
3125
|
...COMMON_SETTINGS,
|
|
3201
3126
|
"cli_url",
|
|
3202
|
-
"cliUrl",
|
|
3203
3127
|
"cli_path",
|
|
3204
|
-
"cliPath",
|
|
3205
3128
|
"github_token",
|
|
3206
|
-
"githubToken",
|
|
3207
3129
|
"model",
|
|
3208
3130
|
"cwd",
|
|
3209
3131
|
"timeout_seconds",
|
|
3210
|
-
"timeoutSeconds",
|
|
3211
3132
|
"log_dir",
|
|
3212
|
-
"logDir",
|
|
3213
3133
|
"log_format",
|
|
3214
|
-
"logFormat",
|
|
3215
3134
|
"system_prompt",
|
|
3216
|
-
"systemPrompt",
|
|
3217
3135
|
"workspace_template",
|
|
3218
|
-
"
|
|
3136
|
+
"byok"
|
|
3219
3137
|
]);
|
|
3220
3138
|
var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
3221
3139
|
...COMMON_SETTINGS,
|
|
@@ -3227,35 +3145,23 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3227
3145
|
"model",
|
|
3228
3146
|
"cwd",
|
|
3229
3147
|
"timeout_seconds",
|
|
3230
|
-
"timeoutSeconds",
|
|
3231
3148
|
"log_dir",
|
|
3232
|
-
"logDir",
|
|
3233
3149
|
"log_format",
|
|
3234
|
-
"logFormat",
|
|
3235
3150
|
"system_prompt",
|
|
3236
|
-
"
|
|
3237
|
-
"workspace_template",
|
|
3238
|
-
"workspaceTemplate"
|
|
3151
|
+
"workspace_template"
|
|
3239
3152
|
]);
|
|
3240
3153
|
var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3241
3154
|
...COMMON_SETTINGS,
|
|
3242
3155
|
"executable",
|
|
3243
3156
|
"workspace_template",
|
|
3244
|
-
"workspaceTemplate",
|
|
3245
3157
|
"wait",
|
|
3246
3158
|
"dry_run",
|
|
3247
|
-
"dryRun",
|
|
3248
3159
|
"subagent_root",
|
|
3249
|
-
"
|
|
3250
|
-
"timeout_seconds",
|
|
3251
|
-
"timeoutSeconds"
|
|
3160
|
+
"timeout_seconds"
|
|
3252
3161
|
]);
|
|
3253
3162
|
var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
3254
3163
|
...COMMON_SETTINGS,
|
|
3255
3164
|
"response",
|
|
3256
|
-
"delayMs",
|
|
3257
|
-
"delayMinMs",
|
|
3258
|
-
"delayMaxMs",
|
|
3259
3165
|
"trace"
|
|
3260
3166
|
// For testing tool-trajectory evaluator
|
|
3261
3167
|
]);
|
|
@@ -3264,23 +3170,14 @@ var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3264
3170
|
"model",
|
|
3265
3171
|
"cwd",
|
|
3266
3172
|
"timeout_seconds",
|
|
3267
|
-
"timeoutSeconds",
|
|
3268
3173
|
"log_dir",
|
|
3269
|
-
"logDir",
|
|
3270
3174
|
"log_directory",
|
|
3271
|
-
"logDirectory",
|
|
3272
3175
|
"log_format",
|
|
3273
|
-
"logFormat",
|
|
3274
3176
|
"log_output_format",
|
|
3275
|
-
"logOutputFormat",
|
|
3276
3177
|
"system_prompt",
|
|
3277
|
-
"systemPrompt",
|
|
3278
3178
|
"workspace_template",
|
|
3279
|
-
"workspaceTemplate",
|
|
3280
3179
|
"max_turns",
|
|
3281
|
-
"
|
|
3282
|
-
"max_budget_usd",
|
|
3283
|
-
"maxBudgetUsd"
|
|
3180
|
+
"max_budget_usd"
|
|
3284
3181
|
]);
|
|
3285
3182
|
function getKnownSettings(provider) {
|
|
3286
3183
|
const normalizedProvider = provider.toLowerCase();
|
|
@@ -3405,15 +3302,15 @@ async function validateTargetsFile(filePath) {
|
|
|
3405
3302
|
});
|
|
3406
3303
|
return;
|
|
3407
3304
|
}
|
|
3408
|
-
const timeoutSeconds = healthcheck.timeout_seconds
|
|
3305
|
+
const timeoutSeconds = healthcheck.timeout_seconds;
|
|
3409
3306
|
if (timeoutSeconds !== void 0) {
|
|
3410
3307
|
const numericTimeout = Number(timeoutSeconds);
|
|
3411
3308
|
if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
|
|
3412
3309
|
errors2.push({
|
|
3413
3310
|
severity: "error",
|
|
3414
3311
|
filePath: absolutePath2,
|
|
3415
|
-
location: `${location}.
|
|
3416
|
-
message: "healthcheck.
|
|
3312
|
+
location: `${location}.timeout_seconds`,
|
|
3313
|
+
message: "healthcheck.timeout_seconds must be a positive number when provided"
|
|
3417
3314
|
});
|
|
3418
3315
|
}
|
|
3419
3316
|
}
|
|
@@ -3512,6 +3409,18 @@ async function validateTargetsFile(filePath) {
|
|
|
3512
3409
|
});
|
|
3513
3410
|
continue;
|
|
3514
3411
|
}
|
|
3412
|
+
for (const warning of findDeprecatedCamelCaseTargetWarnings(target, location)) {
|
|
3413
|
+
const fieldMatch = warning.message.match(/field '([^']+)'/);
|
|
3414
|
+
const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
|
|
3415
|
+
const field = fieldMatch?.[1] ?? "unknown";
|
|
3416
|
+
const replacement = replacementMatch?.[1] ?? "snake_case";
|
|
3417
|
+
errors.push({
|
|
3418
|
+
severity: "error",
|
|
3419
|
+
filePath: absolutePath,
|
|
3420
|
+
location: warning.location,
|
|
3421
|
+
message: `camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
|
|
3422
|
+
});
|
|
3423
|
+
}
|
|
3515
3424
|
const name = target.name;
|
|
3516
3425
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
3517
3426
|
errors.push({
|
|
@@ -3891,7 +3800,9 @@ Errors in ${targetsFilePath}:`);
|
|
|
3891
3800
|
};
|
|
3892
3801
|
}
|
|
3893
3802
|
try {
|
|
3894
|
-
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath
|
|
3803
|
+
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
|
|
3804
|
+
emitDeprecationWarnings: false
|
|
3805
|
+
});
|
|
3895
3806
|
return {
|
|
3896
3807
|
definitions,
|
|
3897
3808
|
resolvedTarget,
|
|
@@ -3974,7 +3885,9 @@ Errors in ${targetsFilePath}:`);
|
|
|
3974
3885
|
});
|
|
3975
3886
|
} else {
|
|
3976
3887
|
try {
|
|
3977
|
-
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath
|
|
3888
|
+
const resolvedTarget = resolveTargetDefinition(targetDefinition, env, testFilePath, {
|
|
3889
|
+
emitDeprecationWarnings: false
|
|
3890
|
+
});
|
|
3978
3891
|
results.push({
|
|
3979
3892
|
definitions,
|
|
3980
3893
|
resolvedTarget,
|
|
@@ -4043,6 +3956,16 @@ function normalizeStringArray(value) {
|
|
|
4043
3956
|
}
|
|
4044
3957
|
return [];
|
|
4045
3958
|
}
|
|
3959
|
+
function normalizeFilter(value) {
|
|
3960
|
+
if (Array.isArray(value)) {
|
|
3961
|
+
const filters = normalizeStringArray(value);
|
|
3962
|
+
if (filters.length === 0) {
|
|
3963
|
+
return void 0;
|
|
3964
|
+
}
|
|
3965
|
+
return filters.length === 1 ? filters[0] : filters;
|
|
3966
|
+
}
|
|
3967
|
+
return normalizeString(value);
|
|
3968
|
+
}
|
|
4046
3969
|
function matchesTagFilters(fileTags, includeTags, excludeTags) {
|
|
4047
3970
|
const tags = new Set(fileTags ?? []);
|
|
4048
3971
|
if (includeTags.length > 0) {
|
|
@@ -4084,15 +4007,12 @@ function trimOutputMessages(output, outputMessages) {
|
|
|
4084
4007
|
return sliced.map((m) => ({ role: m.role, content: m.content }));
|
|
4085
4008
|
}
|
|
4086
4009
|
function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
4087
|
-
const cliFormat = normalizeString(rawOptions.outputFormat);
|
|
4088
|
-
const configFormat = config?.output?.format;
|
|
4089
|
-
const formatStr = cliFormat ?? configFormat ?? "jsonl";
|
|
4090
|
-
const format = formatStr === "yaml" ? "yaml" : "jsonl";
|
|
4091
4010
|
const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
|
|
4092
4011
|
const configWorkers = config?.execution?.workers;
|
|
4093
4012
|
const workers = cliWorkers ?? configWorkers ?? 0;
|
|
4094
|
-
const
|
|
4095
|
-
const
|
|
4013
|
+
const cliOutputDir = normalizeString(rawOptions.output);
|
|
4014
|
+
const rawExportPaths = rawOptions.export;
|
|
4015
|
+
const exportPaths = Array.isArray(rawExportPaths) ? rawExportPaths.filter((v) => typeof v === "string" && v.trim().length > 0) : [];
|
|
4096
4016
|
const rawTarget = rawOptions.target;
|
|
4097
4017
|
let cliTargets = [];
|
|
4098
4018
|
let singleTarget;
|
|
@@ -4132,11 +4052,11 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4132
4052
|
target: singleTarget,
|
|
4133
4053
|
cliTargets,
|
|
4134
4054
|
targetsPath: normalizeString(rawOptions.targets),
|
|
4135
|
-
filter:
|
|
4055
|
+
filter: normalizeFilter(rawOptions.filter),
|
|
4136
4056
|
workers: workers > 0 ? workers : void 0,
|
|
4057
|
+
outputDir: cliOutputDir,
|
|
4137
4058
|
outPath: cliOut ?? configOut,
|
|
4138
|
-
|
|
4139
|
-
format,
|
|
4059
|
+
exportPaths,
|
|
4140
4060
|
dryRun: normalizeBoolean(rawOptions.dryRun),
|
|
4141
4061
|
dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0),
|
|
4142
4062
|
dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0),
|
|
@@ -4165,7 +4085,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4165
4085
|
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
|
|
4166
4086
|
threshold: normalizeOptionalNumber(rawOptions.threshold),
|
|
4167
4087
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4168
|
-
excludeTags: normalizeStringArray(rawOptions.excludeTag)
|
|
4088
|
+
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4089
|
+
transcript: normalizeString(rawOptions.transcript)
|
|
4169
4090
|
};
|
|
4170
4091
|
}
|
|
4171
4092
|
async function ensureFileExists(filePath, description) {
|
|
@@ -4191,20 +4112,20 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
4191
4112
|
addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
|
|
4192
4113
|
};
|
|
4193
4114
|
}
|
|
4194
|
-
function
|
|
4195
|
-
return `${path15.resolve(testFilePath)}::${
|
|
4115
|
+
function makeTestCaseKey(testFilePath, testId) {
|
|
4116
|
+
return `${path15.resolve(testFilePath)}::${testId}`;
|
|
4196
4117
|
}
|
|
4197
4118
|
function createDisplayIdTracker() {
|
|
4198
4119
|
const map = /* @__PURE__ */ new Map();
|
|
4199
4120
|
let nextId = 1;
|
|
4200
4121
|
return {
|
|
4201
|
-
getOrAssign(
|
|
4202
|
-
const existing = map.get(
|
|
4122
|
+
getOrAssign(testCaseKey) {
|
|
4123
|
+
const existing = map.get(testCaseKey);
|
|
4203
4124
|
if (existing !== void 0) {
|
|
4204
4125
|
return existing;
|
|
4205
4126
|
}
|
|
4206
4127
|
const assigned = nextId++;
|
|
4207
|
-
map.set(
|
|
4128
|
+
map.set(testCaseKey, assigned);
|
|
4208
4129
|
return assigned;
|
|
4209
4130
|
}
|
|
4210
4131
|
};
|
|
@@ -4255,58 +4176,79 @@ async function prepareFileMetadata(params) {
|
|
|
4255
4176
|
filter: options.filter,
|
|
4256
4177
|
category
|
|
4257
4178
|
});
|
|
4258
|
-
const
|
|
4259
|
-
const cliTargets = options.cliTargets;
|
|
4179
|
+
const testIds = suite.tests.map((value) => value.id);
|
|
4260
4180
|
const suiteTargets = suite.targets;
|
|
4261
|
-
let targetNames;
|
|
4262
|
-
if (cliTargets.length > 0) {
|
|
4263
|
-
targetNames = cliTargets;
|
|
4264
|
-
} else if (suiteTargets && suiteTargets.length > 0) {
|
|
4265
|
-
targetNames = suiteTargets;
|
|
4266
|
-
} else {
|
|
4267
|
-
targetNames = [];
|
|
4268
|
-
}
|
|
4269
4181
|
let selections;
|
|
4270
|
-
if (
|
|
4271
|
-
const
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4276
|
-
|
|
4277
|
-
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4281
|
-
|
|
4282
|
-
});
|
|
4283
|
-
selections = multiSelections.map((sel) => ({
|
|
4284
|
-
selection: sel,
|
|
4285
|
-
inlineTargetLabel: sel.targetName
|
|
4286
|
-
}));
|
|
4287
|
-
} else {
|
|
4288
|
-
const selection = await selectTarget({
|
|
4289
|
-
testFilePath,
|
|
4290
|
-
repoRoot,
|
|
4291
|
-
cwd,
|
|
4292
|
-
explicitTargetsPath: options.targetsPath,
|
|
4293
|
-
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
|
|
4294
|
-
dryRun: options.dryRun,
|
|
4295
|
-
dryRunDelay: options.dryRunDelay,
|
|
4296
|
-
dryRunDelayMin: options.dryRunDelayMin,
|
|
4297
|
-
dryRunDelayMax: options.dryRunDelayMax,
|
|
4298
|
-
env: process.env
|
|
4299
|
-
});
|
|
4182
|
+
if (options.transcript) {
|
|
4183
|
+
const transcriptSelection = {
|
|
4184
|
+
definitions: [],
|
|
4185
|
+
resolvedTarget: {
|
|
4186
|
+
kind: "transcript",
|
|
4187
|
+
name: "transcript",
|
|
4188
|
+
config: {}
|
|
4189
|
+
},
|
|
4190
|
+
targetName: "transcript",
|
|
4191
|
+
targetSource: "cli",
|
|
4192
|
+
targetsFilePath: options.transcript
|
|
4193
|
+
};
|
|
4300
4194
|
selections = [
|
|
4301
4195
|
{
|
|
4302
|
-
selection,
|
|
4303
|
-
inlineTargetLabel:
|
|
4196
|
+
selection: transcriptSelection,
|
|
4197
|
+
inlineTargetLabel: `transcript (${path15.basename(options.transcript)})`
|
|
4304
4198
|
}
|
|
4305
4199
|
];
|
|
4200
|
+
} else {
|
|
4201
|
+
const cliTargets = options.cliTargets;
|
|
4202
|
+
const suiteTargets2 = suite.targets;
|
|
4203
|
+
let targetNames;
|
|
4204
|
+
if (cliTargets.length > 0) {
|
|
4205
|
+
targetNames = cliTargets;
|
|
4206
|
+
} else if (suiteTargets2 && suiteTargets2.length > 0) {
|
|
4207
|
+
targetNames = suiteTargets2;
|
|
4208
|
+
} else {
|
|
4209
|
+
targetNames = [];
|
|
4210
|
+
}
|
|
4211
|
+
if (targetNames.length > 1) {
|
|
4212
|
+
const multiSelections = await selectMultipleTargets({
|
|
4213
|
+
testFilePath,
|
|
4214
|
+
repoRoot,
|
|
4215
|
+
cwd,
|
|
4216
|
+
explicitTargetsPath: options.targetsPath,
|
|
4217
|
+
dryRun: options.dryRun,
|
|
4218
|
+
dryRunDelay: options.dryRunDelay,
|
|
4219
|
+
dryRunDelayMin: options.dryRunDelayMin,
|
|
4220
|
+
dryRunDelayMax: options.dryRunDelayMax,
|
|
4221
|
+
env: process.env,
|
|
4222
|
+
targetNames
|
|
4223
|
+
});
|
|
4224
|
+
selections = multiSelections.map((sel) => ({
|
|
4225
|
+
selection: sel,
|
|
4226
|
+
inlineTargetLabel: sel.targetName
|
|
4227
|
+
}));
|
|
4228
|
+
} else {
|
|
4229
|
+
const selection = await selectTarget({
|
|
4230
|
+
testFilePath,
|
|
4231
|
+
repoRoot,
|
|
4232
|
+
cwd,
|
|
4233
|
+
explicitTargetsPath: options.targetsPath,
|
|
4234
|
+
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
|
|
4235
|
+
dryRun: options.dryRun,
|
|
4236
|
+
dryRunDelay: options.dryRunDelay,
|
|
4237
|
+
dryRunDelayMin: options.dryRunDelayMin,
|
|
4238
|
+
dryRunDelayMax: options.dryRunDelayMax,
|
|
4239
|
+
env: process.env
|
|
4240
|
+
});
|
|
4241
|
+
selections = [
|
|
4242
|
+
{
|
|
4243
|
+
selection,
|
|
4244
|
+
inlineTargetLabel: selection.targetName
|
|
4245
|
+
}
|
|
4246
|
+
];
|
|
4247
|
+
}
|
|
4306
4248
|
}
|
|
4307
4249
|
return {
|
|
4308
|
-
|
|
4309
|
-
|
|
4250
|
+
testIds,
|
|
4251
|
+
testCases: suite.tests,
|
|
4310
4252
|
selections,
|
|
4311
4253
|
trialsConfig: suite.trials,
|
|
4312
4254
|
suiteTargets,
|
|
@@ -4344,15 +4286,16 @@ async function runSingleEvalFile(params) {
|
|
|
4344
4286
|
workersOverride,
|
|
4345
4287
|
yamlWorkers,
|
|
4346
4288
|
progressReporter,
|
|
4347
|
-
|
|
4289
|
+
seenTestCases,
|
|
4348
4290
|
displayIdTracker,
|
|
4349
4291
|
selection,
|
|
4350
4292
|
inlineTargetLabel,
|
|
4351
|
-
|
|
4293
|
+
testCases,
|
|
4352
4294
|
trialsConfig,
|
|
4353
4295
|
matrixMode,
|
|
4354
4296
|
totalBudgetUsd,
|
|
4355
|
-
failOnError
|
|
4297
|
+
failOnError,
|
|
4298
|
+
providerFactory
|
|
4356
4299
|
} = params;
|
|
4357
4300
|
const targetName = selection.targetName;
|
|
4358
4301
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -4408,7 +4351,8 @@ async function runSingleEvalFile(params) {
|
|
|
4408
4351
|
}
|
|
4409
4352
|
return true;
|
|
4410
4353
|
})(),
|
|
4411
|
-
|
|
4354
|
+
filter: options.filter,
|
|
4355
|
+
evalCases: testCases,
|
|
4412
4356
|
verbose: options.verbose,
|
|
4413
4357
|
maxConcurrency: resolvedWorkers,
|
|
4414
4358
|
workspaceMode: options.workspaceMode,
|
|
@@ -4419,6 +4363,7 @@ async function runSingleEvalFile(params) {
|
|
|
4419
4363
|
graderTarget: options.graderTarget,
|
|
4420
4364
|
model: options.model,
|
|
4421
4365
|
threshold: options.threshold,
|
|
4366
|
+
providerFactory,
|
|
4422
4367
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
4423
4368
|
onResult: async (result) => {
|
|
4424
4369
|
streamingObserver?.completeFromResult?.(result);
|
|
@@ -4442,13 +4387,13 @@ async function runSingleEvalFile(params) {
|
|
|
4442
4387
|
}
|
|
4443
4388
|
},
|
|
4444
4389
|
onProgress: async (event) => {
|
|
4445
|
-
const
|
|
4446
|
-
const
|
|
4447
|
-
if (event.status === "pending" && !
|
|
4448
|
-
|
|
4449
|
-
progressReporter.setTotal(
|
|
4390
|
+
const testCaseKeyId = matrixMode ? `${event.testId}@${targetName}` : event.testId;
|
|
4391
|
+
const testCaseKey = makeTestCaseKey(testFilePath, testCaseKeyId);
|
|
4392
|
+
if (event.status === "pending" && !seenTestCases.has(testCaseKey)) {
|
|
4393
|
+
seenTestCases.add(testCaseKey);
|
|
4394
|
+
progressReporter.setTotal(seenTestCases.size);
|
|
4450
4395
|
}
|
|
4451
|
-
const displayId = displayIdTracker.getOrAssign(
|
|
4396
|
+
const displayId = displayIdTracker.getOrAssign(testCaseKey);
|
|
4452
4397
|
if (event.status === "running" && streamingObserver) {
|
|
4453
4398
|
streamingObserver.startEvalCase(event.testId, targetName, testFilePath);
|
|
4454
4399
|
}
|
|
@@ -4528,13 +4473,48 @@ async function runEvalCommand(input) {
|
|
|
4528
4473
|
if (options.verbose) {
|
|
4529
4474
|
console.log(`Repository root: ${repoRoot}`);
|
|
4530
4475
|
}
|
|
4531
|
-
|
|
4532
|
-
|
|
4476
|
+
if (options.outPath) {
|
|
4477
|
+
console.warn("Warning: --out is deprecated. Use --output <dir> to set the artifact directory.");
|
|
4478
|
+
}
|
|
4479
|
+
if (options.artifacts) {
|
|
4480
|
+
console.warn(
|
|
4481
|
+
"Warning: --artifacts is deprecated. Use --output <dir> to set the artifact directory."
|
|
4482
|
+
);
|
|
4483
|
+
}
|
|
4484
|
+
if (options.benchmarkJson) {
|
|
4485
|
+
console.warn(
|
|
4486
|
+
"Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory."
|
|
4487
|
+
);
|
|
4488
|
+
}
|
|
4489
|
+
if (normalizeString(input.rawOptions.outputFormat)) {
|
|
4490
|
+
console.warn(
|
|
4491
|
+
"Warning: --output-format is deprecated. The artifact directory always uses JSONL."
|
|
4492
|
+
);
|
|
4493
|
+
}
|
|
4494
|
+
const explicitDir = options.outputDir ?? options.artifacts;
|
|
4495
|
+
let runDir;
|
|
4496
|
+
let outputPath;
|
|
4497
|
+
let usesDefaultArtifactWorkspace;
|
|
4498
|
+
if (explicitDir) {
|
|
4499
|
+
runDir = path15.resolve(explicitDir);
|
|
4500
|
+
mkdirSync(runDir, { recursive: true });
|
|
4501
|
+
outputPath = path15.join(runDir, "index.jsonl");
|
|
4502
|
+
usesDefaultArtifactWorkspace = true;
|
|
4503
|
+
} else if (options.outPath) {
|
|
4504
|
+
outputPath = path15.resolve(options.outPath);
|
|
4505
|
+
runDir = path15.dirname(outputPath);
|
|
4506
|
+
mkdirSync(runDir, { recursive: true });
|
|
4507
|
+
usesDefaultArtifactWorkspace = false;
|
|
4508
|
+
} else {
|
|
4509
|
+
outputPath = buildDefaultOutputPath(cwd);
|
|
4510
|
+
runDir = path15.dirname(outputPath);
|
|
4511
|
+
usesDefaultArtifactWorkspace = true;
|
|
4512
|
+
}
|
|
4533
4513
|
let otelExporter = null;
|
|
4534
4514
|
const useFileExport = !!options.otelFile;
|
|
4535
4515
|
if (options.exportOtel || useFileExport) {
|
|
4536
4516
|
try {
|
|
4537
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4517
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QXVR2ZRH.js");
|
|
4538
4518
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4539
4519
|
let headers = {};
|
|
4540
4520
|
if (options.otelBackend) {
|
|
@@ -4575,16 +4555,11 @@ async function runEvalCommand(input) {
|
|
|
4575
4555
|
}
|
|
4576
4556
|
}
|
|
4577
4557
|
const primaryWritePath = outputPath;
|
|
4578
|
-
const
|
|
4579
|
-
|
|
4580
|
-
|
|
4581
|
-
|
|
4582
|
-
|
|
4583
|
-
if (uniqueOutputPaths.length === 1) {
|
|
4584
|
-
console.log(`Output path: ${outputPath}`);
|
|
4585
|
-
} else {
|
|
4586
|
-
console.log("Output paths:");
|
|
4587
|
-
for (const p of uniqueReportedOutputPaths) {
|
|
4558
|
+
const resolvedExportPaths = options.exportPaths.map((p) => path15.resolve(p));
|
|
4559
|
+
console.log(`Artifact directory: ${runDir}`);
|
|
4560
|
+
if (resolvedExportPaths.length > 0) {
|
|
4561
|
+
console.log("Export files:");
|
|
4562
|
+
for (const p of resolvedExportPaths) {
|
|
4588
4563
|
console.log(` ${p}`);
|
|
4589
4564
|
}
|
|
4590
4565
|
}
|
|
@@ -4594,7 +4569,7 @@ async function runEvalCommand(input) {
|
|
|
4594
4569
|
}
|
|
4595
4570
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
4596
4571
|
const allResults = [];
|
|
4597
|
-
const
|
|
4572
|
+
const seenTestCases = /* @__PURE__ */ new Set();
|
|
4598
4573
|
const displayIdTracker = createDisplayIdTracker();
|
|
4599
4574
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
4600
4575
|
const fileConcurrency = Math.min(
|
|
@@ -4656,7 +4631,6 @@ async function runEvalCommand(input) {
|
|
|
4656
4631
|
yamlCache: yamlCacheEnabled
|
|
4657
4632
|
});
|
|
4658
4633
|
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path15.resolve(yamlCachePath) : void 0) : void 0;
|
|
4659
|
-
const useCache = cacheEnabled;
|
|
4660
4634
|
if (cacheEnabled) {
|
|
4661
4635
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
4662
4636
|
}
|
|
@@ -4666,17 +4640,12 @@ async function runEvalCommand(input) {
|
|
|
4666
4640
|
throw new Error("--threshold must be between 0 and 1");
|
|
4667
4641
|
}
|
|
4668
4642
|
const writerOptions = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4669
|
-
|
|
4670
|
-
if (uniqueOutputPaths.length === 1) {
|
|
4671
|
-
outputWriter = await createOutputWriter(primaryWritePath, options.format);
|
|
4672
|
-
} else {
|
|
4673
|
-
outputWriter = await createMultiWriter(uniqueOutputPaths, writerOptions);
|
|
4674
|
-
}
|
|
4643
|
+
const outputWriter = await createOutputWriter(primaryWritePath, "jsonl");
|
|
4675
4644
|
const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1);
|
|
4676
4645
|
let totalEvalCount = 0;
|
|
4677
4646
|
for (const meta of fileMetadata.values()) {
|
|
4678
4647
|
const suiteTargetNames = meta.selections.map((s) => s.selection.targetName);
|
|
4679
|
-
for (const test of meta.
|
|
4648
|
+
for (const test of meta.testCases) {
|
|
4680
4649
|
const testTargetNames = test.targets && test.targets.length > 0 ? test.targets.filter((t) => suiteTargetNames.includes(t)) : suiteTargetNames;
|
|
4681
4650
|
totalEvalCount += testTargetNames.length > 0 ? testTargetNames.length : 1;
|
|
4682
4651
|
}
|
|
@@ -4720,13 +4689,13 @@ async function runEvalCommand(input) {
|
|
|
4720
4689
|
});
|
|
4721
4690
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
4722
4691
|
for (const { selection, inlineTargetLabel } of meta.selections) {
|
|
4723
|
-
for (const testId of meta.
|
|
4724
|
-
const
|
|
4692
|
+
for (const testId of meta.testIds) {
|
|
4693
|
+
const testCaseKey = makeTestCaseKey(
|
|
4725
4694
|
testFilePath,
|
|
4726
4695
|
meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId
|
|
4727
4696
|
);
|
|
4728
|
-
|
|
4729
|
-
const displayId = displayIdTracker.getOrAssign(
|
|
4697
|
+
seenTestCases.add(testCaseKey);
|
|
4698
|
+
const displayId = displayIdTracker.getOrAssign(testCaseKey);
|
|
4730
4699
|
progressReporter.update(displayId, {
|
|
4731
4700
|
workerId: displayId,
|
|
4732
4701
|
testId: meta.selections.length > 1 ? `${testId}@${selection.targetName}` : testId,
|
|
@@ -4737,6 +4706,24 @@ async function runEvalCommand(input) {
|
|
|
4737
4706
|
}
|
|
4738
4707
|
}
|
|
4739
4708
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
4709
|
+
let transcriptProviderFactory;
|
|
4710
|
+
if (options.transcript) {
|
|
4711
|
+
const { TranscriptProvider } = await import("./dist-QXVR2ZRH.js");
|
|
4712
|
+
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
4713
|
+
const totalTests = [...fileMetadata.values()].reduce(
|
|
4714
|
+
(sum, meta) => sum + meta.testCases.length,
|
|
4715
|
+
0
|
|
4716
|
+
);
|
|
4717
|
+
if (transcriptProvider.lineCount !== totalTests) {
|
|
4718
|
+
throw new Error(
|
|
4719
|
+
`Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`
|
|
4720
|
+
);
|
|
4721
|
+
}
|
|
4722
|
+
transcriptProviderFactory = () => transcriptProvider;
|
|
4723
|
+
console.log(
|
|
4724
|
+
`Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`
|
|
4725
|
+
);
|
|
4726
|
+
}
|
|
4740
4727
|
try {
|
|
4741
4728
|
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
|
|
4742
4729
|
const targetPrep = fileMetadata.get(testFilePath);
|
|
@@ -4746,13 +4733,13 @@ async function runEvalCommand(input) {
|
|
|
4746
4733
|
const targetResults = await Promise.all(
|
|
4747
4734
|
targetPrep.selections.map(async ({ selection, inlineTargetLabel }) => {
|
|
4748
4735
|
const targetName = selection.targetName;
|
|
4749
|
-
const
|
|
4736
|
+
const applicableTestCases = targetPrep.selections.length > 1 ? targetPrep.testCases.filter((test) => {
|
|
4750
4737
|
if (test.targets && test.targets.length > 0) {
|
|
4751
4738
|
return test.targets.includes(targetName);
|
|
4752
4739
|
}
|
|
4753
4740
|
return true;
|
|
4754
|
-
}) : targetPrep.
|
|
4755
|
-
if (
|
|
4741
|
+
}) : targetPrep.testCases;
|
|
4742
|
+
if (applicableTestCases.length === 0) {
|
|
4756
4743
|
return [];
|
|
4757
4744
|
}
|
|
4758
4745
|
try {
|
|
@@ -4768,16 +4755,17 @@ async function runEvalCommand(input) {
|
|
|
4768
4755
|
workersOverride: perFileWorkers,
|
|
4769
4756
|
yamlWorkers: targetPrep.yamlWorkers,
|
|
4770
4757
|
progressReporter,
|
|
4771
|
-
|
|
4758
|
+
seenTestCases,
|
|
4772
4759
|
displayIdTracker,
|
|
4773
4760
|
selection,
|
|
4774
4761
|
inlineTargetLabel,
|
|
4775
|
-
|
|
4776
|
-
trialsConfig: targetPrep.trialsConfig,
|
|
4762
|
+
testCases: applicableTestCases,
|
|
4763
|
+
trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
|
|
4777
4764
|
matrixMode: targetPrep.selections.length > 1,
|
|
4778
4765
|
totalBudgetUsd: targetPrep.totalBudgetUsd,
|
|
4779
4766
|
failOnError: targetPrep.failOnError,
|
|
4780
|
-
threshold: resolvedThreshold
|
|
4767
|
+
threshold: resolvedThreshold,
|
|
4768
|
+
providerFactory: transcriptProviderFactory
|
|
4781
4769
|
});
|
|
4782
4770
|
return result.results;
|
|
4783
4771
|
} catch (fileError) {
|
|
@@ -4785,9 +4773,9 @@ async function runEvalCommand(input) {
|
|
|
4785
4773
|
console.error(`
|
|
4786
4774
|
\u26A0 Eval file failed: ${path15.basename(testFilePath)} \u2014 ${message}
|
|
4787
4775
|
`);
|
|
4788
|
-
const errorResults =
|
|
4776
|
+
const errorResults = applicableTestCases.map((testCase) => ({
|
|
4789
4777
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4790
|
-
testId:
|
|
4778
|
+
testId: testCase.id,
|
|
4791
4779
|
score: 0,
|
|
4792
4780
|
assertions: [],
|
|
4793
4781
|
output: [],
|
|
@@ -4824,6 +4812,7 @@ async function runEvalCommand(input) {
|
|
|
4824
4812
|
const thresholdOpts = resolvedThreshold !== void 0 ? { threshold: resolvedThreshold } : void 0;
|
|
4825
4813
|
const summary = calculateEvaluationSummary(allResults, thresholdOpts);
|
|
4826
4814
|
console.log(formatEvaluationSummary(summary, thresholdOpts));
|
|
4815
|
+
const allExecutionErrors = summary.total > 0 && summary.executionErrorCount === summary.total;
|
|
4827
4816
|
const thresholdFailed = resolvedThreshold !== void 0 && summary.qualityFailureCount > 0;
|
|
4828
4817
|
if (isMatrixMode && allResults.length > 0) {
|
|
4829
4818
|
console.log(formatMatrixSummary(allResults));
|
|
@@ -4833,18 +4822,17 @@ async function runEvalCommand(input) {
|
|
|
4833
4822
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
4834
4823
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4835
4824
|
}
|
|
4836
|
-
if (usesDefaultArtifactWorkspace) {
|
|
4825
|
+
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
|
|
4837
4826
|
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
4838
|
-
const workspaceDir = path15.dirname(outputPath);
|
|
4839
4827
|
const {
|
|
4840
4828
|
testArtifactDir,
|
|
4841
4829
|
timingPath,
|
|
4842
4830
|
benchmarkPath: workspaceBenchmarkPath,
|
|
4843
4831
|
indexPath
|
|
4844
|
-
} = await writeArtifactsFromResults(allResults,
|
|
4832
|
+
} = await writeArtifactsFromResults(allResults, runDir, {
|
|
4845
4833
|
evalFile
|
|
4846
4834
|
});
|
|
4847
|
-
console.log(`Artifact workspace written to: ${
|
|
4835
|
+
console.log(`Artifact workspace written to: ${runDir}`);
|
|
4848
4836
|
console.log(` Index: ${indexPath}`);
|
|
4849
4837
|
console.log(
|
|
4850
4838
|
` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`
|
|
@@ -4852,24 +4840,17 @@ async function runEvalCommand(input) {
|
|
|
4852
4840
|
console.log(` Timing: ${timingPath}`);
|
|
4853
4841
|
console.log(` Benchmark: ${workspaceBenchmarkPath}`);
|
|
4854
4842
|
}
|
|
4855
|
-
if (
|
|
4856
|
-
const
|
|
4857
|
-
|
|
4858
|
-
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4862
|
-
|
|
4863
|
-
} = await writeArtifactsFromResults(allResults, artifactsDir, {
|
|
4864
|
-
evalFile
|
|
4865
|
-
});
|
|
4866
|
-
console.log(`Artifacts written to: ${artifactsDir}`);
|
|
4867
|
-
console.log(` Index: ${indexPath}`);
|
|
4843
|
+
if (resolvedExportPaths.length > 0 && allResults.length > 0) {
|
|
4844
|
+
for (const exportPath of resolvedExportPaths) {
|
|
4845
|
+
const writer = await createWriterFromPath(exportPath, writerOptions);
|
|
4846
|
+
for (const result of allResults) {
|
|
4847
|
+
await writer.append(result);
|
|
4848
|
+
}
|
|
4849
|
+
await writer.close();
|
|
4850
|
+
}
|
|
4868
4851
|
console.log(
|
|
4869
|
-
`
|
|
4852
|
+
`Export file(s) written: ${resolvedExportPaths.map((p) => path15.relative(cwd, p)).join(", ")}`
|
|
4870
4853
|
);
|
|
4871
|
-
console.log(` Timing: ${timingPath}`);
|
|
4872
|
-
console.log(` Benchmark: ${abp}`);
|
|
4873
4854
|
}
|
|
4874
4855
|
const failedWithWorkspaces = allResults.filter(
|
|
4875
4856
|
(r) => r.workspacePath && (r.error || r.score < 0.5)
|
|
@@ -4881,15 +4862,8 @@ async function runEvalCommand(input) {
|
|
|
4881
4862
|
}
|
|
4882
4863
|
}
|
|
4883
4864
|
if (allResults.length > 0) {
|
|
4884
|
-
|
|
4885
|
-
console.log(`
|
|
4865
|
+
console.log(`
|
|
4886
4866
|
Results written to: ${outputPath}`);
|
|
4887
|
-
} else {
|
|
4888
|
-
console.log("\nResults written to:");
|
|
4889
|
-
for (const p of uniqueReportedOutputPaths) {
|
|
4890
|
-
console.log(` ${p}`);
|
|
4891
|
-
}
|
|
4892
|
-
}
|
|
4893
4867
|
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
4894
4868
|
}
|
|
4895
4869
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
@@ -4907,7 +4881,8 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
4907
4881
|
outputPath,
|
|
4908
4882
|
testFiles: activeTestFiles,
|
|
4909
4883
|
target: options.target,
|
|
4910
|
-
thresholdFailed
|
|
4884
|
+
thresholdFailed,
|
|
4885
|
+
allExecutionErrors
|
|
4911
4886
|
};
|
|
4912
4887
|
} finally {
|
|
4913
4888
|
unsubscribeCodexLogs();
|
|
@@ -4940,6 +4915,43 @@ async function resolveEvaluationRunner() {
|
|
|
4940
4915
|
return candidate;
|
|
4941
4916
|
}
|
|
4942
4917
|
|
|
4918
|
+
// src/commands/eval/discover.ts
|
|
4919
|
+
import path16 from "node:path";
|
|
4920
|
+
import fg2 from "fast-glob";
|
|
4921
|
+
async function discoverEvalFiles(cwd) {
|
|
4922
|
+
const repoRoot = await findRepoRoot(cwd);
|
|
4923
|
+
const config = await loadConfig(path16.join(cwd, "_"), repoRoot);
|
|
4924
|
+
const patterns = config?.eval_patterns && config.eval_patterns.length > 0 ? config.eval_patterns : DEFAULT_EVAL_PATTERNS;
|
|
4925
|
+
const ignore = ["**/node_modules/**", "**/dist/**"];
|
|
4926
|
+
const matches = await fg2(patterns, {
|
|
4927
|
+
cwd,
|
|
4928
|
+
absolute: true,
|
|
4929
|
+
onlyFiles: true,
|
|
4930
|
+
ignore,
|
|
4931
|
+
followSymbolicLinks: true,
|
|
4932
|
+
caseSensitiveMatch: false
|
|
4933
|
+
});
|
|
4934
|
+
const evalFiles = matches.map((absPath) => {
|
|
4935
|
+
const relativePath = path16.relative(cwd, absPath);
|
|
4936
|
+
const category = deriveCategory(relativePath);
|
|
4937
|
+
return { path: absPath, relativePath, category };
|
|
4938
|
+
});
|
|
4939
|
+
evalFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
|
|
4940
|
+
return evalFiles;
|
|
4941
|
+
}
|
|
4942
|
+
function getCategories(files) {
|
|
4943
|
+
const categories = /* @__PURE__ */ new Set();
|
|
4944
|
+
for (const file of files) {
|
|
4945
|
+
categories.add(file.category);
|
|
4946
|
+
}
|
|
4947
|
+
const sorted = Array.from(categories);
|
|
4948
|
+
sorted.sort();
|
|
4949
|
+
return sorted;
|
|
4950
|
+
}
|
|
4951
|
+
function filterByCategory(files, category) {
|
|
4952
|
+
return files.filter((f) => f.category === category);
|
|
4953
|
+
}
|
|
4954
|
+
|
|
4943
4955
|
export {
|
|
4944
4956
|
package_default,
|
|
4945
4957
|
toSnakeCaseDeep,
|
|
@@ -4948,12 +4960,13 @@ export {
|
|
|
4948
4960
|
buildDefaultRunDir,
|
|
4949
4961
|
resolveExistingRunPrimaryPath,
|
|
4950
4962
|
resolveWorkspaceOrFilePath,
|
|
4951
|
-
|
|
4963
|
+
resolveRunManifestPath,
|
|
4952
4964
|
parseResultManifest,
|
|
4953
4965
|
resolveResultSourcePath,
|
|
4954
4966
|
loadManifestResults,
|
|
4955
4967
|
loadLightweightResults,
|
|
4956
4968
|
HtmlWriter,
|
|
4969
|
+
writeArtifactsFromResults,
|
|
4957
4970
|
resolveRunCacheFile,
|
|
4958
4971
|
loadRunCache,
|
|
4959
4972
|
resolveEvalPaths,
|
|
@@ -4966,6 +4979,9 @@ export {
|
|
|
4966
4979
|
TARGET_FILE_CANDIDATES,
|
|
4967
4980
|
fileExists,
|
|
4968
4981
|
selectTarget,
|
|
4969
|
-
runEvalCommand
|
|
4982
|
+
runEvalCommand,
|
|
4983
|
+
discoverEvalFiles,
|
|
4984
|
+
getCategories,
|
|
4985
|
+
filterByCategory
|
|
4970
4986
|
};
|
|
4971
|
-
//# sourceMappingURL=chunk-
|
|
4987
|
+
//# sourceMappingURL=chunk-QBZJSQXV.js.map
|