@agentv/core 1.3.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-4A6L2F6L.js → chunk-E2VSU4WZ.js} +282 -81
- package/dist/chunk-E2VSU4WZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +82 -67
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -68
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1668 -489
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +279 -77
- package/dist/index.d.ts +279 -77
- package/dist/index.js +1334 -356
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-4A6L2F6L.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
buildDirectoryChain,
|
|
3
3
|
buildSearchRoots,
|
|
4
|
+
extractLastAssistantContent,
|
|
4
5
|
fileExists,
|
|
5
6
|
findGitRoot,
|
|
6
7
|
isAgentProvider,
|
|
@@ -9,7 +10,7 @@ import {
|
|
|
9
10
|
readTextFile,
|
|
10
11
|
resolveFileReference,
|
|
11
12
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-E2VSU4WZ.js";
|
|
13
14
|
|
|
14
15
|
// src/evaluation/types.ts
|
|
15
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -74,33 +75,69 @@ function getHitCount(result) {
|
|
|
74
75
|
}
|
|
75
76
|
|
|
76
77
|
// src/evaluation/trace.ts
|
|
77
|
-
function
|
|
78
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
79
|
-
}
|
|
80
|
-
function isTraceEvent(value) {
|
|
81
|
-
if (typeof value !== "object" || value === null) {
|
|
82
|
-
return false;
|
|
83
|
-
}
|
|
84
|
-
const candidate = value;
|
|
85
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
86
|
-
}
|
|
87
|
-
function computeTraceSummary(trace) {
|
|
78
|
+
function computeTraceSummary(messages) {
|
|
88
79
|
const toolCallCounts = {};
|
|
89
|
-
let
|
|
90
|
-
for (const
|
|
91
|
-
if (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
errorCount++;
|
|
80
|
+
let totalToolCalls = 0;
|
|
81
|
+
for (const message of messages) {
|
|
82
|
+
if (!message.toolCalls) continue;
|
|
83
|
+
for (const toolCall of message.toolCalls) {
|
|
84
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
85
|
+
totalToolCalls++;
|
|
96
86
|
}
|
|
97
87
|
}
|
|
98
88
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
99
89
|
return {
|
|
100
|
-
eventCount:
|
|
90
|
+
eventCount: totalToolCalls,
|
|
101
91
|
toolNames,
|
|
102
92
|
toolCallsByName: toolCallCounts,
|
|
103
|
-
errorCount
|
|
93
|
+
errorCount: 0
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
var DEFAULT_EXPLORATION_TOOLS = [
|
|
97
|
+
"read",
|
|
98
|
+
"grep",
|
|
99
|
+
"glob",
|
|
100
|
+
"search",
|
|
101
|
+
"list",
|
|
102
|
+
"Read",
|
|
103
|
+
"Grep",
|
|
104
|
+
"Glob",
|
|
105
|
+
"WebSearch",
|
|
106
|
+
"WebFetch"
|
|
107
|
+
];
|
|
108
|
+
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
109
|
+
if (summary.eventCount === 0) return void 0;
|
|
110
|
+
const explorationCalls = explorationTools.reduce(
|
|
111
|
+
(sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
|
|
112
|
+
0
|
|
113
|
+
);
|
|
114
|
+
return explorationCalls / summary.eventCount;
|
|
115
|
+
}
|
|
116
|
+
function tokensPerTool(summary) {
|
|
117
|
+
if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
|
|
118
|
+
const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
|
|
119
|
+
return totalTokens / summary.eventCount;
|
|
120
|
+
}
|
|
121
|
+
function avgToolDurationMs(summary) {
|
|
122
|
+
if (!summary.toolDurations) return void 0;
|
|
123
|
+
let totalDuration = 0;
|
|
124
|
+
let totalCalls = 0;
|
|
125
|
+
for (const durations of Object.values(summary.toolDurations)) {
|
|
126
|
+
for (const duration of durations) {
|
|
127
|
+
totalDuration += duration;
|
|
128
|
+
totalCalls++;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (totalCalls === 0) return void 0;
|
|
132
|
+
return totalDuration / totalCalls;
|
|
133
|
+
}
|
|
134
|
+
function mergeExecutionMetrics(summary, metrics) {
|
|
135
|
+
if (!metrics) return summary;
|
|
136
|
+
return {
|
|
137
|
+
...summary,
|
|
138
|
+
tokenUsage: metrics.tokenUsage,
|
|
139
|
+
costUsd: metrics.costUsd,
|
|
140
|
+
durationMs: metrics.durationMs
|
|
104
141
|
};
|
|
105
142
|
}
|
|
106
143
|
|
|
@@ -376,7 +413,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
376
413
|
QUESTION: "question",
|
|
377
414
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
378
415
|
REFERENCE_ANSWER: "reference_answer",
|
|
379
|
-
INPUT_MESSAGES: "input_messages"
|
|
416
|
+
INPUT_MESSAGES: "input_messages",
|
|
417
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
380
418
|
};
|
|
381
419
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
382
420
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -616,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
616
654
|
expected = [];
|
|
617
655
|
for (const item of rawExpected) {
|
|
618
656
|
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
619
|
-
|
|
657
|
+
let args;
|
|
658
|
+
if (item.args === "any") {
|
|
659
|
+
args = "any";
|
|
660
|
+
} else if (isJsonObject2(item.args)) {
|
|
661
|
+
args = item.args;
|
|
662
|
+
}
|
|
663
|
+
expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
|
|
620
664
|
}
|
|
621
665
|
}
|
|
622
666
|
}
|
|
@@ -1259,16 +1303,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1259
1303
|
}) : [];
|
|
1260
1304
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1261
1305
|
let referenceAnswer = "";
|
|
1262
|
-
if (outputSegments.length >
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
const
|
|
1266
|
-
if (typeof
|
|
1267
|
-
referenceAnswer =
|
|
1268
|
-
} else if (
|
|
1269
|
-
referenceAnswer = JSON.stringify(
|
|
1270
|
-
} else if (
|
|
1271
|
-
referenceAnswer = JSON.stringify(
|
|
1306
|
+
if (outputSegments.length > 0) {
|
|
1307
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1308
|
+
const content = lastMessage.content;
|
|
1309
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1310
|
+
if (typeof content === "string") {
|
|
1311
|
+
referenceAnswer = content;
|
|
1312
|
+
} else if (content !== void 0 && content !== null) {
|
|
1313
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1314
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1315
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1272
1316
|
}
|
|
1273
1317
|
}
|
|
1274
1318
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1596,11 +1640,11 @@ async function invokeModel(options) {
|
|
|
1596
1640
|
return mapResponse(result);
|
|
1597
1641
|
}
|
|
1598
1642
|
function mapResponse(result) {
|
|
1643
|
+
const content = result.text ?? "";
|
|
1599
1644
|
return {
|
|
1600
|
-
text: result.text ?? "",
|
|
1601
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1602
1645
|
raw: result,
|
|
1603
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1646
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1647
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1604
1648
|
};
|
|
1605
1649
|
}
|
|
1606
1650
|
function toJsonObject(value) {
|
|
@@ -1753,6 +1797,7 @@ var CliProvider = class {
|
|
|
1753
1797
|
config;
|
|
1754
1798
|
runCommand;
|
|
1755
1799
|
verbose;
|
|
1800
|
+
keepTempFiles;
|
|
1756
1801
|
healthcheckPromise;
|
|
1757
1802
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1758
1803
|
this.targetName = targetName;
|
|
@@ -1760,6 +1805,7 @@ var CliProvider = class {
|
|
|
1760
1805
|
this.config = config;
|
|
1761
1806
|
this.runCommand = runner;
|
|
1762
1807
|
this.verbose = config.verbose ?? false;
|
|
1808
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1763
1809
|
}
|
|
1764
1810
|
async invoke(request) {
|
|
1765
1811
|
if (request.signal?.aborted) {
|
|
@@ -1774,12 +1820,14 @@ var CliProvider = class {
|
|
|
1774
1820
|
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1775
1821
|
);
|
|
1776
1822
|
}
|
|
1823
|
+
const startTime = Date.now();
|
|
1777
1824
|
const result = await this.runCommand(renderedCommand, {
|
|
1778
1825
|
cwd: this.config.cwd,
|
|
1779
1826
|
env: process.env,
|
|
1780
1827
|
timeoutMs: this.config.timeoutMs,
|
|
1781
1828
|
signal: request.signal
|
|
1782
1829
|
});
|
|
1830
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1783
1831
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1784
1832
|
if (request.signal?.aborted) {
|
|
1785
1833
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1797,8 +1845,10 @@ var CliProvider = class {
|
|
|
1797
1845
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1798
1846
|
const parsed = this.parseOutputContent(responseContent);
|
|
1799
1847
|
return {
|
|
1800
|
-
|
|
1801
|
-
|
|
1848
|
+
outputMessages: parsed.outputMessages,
|
|
1849
|
+
tokenUsage: parsed.tokenUsage,
|
|
1850
|
+
costUsd: parsed.costUsd,
|
|
1851
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
1802
1852
|
raw: {
|
|
1803
1853
|
command: renderedCommand,
|
|
1804
1854
|
stderr: result.stderr,
|
|
@@ -1846,12 +1896,14 @@ var CliProvider = class {
|
|
|
1846
1896
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1847
1897
|
);
|
|
1848
1898
|
}
|
|
1899
|
+
const startTime = Date.now();
|
|
1849
1900
|
const result = await this.runCommand(renderedCommand, {
|
|
1850
1901
|
cwd: this.config.cwd,
|
|
1851
1902
|
env: process.env,
|
|
1852
1903
|
timeoutMs: this.config.timeoutMs,
|
|
1853
1904
|
signal: controller.signal
|
|
1854
1905
|
});
|
|
1906
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1855
1907
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1856
1908
|
if (controller.signal.aborted) {
|
|
1857
1909
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1873,11 +1925,13 @@ var CliProvider = class {
|
|
|
1873
1925
|
if (missingIds.length > 0) {
|
|
1874
1926
|
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1875
1927
|
}
|
|
1928
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
1876
1929
|
const responses = requests.map((request) => {
|
|
1877
1930
|
const evalCaseId = request.evalCaseId;
|
|
1878
1931
|
if (!evalCaseId) {
|
|
1879
1932
|
return {
|
|
1880
|
-
|
|
1933
|
+
outputMessages: [],
|
|
1934
|
+
durationMs: perRequestFallbackMs,
|
|
1881
1935
|
raw: {
|
|
1882
1936
|
command: renderedCommand,
|
|
1883
1937
|
stderr: result.stderr,
|
|
@@ -1890,7 +1944,8 @@ var CliProvider = class {
|
|
|
1890
1944
|
const parsed = recordsById.get(evalCaseId);
|
|
1891
1945
|
if (!parsed) {
|
|
1892
1946
|
return {
|
|
1893
|
-
|
|
1947
|
+
outputMessages: [],
|
|
1948
|
+
durationMs: perRequestFallbackMs,
|
|
1894
1949
|
raw: {
|
|
1895
1950
|
command: renderedCommand,
|
|
1896
1951
|
stderr: result.stderr,
|
|
@@ -1901,9 +1956,10 @@ var CliProvider = class {
|
|
|
1901
1956
|
};
|
|
1902
1957
|
}
|
|
1903
1958
|
return {
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1959
|
+
outputMessages: parsed.outputMessages,
|
|
1960
|
+
tokenUsage: parsed.tokenUsage,
|
|
1961
|
+
costUsd: parsed.costUsd,
|
|
1962
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
1907
1963
|
raw: {
|
|
1908
1964
|
command: renderedCommand,
|
|
1909
1965
|
stderr: result.stderr,
|
|
@@ -1918,28 +1974,111 @@ var CliProvider = class {
|
|
|
1918
1974
|
}
|
|
1919
1975
|
/**
|
|
1920
1976
|
* Parse output content from CLI.
|
|
1921
|
-
* If the content is valid JSON with
|
|
1922
|
-
*
|
|
1977
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
1978
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
1979
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
1980
|
+
*
|
|
1981
|
+
* Also extracts optional execution metrics:
|
|
1982
|
+
* - token_usage: { input, output, cached? }
|
|
1983
|
+
* - cost_usd: number
|
|
1984
|
+
* - duration_ms: number
|
|
1923
1985
|
*/
|
|
1924
1986
|
parseOutputContent(content) {
|
|
1925
1987
|
try {
|
|
1926
1988
|
const parsed = JSON.parse(content);
|
|
1927
|
-
if (typeof parsed === "object" && parsed !== null
|
|
1989
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
1928
1990
|
const obj = parsed;
|
|
1929
|
-
const
|
|
1930
|
-
const
|
|
1931
|
-
|
|
1991
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
1992
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
1993
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
1994
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1995
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
1996
|
+
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
1997
|
+
}
|
|
1998
|
+
if ("text" in obj) {
|
|
1999
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2000
|
+
return {
|
|
2001
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
2002
|
+
tokenUsage,
|
|
2003
|
+
costUsd,
|
|
2004
|
+
durationMs
|
|
2005
|
+
};
|
|
2006
|
+
}
|
|
1932
2007
|
}
|
|
1933
2008
|
} catch {
|
|
1934
2009
|
}
|
|
1935
|
-
return {
|
|
2010
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2011
|
+
}
|
|
2012
|
+
/**
|
|
2013
|
+
* Parse token_usage from CLI output.
|
|
2014
|
+
*/
|
|
2015
|
+
parseTokenUsage(tokenUsage) {
|
|
2016
|
+
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2017
|
+
return void 0;
|
|
2018
|
+
}
|
|
2019
|
+
const obj = tokenUsage;
|
|
2020
|
+
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2021
|
+
return void 0;
|
|
2022
|
+
}
|
|
2023
|
+
return {
|
|
2024
|
+
input: obj.input,
|
|
2025
|
+
output: obj.output,
|
|
2026
|
+
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2027
|
+
};
|
|
2028
|
+
}
|
|
2029
|
+
/**
|
|
2030
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2031
|
+
*/
|
|
2032
|
+
parseOutputMessages(outputMessages) {
|
|
2033
|
+
if (!Array.isArray(outputMessages)) {
|
|
2034
|
+
return void 0;
|
|
2035
|
+
}
|
|
2036
|
+
const messages = [];
|
|
2037
|
+
for (const msg of outputMessages) {
|
|
2038
|
+
if (typeof msg !== "object" || msg === null) {
|
|
2039
|
+
continue;
|
|
2040
|
+
}
|
|
2041
|
+
const rawMsg = msg;
|
|
2042
|
+
if (typeof rawMsg.role !== "string") {
|
|
2043
|
+
continue;
|
|
2044
|
+
}
|
|
2045
|
+
const message = {
|
|
2046
|
+
role: rawMsg.role,
|
|
2047
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2048
|
+
content: rawMsg.content,
|
|
2049
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2050
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2051
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
2052
|
+
};
|
|
2053
|
+
messages.push(message);
|
|
2054
|
+
}
|
|
2055
|
+
return messages.length > 0 ? messages : void 0;
|
|
1936
2056
|
}
|
|
1937
|
-
|
|
1938
|
-
|
|
2057
|
+
/**
|
|
2058
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2059
|
+
*/
|
|
2060
|
+
parseToolCalls(toolCalls) {
|
|
2061
|
+
if (!Array.isArray(toolCalls)) {
|
|
1939
2062
|
return void 0;
|
|
1940
2063
|
}
|
|
1941
|
-
const
|
|
1942
|
-
|
|
2064
|
+
const calls = [];
|
|
2065
|
+
for (const call of toolCalls) {
|
|
2066
|
+
if (typeof call !== "object" || call === null) {
|
|
2067
|
+
continue;
|
|
2068
|
+
}
|
|
2069
|
+
const rawCall = call;
|
|
2070
|
+
if (typeof rawCall.tool !== "string") {
|
|
2071
|
+
continue;
|
|
2072
|
+
}
|
|
2073
|
+
calls.push({
|
|
2074
|
+
tool: rawCall.tool,
|
|
2075
|
+
input: rawCall.input,
|
|
2076
|
+
output: rawCall.output,
|
|
2077
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2078
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2079
|
+
});
|
|
2080
|
+
}
|
|
2081
|
+
return calls.length > 0 ? calls : void 0;
|
|
1943
2082
|
}
|
|
1944
2083
|
parseJsonlBatchOutput(content) {
|
|
1945
2084
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -1963,12 +2102,22 @@ var CliProvider = class {
|
|
|
1963
2102
|
if (records.has(id)) {
|
|
1964
2103
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
1965
2104
|
}
|
|
1966
|
-
const
|
|
1967
|
-
const
|
|
2105
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2106
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2107
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2108
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2109
|
+
let outputMessages;
|
|
2110
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2111
|
+
outputMessages = parsedOutputMessages;
|
|
2112
|
+
} else {
|
|
2113
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2114
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2115
|
+
}
|
|
1968
2116
|
records.set(id, {
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
2117
|
+
outputMessages,
|
|
2118
|
+
tokenUsage,
|
|
2119
|
+
costUsd,
|
|
2120
|
+
durationMs
|
|
1972
2121
|
});
|
|
1973
2122
|
}
|
|
1974
2123
|
return records;
|
|
@@ -1981,8 +2130,10 @@ var CliProvider = class {
|
|
|
1981
2130
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1982
2131
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1983
2132
|
} finally {
|
|
1984
|
-
|
|
1985
|
-
|
|
2133
|
+
if (!this.keepTempFiles) {
|
|
2134
|
+
await fs.unlink(filePath).catch(() => {
|
|
2135
|
+
});
|
|
2136
|
+
}
|
|
1986
2137
|
}
|
|
1987
2138
|
}
|
|
1988
2139
|
async ensureHealthy(signal) {
|
|
@@ -2282,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
|
|
|
2282
2433
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
2283
2434
|
var PROMPT_FILENAME = "prompt.md";
|
|
2284
2435
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2436
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2437
|
+
- Do NOT create any additional output files in the workspace.
|
|
2438
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2439
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2440
|
+
This is required for evaluation scoring.`;
|
|
2285
2441
|
var CodexProvider = class {
|
|
2286
2442
|
id;
|
|
2287
2443
|
kind = "codex";
|
|
@@ -2306,7 +2462,11 @@ var CodexProvider = class {
|
|
|
2306
2462
|
const workspaceRoot = await this.createWorkspace();
|
|
2307
2463
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2308
2464
|
try {
|
|
2309
|
-
const
|
|
2465
|
+
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2466
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2467
|
+
const promptContent = `${systemPrompt}
|
|
2468
|
+
|
|
2469
|
+
${basePrompt}`;
|
|
2310
2470
|
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
2311
2471
|
await writeFile(promptFile, promptContent, "utf8");
|
|
2312
2472
|
const args = this.buildCodexArgs();
|
|
@@ -2325,7 +2485,6 @@ var CodexProvider = class {
|
|
|
2325
2485
|
const parsed = parseCodexJson(result.stdout);
|
|
2326
2486
|
const assistantText = extractAssistantText(parsed);
|
|
2327
2487
|
return {
|
|
2328
|
-
text: assistantText,
|
|
2329
2488
|
raw: {
|
|
2330
2489
|
response: parsed,
|
|
2331
2490
|
stdout: result.stdout,
|
|
@@ -2337,7 +2496,8 @@ var CodexProvider = class {
|
|
|
2337
2496
|
workspace: workspaceRoot,
|
|
2338
2497
|
inputFiles,
|
|
2339
2498
|
logFile: logger?.filePath
|
|
2340
|
-
}
|
|
2499
|
+
},
|
|
2500
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2341
2501
|
};
|
|
2342
2502
|
} finally {
|
|
2343
2503
|
await logger?.close();
|
|
@@ -2959,7 +3119,6 @@ var MockProvider = class {
|
|
|
2959
3119
|
delayMs;
|
|
2960
3120
|
delayMinMs;
|
|
2961
3121
|
delayMaxMs;
|
|
2962
|
-
trace;
|
|
2963
3122
|
constructor(targetName, config) {
|
|
2964
3123
|
this.id = `mock:${targetName}`;
|
|
2965
3124
|
this.targetName = targetName;
|
|
@@ -2967,7 +3126,6 @@ var MockProvider = class {
|
|
|
2967
3126
|
this.delayMs = config.delayMs ?? 0;
|
|
2968
3127
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
2969
3128
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
2970
|
-
this.trace = config.trace;
|
|
2971
3129
|
}
|
|
2972
3130
|
async invoke(request) {
|
|
2973
3131
|
const delay = this.calculateDelay();
|
|
@@ -2975,12 +3133,11 @@ var MockProvider = class {
|
|
|
2975
3133
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
2976
3134
|
}
|
|
2977
3135
|
return {
|
|
2978
|
-
|
|
3136
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
2979
3137
|
raw: {
|
|
2980
3138
|
question: request.question,
|
|
2981
3139
|
guidelines: request.guidelines
|
|
2982
|
-
}
|
|
2983
|
-
trace: this.trace
|
|
3140
|
+
}
|
|
2984
3141
|
};
|
|
2985
3142
|
}
|
|
2986
3143
|
calculateDelay() {
|
|
@@ -2993,163 +3150,842 @@ var MockProvider = class {
|
|
|
2993
3150
|
}
|
|
2994
3151
|
};
|
|
2995
3152
|
|
|
2996
|
-
// src/evaluation/providers/
|
|
3153
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3154
|
+
import { spawn as spawn2 } from "node:child_process";
|
|
3155
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
3156
|
+
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
3157
|
+
import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3158
|
+
import { tmpdir as tmpdir2 } from "node:os";
|
|
2997
3159
|
import path10 from "node:path";
|
|
2998
|
-
import {
|
|
2999
|
-
dispatchAgentSession,
|
|
3000
|
-
dispatchBatchAgent,
|
|
3001
|
-
getSubagentRoot,
|
|
3002
|
-
provisionSubagents
|
|
3003
|
-
} from "subagent";
|
|
3004
|
-
|
|
3005
|
-
// src/evaluation/providers/vscode-templates.ts
|
|
3006
|
-
var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3007
|
-
|
|
3008
|
-
{{userQuery}}
|
|
3009
|
-
|
|
3010
|
-
[[ ## system_instructions ## ]]
|
|
3011
|
-
|
|
3012
|
-
**IMPORTANT**: Follow these exact steps:
|
|
3013
|
-
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3014
|
-
- Do NOT create any additional output files in the workspace.
|
|
3015
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
3016
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3017
|
-
2. When completely finished, run these PowerShell commands to signal completion:
|
|
3018
|
-
\`\`\`
|
|
3019
|
-
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
3020
|
-
if (Test-Path subagent.lock) { del subagent.lock }
|
|
3021
|
-
\`\`\`
|
|
3022
|
-
|
|
3023
|
-
Do not proceed to step 2 until your response is completely written to the temporary file.
|
|
3024
|
-
`;
|
|
3025
|
-
var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3026
|
-
|
|
3027
|
-
{{userQuery}}
|
|
3028
|
-
|
|
3029
|
-
[[ ## system_instructions ## ]]
|
|
3030
3160
|
|
|
3031
|
-
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3161
|
+
// src/evaluation/providers/pi-log-tracker.ts
|
|
3162
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
|
|
3163
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
|
|
3164
|
+
function getPiLogStore() {
|
|
3165
|
+
const globalObject = globalThis;
|
|
3166
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
3167
|
+
if (existing) {
|
|
3168
|
+
return existing;
|
|
3169
|
+
}
|
|
3170
|
+
const created = [];
|
|
3171
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
3172
|
+
return created;
|
|
3173
|
+
}
|
|
3174
|
+
function getSubscriberStore2() {
|
|
3175
|
+
const globalObject = globalThis;
|
|
3176
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
3177
|
+
if (existing) {
|
|
3178
|
+
return existing;
|
|
3179
|
+
}
|
|
3180
|
+
const created = /* @__PURE__ */ new Set();
|
|
3181
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
3182
|
+
return created;
|
|
3183
|
+
}
|
|
3184
|
+
function notifySubscribers2(entry) {
|
|
3185
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
3186
|
+
for (const listener of subscribers) {
|
|
3187
|
+
try {
|
|
3188
|
+
listener(entry);
|
|
3189
|
+
} catch (error) {
|
|
3190
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3191
|
+
console.warn(`Pi log subscriber failed: ${message}`);
|
|
3192
|
+
}
|
|
3193
|
+
}
|
|
3194
|
+
}
|
|
3195
|
+
function recordPiLogEntry(entry) {
|
|
3196
|
+
getPiLogStore().push(entry);
|
|
3197
|
+
notifySubscribers2(entry);
|
|
3198
|
+
}
|
|
3199
|
+
function consumePiLogEntries() {
|
|
3200
|
+
const store = getPiLogStore();
|
|
3201
|
+
if (store.length === 0) {
|
|
3202
|
+
return [];
|
|
3203
|
+
}
|
|
3204
|
+
return store.splice(0, store.length);
|
|
3205
|
+
}
|
|
3206
|
+
function subscribeToPiLogEntries(listener) {
|
|
3207
|
+
const store = getSubscriberStore2();
|
|
3208
|
+
store.add(listener);
|
|
3209
|
+
return () => {
|
|
3210
|
+
store.delete(listener);
|
|
3211
|
+
};
|
|
3212
|
+
}
|
|
3039
3213
|
|
|
3040
|
-
// src/evaluation/providers/
|
|
3041
|
-
var
|
|
3214
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3215
|
+
var WORKSPACE_PREFIX2 = "agentv-pi-";
|
|
3216
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
3217
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3218
|
+
- Do NOT create any additional output files in the workspace.
|
|
3219
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
3220
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3221
|
+
This is required for evaluation scoring.`;
|
|
3222
|
+
var PiCodingAgentProvider = class {
|
|
3042
3223
|
id;
|
|
3043
|
-
kind;
|
|
3224
|
+
kind = "pi-coding-agent";
|
|
3044
3225
|
targetName;
|
|
3045
|
-
supportsBatch =
|
|
3226
|
+
supportsBatch = false;
|
|
3046
3227
|
config;
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
this.
|
|
3228
|
+
runPi;
|
|
3229
|
+
constructor(targetName, config, runner = defaultPiRunner) {
|
|
3230
|
+
this.id = `pi-coding-agent:${targetName}`;
|
|
3050
3231
|
this.targetName = targetName;
|
|
3051
3232
|
this.config = config;
|
|
3233
|
+
this.runPi = runner;
|
|
3052
3234
|
}
|
|
3053
3235
|
async invoke(request) {
|
|
3054
3236
|
if (request.signal?.aborted) {
|
|
3055
|
-
throw new Error("
|
|
3056
|
-
}
|
|
3057
|
-
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
3058
|
-
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
3059
|
-
const session = await dispatchAgentSession({
|
|
3060
|
-
userQuery: promptContent,
|
|
3061
|
-
extraAttachments: inputFiles,
|
|
3062
|
-
requestTemplate: AGENTV_REQUEST_TEMPLATE,
|
|
3063
|
-
wait: this.config.waitForResponse,
|
|
3064
|
-
dryRun: this.config.dryRun,
|
|
3065
|
-
vscodeCmd: this.config.command,
|
|
3066
|
-
subagentRoot: this.config.subagentRoot,
|
|
3067
|
-
workspaceTemplate: this.config.workspaceTemplate,
|
|
3068
|
-
silent: true
|
|
3069
|
-
});
|
|
3070
|
-
if (session.exitCode !== 0 || !session.responseFile) {
|
|
3071
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
3072
|
-
throw new Error(failure);
|
|
3237
|
+
throw new Error("Pi coding agent request was aborted before execution");
|
|
3073
3238
|
}
|
|
3074
|
-
|
|
3239
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
3240
|
+
const workspaceRoot = await this.createWorkspace();
|
|
3241
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3242
|
+
try {
|
|
3243
|
+
const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3244
|
+
await writeFile2(promptFile, request.question, "utf8");
|
|
3245
|
+
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3246
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
3247
|
+
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3248
|
+
if (result.timedOut) {
|
|
3249
|
+
throw new Error(
|
|
3250
|
+
`Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
3251
|
+
);
|
|
3252
|
+
}
|
|
3253
|
+
if (result.exitCode !== 0) {
|
|
3254
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
3255
|
+
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3256
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3257
|
+
}
|
|
3258
|
+
const parsed = parsePiJsonl(result.stdout);
|
|
3259
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
3260
|
+
const assistantText = extractAssistantText2(outputMessages);
|
|
3075
3261
|
return {
|
|
3076
|
-
text: "",
|
|
3077
3262
|
raw: {
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3263
|
+
response: parsed,
|
|
3264
|
+
stdout: result.stdout,
|
|
3265
|
+
stderr: result.stderr,
|
|
3266
|
+
exitCode: result.exitCode,
|
|
3267
|
+
args,
|
|
3268
|
+
executable: this.config.executable,
|
|
3269
|
+
promptFile,
|
|
3270
|
+
workspace: workspaceRoot,
|
|
3271
|
+
inputFiles,
|
|
3272
|
+
logFile: logger?.filePath
|
|
3273
|
+
},
|
|
3274
|
+
outputMessages
|
|
3081
3275
|
};
|
|
3276
|
+
} finally {
|
|
3277
|
+
await logger?.close();
|
|
3278
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
3082
3279
|
}
|
|
3083
|
-
const responseText = await readTextFile(session.responseFile);
|
|
3084
|
-
return {
|
|
3085
|
-
text: responseText,
|
|
3086
|
-
raw: {
|
|
3087
|
-
session,
|
|
3088
|
-
inputFiles
|
|
3089
|
-
}
|
|
3090
|
-
};
|
|
3091
3280
|
}
|
|
3092
|
-
|
|
3093
|
-
if (
|
|
3094
|
-
return
|
|
3281
|
+
resolveCwd(workspaceRoot) {
|
|
3282
|
+
if (!this.config.cwd) {
|
|
3283
|
+
return workspaceRoot;
|
|
3095
3284
|
}
|
|
3096
|
-
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
-
);
|
|
3103
|
-
const userQueries = normalizedRequests.map(
|
|
3104
|
-
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
3105
|
-
);
|
|
3106
|
-
const session = await dispatchBatchAgent({
|
|
3107
|
-
userQueries,
|
|
3108
|
-
extraAttachments: combinedInputFiles,
|
|
3109
|
-
requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
|
|
3110
|
-
wait: this.config.waitForResponse,
|
|
3111
|
-
dryRun: this.config.dryRun,
|
|
3112
|
-
vscodeCmd: this.config.command,
|
|
3113
|
-
subagentRoot: this.config.subagentRoot,
|
|
3114
|
-
workspaceTemplate: this.config.workspaceTemplate,
|
|
3115
|
-
silent: true
|
|
3116
|
-
});
|
|
3117
|
-
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
3118
|
-
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
3119
|
-
throw new Error(failure);
|
|
3285
|
+
return path10.resolve(this.config.cwd);
|
|
3286
|
+
}
|
|
3287
|
+
buildPiArgs(prompt, inputFiles) {
|
|
3288
|
+
const args = [];
|
|
3289
|
+
if (this.config.provider) {
|
|
3290
|
+
args.push("--provider", this.config.provider);
|
|
3120
3291
|
}
|
|
3121
|
-
if (this.config.
|
|
3122
|
-
|
|
3123
|
-
text: "",
|
|
3124
|
-
raw: {
|
|
3125
|
-
session,
|
|
3126
|
-
inputFiles,
|
|
3127
|
-
allInputFiles: combinedInputFiles
|
|
3128
|
-
}
|
|
3129
|
-
}));
|
|
3292
|
+
if (this.config.model) {
|
|
3293
|
+
args.push("--model", this.config.model);
|
|
3130
3294
|
}
|
|
3131
|
-
if (
|
|
3132
|
-
|
|
3133
|
-
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
3134
|
-
);
|
|
3295
|
+
if (this.config.apiKey) {
|
|
3296
|
+
args.push("--api-key", this.config.apiKey);
|
|
3135
3297
|
}
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
raw: {
|
|
3142
|
-
session,
|
|
3143
|
-
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
3144
|
-
allInputFiles: combinedInputFiles,
|
|
3145
|
-
responseFile
|
|
3146
|
-
}
|
|
3147
|
-
});
|
|
3298
|
+
args.push("--mode", "json");
|
|
3299
|
+
args.push("--print");
|
|
3300
|
+
args.push("--no-session");
|
|
3301
|
+
if (this.config.tools) {
|
|
3302
|
+
args.push("--tools", this.config.tools);
|
|
3148
3303
|
}
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
}
|
|
3152
|
-
|
|
3304
|
+
if (this.config.thinking) {
|
|
3305
|
+
args.push("--thinking", this.config.thinking);
|
|
3306
|
+
}
|
|
3307
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
3308
|
+
args.push(...this.config.args);
|
|
3309
|
+
}
|
|
3310
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
3311
|
+
for (const file of inputFiles) {
|
|
3312
|
+
args.push(`@${file}`);
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
3316
|
+
const fullPrompt = `${systemPrompt}
|
|
3317
|
+
|
|
3318
|
+
${prompt}`;
|
|
3319
|
+
const escapedPrompt = escapeAtSymbols(fullPrompt);
|
|
3320
|
+
args.push(escapedPrompt);
|
|
3321
|
+
return args;
|
|
3322
|
+
}
|
|
3323
|
+
async executePi(args, cwd, signal, logger) {
|
|
3324
|
+
try {
|
|
3325
|
+
return await this.runPi({
|
|
3326
|
+
executable: this.config.executable,
|
|
3327
|
+
args,
|
|
3328
|
+
cwd,
|
|
3329
|
+
timeoutMs: this.config.timeoutMs,
|
|
3330
|
+
env: this.buildEnv(),
|
|
3331
|
+
signal,
|
|
3332
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
3333
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
3334
|
+
});
|
|
3335
|
+
} catch (error) {
|
|
3336
|
+
const err = error;
|
|
3337
|
+
if (err.code === "ENOENT") {
|
|
3338
|
+
throw new Error(
|
|
3339
|
+
`Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
3340
|
+
);
|
|
3341
|
+
}
|
|
3342
|
+
throw error;
|
|
3343
|
+
}
|
|
3344
|
+
}
|
|
3345
|
+
buildEnv() {
|
|
3346
|
+
const env = { ...process.env };
|
|
3347
|
+
if (this.config.apiKey) {
|
|
3348
|
+
const provider = this.config.provider?.toLowerCase() ?? "google";
|
|
3349
|
+
switch (provider) {
|
|
3350
|
+
case "google":
|
|
3351
|
+
case "gemini":
|
|
3352
|
+
env.GEMINI_API_KEY = this.config.apiKey;
|
|
3353
|
+
break;
|
|
3354
|
+
case "anthropic":
|
|
3355
|
+
env.ANTHROPIC_API_KEY = this.config.apiKey;
|
|
3356
|
+
break;
|
|
3357
|
+
case "openai":
|
|
3358
|
+
env.OPENAI_API_KEY = this.config.apiKey;
|
|
3359
|
+
break;
|
|
3360
|
+
case "groq":
|
|
3361
|
+
env.GROQ_API_KEY = this.config.apiKey;
|
|
3362
|
+
break;
|
|
3363
|
+
case "xai":
|
|
3364
|
+
env.XAI_API_KEY = this.config.apiKey;
|
|
3365
|
+
break;
|
|
3366
|
+
case "openrouter":
|
|
3367
|
+
env.OPENROUTER_API_KEY = this.config.apiKey;
|
|
3368
|
+
break;
|
|
3369
|
+
}
|
|
3370
|
+
}
|
|
3371
|
+
return env;
|
|
3372
|
+
}
|
|
3373
|
+
async createWorkspace() {
|
|
3374
|
+
return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
3375
|
+
}
|
|
3376
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
3377
|
+
try {
|
|
3378
|
+
await rm2(workspaceRoot, { recursive: true, force: true });
|
|
3379
|
+
} catch {
|
|
3380
|
+
}
|
|
3381
|
+
}
|
|
3382
|
+
resolveLogDirectory() {
|
|
3383
|
+
if (this.config.logDir) {
|
|
3384
|
+
return path10.resolve(this.config.logDir);
|
|
3385
|
+
}
|
|
3386
|
+
return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3387
|
+
}
|
|
3388
|
+
async createStreamLogger(request) {
|
|
3389
|
+
const logDir = this.resolveLogDirectory();
|
|
3390
|
+
if (!logDir) {
|
|
3391
|
+
return void 0;
|
|
3392
|
+
}
|
|
3393
|
+
try {
|
|
3394
|
+
await mkdir2(logDir, { recursive: true });
|
|
3395
|
+
} catch (error) {
|
|
3396
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3397
|
+
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3398
|
+
return void 0;
|
|
3399
|
+
}
|
|
3400
|
+
const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3401
|
+
try {
|
|
3402
|
+
const logger = await PiStreamLogger.create({
|
|
3403
|
+
filePath,
|
|
3404
|
+
targetName: this.targetName,
|
|
3405
|
+
evalCaseId: request.evalCaseId,
|
|
3406
|
+
attempt: request.attempt,
|
|
3407
|
+
format: this.config.logFormat ?? "summary"
|
|
3408
|
+
});
|
|
3409
|
+
recordPiLogEntry({
|
|
3410
|
+
filePath,
|
|
3411
|
+
targetName: this.targetName,
|
|
3412
|
+
evalCaseId: request.evalCaseId,
|
|
3413
|
+
attempt: request.attempt
|
|
3414
|
+
});
|
|
3415
|
+
return logger;
|
|
3416
|
+
} catch (error) {
|
|
3417
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3418
|
+
console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
|
|
3419
|
+
return void 0;
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3422
|
+
};
|
|
3423
|
+
var PiStreamLogger = class _PiStreamLogger {
|
|
3424
|
+
filePath;
|
|
3425
|
+
stream;
|
|
3426
|
+
startedAt = Date.now();
|
|
3427
|
+
stdoutBuffer = "";
|
|
3428
|
+
stderrBuffer = "";
|
|
3429
|
+
format;
|
|
3430
|
+
constructor(filePath, format) {
|
|
3431
|
+
this.filePath = filePath;
|
|
3432
|
+
this.format = format;
|
|
3433
|
+
this.stream = createWriteStream2(filePath, { flags: "a" });
|
|
3434
|
+
}
|
|
3435
|
+
static async create(options) {
|
|
3436
|
+
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
3437
|
+
const header = [
|
|
3438
|
+
"# Pi Coding Agent stream log",
|
|
3439
|
+
`# target: ${options.targetName}`,
|
|
3440
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
3441
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
3442
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
3443
|
+
""
|
|
3444
|
+
].filter((line) => Boolean(line));
|
|
3445
|
+
logger.writeLines(header);
|
|
3446
|
+
return logger;
|
|
3447
|
+
}
|
|
3448
|
+
handleStdoutChunk(chunk) {
|
|
3449
|
+
this.stdoutBuffer += chunk;
|
|
3450
|
+
this.flushBuffer("stdout");
|
|
3451
|
+
}
|
|
3452
|
+
handleStderrChunk(chunk) {
|
|
3453
|
+
this.stderrBuffer += chunk;
|
|
3454
|
+
this.flushBuffer("stderr");
|
|
3455
|
+
}
|
|
3456
|
+
async close() {
|
|
3457
|
+
this.flushBuffer("stdout");
|
|
3458
|
+
this.flushBuffer("stderr");
|
|
3459
|
+
this.flushRemainder();
|
|
3460
|
+
await new Promise((resolve, reject) => {
|
|
3461
|
+
this.stream.once("error", reject);
|
|
3462
|
+
this.stream.end(() => resolve());
|
|
3463
|
+
});
|
|
3464
|
+
}
|
|
3465
|
+
writeLines(lines) {
|
|
3466
|
+
for (const line of lines) {
|
|
3467
|
+
this.stream.write(`${line}
|
|
3468
|
+
`);
|
|
3469
|
+
}
|
|
3470
|
+
}
|
|
3471
|
+
flushBuffer(source) {
|
|
3472
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
3473
|
+
const lines = buffer.split(/\r?\n/);
|
|
3474
|
+
const remainder = lines.pop() ?? "";
|
|
3475
|
+
if (source === "stdout") {
|
|
3476
|
+
this.stdoutBuffer = remainder;
|
|
3477
|
+
} else {
|
|
3478
|
+
this.stderrBuffer = remainder;
|
|
3479
|
+
}
|
|
3480
|
+
for (const line of lines) {
|
|
3481
|
+
const formatted = this.formatLine(line, source);
|
|
3482
|
+
if (formatted) {
|
|
3483
|
+
this.stream.write(formatted);
|
|
3484
|
+
this.stream.write("\n");
|
|
3485
|
+
}
|
|
3486
|
+
}
|
|
3487
|
+
}
|
|
3488
|
+
formatLine(rawLine, source) {
|
|
3489
|
+
const trimmed = rawLine.trim();
|
|
3490
|
+
if (trimmed.length === 0) {
|
|
3491
|
+
return void 0;
|
|
3492
|
+
}
|
|
3493
|
+
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3494
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
3495
|
+
}
|
|
3496
|
+
flushRemainder() {
|
|
3497
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
3498
|
+
if (stdoutRemainder.length > 0) {
|
|
3499
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
3500
|
+
if (formatted) {
|
|
3501
|
+
this.stream.write(formatted);
|
|
3502
|
+
this.stream.write("\n");
|
|
3503
|
+
}
|
|
3504
|
+
}
|
|
3505
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
3506
|
+
if (stderrRemainder.length > 0) {
|
|
3507
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
3508
|
+
if (formatted) {
|
|
3509
|
+
this.stream.write(formatted);
|
|
3510
|
+
this.stream.write("\n");
|
|
3511
|
+
}
|
|
3512
|
+
}
|
|
3513
|
+
this.stdoutBuffer = "";
|
|
3514
|
+
this.stderrBuffer = "";
|
|
3515
|
+
}
|
|
3516
|
+
};
|
|
3517
|
+
function buildLogFilename2(request, targetName) {
|
|
3518
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3519
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
|
|
3520
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3521
|
+
const target = sanitizeForFilename2(targetName);
|
|
3522
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
|
|
3523
|
+
}
|
|
3524
|
+
function sanitizeForFilename2(value) {
|
|
3525
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3526
|
+
return sanitized.length > 0 ? sanitized : "pi";
|
|
3527
|
+
}
|
|
3528
|
+
function formatElapsed2(startedAt) {
|
|
3529
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3530
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3531
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
3532
|
+
const seconds = elapsedSeconds % 60;
|
|
3533
|
+
if (hours > 0) {
|
|
3534
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3535
|
+
}
|
|
3536
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3537
|
+
}
|
|
3538
|
+
function formatPiLogMessage(rawLine, source) {
|
|
3539
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3540
|
+
if (parsed) {
|
|
3541
|
+
const summary = summarizePiEvent(parsed);
|
|
3542
|
+
if (summary) {
|
|
3543
|
+
return summary;
|
|
3544
|
+
}
|
|
3545
|
+
}
|
|
3546
|
+
if (source === "stderr") {
|
|
3547
|
+
return `stderr: ${rawLine}`;
|
|
3548
|
+
}
|
|
3549
|
+
return rawLine;
|
|
3550
|
+
}
|
|
3551
|
+
function formatPiJsonLog(rawLine) {
|
|
3552
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3553
|
+
if (!parsed) {
|
|
3554
|
+
return rawLine;
|
|
3555
|
+
}
|
|
3556
|
+
try {
|
|
3557
|
+
return JSON.stringify(parsed, null, 2);
|
|
3558
|
+
} catch {
|
|
3559
|
+
return rawLine;
|
|
3560
|
+
}
|
|
3561
|
+
}
|
|
3562
|
+
function summarizePiEvent(event) {
|
|
3563
|
+
if (!event || typeof event !== "object") {
|
|
3564
|
+
return void 0;
|
|
3565
|
+
}
|
|
3566
|
+
const record = event;
|
|
3567
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
3568
|
+
if (!type) {
|
|
3569
|
+
return void 0;
|
|
3570
|
+
}
|
|
3571
|
+
switch (type) {
|
|
3572
|
+
case "agent_start":
|
|
3573
|
+
return "agent_start";
|
|
3574
|
+
case "agent_end":
|
|
3575
|
+
return "agent_end";
|
|
3576
|
+
case "turn_start":
|
|
3577
|
+
return "turn_start";
|
|
3578
|
+
case "turn_end":
|
|
3579
|
+
return "turn_end";
|
|
3580
|
+
case "message_start":
|
|
3581
|
+
case "message_end": {
|
|
3582
|
+
const message = record.message;
|
|
3583
|
+
const role = message?.role;
|
|
3584
|
+
return `${type}: ${role}`;
|
|
3585
|
+
}
|
|
3586
|
+
case "message_update": {
|
|
3587
|
+
const event2 = record.assistantMessageEvent;
|
|
3588
|
+
const eventType = event2?.type;
|
|
3589
|
+
if (eventType === "text_delta") {
|
|
3590
|
+
const delta = event2?.delta;
|
|
3591
|
+
if (typeof delta === "string") {
|
|
3592
|
+
const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
|
|
3593
|
+
return `text_delta: ${preview}`;
|
|
3594
|
+
}
|
|
3595
|
+
}
|
|
3596
|
+
return `message_update: ${eventType}`;
|
|
3597
|
+
}
|
|
3598
|
+
default:
|
|
3599
|
+
return type;
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
function tryParseJsonValue2(rawLine) {
|
|
3603
|
+
try {
|
|
3604
|
+
return JSON.parse(rawLine);
|
|
3605
|
+
} catch {
|
|
3606
|
+
return void 0;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
function parsePiJsonl(output) {
|
|
3610
|
+
const trimmed = output.trim();
|
|
3611
|
+
if (trimmed.length === 0) {
|
|
3612
|
+
throw new Error("Pi coding agent produced no output");
|
|
3613
|
+
}
|
|
3614
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
3615
|
+
const parsed = [];
|
|
3616
|
+
for (const line of lines) {
|
|
3617
|
+
try {
|
|
3618
|
+
parsed.push(JSON.parse(line));
|
|
3619
|
+
} catch {
|
|
3620
|
+
}
|
|
3621
|
+
}
|
|
3622
|
+
if (parsed.length === 0) {
|
|
3623
|
+
throw new Error("Pi coding agent produced no valid JSON output");
|
|
3624
|
+
}
|
|
3625
|
+
return parsed;
|
|
3626
|
+
}
|
|
3627
|
+
function extractOutputMessages(events) {
|
|
3628
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
3629
|
+
const event = events[i];
|
|
3630
|
+
if (!event || typeof event !== "object") {
|
|
3631
|
+
continue;
|
|
3632
|
+
}
|
|
3633
|
+
const record = event;
|
|
3634
|
+
if (record.type !== "agent_end") {
|
|
3635
|
+
continue;
|
|
3636
|
+
}
|
|
3637
|
+
const messages = record.messages;
|
|
3638
|
+
if (!Array.isArray(messages)) {
|
|
3639
|
+
continue;
|
|
3640
|
+
}
|
|
3641
|
+
return messages.map(convertPiMessage).filter((m) => m !== void 0);
|
|
3642
|
+
}
|
|
3643
|
+
const outputMessages = [];
|
|
3644
|
+
for (const event of events) {
|
|
3645
|
+
if (!event || typeof event !== "object") {
|
|
3646
|
+
continue;
|
|
3647
|
+
}
|
|
3648
|
+
const record = event;
|
|
3649
|
+
if (record.type === "turn_end") {
|
|
3650
|
+
const message = record.message;
|
|
3651
|
+
const converted = convertPiMessage(message);
|
|
3652
|
+
if (converted) {
|
|
3653
|
+
outputMessages.push(converted);
|
|
3654
|
+
}
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
return outputMessages;
|
|
3658
|
+
}
|
|
3659
|
+
function convertPiMessage(message) {
|
|
3660
|
+
if (!message || typeof message !== "object") {
|
|
3661
|
+
return void 0;
|
|
3662
|
+
}
|
|
3663
|
+
const msg = message;
|
|
3664
|
+
const role = msg.role;
|
|
3665
|
+
if (typeof role !== "string") {
|
|
3666
|
+
return void 0;
|
|
3667
|
+
}
|
|
3668
|
+
const content = extractTextContent(msg.content);
|
|
3669
|
+
const toolCalls = extractToolCalls(msg.content);
|
|
3670
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3671
|
+
const metadata = {};
|
|
3672
|
+
if (msg.api) metadata.api = msg.api;
|
|
3673
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
3674
|
+
if (msg.model) metadata.model = msg.model;
|
|
3675
|
+
if (msg.usage) metadata.usage = msg.usage;
|
|
3676
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
3677
|
+
return {
|
|
3678
|
+
role,
|
|
3679
|
+
content,
|
|
3680
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
3681
|
+
timestamp,
|
|
3682
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3683
|
+
};
|
|
3684
|
+
}
|
|
3685
|
+
function extractTextContent(content) {
|
|
3686
|
+
if (typeof content === "string") {
|
|
3687
|
+
return content;
|
|
3688
|
+
}
|
|
3689
|
+
if (!Array.isArray(content)) {
|
|
3690
|
+
return void 0;
|
|
3691
|
+
}
|
|
3692
|
+
const textParts = [];
|
|
3693
|
+
for (const part of content) {
|
|
3694
|
+
if (!part || typeof part !== "object") {
|
|
3695
|
+
continue;
|
|
3696
|
+
}
|
|
3697
|
+
const p = part;
|
|
3698
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
3699
|
+
textParts.push(p.text);
|
|
3700
|
+
}
|
|
3701
|
+
}
|
|
3702
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3703
|
+
}
|
|
3704
|
+
function extractToolCalls(content) {
|
|
3705
|
+
if (!Array.isArray(content)) {
|
|
3706
|
+
return [];
|
|
3707
|
+
}
|
|
3708
|
+
const toolCalls = [];
|
|
3709
|
+
for (const part of content) {
|
|
3710
|
+
if (!part || typeof part !== "object") {
|
|
3711
|
+
continue;
|
|
3712
|
+
}
|
|
3713
|
+
const p = part;
|
|
3714
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
3715
|
+
toolCalls.push({
|
|
3716
|
+
tool: p.name,
|
|
3717
|
+
input: p.input,
|
|
3718
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
3719
|
+
});
|
|
3720
|
+
}
|
|
3721
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
3722
|
+
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
3723
|
+
if (existing) {
|
|
3724
|
+
const idx = toolCalls.indexOf(existing);
|
|
3725
|
+
toolCalls[idx] = {
|
|
3726
|
+
...existing,
|
|
3727
|
+
output: p.content
|
|
3728
|
+
};
|
|
3729
|
+
}
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
return toolCalls;
|
|
3733
|
+
}
|
|
3734
|
+
function extractAssistantText2(messages) {
|
|
3735
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3736
|
+
const msg = messages[i];
|
|
3737
|
+
if (msg.role === "assistant" && msg.content) {
|
|
3738
|
+
if (typeof msg.content === "string") {
|
|
3739
|
+
return msg.content;
|
|
3740
|
+
}
|
|
3741
|
+
return JSON.stringify(msg.content);
|
|
3742
|
+
}
|
|
3743
|
+
}
|
|
3744
|
+
return "";
|
|
3745
|
+
}
|
|
3746
|
+
function escapeAtSymbols(prompt) {
|
|
3747
|
+
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3748
|
+
}
|
|
3749
|
+
function pickDetail2(stderr, stdout) {
|
|
3750
|
+
const errorText = stderr.trim();
|
|
3751
|
+
if (errorText.length > 0) {
|
|
3752
|
+
return errorText;
|
|
3753
|
+
}
|
|
3754
|
+
const stdoutText = stdout.trim();
|
|
3755
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3756
|
+
}
|
|
3757
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3758
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
3759
|
+
return "";
|
|
3760
|
+
}
|
|
3761
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
3762
|
+
return ` after ${seconds}s`;
|
|
3763
|
+
}
|
|
3764
|
+
async function defaultPiRunner(options) {
|
|
3765
|
+
return await new Promise((resolve, reject) => {
|
|
3766
|
+
const parts = options.executable.split(/\s+/);
|
|
3767
|
+
const executable = parts[0];
|
|
3768
|
+
const executableArgs = parts.slice(1);
|
|
3769
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
3770
|
+
const child = spawn2(executable, allArgs, {
|
|
3771
|
+
cwd: options.cwd,
|
|
3772
|
+
env: options.env,
|
|
3773
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
3774
|
+
shell: false
|
|
3775
|
+
});
|
|
3776
|
+
let stdout = "";
|
|
3777
|
+
let stderr = "";
|
|
3778
|
+
let timedOut = false;
|
|
3779
|
+
const onAbort = () => {
|
|
3780
|
+
child.kill("SIGTERM");
|
|
3781
|
+
};
|
|
3782
|
+
if (options.signal) {
|
|
3783
|
+
if (options.signal.aborted) {
|
|
3784
|
+
onAbort();
|
|
3785
|
+
} else {
|
|
3786
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
3787
|
+
}
|
|
3788
|
+
}
|
|
3789
|
+
let timeoutHandle;
|
|
3790
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
3791
|
+
timeoutHandle = setTimeout(() => {
|
|
3792
|
+
timedOut = true;
|
|
3793
|
+
child.kill("SIGTERM");
|
|
3794
|
+
}, options.timeoutMs);
|
|
3795
|
+
timeoutHandle.unref?.();
|
|
3796
|
+
}
|
|
3797
|
+
child.stdout.setEncoding("utf8");
|
|
3798
|
+
child.stdout.on("data", (chunk) => {
|
|
3799
|
+
stdout += chunk;
|
|
3800
|
+
options.onStdoutChunk?.(chunk);
|
|
3801
|
+
});
|
|
3802
|
+
child.stderr.setEncoding("utf8");
|
|
3803
|
+
child.stderr.on("data", (chunk) => {
|
|
3804
|
+
stderr += chunk;
|
|
3805
|
+
options.onStderrChunk?.(chunk);
|
|
3806
|
+
});
|
|
3807
|
+
child.stdin.end();
|
|
3808
|
+
const cleanup = () => {
|
|
3809
|
+
if (timeoutHandle) {
|
|
3810
|
+
clearTimeout(timeoutHandle);
|
|
3811
|
+
}
|
|
3812
|
+
if (options.signal) {
|
|
3813
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
3814
|
+
}
|
|
3815
|
+
};
|
|
3816
|
+
child.on("error", (error) => {
|
|
3817
|
+
cleanup();
|
|
3818
|
+
reject(error);
|
|
3819
|
+
});
|
|
3820
|
+
child.on("close", (code) => {
|
|
3821
|
+
cleanup();
|
|
3822
|
+
resolve({
|
|
3823
|
+
stdout,
|
|
3824
|
+
stderr,
|
|
3825
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
3826
|
+
timedOut
|
|
3827
|
+
});
|
|
3828
|
+
});
|
|
3829
|
+
});
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3832
|
+
// src/evaluation/providers/vscode.ts
|
|
3833
|
+
import path11 from "node:path";
|
|
3834
|
+
import {
|
|
3835
|
+
dispatchAgentSession,
|
|
3836
|
+
dispatchBatchAgent,
|
|
3837
|
+
getSubagentRoot,
|
|
3838
|
+
provisionSubagents
|
|
3839
|
+
} from "subagent";
|
|
3840
|
+
|
|
3841
|
+
// src/evaluation/providers/vscode-templates.ts
|
|
3842
|
+
var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3843
|
+
|
|
3844
|
+
{{userQuery}}
|
|
3845
|
+
|
|
3846
|
+
[[ ## system_instructions ## ]]
|
|
3847
|
+
|
|
3848
|
+
**IMPORTANT**: Follow these exact steps:
|
|
3849
|
+
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3850
|
+
- Do NOT create any additional output files in the workspace.
|
|
3851
|
+
- All intended file outputs/changes MUST be written in your response file.
|
|
3852
|
+
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3853
|
+
2. When completely finished, run these PowerShell commands to signal completion:
|
|
3854
|
+
\`\`\`
|
|
3855
|
+
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
3856
|
+
if (Test-Path subagent.lock) { del subagent.lock }
|
|
3857
|
+
\`\`\`
|
|
3858
|
+
|
|
3859
|
+
Do not proceed to step 2 until your response is completely written to the temporary file.
|
|
3860
|
+
`;
|
|
3861
|
+
var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3862
|
+
|
|
3863
|
+
{{userQuery}}
|
|
3864
|
+
|
|
3865
|
+
[[ ## system_instructions ## ]]
|
|
3866
|
+
|
|
3867
|
+
**IMPORTANT**: Follow these exact steps:
|
|
3868
|
+
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3869
|
+
- Do NOT create any additional output files in the workspace.
|
|
3870
|
+
- All intended file outputs/changes MUST be written in your response file.
|
|
3871
|
+
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3872
|
+
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
3873
|
+
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
3874
|
+
`;
|
|
3875
|
+
|
|
3876
|
+
// src/evaluation/providers/vscode.ts
|
|
3877
|
+
var VSCodeProvider = class {
|
|
3878
|
+
id;
|
|
3879
|
+
kind;
|
|
3880
|
+
targetName;
|
|
3881
|
+
supportsBatch = true;
|
|
3882
|
+
config;
|
|
3883
|
+
constructor(targetName, config, kind) {
|
|
3884
|
+
this.id = `${kind}:${targetName}`;
|
|
3885
|
+
this.kind = kind;
|
|
3886
|
+
this.targetName = targetName;
|
|
3887
|
+
this.config = config;
|
|
3888
|
+
}
|
|
3889
|
+
async invoke(request) {
|
|
3890
|
+
if (request.signal?.aborted) {
|
|
3891
|
+
throw new Error("VS Code provider request was aborted before dispatch");
|
|
3892
|
+
}
|
|
3893
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
3894
|
+
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
3895
|
+
const session = await dispatchAgentSession({
|
|
3896
|
+
userQuery: promptContent,
|
|
3897
|
+
extraAttachments: inputFiles,
|
|
3898
|
+
requestTemplate: AGENTV_REQUEST_TEMPLATE,
|
|
3899
|
+
wait: this.config.waitForResponse,
|
|
3900
|
+
dryRun: this.config.dryRun,
|
|
3901
|
+
vscodeCmd: this.config.command,
|
|
3902
|
+
subagentRoot: this.config.subagentRoot,
|
|
3903
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
3904
|
+
silent: true
|
|
3905
|
+
});
|
|
3906
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
3907
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
3908
|
+
throw new Error(failure);
|
|
3909
|
+
}
|
|
3910
|
+
if (this.config.dryRun) {
|
|
3911
|
+
return {
|
|
3912
|
+
outputMessages: [],
|
|
3913
|
+
raw: {
|
|
3914
|
+
session,
|
|
3915
|
+
inputFiles
|
|
3916
|
+
}
|
|
3917
|
+
};
|
|
3918
|
+
}
|
|
3919
|
+
const responseText = await readTextFile(session.responseFile);
|
|
3920
|
+
return {
|
|
3921
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3922
|
+
raw: {
|
|
3923
|
+
session,
|
|
3924
|
+
inputFiles
|
|
3925
|
+
}
|
|
3926
|
+
};
|
|
3927
|
+
}
|
|
3928
|
+
async invokeBatch(requests) {
|
|
3929
|
+
if (requests.length === 0) {
|
|
3930
|
+
return [];
|
|
3931
|
+
}
|
|
3932
|
+
const normalizedRequests = requests.map((req) => ({
|
|
3933
|
+
request: req,
|
|
3934
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
3935
|
+
}));
|
|
3936
|
+
const combinedInputFiles = mergeAttachments(
|
|
3937
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
3938
|
+
);
|
|
3939
|
+
const userQueries = normalizedRequests.map(
|
|
3940
|
+
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
3941
|
+
);
|
|
3942
|
+
const session = await dispatchBatchAgent({
|
|
3943
|
+
userQueries,
|
|
3944
|
+
extraAttachments: combinedInputFiles,
|
|
3945
|
+
requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
|
|
3946
|
+
wait: this.config.waitForResponse,
|
|
3947
|
+
dryRun: this.config.dryRun,
|
|
3948
|
+
vscodeCmd: this.config.command,
|
|
3949
|
+
subagentRoot: this.config.subagentRoot,
|
|
3950
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
3951
|
+
silent: true
|
|
3952
|
+
});
|
|
3953
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
3954
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
3955
|
+
throw new Error(failure);
|
|
3956
|
+
}
|
|
3957
|
+
if (this.config.dryRun) {
|
|
3958
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3959
|
+
outputMessages: [],
|
|
3960
|
+
raw: {
|
|
3961
|
+
session,
|
|
3962
|
+
inputFiles,
|
|
3963
|
+
allInputFiles: combinedInputFiles
|
|
3964
|
+
}
|
|
3965
|
+
}));
|
|
3966
|
+
}
|
|
3967
|
+
if (session.responseFiles.length !== requests.length) {
|
|
3968
|
+
throw new Error(
|
|
3969
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
3970
|
+
);
|
|
3971
|
+
}
|
|
3972
|
+
const responses = [];
|
|
3973
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3974
|
+
const responseText = await readTextFile(responseFile);
|
|
3975
|
+
responses.push({
|
|
3976
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3977
|
+
raw: {
|
|
3978
|
+
session,
|
|
3979
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
3980
|
+
allInputFiles: combinedInputFiles,
|
|
3981
|
+
responseFile
|
|
3982
|
+
}
|
|
3983
|
+
});
|
|
3984
|
+
}
|
|
3985
|
+
return responses;
|
|
3986
|
+
}
|
|
3987
|
+
};
|
|
3988
|
+
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
3153
3989
|
const parts = [];
|
|
3154
3990
|
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
3155
3991
|
parts.push(request.systemPrompt.trim());
|
|
@@ -3169,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3169
4005
|
return "";
|
|
3170
4006
|
}
|
|
3171
4007
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3172
|
-
const fileName =
|
|
4008
|
+
const fileName = path11.basename(absolutePath);
|
|
3173
4009
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3174
4010
|
return `* [${fileName}](${fileUri})`;
|
|
3175
4011
|
});
|
|
@@ -3194,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3194
4030
|
}
|
|
3195
4031
|
const unique = /* @__PURE__ */ new Map();
|
|
3196
4032
|
for (const attachment of attachments) {
|
|
3197
|
-
const absolutePath =
|
|
3198
|
-
const normalized = absolutePath.split(
|
|
4033
|
+
const absolutePath = path11.resolve(attachment);
|
|
4034
|
+
const normalized = absolutePath.split(path11.sep).join("/");
|
|
3199
4035
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3200
4036
|
if (!unique.has(absolutePath)) {
|
|
3201
4037
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3210,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3210
4046
|
}
|
|
3211
4047
|
const unique = /* @__PURE__ */ new Map();
|
|
3212
4048
|
for (const attachment of attachments) {
|
|
3213
|
-
const absolutePath =
|
|
4049
|
+
const absolutePath = path11.resolve(attachment);
|
|
3214
4050
|
if (!unique.has(absolutePath)) {
|
|
3215
4051
|
unique.set(absolutePath, absolutePath);
|
|
3216
4052
|
}
|
|
@@ -3218,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3218
4054
|
return Array.from(unique.values());
|
|
3219
4055
|
}
|
|
3220
4056
|
function pathToFileUri2(filePath) {
|
|
3221
|
-
const absolutePath =
|
|
4057
|
+
const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
|
|
3222
4058
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3223
4059
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3224
4060
|
return `file:///${normalizedPath}`;
|
|
@@ -3231,7 +4067,7 @@ function normalizeAttachments(attachments) {
|
|
|
3231
4067
|
}
|
|
3232
4068
|
const deduped = /* @__PURE__ */ new Set();
|
|
3233
4069
|
for (const attachment of attachments) {
|
|
3234
|
-
deduped.add(
|
|
4070
|
+
deduped.add(path11.resolve(attachment));
|
|
3235
4071
|
}
|
|
3236
4072
|
return Array.from(deduped);
|
|
3237
4073
|
}
|
|
@@ -3240,7 +4076,7 @@ function mergeAttachments(all) {
|
|
|
3240
4076
|
for (const list of all) {
|
|
3241
4077
|
if (!list) continue;
|
|
3242
4078
|
for (const inputFile of list) {
|
|
3243
|
-
deduped.add(
|
|
4079
|
+
deduped.add(path11.resolve(inputFile));
|
|
3244
4080
|
}
|
|
3245
4081
|
}
|
|
3246
4082
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3289,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3289
4125
|
// src/evaluation/providers/targets-file.ts
|
|
3290
4126
|
import { constants as constants3 } from "node:fs";
|
|
3291
4127
|
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
3292
|
-
import
|
|
4128
|
+
import path12 from "node:path";
|
|
3293
4129
|
import { parse as parse3 } from "yaml";
|
|
3294
4130
|
function isRecord(value) {
|
|
3295
4131
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -3326,7 +4162,7 @@ async function fileExists3(filePath) {
|
|
|
3326
4162
|
}
|
|
3327
4163
|
}
|
|
3328
4164
|
async function readTargetDefinitions(filePath) {
|
|
3329
|
-
const absolutePath =
|
|
4165
|
+
const absolutePath = path12.resolve(filePath);
|
|
3330
4166
|
if (!await fileExists3(absolutePath)) {
|
|
3331
4167
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3332
4168
|
}
|
|
@@ -3358,6 +4194,8 @@ function createProvider(target) {
|
|
|
3358
4194
|
return new CliProvider(target.name, target.config);
|
|
3359
4195
|
case "codex":
|
|
3360
4196
|
return new CodexProvider(target.name, target.config);
|
|
4197
|
+
case "pi-coding-agent":
|
|
4198
|
+
return new PiCodingAgentProvider(target.name, target.config);
|
|
3361
4199
|
case "mock":
|
|
3362
4200
|
return new MockProvider(target.name, target.config);
|
|
3363
4201
|
case "vscode":
|
|
@@ -3377,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3377
4215
|
// src/evaluation/evaluators.ts
|
|
3378
4216
|
import { generateText as generateText2 } from "ai";
|
|
3379
4217
|
import { z } from "zod";
|
|
4218
|
+
|
|
4219
|
+
// src/runtime/exec.ts
|
|
4220
|
+
function getBunSpawn() {
|
|
4221
|
+
const bunSpawn = globalThis.Bun?.spawn;
|
|
4222
|
+
return typeof bunSpawn === "function" ? bunSpawn : void 0;
|
|
4223
|
+
}
|
|
4224
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
4225
|
+
const bunSpawn = getBunSpawn();
|
|
4226
|
+
if (bunSpawn) {
|
|
4227
|
+
const encoder = new TextEncoder();
|
|
4228
|
+
const proc = bunSpawn({
|
|
4229
|
+
cmd: ["sh", "-c", command],
|
|
4230
|
+
cwd: options.cwd,
|
|
4231
|
+
stdin: encoder.encode(stdinPayload),
|
|
4232
|
+
stdout: "pipe",
|
|
4233
|
+
stderr: "pipe"
|
|
4234
|
+
});
|
|
4235
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
4236
|
+
proc.kill();
|
|
4237
|
+
}, options.timeoutMs) : void 0;
|
|
4238
|
+
try {
|
|
4239
|
+
const stdout = await new Response(proc.stdout).text();
|
|
4240
|
+
const stderr = await new Response(proc.stderr).text();
|
|
4241
|
+
const exitCode = await proc.exited;
|
|
4242
|
+
return { stdout, stderr, exitCode };
|
|
4243
|
+
} finally {
|
|
4244
|
+
if (timeout !== void 0) {
|
|
4245
|
+
clearTimeout(timeout);
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
const { spawn: spawn3 } = await import("node:child_process");
|
|
4250
|
+
return await new Promise((resolve, reject) => {
|
|
4251
|
+
const child = spawn3(command, {
|
|
4252
|
+
shell: true,
|
|
4253
|
+
cwd: options.cwd,
|
|
4254
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
4255
|
+
});
|
|
4256
|
+
let stdout = "";
|
|
4257
|
+
let stderr = "";
|
|
4258
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
4259
|
+
child.kill();
|
|
4260
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
4261
|
+
}, options.timeoutMs) : void 0;
|
|
4262
|
+
child.stdout?.on("data", (data) => {
|
|
4263
|
+
stdout += data.toString();
|
|
4264
|
+
});
|
|
4265
|
+
child.stderr?.on("data", (data) => {
|
|
4266
|
+
stderr += data.toString();
|
|
4267
|
+
});
|
|
4268
|
+
child.on("error", (error) => {
|
|
4269
|
+
if (timeout !== void 0) {
|
|
4270
|
+
clearTimeout(timeout);
|
|
4271
|
+
}
|
|
4272
|
+
reject(error);
|
|
4273
|
+
});
|
|
4274
|
+
child.on("exit", (code) => {
|
|
4275
|
+
if (timeout !== void 0) {
|
|
4276
|
+
clearTimeout(timeout);
|
|
4277
|
+
}
|
|
4278
|
+
resolve({ stdout, stderr, exitCode: code ?? 0 });
|
|
4279
|
+
});
|
|
4280
|
+
child.stdin?.write(stdinPayload);
|
|
4281
|
+
child.stdin?.end();
|
|
4282
|
+
});
|
|
4283
|
+
}
|
|
4284
|
+
|
|
4285
|
+
// src/evaluation/evaluators.ts
|
|
3380
4286
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3381
4287
|
|
|
3382
4288
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3441,6 +4347,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3441
4347
|
null,
|
|
3442
4348
|
2
|
|
3443
4349
|
),
|
|
4350
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
3444
4351
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3445
4352
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3446
4353
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -3465,7 +4372,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3465
4372
|
const score = clampScore(data.score);
|
|
3466
4373
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3467
4374
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3468
|
-
const reasoning = data.reasoning
|
|
4375
|
+
const reasoning = data.reasoning;
|
|
3469
4376
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3470
4377
|
return {
|
|
3471
4378
|
score,
|
|
@@ -3567,7 +4474,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3567
4474
|
maxOutputTokens: this.maxOutputTokens,
|
|
3568
4475
|
temperature: this.temperature
|
|
3569
4476
|
});
|
|
3570
|
-
const data = schema.parse(
|
|
4477
|
+
const data = schema.parse(
|
|
4478
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
4479
|
+
);
|
|
3571
4480
|
return { data, providerResponse: response };
|
|
3572
4481
|
} catch (e) {
|
|
3573
4482
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -3649,17 +4558,17 @@ var CodeEvaluator = class {
|
|
|
3649
4558
|
const inputPayload = JSON.stringify(
|
|
3650
4559
|
{
|
|
3651
4560
|
question: context.evalCase.question,
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
4561
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
4562
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
4563
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
4564
|
+
candidateAnswer: context.candidate,
|
|
4565
|
+
outputMessages: context.outputMessages ?? null,
|
|
4566
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
4567
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
4568
|
+
(path14) => !context.evalCase.guideline_paths.includes(path14)
|
|
3659
4569
|
),
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
4570
|
+
inputMessages: context.evalCase.input_messages,
|
|
4571
|
+
traceSummary: context.traceSummary ?? null
|
|
3663
4572
|
},
|
|
3664
4573
|
null,
|
|
3665
4574
|
2
|
|
@@ -3729,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
|
|
|
3729
4638
|
return { score, verdict, hits, misses };
|
|
3730
4639
|
}
|
|
3731
4640
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
3732
|
-
const {
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
shell: true,
|
|
3736
|
-
cwd
|
|
3737
|
-
});
|
|
3738
|
-
let stdout = "";
|
|
3739
|
-
let stderr = "";
|
|
3740
|
-
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
3741
|
-
child.kill();
|
|
3742
|
-
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
3743
|
-
}, agentTimeoutMs) : void 0;
|
|
3744
|
-
child.stdout?.on("data", (data) => {
|
|
3745
|
-
stdout += data.toString();
|
|
3746
|
-
});
|
|
3747
|
-
child.stderr?.on("data", (data) => {
|
|
3748
|
-
stderr += data.toString();
|
|
3749
|
-
});
|
|
3750
|
-
child.on("error", (error) => {
|
|
3751
|
-
if (timeout !== void 0) {
|
|
3752
|
-
clearTimeout(timeout);
|
|
3753
|
-
}
|
|
3754
|
-
reject(error);
|
|
3755
|
-
});
|
|
3756
|
-
child.on("exit", (code) => {
|
|
3757
|
-
if (timeout !== void 0) {
|
|
3758
|
-
clearTimeout(timeout);
|
|
3759
|
-
}
|
|
3760
|
-
if (code && code !== 0 && stderr.length > 0) {
|
|
3761
|
-
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
3762
|
-
return;
|
|
3763
|
-
}
|
|
3764
|
-
resolve(stdout.trim());
|
|
3765
|
-
});
|
|
3766
|
-
child.stdin?.write(input);
|
|
3767
|
-
child.stdin?.end();
|
|
4641
|
+
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
4642
|
+
cwd,
|
|
4643
|
+
timeoutMs: agentTimeoutMs
|
|
3768
4644
|
});
|
|
4645
|
+
if (exitCode !== 0) {
|
|
4646
|
+
const trimmedErr = stderr.trim();
|
|
4647
|
+
throw new Error(
|
|
4648
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
4649
|
+
);
|
|
4650
|
+
}
|
|
4651
|
+
return stdout.trim();
|
|
3769
4652
|
}
|
|
3770
4653
|
function parseJsonSafe(payload) {
|
|
3771
4654
|
try {
|
|
@@ -3779,6 +4662,33 @@ function substituteVariables(template, variables) {
|
|
|
3779
4662
|
return variables[varName] ?? match;
|
|
3780
4663
|
});
|
|
3781
4664
|
}
|
|
4665
|
+
function deepEqual(a, b) {
|
|
4666
|
+
if (a === b) return true;
|
|
4667
|
+
if (a === null || b === null) return a === b;
|
|
4668
|
+
if (typeof a !== typeof b) return false;
|
|
4669
|
+
if (typeof a !== "object") return a === b;
|
|
4670
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
4671
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4672
|
+
if (a.length !== b.length) return false;
|
|
4673
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
4674
|
+
}
|
|
4675
|
+
const aObj = a;
|
|
4676
|
+
const bObj = b;
|
|
4677
|
+
const aKeys = Object.keys(aObj);
|
|
4678
|
+
const bKeys = Object.keys(bObj);
|
|
4679
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
4680
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
4681
|
+
}
|
|
4682
|
+
function argsMatch(expected, actual) {
|
|
4683
|
+
if (expected === void 0) return true;
|
|
4684
|
+
if (expected === "any") return true;
|
|
4685
|
+
if (actual === void 0) return false;
|
|
4686
|
+
for (const key of Object.keys(expected)) {
|
|
4687
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
4688
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
4689
|
+
}
|
|
4690
|
+
return true;
|
|
4691
|
+
}
|
|
3782
4692
|
var ToolTrajectoryEvaluator = class {
|
|
3783
4693
|
kind = "tool_trajectory";
|
|
3784
4694
|
config;
|
|
@@ -3786,8 +4696,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3786
4696
|
this.config = options.config;
|
|
3787
4697
|
}
|
|
3788
4698
|
evaluate(context) {
|
|
3789
|
-
const {
|
|
3790
|
-
|
|
4699
|
+
const { outputMessages, traceSummary } = context;
|
|
4700
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
4701
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
4702
|
+
return {
|
|
4703
|
+
score: 0,
|
|
4704
|
+
verdict: "fail",
|
|
4705
|
+
hits: [],
|
|
4706
|
+
misses: ["No trace available for evaluation"],
|
|
4707
|
+
expectedAspectCount: 1
|
|
4708
|
+
};
|
|
4709
|
+
}
|
|
4710
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
4711
|
+
if (!summary) {
|
|
3791
4712
|
return {
|
|
3792
4713
|
score: 0,
|
|
3793
4714
|
verdict: "fail",
|
|
@@ -3798,11 +4719,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3798
4719
|
}
|
|
3799
4720
|
switch (this.config.mode) {
|
|
3800
4721
|
case "any_order":
|
|
3801
|
-
return this.evaluateAnyOrder(
|
|
4722
|
+
return this.evaluateAnyOrder(summary);
|
|
3802
4723
|
case "in_order":
|
|
3803
|
-
return this.evaluateInOrder(
|
|
4724
|
+
return this.evaluateInOrder(toolCalls);
|
|
3804
4725
|
case "exact":
|
|
3805
|
-
return this.evaluateExact(
|
|
4726
|
+
return this.evaluateExact(toolCalls);
|
|
3806
4727
|
default:
|
|
3807
4728
|
return {
|
|
3808
4729
|
score: 0,
|
|
@@ -3813,6 +4734,42 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3813
4734
|
};
|
|
3814
4735
|
}
|
|
3815
4736
|
}
|
|
4737
|
+
/**
|
|
4738
|
+
* Extract tool calls from output messages.
|
|
4739
|
+
*/
|
|
4740
|
+
extractToolCallsFromMessages(messages) {
|
|
4741
|
+
if (!messages) {
|
|
4742
|
+
return [];
|
|
4743
|
+
}
|
|
4744
|
+
const toolCalls = [];
|
|
4745
|
+
for (const message of messages) {
|
|
4746
|
+
if (message.toolCalls) {
|
|
4747
|
+
for (const call of message.toolCalls) {
|
|
4748
|
+
toolCalls.push({
|
|
4749
|
+
name: call.tool,
|
|
4750
|
+
args: call.input
|
|
4751
|
+
});
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
}
|
|
4755
|
+
return toolCalls;
|
|
4756
|
+
}
|
|
4757
|
+
/**
|
|
4758
|
+
* Build a summary from extracted tool calls.
|
|
4759
|
+
*/
|
|
4760
|
+
buildSummary(toolCalls) {
|
|
4761
|
+
const toolCallsByName = {};
|
|
4762
|
+
for (const call of toolCalls) {
|
|
4763
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
4764
|
+
}
|
|
4765
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
4766
|
+
return {
|
|
4767
|
+
eventCount: toolCalls.length,
|
|
4768
|
+
toolNames,
|
|
4769
|
+
toolCallsByName,
|
|
4770
|
+
errorCount: 0
|
|
4771
|
+
};
|
|
4772
|
+
}
|
|
3816
4773
|
evaluateAnyOrder(summary) {
|
|
3817
4774
|
const minimums = this.config.minimums ?? {};
|
|
3818
4775
|
const toolNames = Object.keys(minimums);
|
|
@@ -3845,7 +4802,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3845
4802
|
expectedAspectCount: toolNames.length
|
|
3846
4803
|
};
|
|
3847
4804
|
}
|
|
3848
|
-
evaluateInOrder(
|
|
4805
|
+
evaluateInOrder(toolCalls) {
|
|
3849
4806
|
const expected = this.config.expected ?? [];
|
|
3850
4807
|
if (expected.length === 0) {
|
|
3851
4808
|
return {
|
|
@@ -3856,23 +4813,33 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3856
4813
|
expectedAspectCount: 0
|
|
3857
4814
|
};
|
|
3858
4815
|
}
|
|
3859
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3860
4816
|
const hits = [];
|
|
3861
4817
|
const misses = [];
|
|
3862
4818
|
let actualIndex = 0;
|
|
3863
4819
|
for (let i = 0; i < expected.length; i++) {
|
|
3864
|
-
const
|
|
4820
|
+
const expectedItem = expected[i];
|
|
4821
|
+
const expectedTool = expectedItem.tool;
|
|
3865
4822
|
let found = false;
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
4823
|
+
let argsMismatch = false;
|
|
4824
|
+
while (actualIndex < toolCalls.length) {
|
|
4825
|
+
const actualCall = toolCalls[actualIndex];
|
|
4826
|
+
if (actualCall.name === expectedTool) {
|
|
4827
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
4828
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4829
|
+
actualIndex++;
|
|
4830
|
+
found = true;
|
|
4831
|
+
break;
|
|
4832
|
+
}
|
|
4833
|
+
misses.push(
|
|
4834
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
4835
|
+
);
|
|
3869
4836
|
actualIndex++;
|
|
3870
|
-
|
|
4837
|
+
argsMismatch = true;
|
|
3871
4838
|
break;
|
|
3872
4839
|
}
|
|
3873
4840
|
actualIndex++;
|
|
3874
4841
|
}
|
|
3875
|
-
if (!found) {
|
|
4842
|
+
if (!found && !argsMismatch) {
|
|
3876
4843
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
3877
4844
|
}
|
|
3878
4845
|
}
|
|
@@ -3885,7 +4852,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3885
4852
|
expectedAspectCount: expected.length
|
|
3886
4853
|
};
|
|
3887
4854
|
}
|
|
3888
|
-
evaluateExact(
|
|
4855
|
+
evaluateExact(toolCalls) {
|
|
3889
4856
|
const expected = this.config.expected ?? [];
|
|
3890
4857
|
if (expected.length === 0) {
|
|
3891
4858
|
return {
|
|
@@ -3896,18 +4863,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3896
4863
|
expectedAspectCount: 0
|
|
3897
4864
|
};
|
|
3898
4865
|
}
|
|
3899
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
3900
4866
|
const hits = [];
|
|
3901
4867
|
const misses = [];
|
|
3902
|
-
if (
|
|
3903
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
4868
|
+
if (toolCalls.length !== expected.length) {
|
|
4869
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
3904
4870
|
}
|
|
3905
|
-
const checkLength = Math.min(expected.length,
|
|
4871
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
3906
4872
|
for (let i = 0; i < checkLength; i++) {
|
|
3907
|
-
const
|
|
3908
|
-
const
|
|
4873
|
+
const expectedItem = expected[i];
|
|
4874
|
+
const expectedTool = expectedItem.tool;
|
|
4875
|
+
const actualCall = toolCalls[i];
|
|
4876
|
+
const actualTool = actualCall.name;
|
|
3909
4877
|
if (actualTool === expectedTool) {
|
|
3910
|
-
|
|
4878
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
4879
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
4880
|
+
} else {
|
|
4881
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
4882
|
+
}
|
|
3911
4883
|
} else {
|
|
3912
4884
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
3913
4885
|
}
|
|
@@ -4119,11 +5091,13 @@ var CompositeEvaluator = class {
|
|
|
4119
5091
|
evalCaseId: context.evalCase.id,
|
|
4120
5092
|
attempt: context.attempt
|
|
4121
5093
|
});
|
|
4122
|
-
const data = freeformEvaluationSchema.parse(
|
|
5094
|
+
const data = freeformEvaluationSchema.parse(
|
|
5095
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
5096
|
+
);
|
|
4123
5097
|
const score = clampScore(data.score);
|
|
4124
5098
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4125
5099
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4126
|
-
const reasoning = data.reasoning
|
|
5100
|
+
const reasoning = data.reasoning;
|
|
4127
5101
|
return {
|
|
4128
5102
|
score,
|
|
4129
5103
|
verdict: scoreToVerdict(score),
|
|
@@ -4149,9 +5123,9 @@ var CompositeEvaluator = class {
|
|
|
4149
5123
|
};
|
|
4150
5124
|
|
|
4151
5125
|
// src/evaluation/orchestrator.ts
|
|
4152
|
-
import { createHash, randomUUID as
|
|
4153
|
-
import { mkdir as
|
|
4154
|
-
import
|
|
5126
|
+
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
5127
|
+
import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
|
|
5128
|
+
import path13 from "node:path";
|
|
4155
5129
|
|
|
4156
5130
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4157
5131
|
var Node = class {
|
|
@@ -4546,11 +5520,19 @@ async function runBatchEvaluation(options) {
|
|
|
4546
5520
|
const evalCase = evalCases[i];
|
|
4547
5521
|
const promptInputs = promptInputsList[i];
|
|
4548
5522
|
const providerResponse = batchResponse[i];
|
|
5523
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5524
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5525
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5526
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
5527
|
+
costUsd: providerResponse.costUsd,
|
|
5528
|
+
durationMs: providerResponse.durationMs
|
|
5529
|
+
}) : void 0;
|
|
5530
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4549
5531
|
let result;
|
|
4550
5532
|
try {
|
|
4551
5533
|
result = await evaluateCandidate({
|
|
4552
5534
|
evalCase,
|
|
4553
|
-
candidate
|
|
5535
|
+
candidate,
|
|
4554
5536
|
target,
|
|
4555
5537
|
provider,
|
|
4556
5538
|
evaluators: evaluatorRegistry,
|
|
@@ -4558,7 +5540,9 @@ async function runBatchEvaluation(options) {
|
|
|
4558
5540
|
nowFn,
|
|
4559
5541
|
attempt: 0,
|
|
4560
5542
|
judgeProvider: await resolveJudgeProvider(target),
|
|
4561
|
-
agentTimeoutMs
|
|
5543
|
+
agentTimeoutMs,
|
|
5544
|
+
outputMessages,
|
|
5545
|
+
traceSummary
|
|
4562
5546
|
});
|
|
4563
5547
|
} catch (error) {
|
|
4564
5548
|
const errorResult = buildErrorResult(
|
|
@@ -4662,21 +5646,18 @@ async function runEvalCase(options) {
|
|
|
4662
5646
|
if (cacheKey && cache && !cachedResponse) {
|
|
4663
5647
|
await cache.set(cacheKey, providerResponse);
|
|
4664
5648
|
}
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
|
|
4668
|
-
|
|
4669
|
-
|
|
4670
|
-
|
|
4671
|
-
|
|
4672
|
-
|
|
4673
|
-
}
|
|
4674
|
-
}
|
|
4675
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
5649
|
+
const outputMessages = providerResponse.outputMessages;
|
|
5650
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5651
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5652
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
5653
|
+
costUsd: providerResponse.costUsd,
|
|
5654
|
+
durationMs: providerResponse.durationMs
|
|
5655
|
+
}) : void 0;
|
|
5656
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
4676
5657
|
try {
|
|
4677
5658
|
return await evaluateCandidate({
|
|
4678
5659
|
evalCase,
|
|
4679
|
-
candidate
|
|
5660
|
+
candidate,
|
|
4680
5661
|
target,
|
|
4681
5662
|
provider,
|
|
4682
5663
|
evaluators,
|
|
@@ -4685,9 +5666,8 @@ async function runEvalCase(options) {
|
|
|
4685
5666
|
attempt,
|
|
4686
5667
|
judgeProvider,
|
|
4687
5668
|
agentTimeoutMs,
|
|
4688
|
-
|
|
4689
|
-
|
|
4690
|
-
candidateTraceSummary
|
|
5669
|
+
outputMessages,
|
|
5670
|
+
traceSummary
|
|
4691
5671
|
});
|
|
4692
5672
|
} catch (error) {
|
|
4693
5673
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -4705,9 +5685,8 @@ async function evaluateCandidate(options) {
|
|
|
4705
5685
|
attempt,
|
|
4706
5686
|
judgeProvider,
|
|
4707
5687
|
agentTimeoutMs,
|
|
4708
|
-
|
|
4709
|
-
|
|
4710
|
-
candidateTraceSummary
|
|
5688
|
+
outputMessages,
|
|
5689
|
+
traceSummary
|
|
4711
5690
|
} = options;
|
|
4712
5691
|
const gradeTimestamp = nowFn();
|
|
4713
5692
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -4721,9 +5700,8 @@ async function evaluateCandidate(options) {
|
|
|
4721
5700
|
now: gradeTimestamp,
|
|
4722
5701
|
judgeProvider,
|
|
4723
5702
|
agentTimeoutMs,
|
|
4724
|
-
|
|
4725
|
-
|
|
4726
|
-
candidateTraceSummary
|
|
5703
|
+
outputMessages,
|
|
5704
|
+
traceSummary
|
|
4727
5705
|
});
|
|
4728
5706
|
const completedAt = nowFn();
|
|
4729
5707
|
let agentProviderRequest;
|
|
@@ -4747,21 +5725,21 @@ async function evaluateCandidate(options) {
|
|
|
4747
5725
|
}
|
|
4748
5726
|
return {
|
|
4749
5727
|
timestamp: completedAt.toISOString(),
|
|
4750
|
-
|
|
5728
|
+
evalId: evalCase.id,
|
|
4751
5729
|
dataset: evalCase.dataset,
|
|
4752
|
-
|
|
5730
|
+
conversationId: evalCase.conversation_id,
|
|
4753
5731
|
score: score.score,
|
|
4754
5732
|
hits: score.hits,
|
|
4755
5733
|
misses: score.misses,
|
|
4756
|
-
|
|
5734
|
+
candidateAnswer: candidate,
|
|
4757
5735
|
target: target.name,
|
|
4758
5736
|
reasoning: score.reasoning,
|
|
4759
|
-
|
|
4760
|
-
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
|
|
4764
|
-
|
|
5737
|
+
rawAspects: score.rawAspects,
|
|
5738
|
+
agentProviderRequest,
|
|
5739
|
+
lmProviderRequest,
|
|
5740
|
+
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
5741
|
+
evaluatorResults,
|
|
5742
|
+
traceSummary
|
|
4765
5743
|
};
|
|
4766
5744
|
}
|
|
4767
5745
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4776,9 +5754,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4776
5754
|
now,
|
|
4777
5755
|
judgeProvider,
|
|
4778
5756
|
agentTimeoutMs,
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
candidateTraceSummary
|
|
5757
|
+
outputMessages,
|
|
5758
|
+
traceSummary
|
|
4782
5759
|
} = options;
|
|
4783
5760
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4784
5761
|
return runEvaluatorList({
|
|
@@ -4793,9 +5770,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4793
5770
|
now,
|
|
4794
5771
|
judgeProvider,
|
|
4795
5772
|
agentTimeoutMs,
|
|
4796
|
-
|
|
4797
|
-
|
|
4798
|
-
candidateTraceSummary
|
|
5773
|
+
outputMessages,
|
|
5774
|
+
traceSummary
|
|
4799
5775
|
});
|
|
4800
5776
|
}
|
|
4801
5777
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -4812,9 +5788,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
4812
5788
|
promptInputs,
|
|
4813
5789
|
now,
|
|
4814
5790
|
judgeProvider,
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
candidateTraceSummary
|
|
5791
|
+
outputMessages,
|
|
5792
|
+
traceSummary
|
|
4818
5793
|
});
|
|
4819
5794
|
return { score };
|
|
4820
5795
|
}
|
|
@@ -4831,9 +5806,8 @@ async function runEvaluatorList(options) {
|
|
|
4831
5806
|
now,
|
|
4832
5807
|
judgeProvider,
|
|
4833
5808
|
agentTimeoutMs,
|
|
4834
|
-
|
|
4835
|
-
|
|
4836
|
-
candidateTraceSummary
|
|
5809
|
+
outputMessages,
|
|
5810
|
+
traceSummary
|
|
4837
5811
|
} = options;
|
|
4838
5812
|
const scored = [];
|
|
4839
5813
|
const evaluatorResults = [];
|
|
@@ -4863,7 +5837,7 @@ async function runEvaluatorList(options) {
|
|
|
4863
5837
|
hits: score2.hits,
|
|
4864
5838
|
misses: score2.misses,
|
|
4865
5839
|
reasoning: score2.reasoning,
|
|
4866
|
-
|
|
5840
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
4867
5841
|
});
|
|
4868
5842
|
}
|
|
4869
5843
|
if (evaluator.type === "code") {
|
|
@@ -4880,8 +5854,8 @@ async function runEvaluatorList(options) {
|
|
|
4880
5854
|
attempt,
|
|
4881
5855
|
promptInputs,
|
|
4882
5856
|
now,
|
|
4883
|
-
|
|
4884
|
-
|
|
5857
|
+
outputMessages,
|
|
5858
|
+
traceSummary
|
|
4885
5859
|
});
|
|
4886
5860
|
const weight = evaluator.weight ?? 1;
|
|
4887
5861
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -4894,11 +5868,11 @@ async function runEvaluatorList(options) {
|
|
|
4894
5868
|
hits: score2.hits,
|
|
4895
5869
|
misses: score2.misses,
|
|
4896
5870
|
reasoning: score2.reasoning,
|
|
4897
|
-
|
|
5871
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
4898
5872
|
});
|
|
4899
5873
|
}
|
|
4900
5874
|
if (evaluator.type === "composite") {
|
|
4901
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
5875
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
4902
5876
|
const createEvaluator = (memberConfig) => {
|
|
4903
5877
|
switch (memberConfig.type) {
|
|
4904
5878
|
case "llm_judge":
|
|
@@ -4951,8 +5925,8 @@ async function runEvaluatorList(options) {
|
|
|
4951
5925
|
hits: score2.hits,
|
|
4952
5926
|
misses: score2.misses,
|
|
4953
5927
|
reasoning: score2.reasoning,
|
|
4954
|
-
|
|
4955
|
-
|
|
5928
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
5929
|
+
evaluatorResults: mapChildResults(score2.evaluatorResults)
|
|
4956
5930
|
});
|
|
4957
5931
|
}
|
|
4958
5932
|
if (evaluator.type === "tool_trajectory") {
|
|
@@ -4967,9 +5941,8 @@ async function runEvaluatorList(options) {
|
|
|
4967
5941
|
attempt,
|
|
4968
5942
|
promptInputs,
|
|
4969
5943
|
now,
|
|
4970
|
-
|
|
4971
|
-
|
|
4972
|
-
candidateTraceSummary
|
|
5944
|
+
outputMessages,
|
|
5945
|
+
traceSummary
|
|
4973
5946
|
});
|
|
4974
5947
|
const weight = evaluator.weight ?? 1;
|
|
4975
5948
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5111,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5111
6084
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5112
6085
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5113
6086
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5114
|
-
const filePath =
|
|
5115
|
-
await
|
|
6087
|
+
const filePath = path13.resolve(directory, filename);
|
|
6088
|
+
await mkdir3(path13.dirname(filePath), { recursive: true });
|
|
5116
6089
|
const payload = {
|
|
5117
6090
|
eval_id: evalCase.id,
|
|
5118
6091
|
question: promptInputs.question,
|
|
5119
6092
|
guidelines: promptInputs.guidelines,
|
|
5120
6093
|
guideline_paths: evalCase.guideline_paths
|
|
5121
6094
|
};
|
|
5122
|
-
await
|
|
6095
|
+
await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
5123
6096
|
}
|
|
5124
6097
|
function sanitizeFilename(value) {
|
|
5125
6098
|
if (!value) {
|
|
5126
6099
|
return "prompt";
|
|
5127
6100
|
}
|
|
5128
6101
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
5129
|
-
return sanitized.length > 0 ? sanitized :
|
|
6102
|
+
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
5130
6103
|
}
|
|
5131
6104
|
async function invokeProvider(provider, options) {
|
|
5132
6105
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -5183,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
5183
6156
|
}
|
|
5184
6157
|
return {
|
|
5185
6158
|
timestamp: timestamp.toISOString(),
|
|
5186
|
-
|
|
6159
|
+
evalId: evalCase.id,
|
|
5187
6160
|
dataset: evalCase.dataset,
|
|
5188
|
-
|
|
6161
|
+
conversationId: evalCase.conversation_id,
|
|
5189
6162
|
score: 0,
|
|
5190
6163
|
hits: [],
|
|
5191
6164
|
misses: [`Error: ${message}`],
|
|
5192
|
-
|
|
6165
|
+
candidateAnswer: `Error occurred: ${message}`,
|
|
5193
6166
|
target: targetName,
|
|
5194
|
-
|
|
5195
|
-
|
|
5196
|
-
|
|
6167
|
+
rawAspects: [],
|
|
6168
|
+
agentProviderRequest,
|
|
6169
|
+
lmProviderRequest,
|
|
5197
6170
|
error: message
|
|
5198
6171
|
};
|
|
5199
6172
|
}
|
|
@@ -5238,8 +6211,8 @@ function mapChildResults(children) {
|
|
|
5238
6211
|
hits: child.hits,
|
|
5239
6212
|
misses: child.misses,
|
|
5240
6213
|
reasoning: child.reasoning,
|
|
5241
|
-
|
|
5242
|
-
|
|
6214
|
+
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
6215
|
+
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
5243
6216
|
}));
|
|
5244
6217
|
}
|
|
5245
6218
|
function computeWeightedMean(entries) {
|
|
@@ -5340,17 +6313,21 @@ function createAgentKernel() {
|
|
|
5340
6313
|
export {
|
|
5341
6314
|
CodeEvaluator,
|
|
5342
6315
|
CompositeEvaluator,
|
|
6316
|
+
DEFAULT_EXPLORATION_TOOLS,
|
|
5343
6317
|
LlmJudgeEvaluator,
|
|
5344
6318
|
TEST_MESSAGE_ROLES,
|
|
5345
6319
|
ToolTrajectoryEvaluator,
|
|
6320
|
+
avgToolDurationMs,
|
|
5346
6321
|
buildDirectoryChain,
|
|
5347
6322
|
buildPromptInputs,
|
|
5348
6323
|
buildSearchRoots,
|
|
5349
6324
|
computeTraceSummary,
|
|
5350
6325
|
consumeCodexLogEntries,
|
|
6326
|
+
consumePiLogEntries,
|
|
5351
6327
|
createAgentKernel,
|
|
5352
6328
|
createProvider,
|
|
5353
6329
|
ensureVSCodeSubagents,
|
|
6330
|
+
explorationRatio,
|
|
5354
6331
|
extractCodeBlocks,
|
|
5355
6332
|
fileExists,
|
|
5356
6333
|
findGitRoot,
|
|
@@ -5362,10 +6339,9 @@ export {
|
|
|
5362
6339
|
isJsonValue,
|
|
5363
6340
|
isTestMessage,
|
|
5364
6341
|
isTestMessageRole,
|
|
5365
|
-
isTraceEvent,
|
|
5366
|
-
isTraceEventType,
|
|
5367
6342
|
listTargetNames,
|
|
5368
6343
|
loadEvalCases,
|
|
6344
|
+
mergeExecutionMetrics,
|
|
5369
6345
|
normalizeLineEndings,
|
|
5370
6346
|
readJsonFile,
|
|
5371
6347
|
readTargetDefinitions,
|
|
@@ -5376,6 +6352,8 @@ export {
|
|
|
5376
6352
|
resolveTargetDefinition,
|
|
5377
6353
|
runEvalCase,
|
|
5378
6354
|
runEvaluation,
|
|
5379
|
-
subscribeToCodexLogEntries
|
|
6355
|
+
subscribeToCodexLogEntries,
|
|
6356
|
+
subscribeToPiLogEntries,
|
|
6357
|
+
tokensPerTool
|
|
5380
6358
|
};
|
|
5381
6359
|
//# sourceMappingURL=index.js.map
|