@agentv/core 1.3.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-4A6L2F6L.js → chunk-E2VSU4WZ.js} +282 -81
- package/dist/chunk-E2VSU4WZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +82 -67
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -68
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1668 -489
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +279 -77
- package/dist/index.d.ts +279 -77
- package/dist/index.js +1334 -356
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-4A6L2F6L.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,17 +32,21 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
35
36
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
37
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
38
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
39
|
+
avgToolDurationMs: () => avgToolDurationMs,
|
|
38
40
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
39
41
|
buildPromptInputs: () => buildPromptInputs,
|
|
40
42
|
buildSearchRoots: () => buildSearchRoots2,
|
|
41
43
|
computeTraceSummary: () => computeTraceSummary,
|
|
42
44
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
45
|
+
consumePiLogEntries: () => consumePiLogEntries,
|
|
43
46
|
createAgentKernel: () => createAgentKernel,
|
|
44
47
|
createProvider: () => createProvider,
|
|
45
48
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
49
|
+
explorationRatio: () => explorationRatio,
|
|
46
50
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
51
|
fileExists: () => fileExists2,
|
|
48
52
|
findGitRoot: () => findGitRoot,
|
|
@@ -54,10 +58,9 @@ __export(index_exports, {
|
|
|
54
58
|
isJsonValue: () => isJsonValue,
|
|
55
59
|
isTestMessage: () => isTestMessage,
|
|
56
60
|
isTestMessageRole: () => isTestMessageRole,
|
|
57
|
-
isTraceEvent: () => isTraceEvent,
|
|
58
|
-
isTraceEventType: () => isTraceEventType,
|
|
59
61
|
listTargetNames: () => listTargetNames,
|
|
60
62
|
loadEvalCases: () => loadEvalCases,
|
|
63
|
+
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
61
64
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
62
65
|
readJsonFile: () => readJsonFile,
|
|
63
66
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
@@ -68,7 +71,9 @@ __export(index_exports, {
|
|
|
68
71
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
69
72
|
runEvalCase: () => runEvalCase,
|
|
70
73
|
runEvaluation: () => runEvaluation,
|
|
71
|
-
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
|
|
74
|
+
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
75
|
+
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
76
|
+
tokensPerTool: () => tokensPerTool
|
|
72
77
|
});
|
|
73
78
|
module.exports = __toCommonJS(index_exports);
|
|
74
79
|
|
|
@@ -135,33 +140,69 @@ function getHitCount(result) {
|
|
|
135
140
|
}
|
|
136
141
|
|
|
137
142
|
// src/evaluation/trace.ts
|
|
138
|
-
function
|
|
139
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
140
|
-
}
|
|
141
|
-
function isTraceEvent(value) {
|
|
142
|
-
if (typeof value !== "object" || value === null) {
|
|
143
|
-
return false;
|
|
144
|
-
}
|
|
145
|
-
const candidate = value;
|
|
146
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
147
|
-
}
|
|
148
|
-
function computeTraceSummary(trace) {
|
|
143
|
+
function computeTraceSummary(messages) {
|
|
149
144
|
const toolCallCounts = {};
|
|
150
|
-
let
|
|
151
|
-
for (const
|
|
152
|
-
if (
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
errorCount++;
|
|
145
|
+
let totalToolCalls = 0;
|
|
146
|
+
for (const message of messages) {
|
|
147
|
+
if (!message.toolCalls) continue;
|
|
148
|
+
for (const toolCall of message.toolCalls) {
|
|
149
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
150
|
+
totalToolCalls++;
|
|
157
151
|
}
|
|
158
152
|
}
|
|
159
153
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
160
154
|
return {
|
|
161
|
-
eventCount:
|
|
155
|
+
eventCount: totalToolCalls,
|
|
162
156
|
toolNames,
|
|
163
157
|
toolCallsByName: toolCallCounts,
|
|
164
|
-
errorCount
|
|
158
|
+
errorCount: 0
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
var DEFAULT_EXPLORATION_TOOLS = [
|
|
162
|
+
"read",
|
|
163
|
+
"grep",
|
|
164
|
+
"glob",
|
|
165
|
+
"search",
|
|
166
|
+
"list",
|
|
167
|
+
"Read",
|
|
168
|
+
"Grep",
|
|
169
|
+
"Glob",
|
|
170
|
+
"WebSearch",
|
|
171
|
+
"WebFetch"
|
|
172
|
+
];
|
|
173
|
+
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
174
|
+
if (summary.eventCount === 0) return void 0;
|
|
175
|
+
const explorationCalls = explorationTools.reduce(
|
|
176
|
+
(sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
|
|
177
|
+
0
|
|
178
|
+
);
|
|
179
|
+
return explorationCalls / summary.eventCount;
|
|
180
|
+
}
|
|
181
|
+
function tokensPerTool(summary) {
|
|
182
|
+
if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
|
|
183
|
+
const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
|
|
184
|
+
return totalTokens / summary.eventCount;
|
|
185
|
+
}
|
|
186
|
+
function avgToolDurationMs(summary) {
|
|
187
|
+
if (!summary.toolDurations) return void 0;
|
|
188
|
+
let totalDuration = 0;
|
|
189
|
+
let totalCalls = 0;
|
|
190
|
+
for (const durations of Object.values(summary.toolDurations)) {
|
|
191
|
+
for (const duration of durations) {
|
|
192
|
+
totalDuration += duration;
|
|
193
|
+
totalCalls++;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
if (totalCalls === 0) return void 0;
|
|
197
|
+
return totalDuration / totalCalls;
|
|
198
|
+
}
|
|
199
|
+
function mergeExecutionMetrics(summary, metrics) {
|
|
200
|
+
if (!metrics) return summary;
|
|
201
|
+
return {
|
|
202
|
+
...summary,
|
|
203
|
+
tokenUsage: metrics.tokenUsage,
|
|
204
|
+
costUsd: metrics.costUsd,
|
|
205
|
+
durationMs: metrics.durationMs
|
|
165
206
|
};
|
|
166
207
|
}
|
|
167
208
|
|
|
@@ -437,7 +478,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
437
478
|
QUESTION: "question",
|
|
438
479
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
439
480
|
REFERENCE_ANSWER: "reference_answer",
|
|
440
|
-
INPUT_MESSAGES: "input_messages"
|
|
481
|
+
INPUT_MESSAGES: "input_messages",
|
|
482
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
441
483
|
};
|
|
442
484
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
443
485
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -677,7 +719,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
677
719
|
expected = [];
|
|
678
720
|
for (const item of rawExpected) {
|
|
679
721
|
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
680
|
-
|
|
722
|
+
let args;
|
|
723
|
+
if (item.args === "any") {
|
|
724
|
+
args = "any";
|
|
725
|
+
} else if (isJsonObject2(item.args)) {
|
|
726
|
+
args = item.args;
|
|
727
|
+
}
|
|
728
|
+
expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
|
|
681
729
|
}
|
|
682
730
|
}
|
|
683
731
|
}
|
|
@@ -1320,16 +1368,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1320
1368
|
}) : [];
|
|
1321
1369
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1322
1370
|
let referenceAnswer = "";
|
|
1323
|
-
if (outputSegments.length >
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
const
|
|
1327
|
-
if (typeof
|
|
1328
|
-
referenceAnswer =
|
|
1329
|
-
} else if (
|
|
1330
|
-
referenceAnswer = JSON.stringify(
|
|
1331
|
-
} else if (
|
|
1332
|
-
referenceAnswer = JSON.stringify(
|
|
1371
|
+
if (outputSegments.length > 0) {
|
|
1372
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1373
|
+
const content = lastMessage.content;
|
|
1374
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1375
|
+
if (typeof content === "string") {
|
|
1376
|
+
referenceAnswer = content;
|
|
1377
|
+
} else if (content !== void 0 && content !== null) {
|
|
1378
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1379
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1380
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1333
1381
|
}
|
|
1334
1382
|
}
|
|
1335
1383
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -1772,11 +1820,11 @@ async function invokeModel(options) {
|
|
|
1772
1820
|
return mapResponse(result);
|
|
1773
1821
|
}
|
|
1774
1822
|
function mapResponse(result) {
|
|
1823
|
+
const content = result.text ?? "";
|
|
1775
1824
|
return {
|
|
1776
|
-
text: result.text ?? "",
|
|
1777
|
-
reasoning: result.reasoningText ?? void 0,
|
|
1778
1825
|
raw: result,
|
|
1779
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1826
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
1827
|
+
outputMessages: [{ role: "assistant", content }]
|
|
1780
1828
|
};
|
|
1781
1829
|
}
|
|
1782
1830
|
function toJsonObject(value) {
|
|
@@ -1929,6 +1977,7 @@ var CliProvider = class {
|
|
|
1929
1977
|
config;
|
|
1930
1978
|
runCommand;
|
|
1931
1979
|
verbose;
|
|
1980
|
+
keepTempFiles;
|
|
1932
1981
|
healthcheckPromise;
|
|
1933
1982
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1934
1983
|
this.targetName = targetName;
|
|
@@ -1936,6 +1985,7 @@ var CliProvider = class {
|
|
|
1936
1985
|
this.config = config;
|
|
1937
1986
|
this.runCommand = runner;
|
|
1938
1987
|
this.verbose = config.verbose ?? false;
|
|
1988
|
+
this.keepTempFiles = config.keepTempFiles ?? false;
|
|
1939
1989
|
}
|
|
1940
1990
|
async invoke(request) {
|
|
1941
1991
|
if (request.signal?.aborted) {
|
|
@@ -1950,12 +2000,14 @@ var CliProvider = class {
|
|
|
1950
2000
|
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1951
2001
|
);
|
|
1952
2002
|
}
|
|
2003
|
+
const startTime = Date.now();
|
|
1953
2004
|
const result = await this.runCommand(renderedCommand, {
|
|
1954
2005
|
cwd: this.config.cwd,
|
|
1955
2006
|
env: process.env,
|
|
1956
2007
|
timeoutMs: this.config.timeoutMs,
|
|
1957
2008
|
signal: request.signal
|
|
1958
2009
|
});
|
|
2010
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1959
2011
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1960
2012
|
if (request.signal?.aborted) {
|
|
1961
2013
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1973,8 +2025,10 @@ var CliProvider = class {
|
|
|
1973
2025
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
1974
2026
|
const parsed = this.parseOutputContent(responseContent);
|
|
1975
2027
|
return {
|
|
1976
|
-
|
|
1977
|
-
|
|
2028
|
+
outputMessages: parsed.outputMessages,
|
|
2029
|
+
tokenUsage: parsed.tokenUsage,
|
|
2030
|
+
costUsd: parsed.costUsd,
|
|
2031
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
1978
2032
|
raw: {
|
|
1979
2033
|
command: renderedCommand,
|
|
1980
2034
|
stderr: result.stderr,
|
|
@@ -2022,12 +2076,14 @@ var CliProvider = class {
|
|
|
2022
2076
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2023
2077
|
);
|
|
2024
2078
|
}
|
|
2079
|
+
const startTime = Date.now();
|
|
2025
2080
|
const result = await this.runCommand(renderedCommand, {
|
|
2026
2081
|
cwd: this.config.cwd,
|
|
2027
2082
|
env: process.env,
|
|
2028
2083
|
timeoutMs: this.config.timeoutMs,
|
|
2029
2084
|
signal: controller.signal
|
|
2030
2085
|
});
|
|
2086
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
2031
2087
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2032
2088
|
if (controller.signal.aborted) {
|
|
2033
2089
|
throw new Error("CLI provider request was aborted");
|
|
@@ -2049,11 +2105,13 @@ var CliProvider = class {
|
|
|
2049
2105
|
if (missingIds.length > 0) {
|
|
2050
2106
|
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2051
2107
|
}
|
|
2108
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
2052
2109
|
const responses = requests.map((request) => {
|
|
2053
2110
|
const evalCaseId = request.evalCaseId;
|
|
2054
2111
|
if (!evalCaseId) {
|
|
2055
2112
|
return {
|
|
2056
|
-
|
|
2113
|
+
outputMessages: [],
|
|
2114
|
+
durationMs: perRequestFallbackMs,
|
|
2057
2115
|
raw: {
|
|
2058
2116
|
command: renderedCommand,
|
|
2059
2117
|
stderr: result.stderr,
|
|
@@ -2066,7 +2124,8 @@ var CliProvider = class {
|
|
|
2066
2124
|
const parsed = recordsById.get(evalCaseId);
|
|
2067
2125
|
if (!parsed) {
|
|
2068
2126
|
return {
|
|
2069
|
-
|
|
2127
|
+
outputMessages: [],
|
|
2128
|
+
durationMs: perRequestFallbackMs,
|
|
2070
2129
|
raw: {
|
|
2071
2130
|
command: renderedCommand,
|
|
2072
2131
|
stderr: result.stderr,
|
|
@@ -2077,9 +2136,10 @@ var CliProvider = class {
|
|
|
2077
2136
|
};
|
|
2078
2137
|
}
|
|
2079
2138
|
return {
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2139
|
+
outputMessages: parsed.outputMessages,
|
|
2140
|
+
tokenUsage: parsed.tokenUsage,
|
|
2141
|
+
costUsd: parsed.costUsd,
|
|
2142
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
2083
2143
|
raw: {
|
|
2084
2144
|
command: renderedCommand,
|
|
2085
2145
|
stderr: result.stderr,
|
|
@@ -2094,28 +2154,111 @@ var CliProvider = class {
|
|
|
2094
2154
|
}
|
|
2095
2155
|
/**
|
|
2096
2156
|
* Parse output content from CLI.
|
|
2097
|
-
* If the content is valid JSON with
|
|
2098
|
-
*
|
|
2157
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
2158
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
2159
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
2160
|
+
*
|
|
2161
|
+
* Also extracts optional execution metrics:
|
|
2162
|
+
* - token_usage: { input, output, cached? }
|
|
2163
|
+
* - cost_usd: number
|
|
2164
|
+
* - duration_ms: number
|
|
2099
2165
|
*/
|
|
2100
2166
|
parseOutputContent(content) {
|
|
2101
2167
|
try {
|
|
2102
2168
|
const parsed = JSON.parse(content);
|
|
2103
|
-
if (typeof parsed === "object" && parsed !== null
|
|
2169
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
2104
2170
|
const obj = parsed;
|
|
2105
|
-
const
|
|
2106
|
-
const
|
|
2107
|
-
|
|
2171
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2172
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2173
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2174
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2175
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
2176
|
+
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
2177
|
+
}
|
|
2178
|
+
if ("text" in obj) {
|
|
2179
|
+
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2180
|
+
return {
|
|
2181
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
2182
|
+
tokenUsage,
|
|
2183
|
+
costUsd,
|
|
2184
|
+
durationMs
|
|
2185
|
+
};
|
|
2186
|
+
}
|
|
2108
2187
|
}
|
|
2109
2188
|
} catch {
|
|
2110
2189
|
}
|
|
2111
|
-
return {
|
|
2190
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
2191
|
+
}
|
|
2192
|
+
/**
|
|
2193
|
+
* Parse token_usage from CLI output.
|
|
2194
|
+
*/
|
|
2195
|
+
parseTokenUsage(tokenUsage) {
|
|
2196
|
+
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2197
|
+
return void 0;
|
|
2198
|
+
}
|
|
2199
|
+
const obj = tokenUsage;
|
|
2200
|
+
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2201
|
+
return void 0;
|
|
2202
|
+
}
|
|
2203
|
+
return {
|
|
2204
|
+
input: obj.input,
|
|
2205
|
+
output: obj.output,
|
|
2206
|
+
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2207
|
+
};
|
|
2208
|
+
}
|
|
2209
|
+
/**
|
|
2210
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2211
|
+
*/
|
|
2212
|
+
parseOutputMessages(outputMessages) {
|
|
2213
|
+
if (!Array.isArray(outputMessages)) {
|
|
2214
|
+
return void 0;
|
|
2215
|
+
}
|
|
2216
|
+
const messages = [];
|
|
2217
|
+
for (const msg of outputMessages) {
|
|
2218
|
+
if (typeof msg !== "object" || msg === null) {
|
|
2219
|
+
continue;
|
|
2220
|
+
}
|
|
2221
|
+
const rawMsg = msg;
|
|
2222
|
+
if (typeof rawMsg.role !== "string") {
|
|
2223
|
+
continue;
|
|
2224
|
+
}
|
|
2225
|
+
const message = {
|
|
2226
|
+
role: rawMsg.role,
|
|
2227
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
2228
|
+
content: rawMsg.content,
|
|
2229
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
2230
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
2231
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
2232
|
+
};
|
|
2233
|
+
messages.push(message);
|
|
2234
|
+
}
|
|
2235
|
+
return messages.length > 0 ? messages : void 0;
|
|
2112
2236
|
}
|
|
2113
|
-
|
|
2114
|
-
|
|
2237
|
+
/**
|
|
2238
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
2239
|
+
*/
|
|
2240
|
+
parseToolCalls(toolCalls) {
|
|
2241
|
+
if (!Array.isArray(toolCalls)) {
|
|
2115
2242
|
return void 0;
|
|
2116
2243
|
}
|
|
2117
|
-
const
|
|
2118
|
-
|
|
2244
|
+
const calls = [];
|
|
2245
|
+
for (const call of toolCalls) {
|
|
2246
|
+
if (typeof call !== "object" || call === null) {
|
|
2247
|
+
continue;
|
|
2248
|
+
}
|
|
2249
|
+
const rawCall = call;
|
|
2250
|
+
if (typeof rawCall.tool !== "string") {
|
|
2251
|
+
continue;
|
|
2252
|
+
}
|
|
2253
|
+
calls.push({
|
|
2254
|
+
tool: rawCall.tool,
|
|
2255
|
+
input: rawCall.input,
|
|
2256
|
+
output: rawCall.output,
|
|
2257
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
2258
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
2259
|
+
});
|
|
2260
|
+
}
|
|
2261
|
+
return calls.length > 0 ? calls : void 0;
|
|
2119
2262
|
}
|
|
2120
2263
|
parseJsonlBatchOutput(content) {
|
|
2121
2264
|
const records = /* @__PURE__ */ new Map();
|
|
@@ -2139,12 +2282,22 @@ var CliProvider = class {
|
|
|
2139
2282
|
if (records.has(id)) {
|
|
2140
2283
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2141
2284
|
}
|
|
2142
|
-
const
|
|
2143
|
-
const
|
|
2285
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2286
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2287
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2288
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2289
|
+
let outputMessages;
|
|
2290
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
2291
|
+
outputMessages = parsedOutputMessages;
|
|
2292
|
+
} else {
|
|
2293
|
+
const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
2294
|
+
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2295
|
+
}
|
|
2144
2296
|
records.set(id, {
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2297
|
+
outputMessages,
|
|
2298
|
+
tokenUsage,
|
|
2299
|
+
costUsd,
|
|
2300
|
+
durationMs
|
|
2148
2301
|
});
|
|
2149
2302
|
}
|
|
2150
2303
|
return records;
|
|
@@ -2157,8 +2310,10 @@ var CliProvider = class {
|
|
|
2157
2310
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
2158
2311
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
2159
2312
|
} finally {
|
|
2160
|
-
|
|
2161
|
-
|
|
2313
|
+
if (!this.keepTempFiles) {
|
|
2314
|
+
await import_promises8.default.unlink(filePath).catch(() => {
|
|
2315
|
+
});
|
|
2316
|
+
}
|
|
2162
2317
|
}
|
|
2163
2318
|
}
|
|
2164
2319
|
async ensureHealthy(signal) {
|
|
@@ -2458,6 +2613,11 @@ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exe
|
|
|
2458
2613
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
2459
2614
|
var PROMPT_FILENAME = "prompt.md";
|
|
2460
2615
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2616
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2617
|
+
- Do NOT create any additional output files in the workspace.
|
|
2618
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2619
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2620
|
+
This is required for evaluation scoring.`;
|
|
2461
2621
|
var CodexProvider = class {
|
|
2462
2622
|
id;
|
|
2463
2623
|
kind = "codex";
|
|
@@ -2482,7 +2642,11 @@ var CodexProvider = class {
|
|
|
2482
2642
|
const workspaceRoot = await this.createWorkspace();
|
|
2483
2643
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2484
2644
|
try {
|
|
2485
|
-
const
|
|
2645
|
+
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2646
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2647
|
+
const promptContent = `${systemPrompt}
|
|
2648
|
+
|
|
2649
|
+
${basePrompt}`;
|
|
2486
2650
|
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2487
2651
|
await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
|
|
2488
2652
|
const args = this.buildCodexArgs();
|
|
@@ -2501,7 +2665,6 @@ var CodexProvider = class {
|
|
|
2501
2665
|
const parsed = parseCodexJson(result.stdout);
|
|
2502
2666
|
const assistantText = extractAssistantText(parsed);
|
|
2503
2667
|
return {
|
|
2504
|
-
text: assistantText,
|
|
2505
2668
|
raw: {
|
|
2506
2669
|
response: parsed,
|
|
2507
2670
|
stdout: result.stdout,
|
|
@@ -2513,7 +2676,8 @@ var CodexProvider = class {
|
|
|
2513
2676
|
workspace: workspaceRoot,
|
|
2514
2677
|
inputFiles,
|
|
2515
2678
|
logFile: logger?.filePath
|
|
2516
|
-
}
|
|
2679
|
+
},
|
|
2680
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
2517
2681
|
};
|
|
2518
2682
|
} finally {
|
|
2519
2683
|
await logger?.close();
|
|
@@ -3135,7 +3299,6 @@ var MockProvider = class {
|
|
|
3135
3299
|
delayMs;
|
|
3136
3300
|
delayMinMs;
|
|
3137
3301
|
delayMaxMs;
|
|
3138
|
-
trace;
|
|
3139
3302
|
constructor(targetName, config) {
|
|
3140
3303
|
this.id = `mock:${targetName}`;
|
|
3141
3304
|
this.targetName = targetName;
|
|
@@ -3143,7 +3306,6 @@ var MockProvider = class {
|
|
|
3143
3306
|
this.delayMs = config.delayMs ?? 0;
|
|
3144
3307
|
this.delayMinMs = config.delayMinMs ?? 0;
|
|
3145
3308
|
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
3146
|
-
this.trace = config.trace;
|
|
3147
3309
|
}
|
|
3148
3310
|
async invoke(request) {
|
|
3149
3311
|
const delay = this.calculateDelay();
|
|
@@ -3151,12 +3313,11 @@ var MockProvider = class {
|
|
|
3151
3313
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
3152
3314
|
}
|
|
3153
3315
|
return {
|
|
3154
|
-
|
|
3316
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
3155
3317
|
raw: {
|
|
3156
3318
|
question: request.question,
|
|
3157
3319
|
guidelines: request.guidelines
|
|
3158
|
-
}
|
|
3159
|
-
trace: this.trace
|
|
3320
|
+
}
|
|
3160
3321
|
};
|
|
3161
3322
|
}
|
|
3162
3323
|
calculateDelay() {
|
|
@@ -3169,182 +3330,1026 @@ var MockProvider = class {
|
|
|
3169
3330
|
}
|
|
3170
3331
|
};
|
|
3171
3332
|
|
|
3172
|
-
// src/evaluation/providers/
|
|
3333
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3334
|
+
var import_node_child_process3 = require("child_process");
|
|
3335
|
+
var import_node_crypto2 = require("crypto");
|
|
3336
|
+
var import_node_fs4 = require("fs");
|
|
3337
|
+
var import_promises10 = require("fs/promises");
|
|
3338
|
+
var import_node_os3 = require("os");
|
|
3173
3339
|
var import_node_path11 = __toESM(require("path"), 1);
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
3184
|
-
name: import_zod.z.string().min(1, "target name is required"),
|
|
3185
|
-
provider: import_zod.z.string().min(1, "provider is required"),
|
|
3186
|
-
judge_target: import_zod.z.string().optional(),
|
|
3187
|
-
workers: import_zod.z.number().int().min(1).optional()
|
|
3188
|
-
}).passthrough();
|
|
3189
|
-
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
3190
|
-
function normalizeAzureApiVersion(value) {
|
|
3191
|
-
if (!value) {
|
|
3192
|
-
return DEFAULT_AZURE_API_VERSION;
|
|
3340
|
+
|
|
3341
|
+
// src/evaluation/providers/pi-log-tracker.ts
|
|
3342
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
|
|
3343
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
|
|
3344
|
+
function getPiLogStore() {
|
|
3345
|
+
const globalObject = globalThis;
|
|
3346
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
3347
|
+
if (existing) {
|
|
3348
|
+
return existing;
|
|
3193
3349
|
}
|
|
3194
|
-
const
|
|
3195
|
-
|
|
3196
|
-
|
|
3350
|
+
const created = [];
|
|
3351
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
3352
|
+
return created;
|
|
3353
|
+
}
|
|
3354
|
+
function getSubscriberStore2() {
|
|
3355
|
+
const globalObject = globalThis;
|
|
3356
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
3357
|
+
if (existing) {
|
|
3358
|
+
return existing;
|
|
3197
3359
|
}
|
|
3198
|
-
const
|
|
3199
|
-
|
|
3360
|
+
const created = /* @__PURE__ */ new Set();
|
|
3361
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
3362
|
+
return created;
|
|
3200
3363
|
}
|
|
3201
|
-
function
|
|
3202
|
-
const
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
|
|
3210
|
-
const maxDelayMs = resolveOptionalNumber(
|
|
3211
|
-
target.retry_max_delay_ms ?? target.retryMaxDelayMs,
|
|
3212
|
-
`${target.name} retry max delay`
|
|
3213
|
-
);
|
|
3214
|
-
const backoffFactor = resolveOptionalNumber(
|
|
3215
|
-
target.retry_backoff_factor ?? target.retryBackoffFactor,
|
|
3216
|
-
`${target.name} retry backoff factor`
|
|
3217
|
-
);
|
|
3218
|
-
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
3219
|
-
target.retry_status_codes ?? target.retryStatusCodes,
|
|
3220
|
-
`${target.name} retry status codes`
|
|
3221
|
-
);
|
|
3222
|
-
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
3223
|
-
return void 0;
|
|
3364
|
+
function notifySubscribers2(entry) {
|
|
3365
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
3366
|
+
for (const listener of subscribers) {
|
|
3367
|
+
try {
|
|
3368
|
+
listener(entry);
|
|
3369
|
+
} catch (error) {
|
|
3370
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3371
|
+
console.warn(`Pi log subscriber failed: ${message}`);
|
|
3372
|
+
}
|
|
3224
3373
|
}
|
|
3225
|
-
return {
|
|
3226
|
-
maxRetries,
|
|
3227
|
-
initialDelayMs,
|
|
3228
|
-
maxDelayMs,
|
|
3229
|
-
backoffFactor,
|
|
3230
|
-
retryableStatusCodes
|
|
3231
|
-
};
|
|
3232
3374
|
}
|
|
3233
|
-
function
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
);
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
case "azure-openai":
|
|
3242
|
-
return {
|
|
3243
|
-
kind: "azure",
|
|
3244
|
-
name: parsed.name,
|
|
3245
|
-
judgeTarget: parsed.judge_target,
|
|
3246
|
-
workers: parsed.workers,
|
|
3247
|
-
providerBatching,
|
|
3248
|
-
config: resolveAzureConfig(parsed, env)
|
|
3249
|
-
};
|
|
3250
|
-
case "anthropic":
|
|
3251
|
-
return {
|
|
3252
|
-
kind: "anthropic",
|
|
3253
|
-
name: parsed.name,
|
|
3254
|
-
judgeTarget: parsed.judge_target,
|
|
3255
|
-
workers: parsed.workers,
|
|
3256
|
-
providerBatching,
|
|
3257
|
-
config: resolveAnthropicConfig(parsed, env)
|
|
3258
|
-
};
|
|
3259
|
-
case "gemini":
|
|
3260
|
-
case "google":
|
|
3261
|
-
case "google-gemini":
|
|
3262
|
-
return {
|
|
3263
|
-
kind: "gemini",
|
|
3264
|
-
name: parsed.name,
|
|
3265
|
-
judgeTarget: parsed.judge_target,
|
|
3266
|
-
workers: parsed.workers,
|
|
3267
|
-
providerBatching,
|
|
3268
|
-
config: resolveGeminiConfig(parsed, env)
|
|
3269
|
-
};
|
|
3270
|
-
case "codex":
|
|
3271
|
-
case "codex-cli":
|
|
3272
|
-
return {
|
|
3273
|
-
kind: "codex",
|
|
3274
|
-
name: parsed.name,
|
|
3275
|
-
judgeTarget: parsed.judge_target,
|
|
3276
|
-
workers: parsed.workers,
|
|
3277
|
-
providerBatching,
|
|
3278
|
-
config: resolveCodexConfig(parsed, env)
|
|
3279
|
-
};
|
|
3280
|
-
case "mock":
|
|
3281
|
-
return {
|
|
3282
|
-
kind: "mock",
|
|
3283
|
-
name: parsed.name,
|
|
3284
|
-
judgeTarget: parsed.judge_target,
|
|
3285
|
-
workers: parsed.workers,
|
|
3286
|
-
providerBatching,
|
|
3287
|
-
config: resolveMockConfig(parsed)
|
|
3288
|
-
};
|
|
3289
|
-
case "vscode":
|
|
3290
|
-
case "vscode-insiders":
|
|
3291
|
-
return {
|
|
3292
|
-
kind: provider,
|
|
3293
|
-
name: parsed.name,
|
|
3294
|
-
judgeTarget: parsed.judge_target,
|
|
3295
|
-
workers: parsed.workers,
|
|
3296
|
-
providerBatching,
|
|
3297
|
-
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
3298
|
-
};
|
|
3299
|
-
case "cli":
|
|
3300
|
-
return {
|
|
3301
|
-
kind: "cli",
|
|
3302
|
-
name: parsed.name,
|
|
3303
|
-
judgeTarget: parsed.judge_target,
|
|
3304
|
-
workers: parsed.workers,
|
|
3305
|
-
providerBatching,
|
|
3306
|
-
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
3307
|
-
};
|
|
3308
|
-
default:
|
|
3309
|
-
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
3375
|
+
function recordPiLogEntry(entry) {
|
|
3376
|
+
getPiLogStore().push(entry);
|
|
3377
|
+
notifySubscribers2(entry);
|
|
3378
|
+
}
|
|
3379
|
+
function consumePiLogEntries() {
|
|
3380
|
+
const store = getPiLogStore();
|
|
3381
|
+
if (store.length === 0) {
|
|
3382
|
+
return [];
|
|
3310
3383
|
}
|
|
3384
|
+
return store.splice(0, store.length);
|
|
3311
3385
|
}
|
|
3312
|
-
function
|
|
3313
|
-
const
|
|
3314
|
-
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
const temperatureSource = target.temperature;
|
|
3318
|
-
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
3319
|
-
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
3320
|
-
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
3321
|
-
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
3322
|
-
const version = normalizeAzureApiVersion(
|
|
3323
|
-
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
3324
|
-
allowLiteral: true,
|
|
3325
|
-
optionalEnv: true
|
|
3326
|
-
})
|
|
3327
|
-
);
|
|
3328
|
-
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
3329
|
-
const maxOutputTokens = resolveOptionalNumber(
|
|
3330
|
-
maxTokensSource,
|
|
3331
|
-
`${target.name} max output tokens`
|
|
3332
|
-
);
|
|
3333
|
-
const retry = resolveRetryConfig(target);
|
|
3334
|
-
return {
|
|
3335
|
-
resourceName,
|
|
3336
|
-
deploymentName,
|
|
3337
|
-
apiKey,
|
|
3338
|
-
version,
|
|
3339
|
-
temperature,
|
|
3340
|
-
maxOutputTokens,
|
|
3341
|
-
retry
|
|
3386
|
+
function subscribeToPiLogEntries(listener) {
|
|
3387
|
+
const store = getSubscriberStore2();
|
|
3388
|
+
store.add(listener);
|
|
3389
|
+
return () => {
|
|
3390
|
+
store.delete(listener);
|
|
3342
3391
|
};
|
|
3343
3392
|
}
|
|
3344
|
-
|
|
3345
|
-
|
|
3346
|
-
|
|
3347
|
-
|
|
3393
|
+
|
|
3394
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3395
|
+
var WORKSPACE_PREFIX2 = "agentv-pi-";
|
|
3396
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
3397
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3398
|
+
- Do NOT create any additional output files in the workspace.
|
|
3399
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
3400
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3401
|
+
This is required for evaluation scoring.`;
|
|
3402
|
+
var PiCodingAgentProvider = class {
|
|
3403
|
+
id;
|
|
3404
|
+
kind = "pi-coding-agent";
|
|
3405
|
+
targetName;
|
|
3406
|
+
supportsBatch = false;
|
|
3407
|
+
config;
|
|
3408
|
+
runPi;
|
|
3409
|
+
constructor(targetName, config, runner = defaultPiRunner) {
|
|
3410
|
+
this.id = `pi-coding-agent:${targetName}`;
|
|
3411
|
+
this.targetName = targetName;
|
|
3412
|
+
this.config = config;
|
|
3413
|
+
this.runPi = runner;
|
|
3414
|
+
}
|
|
3415
|
+
async invoke(request) {
|
|
3416
|
+
if (request.signal?.aborted) {
|
|
3417
|
+
throw new Error("Pi coding agent request was aborted before execution");
|
|
3418
|
+
}
|
|
3419
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
3420
|
+
const workspaceRoot = await this.createWorkspace();
|
|
3421
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3422
|
+
try {
|
|
3423
|
+
const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3424
|
+
await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
|
|
3425
|
+
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3426
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
3427
|
+
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3428
|
+
if (result.timedOut) {
|
|
3429
|
+
throw new Error(
|
|
3430
|
+
`Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
3431
|
+
);
|
|
3432
|
+
}
|
|
3433
|
+
if (result.exitCode !== 0) {
|
|
3434
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
3435
|
+
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3436
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3437
|
+
}
|
|
3438
|
+
const parsed = parsePiJsonl(result.stdout);
|
|
3439
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
3440
|
+
const assistantText = extractAssistantText2(outputMessages);
|
|
3441
|
+
return {
|
|
3442
|
+
raw: {
|
|
3443
|
+
response: parsed,
|
|
3444
|
+
stdout: result.stdout,
|
|
3445
|
+
stderr: result.stderr,
|
|
3446
|
+
exitCode: result.exitCode,
|
|
3447
|
+
args,
|
|
3448
|
+
executable: this.config.executable,
|
|
3449
|
+
promptFile,
|
|
3450
|
+
workspace: workspaceRoot,
|
|
3451
|
+
inputFiles,
|
|
3452
|
+
logFile: logger?.filePath
|
|
3453
|
+
},
|
|
3454
|
+
outputMessages
|
|
3455
|
+
};
|
|
3456
|
+
} finally {
|
|
3457
|
+
await logger?.close();
|
|
3458
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
3459
|
+
}
|
|
3460
|
+
}
|
|
3461
|
+
resolveCwd(workspaceRoot) {
|
|
3462
|
+
if (!this.config.cwd) {
|
|
3463
|
+
return workspaceRoot;
|
|
3464
|
+
}
|
|
3465
|
+
return import_node_path11.default.resolve(this.config.cwd);
|
|
3466
|
+
}
|
|
3467
|
+
buildPiArgs(prompt, inputFiles) {
|
|
3468
|
+
const args = [];
|
|
3469
|
+
if (this.config.provider) {
|
|
3470
|
+
args.push("--provider", this.config.provider);
|
|
3471
|
+
}
|
|
3472
|
+
if (this.config.model) {
|
|
3473
|
+
args.push("--model", this.config.model);
|
|
3474
|
+
}
|
|
3475
|
+
if (this.config.apiKey) {
|
|
3476
|
+
args.push("--api-key", this.config.apiKey);
|
|
3477
|
+
}
|
|
3478
|
+
args.push("--mode", "json");
|
|
3479
|
+
args.push("--print");
|
|
3480
|
+
args.push("--no-session");
|
|
3481
|
+
if (this.config.tools) {
|
|
3482
|
+
args.push("--tools", this.config.tools);
|
|
3483
|
+
}
|
|
3484
|
+
if (this.config.thinking) {
|
|
3485
|
+
args.push("--thinking", this.config.thinking);
|
|
3486
|
+
}
|
|
3487
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
3488
|
+
args.push(...this.config.args);
|
|
3489
|
+
}
|
|
3490
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
3491
|
+
for (const file of inputFiles) {
|
|
3492
|
+
args.push(`@${file}`);
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
3496
|
+
const fullPrompt = `${systemPrompt}
|
|
3497
|
+
|
|
3498
|
+
${prompt}`;
|
|
3499
|
+
const escapedPrompt = escapeAtSymbols(fullPrompt);
|
|
3500
|
+
args.push(escapedPrompt);
|
|
3501
|
+
return args;
|
|
3502
|
+
}
|
|
3503
|
+
async executePi(args, cwd, signal, logger) {
|
|
3504
|
+
try {
|
|
3505
|
+
return await this.runPi({
|
|
3506
|
+
executable: this.config.executable,
|
|
3507
|
+
args,
|
|
3508
|
+
cwd,
|
|
3509
|
+
timeoutMs: this.config.timeoutMs,
|
|
3510
|
+
env: this.buildEnv(),
|
|
3511
|
+
signal,
|
|
3512
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
3513
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
3514
|
+
});
|
|
3515
|
+
} catch (error) {
|
|
3516
|
+
const err = error;
|
|
3517
|
+
if (err.code === "ENOENT") {
|
|
3518
|
+
throw new Error(
|
|
3519
|
+
`Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
3520
|
+
);
|
|
3521
|
+
}
|
|
3522
|
+
throw error;
|
|
3523
|
+
}
|
|
3524
|
+
}
|
|
3525
|
+
buildEnv() {
|
|
3526
|
+
const env = { ...process.env };
|
|
3527
|
+
if (this.config.apiKey) {
|
|
3528
|
+
const provider = this.config.provider?.toLowerCase() ?? "google";
|
|
3529
|
+
switch (provider) {
|
|
3530
|
+
case "google":
|
|
3531
|
+
case "gemini":
|
|
3532
|
+
env.GEMINI_API_KEY = this.config.apiKey;
|
|
3533
|
+
break;
|
|
3534
|
+
case "anthropic":
|
|
3535
|
+
env.ANTHROPIC_API_KEY = this.config.apiKey;
|
|
3536
|
+
break;
|
|
3537
|
+
case "openai":
|
|
3538
|
+
env.OPENAI_API_KEY = this.config.apiKey;
|
|
3539
|
+
break;
|
|
3540
|
+
case "groq":
|
|
3541
|
+
env.GROQ_API_KEY = this.config.apiKey;
|
|
3542
|
+
break;
|
|
3543
|
+
case "xai":
|
|
3544
|
+
env.XAI_API_KEY = this.config.apiKey;
|
|
3545
|
+
break;
|
|
3546
|
+
case "openrouter":
|
|
3547
|
+
env.OPENROUTER_API_KEY = this.config.apiKey;
|
|
3548
|
+
break;
|
|
3549
|
+
}
|
|
3550
|
+
}
|
|
3551
|
+
return env;
|
|
3552
|
+
}
|
|
3553
|
+
async createWorkspace() {
|
|
3554
|
+
return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
3555
|
+
}
|
|
3556
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
3557
|
+
try {
|
|
3558
|
+
await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3559
|
+
} catch {
|
|
3560
|
+
}
|
|
3561
|
+
}
|
|
3562
|
+
resolveLogDirectory() {
|
|
3563
|
+
if (this.config.logDir) {
|
|
3564
|
+
return import_node_path11.default.resolve(this.config.logDir);
|
|
3565
|
+
}
|
|
3566
|
+
return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3567
|
+
}
|
|
3568
|
+
async createStreamLogger(request) {
|
|
3569
|
+
const logDir = this.resolveLogDirectory();
|
|
3570
|
+
if (!logDir) {
|
|
3571
|
+
return void 0;
|
|
3572
|
+
}
|
|
3573
|
+
try {
|
|
3574
|
+
await (0, import_promises10.mkdir)(logDir, { recursive: true });
|
|
3575
|
+
} catch (error) {
|
|
3576
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3577
|
+
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3578
|
+
return void 0;
|
|
3579
|
+
}
|
|
3580
|
+
const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3581
|
+
try {
|
|
3582
|
+
const logger = await PiStreamLogger.create({
|
|
3583
|
+
filePath,
|
|
3584
|
+
targetName: this.targetName,
|
|
3585
|
+
evalCaseId: request.evalCaseId,
|
|
3586
|
+
attempt: request.attempt,
|
|
3587
|
+
format: this.config.logFormat ?? "summary"
|
|
3588
|
+
});
|
|
3589
|
+
recordPiLogEntry({
|
|
3590
|
+
filePath,
|
|
3591
|
+
targetName: this.targetName,
|
|
3592
|
+
evalCaseId: request.evalCaseId,
|
|
3593
|
+
attempt: request.attempt
|
|
3594
|
+
});
|
|
3595
|
+
return logger;
|
|
3596
|
+
} catch (error) {
|
|
3597
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3598
|
+
console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
|
|
3599
|
+
return void 0;
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
};
|
|
3603
|
+
var PiStreamLogger = class _PiStreamLogger {
|
|
3604
|
+
filePath;
|
|
3605
|
+
stream;
|
|
3606
|
+
startedAt = Date.now();
|
|
3607
|
+
stdoutBuffer = "";
|
|
3608
|
+
stderrBuffer = "";
|
|
3609
|
+
format;
|
|
3610
|
+
constructor(filePath, format) {
|
|
3611
|
+
this.filePath = filePath;
|
|
3612
|
+
this.format = format;
|
|
3613
|
+
this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
|
|
3614
|
+
}
|
|
3615
|
+
static async create(options) {
|
|
3616
|
+
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
3617
|
+
const header = [
|
|
3618
|
+
"# Pi Coding Agent stream log",
|
|
3619
|
+
`# target: ${options.targetName}`,
|
|
3620
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
3621
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
3622
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
3623
|
+
""
|
|
3624
|
+
].filter((line) => Boolean(line));
|
|
3625
|
+
logger.writeLines(header);
|
|
3626
|
+
return logger;
|
|
3627
|
+
}
|
|
3628
|
+
handleStdoutChunk(chunk) {
|
|
3629
|
+
this.stdoutBuffer += chunk;
|
|
3630
|
+
this.flushBuffer("stdout");
|
|
3631
|
+
}
|
|
3632
|
+
handleStderrChunk(chunk) {
|
|
3633
|
+
this.stderrBuffer += chunk;
|
|
3634
|
+
this.flushBuffer("stderr");
|
|
3635
|
+
}
|
|
3636
|
+
async close() {
|
|
3637
|
+
this.flushBuffer("stdout");
|
|
3638
|
+
this.flushBuffer("stderr");
|
|
3639
|
+
this.flushRemainder();
|
|
3640
|
+
await new Promise((resolve, reject) => {
|
|
3641
|
+
this.stream.once("error", reject);
|
|
3642
|
+
this.stream.end(() => resolve());
|
|
3643
|
+
});
|
|
3644
|
+
}
|
|
3645
|
+
writeLines(lines) {
|
|
3646
|
+
for (const line of lines) {
|
|
3647
|
+
this.stream.write(`${line}
|
|
3648
|
+
`);
|
|
3649
|
+
}
|
|
3650
|
+
}
|
|
3651
|
+
flushBuffer(source) {
|
|
3652
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
3653
|
+
const lines = buffer.split(/\r?\n/);
|
|
3654
|
+
const remainder = lines.pop() ?? "";
|
|
3655
|
+
if (source === "stdout") {
|
|
3656
|
+
this.stdoutBuffer = remainder;
|
|
3657
|
+
} else {
|
|
3658
|
+
this.stderrBuffer = remainder;
|
|
3659
|
+
}
|
|
3660
|
+
for (const line of lines) {
|
|
3661
|
+
const formatted = this.formatLine(line, source);
|
|
3662
|
+
if (formatted) {
|
|
3663
|
+
this.stream.write(formatted);
|
|
3664
|
+
this.stream.write("\n");
|
|
3665
|
+
}
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
formatLine(rawLine, source) {
|
|
3669
|
+
const trimmed = rawLine.trim();
|
|
3670
|
+
if (trimmed.length === 0) {
|
|
3671
|
+
return void 0;
|
|
3672
|
+
}
|
|
3673
|
+
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3674
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
3675
|
+
}
|
|
3676
|
+
flushRemainder() {
|
|
3677
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
3678
|
+
if (stdoutRemainder.length > 0) {
|
|
3679
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
3680
|
+
if (formatted) {
|
|
3681
|
+
this.stream.write(formatted);
|
|
3682
|
+
this.stream.write("\n");
|
|
3683
|
+
}
|
|
3684
|
+
}
|
|
3685
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
3686
|
+
if (stderrRemainder.length > 0) {
|
|
3687
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
3688
|
+
if (formatted) {
|
|
3689
|
+
this.stream.write(formatted);
|
|
3690
|
+
this.stream.write("\n");
|
|
3691
|
+
}
|
|
3692
|
+
}
|
|
3693
|
+
this.stdoutBuffer = "";
|
|
3694
|
+
this.stderrBuffer = "";
|
|
3695
|
+
}
|
|
3696
|
+
};
|
|
3697
|
+
function buildLogFilename2(request, targetName) {
|
|
3698
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3699
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
|
|
3700
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3701
|
+
const target = sanitizeForFilename2(targetName);
|
|
3702
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
|
|
3703
|
+
}
|
|
3704
|
+
function sanitizeForFilename2(value) {
|
|
3705
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3706
|
+
return sanitized.length > 0 ? sanitized : "pi";
|
|
3707
|
+
}
|
|
3708
|
+
function formatElapsed2(startedAt) {
|
|
3709
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3710
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3711
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
3712
|
+
const seconds = elapsedSeconds % 60;
|
|
3713
|
+
if (hours > 0) {
|
|
3714
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3715
|
+
}
|
|
3716
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3717
|
+
}
|
|
3718
|
+
function formatPiLogMessage(rawLine, source) {
|
|
3719
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3720
|
+
if (parsed) {
|
|
3721
|
+
const summary = summarizePiEvent(parsed);
|
|
3722
|
+
if (summary) {
|
|
3723
|
+
return summary;
|
|
3724
|
+
}
|
|
3725
|
+
}
|
|
3726
|
+
if (source === "stderr") {
|
|
3727
|
+
return `stderr: ${rawLine}`;
|
|
3728
|
+
}
|
|
3729
|
+
return rawLine;
|
|
3730
|
+
}
|
|
3731
|
+
function formatPiJsonLog(rawLine) {
|
|
3732
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3733
|
+
if (!parsed) {
|
|
3734
|
+
return rawLine;
|
|
3735
|
+
}
|
|
3736
|
+
try {
|
|
3737
|
+
return JSON.stringify(parsed, null, 2);
|
|
3738
|
+
} catch {
|
|
3739
|
+
return rawLine;
|
|
3740
|
+
}
|
|
3741
|
+
}
|
|
3742
|
+
function summarizePiEvent(event) {
|
|
3743
|
+
if (!event || typeof event !== "object") {
|
|
3744
|
+
return void 0;
|
|
3745
|
+
}
|
|
3746
|
+
const record = event;
|
|
3747
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
3748
|
+
if (!type) {
|
|
3749
|
+
return void 0;
|
|
3750
|
+
}
|
|
3751
|
+
switch (type) {
|
|
3752
|
+
case "agent_start":
|
|
3753
|
+
return "agent_start";
|
|
3754
|
+
case "agent_end":
|
|
3755
|
+
return "agent_end";
|
|
3756
|
+
case "turn_start":
|
|
3757
|
+
return "turn_start";
|
|
3758
|
+
case "turn_end":
|
|
3759
|
+
return "turn_end";
|
|
3760
|
+
case "message_start":
|
|
3761
|
+
case "message_end": {
|
|
3762
|
+
const message = record.message;
|
|
3763
|
+
const role = message?.role;
|
|
3764
|
+
return `${type}: ${role}`;
|
|
3765
|
+
}
|
|
3766
|
+
case "message_update": {
|
|
3767
|
+
const event2 = record.assistantMessageEvent;
|
|
3768
|
+
const eventType = event2?.type;
|
|
3769
|
+
if (eventType === "text_delta") {
|
|
3770
|
+
const delta = event2?.delta;
|
|
3771
|
+
if (typeof delta === "string") {
|
|
3772
|
+
const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
|
|
3773
|
+
return `text_delta: ${preview}`;
|
|
3774
|
+
}
|
|
3775
|
+
}
|
|
3776
|
+
return `message_update: ${eventType}`;
|
|
3777
|
+
}
|
|
3778
|
+
default:
|
|
3779
|
+
return type;
|
|
3780
|
+
}
|
|
3781
|
+
}
|
|
3782
|
+
function tryParseJsonValue2(rawLine) {
|
|
3783
|
+
try {
|
|
3784
|
+
return JSON.parse(rawLine);
|
|
3785
|
+
} catch {
|
|
3786
|
+
return void 0;
|
|
3787
|
+
}
|
|
3788
|
+
}
|
|
3789
|
+
function parsePiJsonl(output) {
|
|
3790
|
+
const trimmed = output.trim();
|
|
3791
|
+
if (trimmed.length === 0) {
|
|
3792
|
+
throw new Error("Pi coding agent produced no output");
|
|
3793
|
+
}
|
|
3794
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
3795
|
+
const parsed = [];
|
|
3796
|
+
for (const line of lines) {
|
|
3797
|
+
try {
|
|
3798
|
+
parsed.push(JSON.parse(line));
|
|
3799
|
+
} catch {
|
|
3800
|
+
}
|
|
3801
|
+
}
|
|
3802
|
+
if (parsed.length === 0) {
|
|
3803
|
+
throw new Error("Pi coding agent produced no valid JSON output");
|
|
3804
|
+
}
|
|
3805
|
+
return parsed;
|
|
3806
|
+
}
|
|
3807
|
+
function extractOutputMessages(events) {
|
|
3808
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
3809
|
+
const event = events[i];
|
|
3810
|
+
if (!event || typeof event !== "object") {
|
|
3811
|
+
continue;
|
|
3812
|
+
}
|
|
3813
|
+
const record = event;
|
|
3814
|
+
if (record.type !== "agent_end") {
|
|
3815
|
+
continue;
|
|
3816
|
+
}
|
|
3817
|
+
const messages = record.messages;
|
|
3818
|
+
if (!Array.isArray(messages)) {
|
|
3819
|
+
continue;
|
|
3820
|
+
}
|
|
3821
|
+
return messages.map(convertPiMessage).filter((m) => m !== void 0);
|
|
3822
|
+
}
|
|
3823
|
+
const outputMessages = [];
|
|
3824
|
+
for (const event of events) {
|
|
3825
|
+
if (!event || typeof event !== "object") {
|
|
3826
|
+
continue;
|
|
3827
|
+
}
|
|
3828
|
+
const record = event;
|
|
3829
|
+
if (record.type === "turn_end") {
|
|
3830
|
+
const message = record.message;
|
|
3831
|
+
const converted = convertPiMessage(message);
|
|
3832
|
+
if (converted) {
|
|
3833
|
+
outputMessages.push(converted);
|
|
3834
|
+
}
|
|
3835
|
+
}
|
|
3836
|
+
}
|
|
3837
|
+
return outputMessages;
|
|
3838
|
+
}
|
|
3839
|
+
function convertPiMessage(message) {
|
|
3840
|
+
if (!message || typeof message !== "object") {
|
|
3841
|
+
return void 0;
|
|
3842
|
+
}
|
|
3843
|
+
const msg = message;
|
|
3844
|
+
const role = msg.role;
|
|
3845
|
+
if (typeof role !== "string") {
|
|
3846
|
+
return void 0;
|
|
3847
|
+
}
|
|
3848
|
+
const content = extractTextContent(msg.content);
|
|
3849
|
+
const toolCalls = extractToolCalls(msg.content);
|
|
3850
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3851
|
+
const metadata = {};
|
|
3852
|
+
if (msg.api) metadata.api = msg.api;
|
|
3853
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
3854
|
+
if (msg.model) metadata.model = msg.model;
|
|
3855
|
+
if (msg.usage) metadata.usage = msg.usage;
|
|
3856
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
3857
|
+
return {
|
|
3858
|
+
role,
|
|
3859
|
+
content,
|
|
3860
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
3861
|
+
timestamp,
|
|
3862
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3863
|
+
};
|
|
3864
|
+
}
|
|
3865
|
+
function extractTextContent(content) {
|
|
3866
|
+
if (typeof content === "string") {
|
|
3867
|
+
return content;
|
|
3868
|
+
}
|
|
3869
|
+
if (!Array.isArray(content)) {
|
|
3870
|
+
return void 0;
|
|
3871
|
+
}
|
|
3872
|
+
const textParts = [];
|
|
3873
|
+
for (const part of content) {
|
|
3874
|
+
if (!part || typeof part !== "object") {
|
|
3875
|
+
continue;
|
|
3876
|
+
}
|
|
3877
|
+
const p = part;
|
|
3878
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
3879
|
+
textParts.push(p.text);
|
|
3880
|
+
}
|
|
3881
|
+
}
|
|
3882
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3883
|
+
}
|
|
3884
|
+
function extractToolCalls(content) {
|
|
3885
|
+
if (!Array.isArray(content)) {
|
|
3886
|
+
return [];
|
|
3887
|
+
}
|
|
3888
|
+
const toolCalls = [];
|
|
3889
|
+
for (const part of content) {
|
|
3890
|
+
if (!part || typeof part !== "object") {
|
|
3891
|
+
continue;
|
|
3892
|
+
}
|
|
3893
|
+
const p = part;
|
|
3894
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
3895
|
+
toolCalls.push({
|
|
3896
|
+
tool: p.name,
|
|
3897
|
+
input: p.input,
|
|
3898
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
3899
|
+
});
|
|
3900
|
+
}
|
|
3901
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
3902
|
+
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
3903
|
+
if (existing) {
|
|
3904
|
+
const idx = toolCalls.indexOf(existing);
|
|
3905
|
+
toolCalls[idx] = {
|
|
3906
|
+
...existing,
|
|
3907
|
+
output: p.content
|
|
3908
|
+
};
|
|
3909
|
+
}
|
|
3910
|
+
}
|
|
3911
|
+
}
|
|
3912
|
+
return toolCalls;
|
|
3913
|
+
}
|
|
3914
|
+
function extractAssistantText2(messages) {
|
|
3915
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3916
|
+
const msg = messages[i];
|
|
3917
|
+
if (msg.role === "assistant" && msg.content) {
|
|
3918
|
+
if (typeof msg.content === "string") {
|
|
3919
|
+
return msg.content;
|
|
3920
|
+
}
|
|
3921
|
+
return JSON.stringify(msg.content);
|
|
3922
|
+
}
|
|
3923
|
+
}
|
|
3924
|
+
return "";
|
|
3925
|
+
}
|
|
3926
|
+
function escapeAtSymbols(prompt) {
|
|
3927
|
+
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3928
|
+
}
|
|
3929
|
+
function pickDetail2(stderr, stdout) {
|
|
3930
|
+
const errorText = stderr.trim();
|
|
3931
|
+
if (errorText.length > 0) {
|
|
3932
|
+
return errorText;
|
|
3933
|
+
}
|
|
3934
|
+
const stdoutText = stdout.trim();
|
|
3935
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3936
|
+
}
|
|
3937
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3938
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
3939
|
+
return "";
|
|
3940
|
+
}
|
|
3941
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
3942
|
+
return ` after ${seconds}s`;
|
|
3943
|
+
}
|
|
3944
|
+
async function defaultPiRunner(options) {
|
|
3945
|
+
return await new Promise((resolve, reject) => {
|
|
3946
|
+
const parts = options.executable.split(/\s+/);
|
|
3947
|
+
const executable = parts[0];
|
|
3948
|
+
const executableArgs = parts.slice(1);
|
|
3949
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
3950
|
+
const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
|
|
3951
|
+
cwd: options.cwd,
|
|
3952
|
+
env: options.env,
|
|
3953
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
3954
|
+
shell: false
|
|
3955
|
+
});
|
|
3956
|
+
let stdout = "";
|
|
3957
|
+
let stderr = "";
|
|
3958
|
+
let timedOut = false;
|
|
3959
|
+
const onAbort = () => {
|
|
3960
|
+
child.kill("SIGTERM");
|
|
3961
|
+
};
|
|
3962
|
+
if (options.signal) {
|
|
3963
|
+
if (options.signal.aborted) {
|
|
3964
|
+
onAbort();
|
|
3965
|
+
} else {
|
|
3966
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
3967
|
+
}
|
|
3968
|
+
}
|
|
3969
|
+
let timeoutHandle;
|
|
3970
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
3971
|
+
timeoutHandle = setTimeout(() => {
|
|
3972
|
+
timedOut = true;
|
|
3973
|
+
child.kill("SIGTERM");
|
|
3974
|
+
}, options.timeoutMs);
|
|
3975
|
+
timeoutHandle.unref?.();
|
|
3976
|
+
}
|
|
3977
|
+
child.stdout.setEncoding("utf8");
|
|
3978
|
+
child.stdout.on("data", (chunk) => {
|
|
3979
|
+
stdout += chunk;
|
|
3980
|
+
options.onStdoutChunk?.(chunk);
|
|
3981
|
+
});
|
|
3982
|
+
child.stderr.setEncoding("utf8");
|
|
3983
|
+
child.stderr.on("data", (chunk) => {
|
|
3984
|
+
stderr += chunk;
|
|
3985
|
+
options.onStderrChunk?.(chunk);
|
|
3986
|
+
});
|
|
3987
|
+
child.stdin.end();
|
|
3988
|
+
const cleanup = () => {
|
|
3989
|
+
if (timeoutHandle) {
|
|
3990
|
+
clearTimeout(timeoutHandle);
|
|
3991
|
+
}
|
|
3992
|
+
if (options.signal) {
|
|
3993
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
3994
|
+
}
|
|
3995
|
+
};
|
|
3996
|
+
child.on("error", (error) => {
|
|
3997
|
+
cleanup();
|
|
3998
|
+
reject(error);
|
|
3999
|
+
});
|
|
4000
|
+
child.on("close", (code) => {
|
|
4001
|
+
cleanup();
|
|
4002
|
+
resolve({
|
|
4003
|
+
stdout,
|
|
4004
|
+
stderr,
|
|
4005
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
4006
|
+
timedOut
|
|
4007
|
+
});
|
|
4008
|
+
});
|
|
4009
|
+
});
|
|
4010
|
+
}
|
|
4011
|
+
|
|
4012
|
+
// src/evaluation/providers/targets.ts
|
|
4013
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
4014
|
+
var import_zod = require("zod");
|
|
4015
|
+
var CliHealthcheckHttpInputSchema = import_zod.z.object({
|
|
4016
|
+
type: import_zod.z.literal("http"),
|
|
4017
|
+
url: import_zod.z.string().min(1, "healthcheck URL is required"),
|
|
4018
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4019
|
+
timeoutSeconds: import_zod.z.number().positive().optional()
|
|
4020
|
+
});
|
|
4021
|
+
var CliHealthcheckCommandInputSchema = import_zod.z.object({
|
|
4022
|
+
type: import_zod.z.literal("command"),
|
|
4023
|
+
command_template: import_zod.z.string().optional(),
|
|
4024
|
+
commandTemplate: import_zod.z.string().optional(),
|
|
4025
|
+
cwd: import_zod.z.string().optional(),
|
|
4026
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4027
|
+
timeoutSeconds: import_zod.z.number().positive().optional()
|
|
4028
|
+
});
|
|
4029
|
+
var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
|
|
4030
|
+
CliHealthcheckHttpInputSchema,
|
|
4031
|
+
CliHealthcheckCommandInputSchema
|
|
4032
|
+
]);
|
|
4033
|
+
var CliTargetInputSchema = import_zod.z.object({
|
|
4034
|
+
name: import_zod.z.string().min(1, "target name is required"),
|
|
4035
|
+
provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
4036
|
+
// Command template - required (accept both naming conventions)
|
|
4037
|
+
command_template: import_zod.z.string().optional(),
|
|
4038
|
+
commandTemplate: import_zod.z.string().optional(),
|
|
4039
|
+
// Files format - optional
|
|
4040
|
+
files_format: import_zod.z.string().optional(),
|
|
4041
|
+
filesFormat: import_zod.z.string().optional(),
|
|
4042
|
+
attachments_format: import_zod.z.string().optional(),
|
|
4043
|
+
attachmentsFormat: import_zod.z.string().optional(),
|
|
4044
|
+
// Working directory - optional
|
|
4045
|
+
cwd: import_zod.z.string().optional(),
|
|
4046
|
+
// Timeout in seconds - optional
|
|
4047
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4048
|
+
timeoutSeconds: import_zod.z.number().positive().optional(),
|
|
4049
|
+
// Healthcheck configuration - optional
|
|
4050
|
+
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
4051
|
+
// Verbose mode - optional
|
|
4052
|
+
verbose: import_zod.z.boolean().optional(),
|
|
4053
|
+
cli_verbose: import_zod.z.boolean().optional(),
|
|
4054
|
+
cliVerbose: import_zod.z.boolean().optional(),
|
|
4055
|
+
// Keep temp files - optional
|
|
4056
|
+
keep_temp_files: import_zod.z.boolean().optional(),
|
|
4057
|
+
keepTempFiles: import_zod.z.boolean().optional(),
|
|
4058
|
+
keep_output_files: import_zod.z.boolean().optional(),
|
|
4059
|
+
keepOutputFiles: import_zod.z.boolean().optional(),
|
|
4060
|
+
// Common target fields
|
|
4061
|
+
judge_target: import_zod.z.string().optional(),
|
|
4062
|
+
workers: import_zod.z.number().int().min(1).optional(),
|
|
4063
|
+
provider_batching: import_zod.z.boolean().optional(),
|
|
4064
|
+
providerBatching: import_zod.z.boolean().optional()
|
|
4065
|
+
}).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
|
|
4066
|
+
message: "Either command_template or commandTemplate is required"
|
|
4067
|
+
});
|
|
4068
|
+
var CliHealthcheckHttpSchema = import_zod.z.object({
|
|
4069
|
+
type: import_zod.z.literal("http"),
|
|
4070
|
+
url: import_zod.z.string().min(1),
|
|
4071
|
+
timeoutMs: import_zod.z.number().positive().optional()
|
|
4072
|
+
}).strict();
|
|
4073
|
+
var CliHealthcheckCommandSchema = import_zod.z.object({
|
|
4074
|
+
type: import_zod.z.literal("command"),
|
|
4075
|
+
commandTemplate: import_zod.z.string().min(1),
|
|
4076
|
+
cwd: import_zod.z.string().optional(),
|
|
4077
|
+
timeoutMs: import_zod.z.number().positive().optional()
|
|
4078
|
+
}).strict();
|
|
4079
|
+
var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
|
|
4080
|
+
CliHealthcheckHttpSchema,
|
|
4081
|
+
CliHealthcheckCommandSchema
|
|
4082
|
+
]);
|
|
4083
|
+
var CliTargetConfigSchema = import_zod.z.object({
|
|
4084
|
+
commandTemplate: import_zod.z.string().min(1),
|
|
4085
|
+
filesFormat: import_zod.z.string().optional(),
|
|
4086
|
+
cwd: import_zod.z.string().optional(),
|
|
4087
|
+
timeoutMs: import_zod.z.number().positive().optional(),
|
|
4088
|
+
healthcheck: CliHealthcheckSchema.optional(),
|
|
4089
|
+
verbose: import_zod.z.boolean().optional(),
|
|
4090
|
+
keepTempFiles: import_zod.z.boolean().optional()
|
|
4091
|
+
}).strict();
|
|
4092
|
+
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
4093
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4094
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4095
|
+
if (input.type === "http") {
|
|
4096
|
+
const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
4097
|
+
return {
|
|
4098
|
+
type: "http",
|
|
4099
|
+
url,
|
|
4100
|
+
timeoutMs
|
|
4101
|
+
};
|
|
4102
|
+
}
|
|
4103
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4104
|
+
if (commandTemplateSource === void 0) {
|
|
4105
|
+
throw new Error(
|
|
4106
|
+
`${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
|
|
4107
|
+
);
|
|
4108
|
+
}
|
|
4109
|
+
const commandTemplate = resolveString(
|
|
4110
|
+
commandTemplateSource,
|
|
4111
|
+
env,
|
|
4112
|
+
`${targetName} healthcheck command template`,
|
|
4113
|
+
true
|
|
4114
|
+
);
|
|
4115
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
|
|
4116
|
+
allowLiteral: true,
|
|
4117
|
+
optionalEnv: true
|
|
4118
|
+
});
|
|
4119
|
+
if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
|
|
4120
|
+
cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
|
|
4121
|
+
}
|
|
4122
|
+
return {
|
|
4123
|
+
type: "command",
|
|
4124
|
+
commandTemplate,
|
|
4125
|
+
cwd,
|
|
4126
|
+
timeoutMs
|
|
4127
|
+
};
|
|
4128
|
+
}
|
|
4129
|
+
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
4130
|
+
const targetName = input.name;
|
|
4131
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4132
|
+
if (commandTemplateSource === void 0) {
|
|
4133
|
+
throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
|
|
4134
|
+
}
|
|
4135
|
+
const commandTemplate = resolveString(
|
|
4136
|
+
commandTemplateSource,
|
|
4137
|
+
env,
|
|
4138
|
+
`${targetName} CLI command template`,
|
|
4139
|
+
true
|
|
4140
|
+
);
|
|
4141
|
+
const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
|
|
4142
|
+
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
4143
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
|
|
4144
|
+
allowLiteral: true,
|
|
4145
|
+
optionalEnv: true
|
|
4146
|
+
});
|
|
4147
|
+
if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
|
|
4148
|
+
cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
|
|
4149
|
+
}
|
|
4150
|
+
if (!cwd && evalFilePath) {
|
|
4151
|
+
cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
|
|
4152
|
+
}
|
|
4153
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4154
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4155
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
|
|
4156
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
4157
|
+
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
4158
|
+
);
|
|
4159
|
+
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
4160
|
+
return {
|
|
4161
|
+
commandTemplate,
|
|
4162
|
+
filesFormat,
|
|
4163
|
+
cwd,
|
|
4164
|
+
timeoutMs,
|
|
4165
|
+
healthcheck,
|
|
4166
|
+
verbose,
|
|
4167
|
+
keepTempFiles
|
|
4168
|
+
};
|
|
4169
|
+
}
|
|
4170
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
4171
|
+
"PROMPT",
|
|
4172
|
+
"GUIDELINES",
|
|
4173
|
+
"EVAL_ID",
|
|
4174
|
+
"ATTEMPT",
|
|
4175
|
+
"FILES",
|
|
4176
|
+
"OUTPUT_FILE"
|
|
4177
|
+
]);
|
|
4178
|
+
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
4179
|
+
name: import_zod.z.string().min(1, "target name is required"),
|
|
4180
|
+
provider: import_zod.z.string().min(1, "provider is required"),
|
|
4181
|
+
judge_target: import_zod.z.string().optional(),
|
|
4182
|
+
workers: import_zod.z.number().int().min(1).optional()
|
|
4183
|
+
}).passthrough();
|
|
4184
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
4185
|
+
function normalizeAzureApiVersion(value) {
|
|
4186
|
+
if (!value) {
|
|
4187
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
4188
|
+
}
|
|
4189
|
+
const trimmed = value.trim();
|
|
4190
|
+
if (trimmed.length === 0) {
|
|
4191
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
4192
|
+
}
|
|
4193
|
+
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
4194
|
+
return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
|
|
4195
|
+
}
|
|
4196
|
+
function resolveRetryConfig(target) {
|
|
4197
|
+
const maxRetries = resolveOptionalNumber(
|
|
4198
|
+
target.max_retries ?? target.maxRetries,
|
|
4199
|
+
`${target.name} max retries`
|
|
4200
|
+
);
|
|
4201
|
+
const initialDelayMs = resolveOptionalNumber(
|
|
4202
|
+
target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
|
|
4203
|
+
`${target.name} retry initial delay`
|
|
4204
|
+
);
|
|
4205
|
+
const maxDelayMs = resolveOptionalNumber(
|
|
4206
|
+
target.retry_max_delay_ms ?? target.retryMaxDelayMs,
|
|
4207
|
+
`${target.name} retry max delay`
|
|
4208
|
+
);
|
|
4209
|
+
const backoffFactor = resolveOptionalNumber(
|
|
4210
|
+
target.retry_backoff_factor ?? target.retryBackoffFactor,
|
|
4211
|
+
`${target.name} retry backoff factor`
|
|
4212
|
+
);
|
|
4213
|
+
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
4214
|
+
target.retry_status_codes ?? target.retryStatusCodes,
|
|
4215
|
+
`${target.name} retry status codes`
|
|
4216
|
+
);
|
|
4217
|
+
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
4218
|
+
return void 0;
|
|
4219
|
+
}
|
|
4220
|
+
return {
|
|
4221
|
+
maxRetries,
|
|
4222
|
+
initialDelayMs,
|
|
4223
|
+
maxDelayMs,
|
|
4224
|
+
backoffFactor,
|
|
4225
|
+
retryableStatusCodes
|
|
4226
|
+
};
|
|
4227
|
+
}
|
|
4228
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
4229
|
+
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
4230
|
+
const provider = parsed.provider.toLowerCase();
|
|
4231
|
+
const providerBatching = resolveOptionalBoolean(
|
|
4232
|
+
parsed.provider_batching ?? parsed.providerBatching
|
|
4233
|
+
);
|
|
4234
|
+
switch (provider) {
|
|
4235
|
+
case "azure":
|
|
4236
|
+
case "azure-openai":
|
|
4237
|
+
return {
|
|
4238
|
+
kind: "azure",
|
|
4239
|
+
name: parsed.name,
|
|
4240
|
+
judgeTarget: parsed.judge_target,
|
|
4241
|
+
workers: parsed.workers,
|
|
4242
|
+
providerBatching,
|
|
4243
|
+
config: resolveAzureConfig(parsed, env)
|
|
4244
|
+
};
|
|
4245
|
+
case "anthropic":
|
|
4246
|
+
return {
|
|
4247
|
+
kind: "anthropic",
|
|
4248
|
+
name: parsed.name,
|
|
4249
|
+
judgeTarget: parsed.judge_target,
|
|
4250
|
+
workers: parsed.workers,
|
|
4251
|
+
providerBatching,
|
|
4252
|
+
config: resolveAnthropicConfig(parsed, env)
|
|
4253
|
+
};
|
|
4254
|
+
case "gemini":
|
|
4255
|
+
case "google":
|
|
4256
|
+
case "google-gemini":
|
|
4257
|
+
return {
|
|
4258
|
+
kind: "gemini",
|
|
4259
|
+
name: parsed.name,
|
|
4260
|
+
judgeTarget: parsed.judge_target,
|
|
4261
|
+
workers: parsed.workers,
|
|
4262
|
+
providerBatching,
|
|
4263
|
+
config: resolveGeminiConfig(parsed, env)
|
|
4264
|
+
};
|
|
4265
|
+
case "codex":
|
|
4266
|
+
case "codex-cli":
|
|
4267
|
+
return {
|
|
4268
|
+
kind: "codex",
|
|
4269
|
+
name: parsed.name,
|
|
4270
|
+
judgeTarget: parsed.judge_target,
|
|
4271
|
+
workers: parsed.workers,
|
|
4272
|
+
providerBatching,
|
|
4273
|
+
config: resolveCodexConfig(parsed, env)
|
|
4274
|
+
};
|
|
4275
|
+
case "pi":
|
|
4276
|
+
case "pi-coding-agent":
|
|
4277
|
+
return {
|
|
4278
|
+
kind: "pi-coding-agent",
|
|
4279
|
+
name: parsed.name,
|
|
4280
|
+
judgeTarget: parsed.judge_target,
|
|
4281
|
+
workers: parsed.workers,
|
|
4282
|
+
providerBatching,
|
|
4283
|
+
config: resolvePiCodingAgentConfig(parsed, env)
|
|
4284
|
+
};
|
|
4285
|
+
case "mock":
|
|
4286
|
+
return {
|
|
4287
|
+
kind: "mock",
|
|
4288
|
+
name: parsed.name,
|
|
4289
|
+
judgeTarget: parsed.judge_target,
|
|
4290
|
+
workers: parsed.workers,
|
|
4291
|
+
providerBatching,
|
|
4292
|
+
config: resolveMockConfig(parsed)
|
|
4293
|
+
};
|
|
4294
|
+
case "vscode":
|
|
4295
|
+
case "vscode-insiders":
|
|
4296
|
+
return {
|
|
4297
|
+
kind: provider,
|
|
4298
|
+
name: parsed.name,
|
|
4299
|
+
judgeTarget: parsed.judge_target,
|
|
4300
|
+
workers: parsed.workers,
|
|
4301
|
+
providerBatching,
|
|
4302
|
+
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
4303
|
+
};
|
|
4304
|
+
case "cli":
|
|
4305
|
+
return {
|
|
4306
|
+
kind: "cli",
|
|
4307
|
+
name: parsed.name,
|
|
4308
|
+
judgeTarget: parsed.judge_target,
|
|
4309
|
+
workers: parsed.workers,
|
|
4310
|
+
providerBatching,
|
|
4311
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
4312
|
+
};
|
|
4313
|
+
default:
|
|
4314
|
+
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
4315
|
+
}
|
|
4316
|
+
}
|
|
4317
|
+
function resolveAzureConfig(target, env) {
|
|
4318
|
+
const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
|
|
4319
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4320
|
+
const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
|
|
4321
|
+
const versionSource = target.version ?? target.api_version;
|
|
4322
|
+
const temperatureSource = target.temperature;
|
|
4323
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
4324
|
+
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
4325
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
4326
|
+
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
4327
|
+
const version = normalizeAzureApiVersion(
|
|
4328
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
4329
|
+
allowLiteral: true,
|
|
4330
|
+
optionalEnv: true
|
|
4331
|
+
})
|
|
4332
|
+
);
|
|
4333
|
+
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
4334
|
+
const maxOutputTokens = resolveOptionalNumber(
|
|
4335
|
+
maxTokensSource,
|
|
4336
|
+
`${target.name} max output tokens`
|
|
4337
|
+
);
|
|
4338
|
+
const retry = resolveRetryConfig(target);
|
|
4339
|
+
return {
|
|
4340
|
+
resourceName,
|
|
4341
|
+
deploymentName,
|
|
4342
|
+
apiKey,
|
|
4343
|
+
version,
|
|
4344
|
+
temperature,
|
|
4345
|
+
maxOutputTokens,
|
|
4346
|
+
retry
|
|
4347
|
+
};
|
|
4348
|
+
}
|
|
4349
|
+
function resolveAnthropicConfig(target, env) {
|
|
4350
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4351
|
+
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
4352
|
+
const temperatureSource = target.temperature;
|
|
3348
4353
|
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
3349
4354
|
const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
|
|
3350
4355
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
@@ -3385,6 +4390,7 @@ function resolveCodexConfig(target, env) {
|
|
|
3385
4390
|
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
3386
4391
|
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
3387
4392
|
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
4393
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
3388
4394
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
3389
4395
|
allowLiteral: true,
|
|
3390
4396
|
optionalEnv: true
|
|
@@ -3400,13 +4406,15 @@ function resolveCodexConfig(target, env) {
|
|
|
3400
4406
|
optionalEnv: true
|
|
3401
4407
|
});
|
|
3402
4408
|
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
4409
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
3403
4410
|
return {
|
|
3404
4411
|
executable,
|
|
3405
4412
|
args,
|
|
3406
4413
|
cwd,
|
|
3407
4414
|
timeoutMs,
|
|
3408
4415
|
logDir,
|
|
3409
|
-
logFormat
|
|
4416
|
+
logFormat,
|
|
4417
|
+
systemPrompt
|
|
3410
4418
|
};
|
|
3411
4419
|
}
|
|
3412
4420
|
function normalizeCodexLogFormat(value) {
|
|
@@ -3422,10 +4430,73 @@ function normalizeCodexLogFormat(value) {
|
|
|
3422
4430
|
}
|
|
3423
4431
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
3424
4432
|
}
|
|
4433
|
+
function resolvePiCodingAgentConfig(target, env) {
|
|
4434
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
4435
|
+
const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
|
|
4436
|
+
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
4437
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4438
|
+
const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
|
|
4439
|
+
const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
|
|
4440
|
+
const argsSource = target.args ?? target.arguments;
|
|
4441
|
+
const cwdSource = target.cwd;
|
|
4442
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
4443
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
4444
|
+
const logFormatSource = target.log_format ?? target.logFormat;
|
|
4445
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
4446
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, {
|
|
4447
|
+
allowLiteral: true,
|
|
4448
|
+
optionalEnv: true
|
|
4449
|
+
}) ?? "pi";
|
|
4450
|
+
const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
|
|
4451
|
+
allowLiteral: true,
|
|
4452
|
+
optionalEnv: true
|
|
4453
|
+
});
|
|
4454
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
|
|
4455
|
+
allowLiteral: true,
|
|
4456
|
+
optionalEnv: true
|
|
4457
|
+
});
|
|
4458
|
+
const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, {
|
|
4459
|
+
allowLiteral: false,
|
|
4460
|
+
optionalEnv: true
|
|
4461
|
+
});
|
|
4462
|
+
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
4463
|
+
allowLiteral: true,
|
|
4464
|
+
optionalEnv: true
|
|
4465
|
+
});
|
|
4466
|
+
const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
|
|
4467
|
+
allowLiteral: true,
|
|
4468
|
+
optionalEnv: true
|
|
4469
|
+
});
|
|
4470
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
4471
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
4472
|
+
allowLiteral: true,
|
|
4473
|
+
optionalEnv: true
|
|
4474
|
+
});
|
|
4475
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
|
|
4476
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
|
|
4477
|
+
allowLiteral: true,
|
|
4478
|
+
optionalEnv: true
|
|
4479
|
+
});
|
|
4480
|
+
const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
|
|
4481
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4482
|
+
return {
|
|
4483
|
+
executable,
|
|
4484
|
+
provider,
|
|
4485
|
+
model,
|
|
4486
|
+
apiKey,
|
|
4487
|
+
tools,
|
|
4488
|
+
thinking,
|
|
4489
|
+
args,
|
|
4490
|
+
cwd,
|
|
4491
|
+
timeoutMs,
|
|
4492
|
+
logDir,
|
|
4493
|
+
logFormat,
|
|
4494
|
+
systemPrompt
|
|
4495
|
+
};
|
|
4496
|
+
}
|
|
3425
4497
|
function resolveMockConfig(target) {
|
|
3426
4498
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3427
|
-
|
|
3428
|
-
return { response, trace };
|
|
4499
|
+
return { response };
|
|
3429
4500
|
}
|
|
3430
4501
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
3431
4502
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -3457,42 +4528,35 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
3457
4528
|
workspaceTemplate
|
|
3458
4529
|
};
|
|
3459
4530
|
}
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3464
|
-
);
|
|
3465
|
-
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
3466
|
-
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3467
|
-
allowLiteral: true,
|
|
3468
|
-
optionalEnv: true
|
|
3469
|
-
});
|
|
3470
|
-
if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
|
|
3471
|
-
cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
|
|
4531
|
+
var cliErrorMap = (issue, ctx) => {
|
|
4532
|
+
if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
|
|
4533
|
+
return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
|
|
3472
4534
|
}
|
|
3473
|
-
if (
|
|
3474
|
-
|
|
4535
|
+
if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
|
|
4536
|
+
return { message: "healthcheck type must be 'http' or 'command'" };
|
|
3475
4537
|
}
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3480
|
-
|
|
3481
|
-
|
|
3482
|
-
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
4538
|
+
if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
|
|
4539
|
+
return { message: `${ctx.defaultError} (expected a string value)` };
|
|
4540
|
+
}
|
|
4541
|
+
return { message: ctx.defaultError };
|
|
4542
|
+
};
|
|
4543
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
4544
|
+
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
4545
|
+
if (!parseResult.success) {
|
|
4546
|
+
const firstError = parseResult.error.errors[0];
|
|
4547
|
+
const path16 = firstError?.path.join(".") || "";
|
|
4548
|
+
const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
|
|
4549
|
+
throw new Error(`${prefix}${firstError?.message}`);
|
|
4550
|
+
}
|
|
4551
|
+
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
4552
|
+
assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
|
|
4553
|
+
if (normalized.healthcheck?.type === "command") {
|
|
4554
|
+
assertSupportedCliPlaceholders(
|
|
4555
|
+
normalized.healthcheck.commandTemplate,
|
|
4556
|
+
`${target.name} healthcheck command template`
|
|
4557
|
+
);
|
|
4558
|
+
}
|
|
4559
|
+
return normalized;
|
|
3496
4560
|
}
|
|
3497
4561
|
function resolveTimeoutMs(source, description) {
|
|
3498
4562
|
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
@@ -3504,49 +4568,6 @@ function resolveTimeoutMs(source, description) {
|
|
|
3504
4568
|
}
|
|
3505
4569
|
return Math.floor(seconds * 1e3);
|
|
3506
4570
|
}
|
|
3507
|
-
function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
|
|
3508
|
-
if (source === void 0 || source === null) {
|
|
3509
|
-
return void 0;
|
|
3510
|
-
}
|
|
3511
|
-
if (typeof source !== "object" || Array.isArray(source)) {
|
|
3512
|
-
throw new Error(`${targetName} healthcheck must be an object`);
|
|
3513
|
-
}
|
|
3514
|
-
const candidate = source;
|
|
3515
|
-
const type = candidate.type;
|
|
3516
|
-
const timeoutMs = resolveTimeoutMs(
|
|
3517
|
-
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
3518
|
-
`${targetName} healthcheck timeout`
|
|
3519
|
-
);
|
|
3520
|
-
if (type === "http") {
|
|
3521
|
-
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
3522
|
-
return {
|
|
3523
|
-
type: "http",
|
|
3524
|
-
url,
|
|
3525
|
-
timeoutMs
|
|
3526
|
-
};
|
|
3527
|
-
}
|
|
3528
|
-
if (type === "command") {
|
|
3529
|
-
const commandTemplate = resolveString(
|
|
3530
|
-
candidate.command_template ?? candidate.commandTemplate,
|
|
3531
|
-
env,
|
|
3532
|
-
`${targetName} healthcheck command template`,
|
|
3533
|
-
true
|
|
3534
|
-
);
|
|
3535
|
-
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
3536
|
-
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
3537
|
-
allowLiteral: true,
|
|
3538
|
-
optionalEnv: true
|
|
3539
|
-
});
|
|
3540
|
-
const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
|
|
3541
|
-
return {
|
|
3542
|
-
type: "command",
|
|
3543
|
-
commandTemplate,
|
|
3544
|
-
timeoutMs,
|
|
3545
|
-
cwd: resolvedCwd
|
|
3546
|
-
};
|
|
3547
|
-
}
|
|
3548
|
-
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
3549
|
-
}
|
|
3550
4571
|
function assertSupportedCliPlaceholders(template, description) {
|
|
3551
4572
|
const placeholders = extractCliPlaceholders(template);
|
|
3552
4573
|
for (const placeholder of placeholders) {
|
|
@@ -3712,7 +4733,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
3712
4733
|
}
|
|
3713
4734
|
|
|
3714
4735
|
// src/evaluation/providers/vscode.ts
|
|
3715
|
-
var
|
|
4736
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3716
4737
|
var import_subagent = require("subagent");
|
|
3717
4738
|
|
|
3718
4739
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -3786,7 +4807,7 @@ var VSCodeProvider = class {
|
|
|
3786
4807
|
}
|
|
3787
4808
|
if (this.config.dryRun) {
|
|
3788
4809
|
return {
|
|
3789
|
-
|
|
4810
|
+
outputMessages: [],
|
|
3790
4811
|
raw: {
|
|
3791
4812
|
session,
|
|
3792
4813
|
inputFiles
|
|
@@ -3795,7 +4816,7 @@ var VSCodeProvider = class {
|
|
|
3795
4816
|
}
|
|
3796
4817
|
const responseText = await readTextFile(session.responseFile);
|
|
3797
4818
|
return {
|
|
3798
|
-
|
|
4819
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3799
4820
|
raw: {
|
|
3800
4821
|
session,
|
|
3801
4822
|
inputFiles
|
|
@@ -3833,7 +4854,7 @@ var VSCodeProvider = class {
|
|
|
3833
4854
|
}
|
|
3834
4855
|
if (this.config.dryRun) {
|
|
3835
4856
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3836
|
-
|
|
4857
|
+
outputMessages: [],
|
|
3837
4858
|
raw: {
|
|
3838
4859
|
session,
|
|
3839
4860
|
inputFiles,
|
|
@@ -3850,7 +4871,7 @@ var VSCodeProvider = class {
|
|
|
3850
4871
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3851
4872
|
const responseText = await readTextFile(responseFile);
|
|
3852
4873
|
responses.push({
|
|
3853
|
-
|
|
4874
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3854
4875
|
raw: {
|
|
3855
4876
|
session,
|
|
3856
4877
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -3882,7 +4903,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3882
4903
|
return "";
|
|
3883
4904
|
}
|
|
3884
4905
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3885
|
-
const fileName =
|
|
4906
|
+
const fileName = import_node_path13.default.basename(absolutePath);
|
|
3886
4907
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3887
4908
|
return `* [${fileName}](${fileUri})`;
|
|
3888
4909
|
});
|
|
@@ -3907,8 +4928,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3907
4928
|
}
|
|
3908
4929
|
const unique = /* @__PURE__ */ new Map();
|
|
3909
4930
|
for (const attachment of attachments) {
|
|
3910
|
-
const absolutePath =
|
|
3911
|
-
const normalized = absolutePath.split(
|
|
4931
|
+
const absolutePath = import_node_path13.default.resolve(attachment);
|
|
4932
|
+
const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
|
|
3912
4933
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3913
4934
|
if (!unique.has(absolutePath)) {
|
|
3914
4935
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3923,7 +4944,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3923
4944
|
}
|
|
3924
4945
|
const unique = /* @__PURE__ */ new Map();
|
|
3925
4946
|
for (const attachment of attachments) {
|
|
3926
|
-
const absolutePath =
|
|
4947
|
+
const absolutePath = import_node_path13.default.resolve(attachment);
|
|
3927
4948
|
if (!unique.has(absolutePath)) {
|
|
3928
4949
|
unique.set(absolutePath, absolutePath);
|
|
3929
4950
|
}
|
|
@@ -3931,7 +4952,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3931
4952
|
return Array.from(unique.values());
|
|
3932
4953
|
}
|
|
3933
4954
|
function pathToFileUri2(filePath) {
|
|
3934
|
-
const absolutePath =
|
|
4955
|
+
const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
|
|
3935
4956
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3936
4957
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3937
4958
|
return `file:///${normalizedPath}`;
|
|
@@ -3944,7 +4965,7 @@ function normalizeAttachments(attachments) {
|
|
|
3944
4965
|
}
|
|
3945
4966
|
const deduped = /* @__PURE__ */ new Set();
|
|
3946
4967
|
for (const attachment of attachments) {
|
|
3947
|
-
deduped.add(
|
|
4968
|
+
deduped.add(import_node_path13.default.resolve(attachment));
|
|
3948
4969
|
}
|
|
3949
4970
|
return Array.from(deduped);
|
|
3950
4971
|
}
|
|
@@ -3953,7 +4974,7 @@ function mergeAttachments(all) {
|
|
|
3953
4974
|
for (const list of all) {
|
|
3954
4975
|
if (!list) continue;
|
|
3955
4976
|
for (const inputFile of list) {
|
|
3956
|
-
deduped.add(
|
|
4977
|
+
deduped.add(import_node_path13.default.resolve(inputFile));
|
|
3957
4978
|
}
|
|
3958
4979
|
}
|
|
3959
4980
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -4000,9 +5021,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
4000
5021
|
}
|
|
4001
5022
|
|
|
4002
5023
|
// src/evaluation/providers/targets-file.ts
|
|
4003
|
-
var
|
|
4004
|
-
var
|
|
4005
|
-
var
|
|
5024
|
+
var import_node_fs5 = require("fs");
|
|
5025
|
+
var import_promises11 = require("fs/promises");
|
|
5026
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4006
5027
|
var import_yaml3 = require("yaml");
|
|
4007
5028
|
function isRecord(value) {
|
|
4008
5029
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4032,18 +5053,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
4032
5053
|
}
|
|
4033
5054
|
async function fileExists3(filePath) {
|
|
4034
5055
|
try {
|
|
4035
|
-
await (0,
|
|
5056
|
+
await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
|
|
4036
5057
|
return true;
|
|
4037
5058
|
} catch {
|
|
4038
5059
|
return false;
|
|
4039
5060
|
}
|
|
4040
5061
|
}
|
|
4041
5062
|
async function readTargetDefinitions(filePath) {
|
|
4042
|
-
const absolutePath =
|
|
5063
|
+
const absolutePath = import_node_path14.default.resolve(filePath);
|
|
4043
5064
|
if (!await fileExists3(absolutePath)) {
|
|
4044
5065
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
4045
5066
|
}
|
|
4046
|
-
const raw = await (0,
|
|
5067
|
+
const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
|
|
4047
5068
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
4048
5069
|
if (!isRecord(parsed)) {
|
|
4049
5070
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -4071,6 +5092,8 @@ function createProvider(target) {
|
|
|
4071
5092
|
return new CliProvider(target.name, target.config);
|
|
4072
5093
|
case "codex":
|
|
4073
5094
|
return new CodexProvider(target.name, target.config);
|
|
5095
|
+
case "pi-coding-agent":
|
|
5096
|
+
return new PiCodingAgentProvider(target.name, target.config);
|
|
4074
5097
|
case "mock":
|
|
4075
5098
|
return new MockProvider(target.name, target.config);
|
|
4076
5099
|
case "vscode":
|
|
@@ -4090,6 +5113,100 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
4090
5113
|
// src/evaluation/evaluators.ts
|
|
4091
5114
|
var import_ai2 = require("ai");
|
|
4092
5115
|
var import_zod2 = require("zod");
|
|
5116
|
+
|
|
5117
|
+
// src/runtime/exec.ts
|
|
5118
|
+
function getBunSpawn() {
|
|
5119
|
+
const bunSpawn = globalThis.Bun?.spawn;
|
|
5120
|
+
return typeof bunSpawn === "function" ? bunSpawn : void 0;
|
|
5121
|
+
}
|
|
5122
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5123
|
+
const bunSpawn = getBunSpawn();
|
|
5124
|
+
if (bunSpawn) {
|
|
5125
|
+
const encoder = new TextEncoder();
|
|
5126
|
+
const proc = bunSpawn({
|
|
5127
|
+
cmd: ["sh", "-c", command],
|
|
5128
|
+
cwd: options.cwd,
|
|
5129
|
+
stdin: encoder.encode(stdinPayload),
|
|
5130
|
+
stdout: "pipe",
|
|
5131
|
+
stderr: "pipe"
|
|
5132
|
+
});
|
|
5133
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5134
|
+
proc.kill();
|
|
5135
|
+
}, options.timeoutMs) : void 0;
|
|
5136
|
+
try {
|
|
5137
|
+
const stdout = await new Response(proc.stdout).text();
|
|
5138
|
+
const stderr = await new Response(proc.stderr).text();
|
|
5139
|
+
const exitCode = await proc.exited;
|
|
5140
|
+
return { stdout, stderr, exitCode };
|
|
5141
|
+
} finally {
|
|
5142
|
+
if (timeout !== void 0) {
|
|
5143
|
+
clearTimeout(timeout);
|
|
5144
|
+
}
|
|
5145
|
+
}
|
|
5146
|
+
}
|
|
5147
|
+
const { spawn: spawn3 } = await import("child_process");
|
|
5148
|
+
return await new Promise((resolve, reject) => {
|
|
5149
|
+
const child = spawn3(command, {
|
|
5150
|
+
shell: true,
|
|
5151
|
+
cwd: options.cwd,
|
|
5152
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
5153
|
+
});
|
|
5154
|
+
let stdout = "";
|
|
5155
|
+
let stderr = "";
|
|
5156
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5157
|
+
child.kill();
|
|
5158
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5159
|
+
}, options.timeoutMs) : void 0;
|
|
5160
|
+
child.stdout?.on("data", (data) => {
|
|
5161
|
+
stdout += data.toString();
|
|
5162
|
+
});
|
|
5163
|
+
child.stderr?.on("data", (data) => {
|
|
5164
|
+
stderr += data.toString();
|
|
5165
|
+
});
|
|
5166
|
+
child.on("error", (error) => {
|
|
5167
|
+
if (timeout !== void 0) {
|
|
5168
|
+
clearTimeout(timeout);
|
|
5169
|
+
}
|
|
5170
|
+
reject(error);
|
|
5171
|
+
});
|
|
5172
|
+
child.on("exit", (code) => {
|
|
5173
|
+
if (timeout !== void 0) {
|
|
5174
|
+
clearTimeout(timeout);
|
|
5175
|
+
}
|
|
5176
|
+
resolve({ stdout, stderr, exitCode: code ?? 0 });
|
|
5177
|
+
});
|
|
5178
|
+
child.stdin?.write(stdinPayload);
|
|
5179
|
+
child.stdin?.end();
|
|
5180
|
+
});
|
|
5181
|
+
}
|
|
5182
|
+
|
|
5183
|
+
// src/evaluation/providers/types.ts
|
|
5184
|
+
var AGENT_PROVIDER_KINDS = [
|
|
5185
|
+
"codex",
|
|
5186
|
+
"pi-coding-agent",
|
|
5187
|
+
"vscode",
|
|
5188
|
+
"vscode-insiders"
|
|
5189
|
+
];
|
|
5190
|
+
function extractLastAssistantContent(messages) {
|
|
5191
|
+
if (!messages || messages.length === 0) {
|
|
5192
|
+
return "";
|
|
5193
|
+
}
|
|
5194
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
5195
|
+
const msg = messages[i];
|
|
5196
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
5197
|
+
if (typeof msg.content === "string") {
|
|
5198
|
+
return msg.content;
|
|
5199
|
+
}
|
|
5200
|
+
return JSON.stringify(msg.content);
|
|
5201
|
+
}
|
|
5202
|
+
}
|
|
5203
|
+
return "";
|
|
5204
|
+
}
|
|
5205
|
+
function isAgentProvider(provider) {
|
|
5206
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
5207
|
+
}
|
|
5208
|
+
|
|
5209
|
+
// src/evaluation/evaluators.ts
|
|
4093
5210
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
4094
5211
|
|
|
4095
5212
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -4154,6 +5271,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4154
5271
|
null,
|
|
4155
5272
|
2
|
|
4156
5273
|
),
|
|
5274
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
4157
5275
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
4158
5276
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
4159
5277
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -4178,7 +5296,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4178
5296
|
const score = clampScore(data.score);
|
|
4179
5297
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4180
5298
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4181
|
-
const reasoning = data.reasoning
|
|
5299
|
+
const reasoning = data.reasoning;
|
|
4182
5300
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
4183
5301
|
return {
|
|
4184
5302
|
score,
|
|
@@ -4280,7 +5398,9 @@ var LlmJudgeEvaluator = class {
|
|
|
4280
5398
|
maxOutputTokens: this.maxOutputTokens,
|
|
4281
5399
|
temperature: this.temperature
|
|
4282
5400
|
});
|
|
4283
|
-
const data = schema.parse(
|
|
5401
|
+
const data = schema.parse(
|
|
5402
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
5403
|
+
);
|
|
4284
5404
|
return { data, providerResponse: response };
|
|
4285
5405
|
} catch (e) {
|
|
4286
5406
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -4362,17 +5482,17 @@ var CodeEvaluator = class {
|
|
|
4362
5482
|
const inputPayload = JSON.stringify(
|
|
4363
5483
|
{
|
|
4364
5484
|
question: context.evalCase.question,
|
|
4365
|
-
|
|
4366
|
-
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
|
|
4371
|
-
|
|
5485
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5486
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5487
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5488
|
+
candidateAnswer: context.candidate,
|
|
5489
|
+
outputMessages: context.outputMessages ?? null,
|
|
5490
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5491
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5492
|
+
(path16) => !context.evalCase.guideline_paths.includes(path16)
|
|
4372
5493
|
),
|
|
4373
|
-
|
|
4374
|
-
|
|
4375
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
5494
|
+
inputMessages: context.evalCase.input_messages,
|
|
5495
|
+
traceSummary: context.traceSummary ?? null
|
|
4376
5496
|
},
|
|
4377
5497
|
null,
|
|
4378
5498
|
2
|
|
@@ -4442,43 +5562,17 @@ function calculateRubricScore(result, rubrics) {
|
|
|
4442
5562
|
return { score, verdict, hits, misses };
|
|
4443
5563
|
}
|
|
4444
5564
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4445
|
-
const {
|
|
4446
|
-
|
|
4447
|
-
|
|
4448
|
-
shell: true,
|
|
4449
|
-
cwd
|
|
4450
|
-
});
|
|
4451
|
-
let stdout = "";
|
|
4452
|
-
let stderr = "";
|
|
4453
|
-
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
4454
|
-
child.kill();
|
|
4455
|
-
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
4456
|
-
}, agentTimeoutMs) : void 0;
|
|
4457
|
-
child.stdout?.on("data", (data) => {
|
|
4458
|
-
stdout += data.toString();
|
|
4459
|
-
});
|
|
4460
|
-
child.stderr?.on("data", (data) => {
|
|
4461
|
-
stderr += data.toString();
|
|
4462
|
-
});
|
|
4463
|
-
child.on("error", (error) => {
|
|
4464
|
-
if (timeout !== void 0) {
|
|
4465
|
-
clearTimeout(timeout);
|
|
4466
|
-
}
|
|
4467
|
-
reject(error);
|
|
4468
|
-
});
|
|
4469
|
-
child.on("exit", (code) => {
|
|
4470
|
-
if (timeout !== void 0) {
|
|
4471
|
-
clearTimeout(timeout);
|
|
4472
|
-
}
|
|
4473
|
-
if (code && code !== 0 && stderr.length > 0) {
|
|
4474
|
-
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
4475
|
-
return;
|
|
4476
|
-
}
|
|
4477
|
-
resolve(stdout.trim());
|
|
4478
|
-
});
|
|
4479
|
-
child.stdin?.write(input);
|
|
4480
|
-
child.stdin?.end();
|
|
5565
|
+
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
5566
|
+
cwd,
|
|
5567
|
+
timeoutMs: agentTimeoutMs
|
|
4481
5568
|
});
|
|
5569
|
+
if (exitCode !== 0) {
|
|
5570
|
+
const trimmedErr = stderr.trim();
|
|
5571
|
+
throw new Error(
|
|
5572
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5573
|
+
);
|
|
5574
|
+
}
|
|
5575
|
+
return stdout.trim();
|
|
4482
5576
|
}
|
|
4483
5577
|
function parseJsonSafe(payload) {
|
|
4484
5578
|
try {
|
|
@@ -4492,6 +5586,33 @@ function substituteVariables(template, variables) {
|
|
|
4492
5586
|
return variables[varName] ?? match;
|
|
4493
5587
|
});
|
|
4494
5588
|
}
|
|
5589
|
+
function deepEqual(a, b) {
|
|
5590
|
+
if (a === b) return true;
|
|
5591
|
+
if (a === null || b === null) return a === b;
|
|
5592
|
+
if (typeof a !== typeof b) return false;
|
|
5593
|
+
if (typeof a !== "object") return a === b;
|
|
5594
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5595
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5596
|
+
if (a.length !== b.length) return false;
|
|
5597
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5598
|
+
}
|
|
5599
|
+
const aObj = a;
|
|
5600
|
+
const bObj = b;
|
|
5601
|
+
const aKeys = Object.keys(aObj);
|
|
5602
|
+
const bKeys = Object.keys(bObj);
|
|
5603
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
5604
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5605
|
+
}
|
|
5606
|
+
function argsMatch(expected, actual) {
|
|
5607
|
+
if (expected === void 0) return true;
|
|
5608
|
+
if (expected === "any") return true;
|
|
5609
|
+
if (actual === void 0) return false;
|
|
5610
|
+
for (const key of Object.keys(expected)) {
|
|
5611
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
5612
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
5613
|
+
}
|
|
5614
|
+
return true;
|
|
5615
|
+
}
|
|
4495
5616
|
var ToolTrajectoryEvaluator = class {
|
|
4496
5617
|
kind = "tool_trajectory";
|
|
4497
5618
|
config;
|
|
@@ -4499,8 +5620,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4499
5620
|
this.config = options.config;
|
|
4500
5621
|
}
|
|
4501
5622
|
evaluate(context) {
|
|
4502
|
-
const {
|
|
4503
|
-
|
|
5623
|
+
const { outputMessages, traceSummary } = context;
|
|
5624
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
5625
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
5626
|
+
return {
|
|
5627
|
+
score: 0,
|
|
5628
|
+
verdict: "fail",
|
|
5629
|
+
hits: [],
|
|
5630
|
+
misses: ["No trace available for evaluation"],
|
|
5631
|
+
expectedAspectCount: 1
|
|
5632
|
+
};
|
|
5633
|
+
}
|
|
5634
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
5635
|
+
if (!summary) {
|
|
4504
5636
|
return {
|
|
4505
5637
|
score: 0,
|
|
4506
5638
|
verdict: "fail",
|
|
@@ -4511,11 +5643,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4511
5643
|
}
|
|
4512
5644
|
switch (this.config.mode) {
|
|
4513
5645
|
case "any_order":
|
|
4514
|
-
return this.evaluateAnyOrder(
|
|
5646
|
+
return this.evaluateAnyOrder(summary);
|
|
4515
5647
|
case "in_order":
|
|
4516
|
-
return this.evaluateInOrder(
|
|
5648
|
+
return this.evaluateInOrder(toolCalls);
|
|
4517
5649
|
case "exact":
|
|
4518
|
-
return this.evaluateExact(
|
|
5650
|
+
return this.evaluateExact(toolCalls);
|
|
4519
5651
|
default:
|
|
4520
5652
|
return {
|
|
4521
5653
|
score: 0,
|
|
@@ -4526,6 +5658,42 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4526
5658
|
};
|
|
4527
5659
|
}
|
|
4528
5660
|
}
|
|
5661
|
+
/**
|
|
5662
|
+
* Extract tool calls from output messages.
|
|
5663
|
+
*/
|
|
5664
|
+
extractToolCallsFromMessages(messages) {
|
|
5665
|
+
if (!messages) {
|
|
5666
|
+
return [];
|
|
5667
|
+
}
|
|
5668
|
+
const toolCalls = [];
|
|
5669
|
+
for (const message of messages) {
|
|
5670
|
+
if (message.toolCalls) {
|
|
5671
|
+
for (const call of message.toolCalls) {
|
|
5672
|
+
toolCalls.push({
|
|
5673
|
+
name: call.tool,
|
|
5674
|
+
args: call.input
|
|
5675
|
+
});
|
|
5676
|
+
}
|
|
5677
|
+
}
|
|
5678
|
+
}
|
|
5679
|
+
return toolCalls;
|
|
5680
|
+
}
|
|
5681
|
+
/**
|
|
5682
|
+
* Build a summary from extracted tool calls.
|
|
5683
|
+
*/
|
|
5684
|
+
buildSummary(toolCalls) {
|
|
5685
|
+
const toolCallsByName = {};
|
|
5686
|
+
for (const call of toolCalls) {
|
|
5687
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
5688
|
+
}
|
|
5689
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
5690
|
+
return {
|
|
5691
|
+
eventCount: toolCalls.length,
|
|
5692
|
+
toolNames,
|
|
5693
|
+
toolCallsByName,
|
|
5694
|
+
errorCount: 0
|
|
5695
|
+
};
|
|
5696
|
+
}
|
|
4529
5697
|
evaluateAnyOrder(summary) {
|
|
4530
5698
|
const minimums = this.config.minimums ?? {};
|
|
4531
5699
|
const toolNames = Object.keys(minimums);
|
|
@@ -4558,7 +5726,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4558
5726
|
expectedAspectCount: toolNames.length
|
|
4559
5727
|
};
|
|
4560
5728
|
}
|
|
4561
|
-
evaluateInOrder(
|
|
5729
|
+
evaluateInOrder(toolCalls) {
|
|
4562
5730
|
const expected = this.config.expected ?? [];
|
|
4563
5731
|
if (expected.length === 0) {
|
|
4564
5732
|
return {
|
|
@@ -4569,23 +5737,33 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4569
5737
|
expectedAspectCount: 0
|
|
4570
5738
|
};
|
|
4571
5739
|
}
|
|
4572
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4573
5740
|
const hits = [];
|
|
4574
5741
|
const misses = [];
|
|
4575
5742
|
let actualIndex = 0;
|
|
4576
5743
|
for (let i = 0; i < expected.length; i++) {
|
|
4577
|
-
const
|
|
5744
|
+
const expectedItem = expected[i];
|
|
5745
|
+
const expectedTool = expectedItem.tool;
|
|
4578
5746
|
let found = false;
|
|
4579
|
-
|
|
4580
|
-
|
|
4581
|
-
|
|
5747
|
+
let argsMismatch = false;
|
|
5748
|
+
while (actualIndex < toolCalls.length) {
|
|
5749
|
+
const actualCall = toolCalls[actualIndex];
|
|
5750
|
+
if (actualCall.name === expectedTool) {
|
|
5751
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
5752
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
5753
|
+
actualIndex++;
|
|
5754
|
+
found = true;
|
|
5755
|
+
break;
|
|
5756
|
+
}
|
|
5757
|
+
misses.push(
|
|
5758
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
5759
|
+
);
|
|
4582
5760
|
actualIndex++;
|
|
4583
|
-
|
|
5761
|
+
argsMismatch = true;
|
|
4584
5762
|
break;
|
|
4585
5763
|
}
|
|
4586
5764
|
actualIndex++;
|
|
4587
5765
|
}
|
|
4588
|
-
if (!found) {
|
|
5766
|
+
if (!found && !argsMismatch) {
|
|
4589
5767
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
4590
5768
|
}
|
|
4591
5769
|
}
|
|
@@ -4598,7 +5776,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4598
5776
|
expectedAspectCount: expected.length
|
|
4599
5777
|
};
|
|
4600
5778
|
}
|
|
4601
|
-
evaluateExact(
|
|
5779
|
+
evaluateExact(toolCalls) {
|
|
4602
5780
|
const expected = this.config.expected ?? [];
|
|
4603
5781
|
if (expected.length === 0) {
|
|
4604
5782
|
return {
|
|
@@ -4609,18 +5787,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4609
5787
|
expectedAspectCount: 0
|
|
4610
5788
|
};
|
|
4611
5789
|
}
|
|
4612
|
-
const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
|
|
4613
5790
|
const hits = [];
|
|
4614
5791
|
const misses = [];
|
|
4615
|
-
if (
|
|
4616
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
5792
|
+
if (toolCalls.length !== expected.length) {
|
|
5793
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
4617
5794
|
}
|
|
4618
|
-
const checkLength = Math.min(expected.length,
|
|
5795
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
4619
5796
|
for (let i = 0; i < checkLength; i++) {
|
|
4620
|
-
const
|
|
4621
|
-
const
|
|
5797
|
+
const expectedItem = expected[i];
|
|
5798
|
+
const expectedTool = expectedItem.tool;
|
|
5799
|
+
const actualCall = toolCalls[i];
|
|
5800
|
+
const actualTool = actualCall.name;
|
|
4622
5801
|
if (actualTool === expectedTool) {
|
|
4623
|
-
|
|
5802
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
5803
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
5804
|
+
} else {
|
|
5805
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
5806
|
+
}
|
|
4624
5807
|
} else {
|
|
4625
5808
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4626
5809
|
}
|
|
@@ -4832,11 +6015,13 @@ var CompositeEvaluator = class {
|
|
|
4832
6015
|
evalCaseId: context.evalCase.id,
|
|
4833
6016
|
attempt: context.attempt
|
|
4834
6017
|
});
|
|
4835
|
-
const data = freeformEvaluationSchema.parse(
|
|
6018
|
+
const data = freeformEvaluationSchema.parse(
|
|
6019
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
6020
|
+
);
|
|
4836
6021
|
const score = clampScore(data.score);
|
|
4837
6022
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4838
6023
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4839
|
-
const reasoning = data.reasoning
|
|
6024
|
+
const reasoning = data.reasoning;
|
|
4840
6025
|
return {
|
|
4841
6026
|
score,
|
|
4842
6027
|
verdict: scoreToVerdict(score),
|
|
@@ -4862,9 +6047,9 @@ var CompositeEvaluator = class {
|
|
|
4862
6047
|
};
|
|
4863
6048
|
|
|
4864
6049
|
// src/evaluation/orchestrator.ts
|
|
4865
|
-
var
|
|
4866
|
-
var
|
|
4867
|
-
var
|
|
6050
|
+
var import_node_crypto3 = require("crypto");
|
|
6051
|
+
var import_promises12 = require("fs/promises");
|
|
6052
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
4868
6053
|
|
|
4869
6054
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4870
6055
|
var Node = class {
|
|
@@ -5005,16 +6190,6 @@ function validateConcurrency(concurrency) {
|
|
|
5005
6190
|
}
|
|
5006
6191
|
}
|
|
5007
6192
|
|
|
5008
|
-
// src/evaluation/providers/types.ts
|
|
5009
|
-
var AGENT_PROVIDER_KINDS = [
|
|
5010
|
-
"codex",
|
|
5011
|
-
"vscode",
|
|
5012
|
-
"vscode-insiders"
|
|
5013
|
-
];
|
|
5014
|
-
function isAgentProvider(provider) {
|
|
5015
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
5016
|
-
}
|
|
5017
|
-
|
|
5018
6193
|
// src/evaluation/orchestrator.ts
|
|
5019
6194
|
async function runEvaluation(options) {
|
|
5020
6195
|
const {
|
|
@@ -5269,11 +6444,19 @@ async function runBatchEvaluation(options) {
|
|
|
5269
6444
|
const evalCase = evalCases[i];
|
|
5270
6445
|
const promptInputs = promptInputsList[i];
|
|
5271
6446
|
const providerResponse = batchResponse[i];
|
|
6447
|
+
const outputMessages = providerResponse.outputMessages;
|
|
6448
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
6449
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6450
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
6451
|
+
costUsd: providerResponse.costUsd,
|
|
6452
|
+
durationMs: providerResponse.durationMs
|
|
6453
|
+
}) : void 0;
|
|
6454
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5272
6455
|
let result;
|
|
5273
6456
|
try {
|
|
5274
6457
|
result = await evaluateCandidate({
|
|
5275
6458
|
evalCase,
|
|
5276
|
-
candidate
|
|
6459
|
+
candidate,
|
|
5277
6460
|
target,
|
|
5278
6461
|
provider,
|
|
5279
6462
|
evaluators: evaluatorRegistry,
|
|
@@ -5281,7 +6464,9 @@ async function runBatchEvaluation(options) {
|
|
|
5281
6464
|
nowFn,
|
|
5282
6465
|
attempt: 0,
|
|
5283
6466
|
judgeProvider: await resolveJudgeProvider(target),
|
|
5284
|
-
agentTimeoutMs
|
|
6467
|
+
agentTimeoutMs,
|
|
6468
|
+
outputMessages,
|
|
6469
|
+
traceSummary
|
|
5285
6470
|
});
|
|
5286
6471
|
} catch (error) {
|
|
5287
6472
|
const errorResult = buildErrorResult(
|
|
@@ -5385,21 +6570,18 @@ async function runEvalCase(options) {
|
|
|
5385
6570
|
if (cacheKey && cache && !cachedResponse) {
|
|
5386
6571
|
await cache.set(cacheKey, providerResponse);
|
|
5387
6572
|
}
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
|
|
5392
|
-
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
|
|
5396
|
-
}
|
|
5397
|
-
}
|
|
5398
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
6573
|
+
const outputMessages = providerResponse.outputMessages;
|
|
6574
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
6575
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6576
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
6577
|
+
costUsd: providerResponse.costUsd,
|
|
6578
|
+
durationMs: providerResponse.durationMs
|
|
6579
|
+
}) : void 0;
|
|
6580
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
5399
6581
|
try {
|
|
5400
6582
|
return await evaluateCandidate({
|
|
5401
6583
|
evalCase,
|
|
5402
|
-
candidate
|
|
6584
|
+
candidate,
|
|
5403
6585
|
target,
|
|
5404
6586
|
provider,
|
|
5405
6587
|
evaluators,
|
|
@@ -5408,9 +6590,8 @@ async function runEvalCase(options) {
|
|
|
5408
6590
|
attempt,
|
|
5409
6591
|
judgeProvider,
|
|
5410
6592
|
agentTimeoutMs,
|
|
5411
|
-
|
|
5412
|
-
|
|
5413
|
-
candidateTraceSummary
|
|
6593
|
+
outputMessages,
|
|
6594
|
+
traceSummary
|
|
5414
6595
|
});
|
|
5415
6596
|
} catch (error) {
|
|
5416
6597
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
@@ -5428,9 +6609,8 @@ async function evaluateCandidate(options) {
|
|
|
5428
6609
|
attempt,
|
|
5429
6610
|
judgeProvider,
|
|
5430
6611
|
agentTimeoutMs,
|
|
5431
|
-
|
|
5432
|
-
|
|
5433
|
-
candidateTraceSummary
|
|
6612
|
+
outputMessages,
|
|
6613
|
+
traceSummary
|
|
5434
6614
|
} = options;
|
|
5435
6615
|
const gradeTimestamp = nowFn();
|
|
5436
6616
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -5444,9 +6624,8 @@ async function evaluateCandidate(options) {
|
|
|
5444
6624
|
now: gradeTimestamp,
|
|
5445
6625
|
judgeProvider,
|
|
5446
6626
|
agentTimeoutMs,
|
|
5447
|
-
|
|
5448
|
-
|
|
5449
|
-
candidateTraceSummary
|
|
6627
|
+
outputMessages,
|
|
6628
|
+
traceSummary
|
|
5450
6629
|
});
|
|
5451
6630
|
const completedAt = nowFn();
|
|
5452
6631
|
let agentProviderRequest;
|
|
@@ -5470,21 +6649,21 @@ async function evaluateCandidate(options) {
|
|
|
5470
6649
|
}
|
|
5471
6650
|
return {
|
|
5472
6651
|
timestamp: completedAt.toISOString(),
|
|
5473
|
-
|
|
6652
|
+
evalId: evalCase.id,
|
|
5474
6653
|
dataset: evalCase.dataset,
|
|
5475
|
-
|
|
6654
|
+
conversationId: evalCase.conversation_id,
|
|
5476
6655
|
score: score.score,
|
|
5477
6656
|
hits: score.hits,
|
|
5478
6657
|
misses: score.misses,
|
|
5479
|
-
|
|
6658
|
+
candidateAnswer: candidate,
|
|
5480
6659
|
target: target.name,
|
|
5481
6660
|
reasoning: score.reasoning,
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
6661
|
+
rawAspects: score.rawAspects,
|
|
6662
|
+
agentProviderRequest,
|
|
6663
|
+
lmProviderRequest,
|
|
6664
|
+
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
6665
|
+
evaluatorResults,
|
|
6666
|
+
traceSummary
|
|
5488
6667
|
};
|
|
5489
6668
|
}
|
|
5490
6669
|
async function runEvaluatorsForCase(options) {
|
|
@@ -5499,9 +6678,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5499
6678
|
now,
|
|
5500
6679
|
judgeProvider,
|
|
5501
6680
|
agentTimeoutMs,
|
|
5502
|
-
|
|
5503
|
-
|
|
5504
|
-
candidateTraceSummary
|
|
6681
|
+
outputMessages,
|
|
6682
|
+
traceSummary
|
|
5505
6683
|
} = options;
|
|
5506
6684
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
5507
6685
|
return runEvaluatorList({
|
|
@@ -5516,9 +6694,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5516
6694
|
now,
|
|
5517
6695
|
judgeProvider,
|
|
5518
6696
|
agentTimeoutMs,
|
|
5519
|
-
|
|
5520
|
-
|
|
5521
|
-
candidateTraceSummary
|
|
6697
|
+
outputMessages,
|
|
6698
|
+
traceSummary
|
|
5522
6699
|
});
|
|
5523
6700
|
}
|
|
5524
6701
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -5535,9 +6712,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
5535
6712
|
promptInputs,
|
|
5536
6713
|
now,
|
|
5537
6714
|
judgeProvider,
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
candidateTraceSummary
|
|
6715
|
+
outputMessages,
|
|
6716
|
+
traceSummary
|
|
5541
6717
|
});
|
|
5542
6718
|
return { score };
|
|
5543
6719
|
}
|
|
@@ -5554,9 +6730,8 @@ async function runEvaluatorList(options) {
|
|
|
5554
6730
|
now,
|
|
5555
6731
|
judgeProvider,
|
|
5556
6732
|
agentTimeoutMs,
|
|
5557
|
-
|
|
5558
|
-
|
|
5559
|
-
candidateTraceSummary
|
|
6733
|
+
outputMessages,
|
|
6734
|
+
traceSummary
|
|
5560
6735
|
} = options;
|
|
5561
6736
|
const scored = [];
|
|
5562
6737
|
const evaluatorResults = [];
|
|
@@ -5586,7 +6761,7 @@ async function runEvaluatorList(options) {
|
|
|
5586
6761
|
hits: score2.hits,
|
|
5587
6762
|
misses: score2.misses,
|
|
5588
6763
|
reasoning: score2.reasoning,
|
|
5589
|
-
|
|
6764
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
5590
6765
|
});
|
|
5591
6766
|
}
|
|
5592
6767
|
if (evaluator.type === "code") {
|
|
@@ -5603,8 +6778,8 @@ async function runEvaluatorList(options) {
|
|
|
5603
6778
|
attempt,
|
|
5604
6779
|
promptInputs,
|
|
5605
6780
|
now,
|
|
5606
|
-
|
|
5607
|
-
|
|
6781
|
+
outputMessages,
|
|
6782
|
+
traceSummary
|
|
5608
6783
|
});
|
|
5609
6784
|
const weight = evaluator.weight ?? 1;
|
|
5610
6785
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -5617,11 +6792,11 @@ async function runEvaluatorList(options) {
|
|
|
5617
6792
|
hits: score2.hits,
|
|
5618
6793
|
misses: score2.misses,
|
|
5619
6794
|
reasoning: score2.reasoning,
|
|
5620
|
-
|
|
6795
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
5621
6796
|
});
|
|
5622
6797
|
}
|
|
5623
6798
|
if (evaluator.type === "composite") {
|
|
5624
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
6799
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5625
6800
|
const createEvaluator = (memberConfig) => {
|
|
5626
6801
|
switch (memberConfig.type) {
|
|
5627
6802
|
case "llm_judge":
|
|
@@ -5674,8 +6849,8 @@ async function runEvaluatorList(options) {
|
|
|
5674
6849
|
hits: score2.hits,
|
|
5675
6850
|
misses: score2.misses,
|
|
5676
6851
|
reasoning: score2.reasoning,
|
|
5677
|
-
|
|
5678
|
-
|
|
6852
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
6853
|
+
evaluatorResults: mapChildResults(score2.evaluatorResults)
|
|
5679
6854
|
});
|
|
5680
6855
|
}
|
|
5681
6856
|
if (evaluator.type === "tool_trajectory") {
|
|
@@ -5690,9 +6865,8 @@ async function runEvaluatorList(options) {
|
|
|
5690
6865
|
attempt,
|
|
5691
6866
|
promptInputs,
|
|
5692
6867
|
now,
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
candidateTraceSummary
|
|
6868
|
+
outputMessages,
|
|
6869
|
+
traceSummary
|
|
5696
6870
|
});
|
|
5697
6871
|
const weight = evaluator.weight ?? 1;
|
|
5698
6872
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -5834,22 +7008,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5834
7008
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5835
7009
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5836
7010
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5837
|
-
const filePath =
|
|
5838
|
-
await (0,
|
|
7011
|
+
const filePath = import_node_path15.default.resolve(directory, filename);
|
|
7012
|
+
await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
|
|
5839
7013
|
const payload = {
|
|
5840
7014
|
eval_id: evalCase.id,
|
|
5841
7015
|
question: promptInputs.question,
|
|
5842
7016
|
guidelines: promptInputs.guidelines,
|
|
5843
7017
|
guideline_paths: evalCase.guideline_paths
|
|
5844
7018
|
};
|
|
5845
|
-
await (0,
|
|
7019
|
+
await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
5846
7020
|
}
|
|
5847
7021
|
function sanitizeFilename(value) {
|
|
5848
7022
|
if (!value) {
|
|
5849
7023
|
return "prompt";
|
|
5850
7024
|
}
|
|
5851
7025
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
5852
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
7026
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
|
|
5853
7027
|
}
|
|
5854
7028
|
async function invokeProvider(provider, options) {
|
|
5855
7029
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -5906,22 +7080,22 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
5906
7080
|
}
|
|
5907
7081
|
return {
|
|
5908
7082
|
timestamp: timestamp.toISOString(),
|
|
5909
|
-
|
|
7083
|
+
evalId: evalCase.id,
|
|
5910
7084
|
dataset: evalCase.dataset,
|
|
5911
|
-
|
|
7085
|
+
conversationId: evalCase.conversation_id,
|
|
5912
7086
|
score: 0,
|
|
5913
7087
|
hits: [],
|
|
5914
7088
|
misses: [`Error: ${message}`],
|
|
5915
|
-
|
|
7089
|
+
candidateAnswer: `Error occurred: ${message}`,
|
|
5916
7090
|
target: targetName,
|
|
5917
|
-
|
|
5918
|
-
|
|
5919
|
-
|
|
7091
|
+
rawAspects: [],
|
|
7092
|
+
agentProviderRequest,
|
|
7093
|
+
lmProviderRequest,
|
|
5920
7094
|
error: message
|
|
5921
7095
|
};
|
|
5922
7096
|
}
|
|
5923
7097
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
5924
|
-
const hash = (0,
|
|
7098
|
+
const hash = (0, import_node_crypto3.createHash)("sha256");
|
|
5925
7099
|
hash.update(provider.id);
|
|
5926
7100
|
hash.update(target.name);
|
|
5927
7101
|
hash.update(evalCase.id);
|
|
@@ -5961,8 +7135,8 @@ function mapChildResults(children) {
|
|
|
5961
7135
|
hits: child.hits,
|
|
5962
7136
|
misses: child.misses,
|
|
5963
7137
|
reasoning: child.reasoning,
|
|
5964
|
-
|
|
5965
|
-
|
|
7138
|
+
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
7139
|
+
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
5966
7140
|
}));
|
|
5967
7141
|
}
|
|
5968
7142
|
function computeWeightedMean(entries) {
|
|
@@ -6064,17 +7238,21 @@ function createAgentKernel() {
|
|
|
6064
7238
|
0 && (module.exports = {
|
|
6065
7239
|
CodeEvaluator,
|
|
6066
7240
|
CompositeEvaluator,
|
|
7241
|
+
DEFAULT_EXPLORATION_TOOLS,
|
|
6067
7242
|
LlmJudgeEvaluator,
|
|
6068
7243
|
TEST_MESSAGE_ROLES,
|
|
6069
7244
|
ToolTrajectoryEvaluator,
|
|
7245
|
+
avgToolDurationMs,
|
|
6070
7246
|
buildDirectoryChain,
|
|
6071
7247
|
buildPromptInputs,
|
|
6072
7248
|
buildSearchRoots,
|
|
6073
7249
|
computeTraceSummary,
|
|
6074
7250
|
consumeCodexLogEntries,
|
|
7251
|
+
consumePiLogEntries,
|
|
6075
7252
|
createAgentKernel,
|
|
6076
7253
|
createProvider,
|
|
6077
7254
|
ensureVSCodeSubagents,
|
|
7255
|
+
explorationRatio,
|
|
6078
7256
|
extractCodeBlocks,
|
|
6079
7257
|
fileExists,
|
|
6080
7258
|
findGitRoot,
|
|
@@ -6086,10 +7264,9 @@ function createAgentKernel() {
|
|
|
6086
7264
|
isJsonValue,
|
|
6087
7265
|
isTestMessage,
|
|
6088
7266
|
isTestMessageRole,
|
|
6089
|
-
isTraceEvent,
|
|
6090
|
-
isTraceEventType,
|
|
6091
7267
|
listTargetNames,
|
|
6092
7268
|
loadEvalCases,
|
|
7269
|
+
mergeExecutionMetrics,
|
|
6093
7270
|
normalizeLineEndings,
|
|
6094
7271
|
readJsonFile,
|
|
6095
7272
|
readTargetDefinitions,
|
|
@@ -6100,6 +7277,8 @@ function createAgentKernel() {
|
|
|
6100
7277
|
resolveTargetDefinition,
|
|
6101
7278
|
runEvalCase,
|
|
6102
7279
|
runEvaluation,
|
|
6103
|
-
subscribeToCodexLogEntries
|
|
7280
|
+
subscribeToCodexLogEntries,
|
|
7281
|
+
subscribeToPiLogEntries,
|
|
7282
|
+
tokensPerTool
|
|
6104
7283
|
});
|
|
6105
7284
|
//# sourceMappingURL=index.cjs.map
|