@agentv/core 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KPHTMTZ3.js → chunk-E2VSU4WZ.js} +265 -83
- package/dist/chunk-E2VSU4WZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +82 -71
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1475 -393
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +227 -33
- package/dist/index.d.ts +227 -33
- package/dist/index.js +1142 -244
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-KPHTMTZ3.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,17 +32,21 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
+
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
35
36
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
37
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
38
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
39
|
+
avgToolDurationMs: () => avgToolDurationMs,
|
|
38
40
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
39
41
|
buildPromptInputs: () => buildPromptInputs,
|
|
40
42
|
buildSearchRoots: () => buildSearchRoots2,
|
|
41
43
|
computeTraceSummary: () => computeTraceSummary,
|
|
42
44
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
45
|
+
consumePiLogEntries: () => consumePiLogEntries,
|
|
43
46
|
createAgentKernel: () => createAgentKernel,
|
|
44
47
|
createProvider: () => createProvider,
|
|
45
48
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
49
|
+
explorationRatio: () => explorationRatio,
|
|
46
50
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
51
|
fileExists: () => fileExists2,
|
|
48
52
|
findGitRoot: () => findGitRoot,
|
|
@@ -56,6 +60,7 @@ __export(index_exports, {
|
|
|
56
60
|
isTestMessageRole: () => isTestMessageRole,
|
|
57
61
|
listTargetNames: () => listTargetNames,
|
|
58
62
|
loadEvalCases: () => loadEvalCases,
|
|
63
|
+
mergeExecutionMetrics: () => mergeExecutionMetrics,
|
|
59
64
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
60
65
|
readJsonFile: () => readJsonFile,
|
|
61
66
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
@@ -66,7 +71,9 @@ __export(index_exports, {
|
|
|
66
71
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
67
72
|
runEvalCase: () => runEvalCase,
|
|
68
73
|
runEvaluation: () => runEvaluation,
|
|
69
|
-
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
|
|
74
|
+
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
|
|
75
|
+
subscribeToPiLogEntries: () => subscribeToPiLogEntries,
|
|
76
|
+
tokensPerTool: () => tokensPerTool
|
|
70
77
|
});
|
|
71
78
|
module.exports = __toCommonJS(index_exports);
|
|
72
79
|
|
|
@@ -151,6 +158,53 @@ function computeTraceSummary(messages) {
|
|
|
151
158
|
errorCount: 0
|
|
152
159
|
};
|
|
153
160
|
}
|
|
161
|
+
var DEFAULT_EXPLORATION_TOOLS = [
|
|
162
|
+
"read",
|
|
163
|
+
"grep",
|
|
164
|
+
"glob",
|
|
165
|
+
"search",
|
|
166
|
+
"list",
|
|
167
|
+
"Read",
|
|
168
|
+
"Grep",
|
|
169
|
+
"Glob",
|
|
170
|
+
"WebSearch",
|
|
171
|
+
"WebFetch"
|
|
172
|
+
];
|
|
173
|
+
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
174
|
+
if (summary.eventCount === 0) return void 0;
|
|
175
|
+
const explorationCalls = explorationTools.reduce(
|
|
176
|
+
(sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
|
|
177
|
+
0
|
|
178
|
+
);
|
|
179
|
+
return explorationCalls / summary.eventCount;
|
|
180
|
+
}
|
|
181
|
+
function tokensPerTool(summary) {
|
|
182
|
+
if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
|
|
183
|
+
const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
|
|
184
|
+
return totalTokens / summary.eventCount;
|
|
185
|
+
}
|
|
186
|
+
function avgToolDurationMs(summary) {
|
|
187
|
+
if (!summary.toolDurations) return void 0;
|
|
188
|
+
let totalDuration = 0;
|
|
189
|
+
let totalCalls = 0;
|
|
190
|
+
for (const durations of Object.values(summary.toolDurations)) {
|
|
191
|
+
for (const duration of durations) {
|
|
192
|
+
totalDuration += duration;
|
|
193
|
+
totalCalls++;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
if (totalCalls === 0) return void 0;
|
|
197
|
+
return totalDuration / totalCalls;
|
|
198
|
+
}
|
|
199
|
+
function mergeExecutionMetrics(summary, metrics) {
|
|
200
|
+
if (!metrics) return summary;
|
|
201
|
+
return {
|
|
202
|
+
...summary,
|
|
203
|
+
tokenUsage: metrics.tokenUsage,
|
|
204
|
+
costUsd: metrics.costUsd,
|
|
205
|
+
durationMs: metrics.durationMs
|
|
206
|
+
};
|
|
207
|
+
}
|
|
154
208
|
|
|
155
209
|
// src/evaluation/yaml-parser.ts
|
|
156
210
|
var import_promises6 = require("fs/promises");
|
|
@@ -665,7 +719,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
665
719
|
expected = [];
|
|
666
720
|
for (const item of rawExpected) {
|
|
667
721
|
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
668
|
-
|
|
722
|
+
let args;
|
|
723
|
+
if (item.args === "any") {
|
|
724
|
+
args = "any";
|
|
725
|
+
} else if (isJsonObject2(item.args)) {
|
|
726
|
+
args = item.args;
|
|
727
|
+
}
|
|
728
|
+
expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
|
|
669
729
|
}
|
|
670
730
|
}
|
|
671
731
|
}
|
|
@@ -1940,12 +2000,14 @@ var CliProvider = class {
|
|
|
1940
2000
|
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1941
2001
|
);
|
|
1942
2002
|
}
|
|
2003
|
+
const startTime = Date.now();
|
|
1943
2004
|
const result = await this.runCommand(renderedCommand, {
|
|
1944
2005
|
cwd: this.config.cwd,
|
|
1945
2006
|
env: process.env,
|
|
1946
2007
|
timeoutMs: this.config.timeoutMs,
|
|
1947
2008
|
signal: request.signal
|
|
1948
2009
|
});
|
|
2010
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1949
2011
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1950
2012
|
if (request.signal?.aborted) {
|
|
1951
2013
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1964,6 +2026,9 @@ var CliProvider = class {
|
|
|
1964
2026
|
const parsed = this.parseOutputContent(responseContent);
|
|
1965
2027
|
return {
|
|
1966
2028
|
outputMessages: parsed.outputMessages,
|
|
2029
|
+
tokenUsage: parsed.tokenUsage,
|
|
2030
|
+
costUsd: parsed.costUsd,
|
|
2031
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
1967
2032
|
raw: {
|
|
1968
2033
|
command: renderedCommand,
|
|
1969
2034
|
stderr: result.stderr,
|
|
@@ -2011,12 +2076,14 @@ var CliProvider = class {
|
|
|
2011
2076
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
2012
2077
|
);
|
|
2013
2078
|
}
|
|
2079
|
+
const startTime = Date.now();
|
|
2014
2080
|
const result = await this.runCommand(renderedCommand, {
|
|
2015
2081
|
cwd: this.config.cwd,
|
|
2016
2082
|
env: process.env,
|
|
2017
2083
|
timeoutMs: this.config.timeoutMs,
|
|
2018
2084
|
signal: controller.signal
|
|
2019
2085
|
});
|
|
2086
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
2020
2087
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
2021
2088
|
if (controller.signal.aborted) {
|
|
2022
2089
|
throw new Error("CLI provider request was aborted");
|
|
@@ -2038,11 +2105,13 @@ var CliProvider = class {
|
|
|
2038
2105
|
if (missingIds.length > 0) {
|
|
2039
2106
|
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
2040
2107
|
}
|
|
2108
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
2041
2109
|
const responses = requests.map((request) => {
|
|
2042
2110
|
const evalCaseId = request.evalCaseId;
|
|
2043
2111
|
if (!evalCaseId) {
|
|
2044
2112
|
return {
|
|
2045
2113
|
outputMessages: [],
|
|
2114
|
+
durationMs: perRequestFallbackMs,
|
|
2046
2115
|
raw: {
|
|
2047
2116
|
command: renderedCommand,
|
|
2048
2117
|
stderr: result.stderr,
|
|
@@ -2056,6 +2125,7 @@ var CliProvider = class {
|
|
|
2056
2125
|
if (!parsed) {
|
|
2057
2126
|
return {
|
|
2058
2127
|
outputMessages: [],
|
|
2128
|
+
durationMs: perRequestFallbackMs,
|
|
2059
2129
|
raw: {
|
|
2060
2130
|
command: renderedCommand,
|
|
2061
2131
|
stderr: result.stderr,
|
|
@@ -2067,6 +2137,9 @@ var CliProvider = class {
|
|
|
2067
2137
|
}
|
|
2068
2138
|
return {
|
|
2069
2139
|
outputMessages: parsed.outputMessages,
|
|
2140
|
+
tokenUsage: parsed.tokenUsage,
|
|
2141
|
+
costUsd: parsed.costUsd,
|
|
2142
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
2070
2143
|
raw: {
|
|
2071
2144
|
command: renderedCommand,
|
|
2072
2145
|
stderr: result.stderr,
|
|
@@ -2084,25 +2157,55 @@ var CliProvider = class {
|
|
|
2084
2157
|
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
2085
2158
|
* If only 'text' is provided, wrap it in outputMessages.
|
|
2086
2159
|
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
2160
|
+
*
|
|
2161
|
+
* Also extracts optional execution metrics:
|
|
2162
|
+
* - token_usage: { input, output, cached? }
|
|
2163
|
+
* - cost_usd: number
|
|
2164
|
+
* - duration_ms: number
|
|
2087
2165
|
*/
|
|
2088
2166
|
parseOutputContent(content) {
|
|
2089
2167
|
try {
|
|
2090
2168
|
const parsed = JSON.parse(content);
|
|
2091
2169
|
if (typeof parsed === "object" && parsed !== null) {
|
|
2092
2170
|
const obj = parsed;
|
|
2171
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2172
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2173
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2093
2174
|
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2094
2175
|
if (outputMessages && outputMessages.length > 0) {
|
|
2095
|
-
return { outputMessages };
|
|
2176
|
+
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
2096
2177
|
}
|
|
2097
2178
|
if ("text" in obj) {
|
|
2098
2179
|
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
2099
|
-
return {
|
|
2180
|
+
return {
|
|
2181
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
2182
|
+
tokenUsage,
|
|
2183
|
+
costUsd,
|
|
2184
|
+
durationMs
|
|
2185
|
+
};
|
|
2100
2186
|
}
|
|
2101
2187
|
}
|
|
2102
2188
|
} catch {
|
|
2103
2189
|
}
|
|
2104
2190
|
return { outputMessages: [{ role: "assistant", content }] };
|
|
2105
2191
|
}
|
|
2192
|
+
/**
|
|
2193
|
+
* Parse token_usage from CLI output.
|
|
2194
|
+
*/
|
|
2195
|
+
parseTokenUsage(tokenUsage) {
|
|
2196
|
+
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2197
|
+
return void 0;
|
|
2198
|
+
}
|
|
2199
|
+
const obj = tokenUsage;
|
|
2200
|
+
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2201
|
+
return void 0;
|
|
2202
|
+
}
|
|
2203
|
+
return {
|
|
2204
|
+
input: obj.input,
|
|
2205
|
+
output: obj.output,
|
|
2206
|
+
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2207
|
+
};
|
|
2208
|
+
}
|
|
2106
2209
|
/**
|
|
2107
2210
|
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
2108
2211
|
*/
|
|
@@ -2179,6 +2282,9 @@ var CliProvider = class {
|
|
|
2179
2282
|
if (records.has(id)) {
|
|
2180
2283
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2181
2284
|
}
|
|
2285
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2286
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2287
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2182
2288
|
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2183
2289
|
let outputMessages;
|
|
2184
2290
|
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
@@ -2188,7 +2294,10 @@ var CliProvider = class {
|
|
|
2188
2294
|
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2189
2295
|
}
|
|
2190
2296
|
records.set(id, {
|
|
2191
|
-
outputMessages
|
|
2297
|
+
outputMessages,
|
|
2298
|
+
tokenUsage,
|
|
2299
|
+
costUsd,
|
|
2300
|
+
durationMs
|
|
2192
2301
|
});
|
|
2193
2302
|
}
|
|
2194
2303
|
return records;
|
|
@@ -2504,6 +2613,11 @@ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exe
|
|
|
2504
2613
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
2505
2614
|
var PROMPT_FILENAME = "prompt.md";
|
|
2506
2615
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2616
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2617
|
+
- Do NOT create any additional output files in the workspace.
|
|
2618
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2619
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2620
|
+
This is required for evaluation scoring.`;
|
|
2507
2621
|
var CodexProvider = class {
|
|
2508
2622
|
id;
|
|
2509
2623
|
kind = "codex";
|
|
@@ -2528,7 +2642,11 @@ var CodexProvider = class {
|
|
|
2528
2642
|
const workspaceRoot = await this.createWorkspace();
|
|
2529
2643
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2530
2644
|
try {
|
|
2531
|
-
const
|
|
2645
|
+
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2646
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2647
|
+
const promptContent = `${systemPrompt}
|
|
2648
|
+
|
|
2649
|
+
${basePrompt}`;
|
|
2532
2650
|
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2533
2651
|
await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
|
|
2534
2652
|
const args = this.buildCodexArgs();
|
|
@@ -3212,222 +3330,1067 @@ var MockProvider = class {
|
|
|
3212
3330
|
}
|
|
3213
3331
|
};
|
|
3214
3332
|
|
|
3215
|
-
// src/evaluation/providers/
|
|
3333
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3334
|
+
var import_node_child_process3 = require("child_process");
|
|
3335
|
+
var import_node_crypto2 = require("crypto");
|
|
3336
|
+
var import_node_fs4 = require("fs");
|
|
3337
|
+
var import_promises10 = require("fs/promises");
|
|
3338
|
+
var import_node_os3 = require("os");
|
|
3216
3339
|
var import_node_path11 = __toESM(require("path"), 1);
|
|
3217
|
-
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3226
|
-
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
3227
|
-
name: import_zod.z.string().min(1, "target name is required"),
|
|
3228
|
-
provider: import_zod.z.string().min(1, "provider is required"),
|
|
3229
|
-
judge_target: import_zod.z.string().optional(),
|
|
3230
|
-
workers: import_zod.z.number().int().min(1).optional()
|
|
3231
|
-
}).passthrough();
|
|
3232
|
-
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
3233
|
-
function normalizeAzureApiVersion(value) {
|
|
3234
|
-
if (!value) {
|
|
3235
|
-
return DEFAULT_AZURE_API_VERSION;
|
|
3236
|
-
}
|
|
3237
|
-
const trimmed = value.trim();
|
|
3238
|
-
if (trimmed.length === 0) {
|
|
3239
|
-
return DEFAULT_AZURE_API_VERSION;
|
|
3340
|
+
|
|
3341
|
+
// src/evaluation/providers/pi-log-tracker.ts
|
|
3342
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
|
|
3343
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
|
|
3344
|
+
function getPiLogStore() {
|
|
3345
|
+
const globalObject = globalThis;
|
|
3346
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
3347
|
+
if (existing) {
|
|
3348
|
+
return existing;
|
|
3240
3349
|
}
|
|
3241
|
-
const
|
|
3242
|
-
|
|
3350
|
+
const created = [];
|
|
3351
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
3352
|
+
return created;
|
|
3243
3353
|
}
|
|
3244
|
-
function
|
|
3245
|
-
const
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
const initialDelayMs = resolveOptionalNumber(
|
|
3250
|
-
target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
|
|
3251
|
-
`${target.name} retry initial delay`
|
|
3252
|
-
);
|
|
3253
|
-
const maxDelayMs = resolveOptionalNumber(
|
|
3254
|
-
target.retry_max_delay_ms ?? target.retryMaxDelayMs,
|
|
3255
|
-
`${target.name} retry max delay`
|
|
3256
|
-
);
|
|
3257
|
-
const backoffFactor = resolveOptionalNumber(
|
|
3258
|
-
target.retry_backoff_factor ?? target.retryBackoffFactor,
|
|
3259
|
-
`${target.name} retry backoff factor`
|
|
3260
|
-
);
|
|
3261
|
-
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
3262
|
-
target.retry_status_codes ?? target.retryStatusCodes,
|
|
3263
|
-
`${target.name} retry status codes`
|
|
3264
|
-
);
|
|
3265
|
-
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
3266
|
-
return void 0;
|
|
3354
|
+
function getSubscriberStore2() {
|
|
3355
|
+
const globalObject = globalThis;
|
|
3356
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
3357
|
+
if (existing) {
|
|
3358
|
+
return existing;
|
|
3267
3359
|
}
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
maxDelayMs,
|
|
3272
|
-
backoffFactor,
|
|
3273
|
-
retryableStatusCodes
|
|
3274
|
-
};
|
|
3360
|
+
const created = /* @__PURE__ */ new Set();
|
|
3361
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
3362
|
+
return created;
|
|
3275
3363
|
}
|
|
3276
|
-
function
|
|
3277
|
-
const
|
|
3278
|
-
const
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
return {
|
|
3286
|
-
kind: "azure",
|
|
3287
|
-
name: parsed.name,
|
|
3288
|
-
judgeTarget: parsed.judge_target,
|
|
3289
|
-
workers: parsed.workers,
|
|
3290
|
-
providerBatching,
|
|
3291
|
-
config: resolveAzureConfig(parsed, env)
|
|
3292
|
-
};
|
|
3293
|
-
case "anthropic":
|
|
3294
|
-
return {
|
|
3295
|
-
kind: "anthropic",
|
|
3296
|
-
name: parsed.name,
|
|
3297
|
-
judgeTarget: parsed.judge_target,
|
|
3298
|
-
workers: parsed.workers,
|
|
3299
|
-
providerBatching,
|
|
3300
|
-
config: resolveAnthropicConfig(parsed, env)
|
|
3301
|
-
};
|
|
3302
|
-
case "gemini":
|
|
3303
|
-
case "google":
|
|
3304
|
-
case "google-gemini":
|
|
3305
|
-
return {
|
|
3306
|
-
kind: "gemini",
|
|
3307
|
-
name: parsed.name,
|
|
3308
|
-
judgeTarget: parsed.judge_target,
|
|
3309
|
-
workers: parsed.workers,
|
|
3310
|
-
providerBatching,
|
|
3311
|
-
config: resolveGeminiConfig(parsed, env)
|
|
3312
|
-
};
|
|
3313
|
-
case "codex":
|
|
3314
|
-
case "codex-cli":
|
|
3315
|
-
return {
|
|
3316
|
-
kind: "codex",
|
|
3317
|
-
name: parsed.name,
|
|
3318
|
-
judgeTarget: parsed.judge_target,
|
|
3319
|
-
workers: parsed.workers,
|
|
3320
|
-
providerBatching,
|
|
3321
|
-
config: resolveCodexConfig(parsed, env)
|
|
3322
|
-
};
|
|
3323
|
-
case "mock":
|
|
3324
|
-
return {
|
|
3325
|
-
kind: "mock",
|
|
3326
|
-
name: parsed.name,
|
|
3327
|
-
judgeTarget: parsed.judge_target,
|
|
3328
|
-
workers: parsed.workers,
|
|
3329
|
-
providerBatching,
|
|
3330
|
-
config: resolveMockConfig(parsed)
|
|
3331
|
-
};
|
|
3332
|
-
case "vscode":
|
|
3333
|
-
case "vscode-insiders":
|
|
3334
|
-
return {
|
|
3335
|
-
kind: provider,
|
|
3336
|
-
name: parsed.name,
|
|
3337
|
-
judgeTarget: parsed.judge_target,
|
|
3338
|
-
workers: parsed.workers,
|
|
3339
|
-
providerBatching,
|
|
3340
|
-
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
3341
|
-
};
|
|
3342
|
-
case "cli":
|
|
3343
|
-
return {
|
|
3344
|
-
kind: "cli",
|
|
3345
|
-
name: parsed.name,
|
|
3346
|
-
judgeTarget: parsed.judge_target,
|
|
3347
|
-
workers: parsed.workers,
|
|
3348
|
-
providerBatching,
|
|
3349
|
-
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
3350
|
-
};
|
|
3351
|
-
default:
|
|
3352
|
-
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
3364
|
+
function notifySubscribers2(entry) {
|
|
3365
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
3366
|
+
for (const listener of subscribers) {
|
|
3367
|
+
try {
|
|
3368
|
+
listener(entry);
|
|
3369
|
+
} catch (error) {
|
|
3370
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3371
|
+
console.warn(`Pi log subscriber failed: ${message}`);
|
|
3372
|
+
}
|
|
3353
3373
|
}
|
|
3354
3374
|
}
|
|
3355
|
-
function
|
|
3356
|
-
|
|
3357
|
-
|
|
3358
|
-
const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
|
|
3359
|
-
const versionSource = target.version ?? target.api_version;
|
|
3360
|
-
const temperatureSource = target.temperature;
|
|
3361
|
-
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
3362
|
-
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
3363
|
-
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
3364
|
-
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
3365
|
-
const version = normalizeAzureApiVersion(
|
|
3366
|
-
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
3367
|
-
allowLiteral: true,
|
|
3368
|
-
optionalEnv: true
|
|
3369
|
-
})
|
|
3370
|
-
);
|
|
3371
|
-
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
3372
|
-
const maxOutputTokens = resolveOptionalNumber(
|
|
3373
|
-
maxTokensSource,
|
|
3374
|
-
`${target.name} max output tokens`
|
|
3375
|
-
);
|
|
3376
|
-
const retry = resolveRetryConfig(target);
|
|
3377
|
-
return {
|
|
3378
|
-
resourceName,
|
|
3379
|
-
deploymentName,
|
|
3380
|
-
apiKey,
|
|
3381
|
-
version,
|
|
3382
|
-
temperature,
|
|
3383
|
-
maxOutputTokens,
|
|
3384
|
-
retry
|
|
3385
|
-
};
|
|
3375
|
+
function recordPiLogEntry(entry) {
|
|
3376
|
+
getPiLogStore().push(entry);
|
|
3377
|
+
notifySubscribers2(entry);
|
|
3386
3378
|
}
|
|
3387
|
-
function
|
|
3388
|
-
const
|
|
3389
|
-
|
|
3390
|
-
|
|
3391
|
-
|
|
3392
|
-
|
|
3393
|
-
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
3394
|
-
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
3395
|
-
const retry = resolveRetryConfig(target);
|
|
3396
|
-
return {
|
|
3397
|
-
apiKey,
|
|
3398
|
-
model,
|
|
3399
|
-
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
3400
|
-
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
3401
|
-
thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
|
|
3402
|
-
retry
|
|
3403
|
-
};
|
|
3379
|
+
function consumePiLogEntries() {
|
|
3380
|
+
const store = getPiLogStore();
|
|
3381
|
+
if (store.length === 0) {
|
|
3382
|
+
return [];
|
|
3383
|
+
}
|
|
3384
|
+
return store.splice(0, store.length);
|
|
3404
3385
|
}
|
|
3405
|
-
function
|
|
3406
|
-
const
|
|
3407
|
-
|
|
3408
|
-
|
|
3409
|
-
|
|
3410
|
-
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
3411
|
-
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
3412
|
-
allowLiteral: true,
|
|
3413
|
-
optionalEnv: true
|
|
3414
|
-
}) ?? "gemini-2.5-flash";
|
|
3415
|
-
const retry = resolveRetryConfig(target);
|
|
3416
|
-
return {
|
|
3417
|
-
apiKey,
|
|
3418
|
-
model,
|
|
3419
|
-
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
3420
|
-
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
3421
|
-
retry
|
|
3386
|
+
function subscribeToPiLogEntries(listener) {
|
|
3387
|
+
const store = getSubscriberStore2();
|
|
3388
|
+
store.add(listener);
|
|
3389
|
+
return () => {
|
|
3390
|
+
store.delete(listener);
|
|
3422
3391
|
};
|
|
3423
3392
|
}
|
|
3424
|
-
|
|
3425
|
-
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3393
|
+
|
|
3394
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3395
|
+
var WORKSPACE_PREFIX2 = "agentv-pi-";
|
|
3396
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
3397
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3398
|
+
- Do NOT create any additional output files in the workspace.
|
|
3399
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
3400
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3401
|
+
This is required for evaluation scoring.`;
|
|
3402
|
+
var PiCodingAgentProvider = class {
|
|
3403
|
+
id;
|
|
3404
|
+
kind = "pi-coding-agent";
|
|
3405
|
+
targetName;
|
|
3406
|
+
supportsBatch = false;
|
|
3407
|
+
config;
|
|
3408
|
+
runPi;
|
|
3409
|
+
constructor(targetName, config, runner = defaultPiRunner) {
|
|
3410
|
+
this.id = `pi-coding-agent:${targetName}`;
|
|
3411
|
+
this.targetName = targetName;
|
|
3412
|
+
this.config = config;
|
|
3413
|
+
this.runPi = runner;
|
|
3414
|
+
}
|
|
3415
|
+
async invoke(request) {
|
|
3416
|
+
if (request.signal?.aborted) {
|
|
3417
|
+
throw new Error("Pi coding agent request was aborted before execution");
|
|
3418
|
+
}
|
|
3419
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
3420
|
+
const workspaceRoot = await this.createWorkspace();
|
|
3421
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3422
|
+
try {
|
|
3423
|
+
const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3424
|
+
await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
|
|
3425
|
+
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3426
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
3427
|
+
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3428
|
+
if (result.timedOut) {
|
|
3429
|
+
throw new Error(
|
|
3430
|
+
`Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
3431
|
+
);
|
|
3432
|
+
}
|
|
3433
|
+
if (result.exitCode !== 0) {
|
|
3434
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
3435
|
+
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3436
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3437
|
+
}
|
|
3438
|
+
const parsed = parsePiJsonl(result.stdout);
|
|
3439
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
3440
|
+
const assistantText = extractAssistantText2(outputMessages);
|
|
3441
|
+
return {
|
|
3442
|
+
raw: {
|
|
3443
|
+
response: parsed,
|
|
3444
|
+
stdout: result.stdout,
|
|
3445
|
+
stderr: result.stderr,
|
|
3446
|
+
exitCode: result.exitCode,
|
|
3447
|
+
args,
|
|
3448
|
+
executable: this.config.executable,
|
|
3449
|
+
promptFile,
|
|
3450
|
+
workspace: workspaceRoot,
|
|
3451
|
+
inputFiles,
|
|
3452
|
+
logFile: logger?.filePath
|
|
3453
|
+
},
|
|
3454
|
+
outputMessages
|
|
3455
|
+
};
|
|
3456
|
+
} finally {
|
|
3457
|
+
await logger?.close();
|
|
3458
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
3459
|
+
}
|
|
3460
|
+
}
|
|
3461
|
+
resolveCwd(workspaceRoot) {
|
|
3462
|
+
if (!this.config.cwd) {
|
|
3463
|
+
return workspaceRoot;
|
|
3464
|
+
}
|
|
3465
|
+
return import_node_path11.default.resolve(this.config.cwd);
|
|
3466
|
+
}
|
|
3467
|
+
buildPiArgs(prompt, inputFiles) {
|
|
3468
|
+
const args = [];
|
|
3469
|
+
if (this.config.provider) {
|
|
3470
|
+
args.push("--provider", this.config.provider);
|
|
3471
|
+
}
|
|
3472
|
+
if (this.config.model) {
|
|
3473
|
+
args.push("--model", this.config.model);
|
|
3474
|
+
}
|
|
3475
|
+
if (this.config.apiKey) {
|
|
3476
|
+
args.push("--api-key", this.config.apiKey);
|
|
3477
|
+
}
|
|
3478
|
+
args.push("--mode", "json");
|
|
3479
|
+
args.push("--print");
|
|
3480
|
+
args.push("--no-session");
|
|
3481
|
+
if (this.config.tools) {
|
|
3482
|
+
args.push("--tools", this.config.tools);
|
|
3483
|
+
}
|
|
3484
|
+
if (this.config.thinking) {
|
|
3485
|
+
args.push("--thinking", this.config.thinking);
|
|
3486
|
+
}
|
|
3487
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
3488
|
+
args.push(...this.config.args);
|
|
3489
|
+
}
|
|
3490
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
3491
|
+
for (const file of inputFiles) {
|
|
3492
|
+
args.push(`@${file}`);
|
|
3493
|
+
}
|
|
3494
|
+
}
|
|
3495
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
3496
|
+
const fullPrompt = `${systemPrompt}
|
|
3497
|
+
|
|
3498
|
+
${prompt}`;
|
|
3499
|
+
const escapedPrompt = escapeAtSymbols(fullPrompt);
|
|
3500
|
+
args.push(escapedPrompt);
|
|
3501
|
+
return args;
|
|
3502
|
+
}
|
|
3503
|
+
async executePi(args, cwd, signal, logger) {
|
|
3504
|
+
try {
|
|
3505
|
+
return await this.runPi({
|
|
3506
|
+
executable: this.config.executable,
|
|
3507
|
+
args,
|
|
3508
|
+
cwd,
|
|
3509
|
+
timeoutMs: this.config.timeoutMs,
|
|
3510
|
+
env: this.buildEnv(),
|
|
3511
|
+
signal,
|
|
3512
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
3513
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
3514
|
+
});
|
|
3515
|
+
} catch (error) {
|
|
3516
|
+
const err = error;
|
|
3517
|
+
if (err.code === "ENOENT") {
|
|
3518
|
+
throw new Error(
|
|
3519
|
+
`Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
3520
|
+
);
|
|
3521
|
+
}
|
|
3522
|
+
throw error;
|
|
3523
|
+
}
|
|
3524
|
+
}
|
|
3525
|
+
buildEnv() {
|
|
3526
|
+
const env = { ...process.env };
|
|
3527
|
+
if (this.config.apiKey) {
|
|
3528
|
+
const provider = this.config.provider?.toLowerCase() ?? "google";
|
|
3529
|
+
switch (provider) {
|
|
3530
|
+
case "google":
|
|
3531
|
+
case "gemini":
|
|
3532
|
+
env.GEMINI_API_KEY = this.config.apiKey;
|
|
3533
|
+
break;
|
|
3534
|
+
case "anthropic":
|
|
3535
|
+
env.ANTHROPIC_API_KEY = this.config.apiKey;
|
|
3536
|
+
break;
|
|
3537
|
+
case "openai":
|
|
3538
|
+
env.OPENAI_API_KEY = this.config.apiKey;
|
|
3539
|
+
break;
|
|
3540
|
+
case "groq":
|
|
3541
|
+
env.GROQ_API_KEY = this.config.apiKey;
|
|
3542
|
+
break;
|
|
3543
|
+
case "xai":
|
|
3544
|
+
env.XAI_API_KEY = this.config.apiKey;
|
|
3545
|
+
break;
|
|
3546
|
+
case "openrouter":
|
|
3547
|
+
env.OPENROUTER_API_KEY = this.config.apiKey;
|
|
3548
|
+
break;
|
|
3549
|
+
}
|
|
3550
|
+
}
|
|
3551
|
+
return env;
|
|
3552
|
+
}
|
|
3553
|
+
async createWorkspace() {
|
|
3554
|
+
return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
3555
|
+
}
|
|
3556
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
3557
|
+
try {
|
|
3558
|
+
await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3559
|
+
} catch {
|
|
3560
|
+
}
|
|
3561
|
+
}
|
|
3562
|
+
resolveLogDirectory() {
|
|
3563
|
+
if (this.config.logDir) {
|
|
3564
|
+
return import_node_path11.default.resolve(this.config.logDir);
|
|
3565
|
+
}
|
|
3566
|
+
return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3567
|
+
}
|
|
3568
|
+
async createStreamLogger(request) {
|
|
3569
|
+
const logDir = this.resolveLogDirectory();
|
|
3570
|
+
if (!logDir) {
|
|
3571
|
+
return void 0;
|
|
3572
|
+
}
|
|
3573
|
+
try {
|
|
3574
|
+
await (0, import_promises10.mkdir)(logDir, { recursive: true });
|
|
3575
|
+
} catch (error) {
|
|
3576
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3577
|
+
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3578
|
+
return void 0;
|
|
3579
|
+
}
|
|
3580
|
+
const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3581
|
+
try {
|
|
3582
|
+
const logger = await PiStreamLogger.create({
|
|
3583
|
+
filePath,
|
|
3584
|
+
targetName: this.targetName,
|
|
3585
|
+
evalCaseId: request.evalCaseId,
|
|
3586
|
+
attempt: request.attempt,
|
|
3587
|
+
format: this.config.logFormat ?? "summary"
|
|
3588
|
+
});
|
|
3589
|
+
recordPiLogEntry({
|
|
3590
|
+
filePath,
|
|
3591
|
+
targetName: this.targetName,
|
|
3592
|
+
evalCaseId: request.evalCaseId,
|
|
3593
|
+
attempt: request.attempt
|
|
3594
|
+
});
|
|
3595
|
+
return logger;
|
|
3596
|
+
} catch (error) {
|
|
3597
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3598
|
+
console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
|
|
3599
|
+
return void 0;
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
};
|
|
3603
|
+
var PiStreamLogger = class _PiStreamLogger {
|
|
3604
|
+
filePath;
|
|
3605
|
+
stream;
|
|
3606
|
+
startedAt = Date.now();
|
|
3607
|
+
stdoutBuffer = "";
|
|
3608
|
+
stderrBuffer = "";
|
|
3609
|
+
format;
|
|
3610
|
+
constructor(filePath, format) {
|
|
3611
|
+
this.filePath = filePath;
|
|
3612
|
+
this.format = format;
|
|
3613
|
+
this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
|
|
3614
|
+
}
|
|
3615
|
+
static async create(options) {
|
|
3616
|
+
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
3617
|
+
const header = [
|
|
3618
|
+
"# Pi Coding Agent stream log",
|
|
3619
|
+
`# target: ${options.targetName}`,
|
|
3620
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
3621
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
3622
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
3623
|
+
""
|
|
3624
|
+
].filter((line) => Boolean(line));
|
|
3625
|
+
logger.writeLines(header);
|
|
3626
|
+
return logger;
|
|
3627
|
+
}
|
|
3628
|
+
handleStdoutChunk(chunk) {
|
|
3629
|
+
this.stdoutBuffer += chunk;
|
|
3630
|
+
this.flushBuffer("stdout");
|
|
3631
|
+
}
|
|
3632
|
+
handleStderrChunk(chunk) {
|
|
3633
|
+
this.stderrBuffer += chunk;
|
|
3634
|
+
this.flushBuffer("stderr");
|
|
3635
|
+
}
|
|
3636
|
+
async close() {
|
|
3637
|
+
this.flushBuffer("stdout");
|
|
3638
|
+
this.flushBuffer("stderr");
|
|
3639
|
+
this.flushRemainder();
|
|
3640
|
+
await new Promise((resolve, reject) => {
|
|
3641
|
+
this.stream.once("error", reject);
|
|
3642
|
+
this.stream.end(() => resolve());
|
|
3643
|
+
});
|
|
3644
|
+
}
|
|
3645
|
+
writeLines(lines) {
|
|
3646
|
+
for (const line of lines) {
|
|
3647
|
+
this.stream.write(`${line}
|
|
3648
|
+
`);
|
|
3649
|
+
}
|
|
3650
|
+
}
|
|
3651
|
+
flushBuffer(source) {
|
|
3652
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
3653
|
+
const lines = buffer.split(/\r?\n/);
|
|
3654
|
+
const remainder = lines.pop() ?? "";
|
|
3655
|
+
if (source === "stdout") {
|
|
3656
|
+
this.stdoutBuffer = remainder;
|
|
3657
|
+
} else {
|
|
3658
|
+
this.stderrBuffer = remainder;
|
|
3659
|
+
}
|
|
3660
|
+
for (const line of lines) {
|
|
3661
|
+
const formatted = this.formatLine(line, source);
|
|
3662
|
+
if (formatted) {
|
|
3663
|
+
this.stream.write(formatted);
|
|
3664
|
+
this.stream.write("\n");
|
|
3665
|
+
}
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
formatLine(rawLine, source) {
|
|
3669
|
+
const trimmed = rawLine.trim();
|
|
3670
|
+
if (trimmed.length === 0) {
|
|
3671
|
+
return void 0;
|
|
3672
|
+
}
|
|
3673
|
+
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3674
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
3675
|
+
}
|
|
3676
|
+
flushRemainder() {
|
|
3677
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
3678
|
+
if (stdoutRemainder.length > 0) {
|
|
3679
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
3680
|
+
if (formatted) {
|
|
3681
|
+
this.stream.write(formatted);
|
|
3682
|
+
this.stream.write("\n");
|
|
3683
|
+
}
|
|
3684
|
+
}
|
|
3685
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
3686
|
+
if (stderrRemainder.length > 0) {
|
|
3687
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
3688
|
+
if (formatted) {
|
|
3689
|
+
this.stream.write(formatted);
|
|
3690
|
+
this.stream.write("\n");
|
|
3691
|
+
}
|
|
3692
|
+
}
|
|
3693
|
+
this.stdoutBuffer = "";
|
|
3694
|
+
this.stderrBuffer = "";
|
|
3695
|
+
}
|
|
3696
|
+
};
|
|
3697
|
+
function buildLogFilename2(request, targetName) {
|
|
3698
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3699
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
|
|
3700
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3701
|
+
const target = sanitizeForFilename2(targetName);
|
|
3702
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
|
|
3703
|
+
}
|
|
3704
|
+
function sanitizeForFilename2(value) {
|
|
3705
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3706
|
+
return sanitized.length > 0 ? sanitized : "pi";
|
|
3707
|
+
}
|
|
3708
|
+
function formatElapsed2(startedAt) {
|
|
3709
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3710
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3711
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
3712
|
+
const seconds = elapsedSeconds % 60;
|
|
3713
|
+
if (hours > 0) {
|
|
3714
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3715
|
+
}
|
|
3716
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3717
|
+
}
|
|
3718
|
+
function formatPiLogMessage(rawLine, source) {
|
|
3719
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3720
|
+
if (parsed) {
|
|
3721
|
+
const summary = summarizePiEvent(parsed);
|
|
3722
|
+
if (summary) {
|
|
3723
|
+
return summary;
|
|
3724
|
+
}
|
|
3725
|
+
}
|
|
3726
|
+
if (source === "stderr") {
|
|
3727
|
+
return `stderr: ${rawLine}`;
|
|
3728
|
+
}
|
|
3729
|
+
return rawLine;
|
|
3730
|
+
}
|
|
3731
|
+
function formatPiJsonLog(rawLine) {
|
|
3732
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3733
|
+
if (!parsed) {
|
|
3734
|
+
return rawLine;
|
|
3735
|
+
}
|
|
3736
|
+
try {
|
|
3737
|
+
return JSON.stringify(parsed, null, 2);
|
|
3738
|
+
} catch {
|
|
3739
|
+
return rawLine;
|
|
3740
|
+
}
|
|
3741
|
+
}
|
|
3742
|
+
function summarizePiEvent(event) {
|
|
3743
|
+
if (!event || typeof event !== "object") {
|
|
3744
|
+
return void 0;
|
|
3745
|
+
}
|
|
3746
|
+
const record = event;
|
|
3747
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
3748
|
+
if (!type) {
|
|
3749
|
+
return void 0;
|
|
3750
|
+
}
|
|
3751
|
+
switch (type) {
|
|
3752
|
+
case "agent_start":
|
|
3753
|
+
return "agent_start";
|
|
3754
|
+
case "agent_end":
|
|
3755
|
+
return "agent_end";
|
|
3756
|
+
case "turn_start":
|
|
3757
|
+
return "turn_start";
|
|
3758
|
+
case "turn_end":
|
|
3759
|
+
return "turn_end";
|
|
3760
|
+
case "message_start":
|
|
3761
|
+
case "message_end": {
|
|
3762
|
+
const message = record.message;
|
|
3763
|
+
const role = message?.role;
|
|
3764
|
+
return `${type}: ${role}`;
|
|
3765
|
+
}
|
|
3766
|
+
case "message_update": {
|
|
3767
|
+
const event2 = record.assistantMessageEvent;
|
|
3768
|
+
const eventType = event2?.type;
|
|
3769
|
+
if (eventType === "text_delta") {
|
|
3770
|
+
const delta = event2?.delta;
|
|
3771
|
+
if (typeof delta === "string") {
|
|
3772
|
+
const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
|
|
3773
|
+
return `text_delta: ${preview}`;
|
|
3774
|
+
}
|
|
3775
|
+
}
|
|
3776
|
+
return `message_update: ${eventType}`;
|
|
3777
|
+
}
|
|
3778
|
+
default:
|
|
3779
|
+
return type;
|
|
3780
|
+
}
|
|
3781
|
+
}
|
|
3782
|
+
function tryParseJsonValue2(rawLine) {
|
|
3783
|
+
try {
|
|
3784
|
+
return JSON.parse(rawLine);
|
|
3785
|
+
} catch {
|
|
3786
|
+
return void 0;
|
|
3787
|
+
}
|
|
3788
|
+
}
|
|
3789
|
+
function parsePiJsonl(output) {
|
|
3790
|
+
const trimmed = output.trim();
|
|
3791
|
+
if (trimmed.length === 0) {
|
|
3792
|
+
throw new Error("Pi coding agent produced no output");
|
|
3793
|
+
}
|
|
3794
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
3795
|
+
const parsed = [];
|
|
3796
|
+
for (const line of lines) {
|
|
3797
|
+
try {
|
|
3798
|
+
parsed.push(JSON.parse(line));
|
|
3799
|
+
} catch {
|
|
3800
|
+
}
|
|
3801
|
+
}
|
|
3802
|
+
if (parsed.length === 0) {
|
|
3803
|
+
throw new Error("Pi coding agent produced no valid JSON output");
|
|
3804
|
+
}
|
|
3805
|
+
return parsed;
|
|
3806
|
+
}
|
|
3807
|
+
function extractOutputMessages(events) {
|
|
3808
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
3809
|
+
const event = events[i];
|
|
3810
|
+
if (!event || typeof event !== "object") {
|
|
3811
|
+
continue;
|
|
3812
|
+
}
|
|
3813
|
+
const record = event;
|
|
3814
|
+
if (record.type !== "agent_end") {
|
|
3815
|
+
continue;
|
|
3816
|
+
}
|
|
3817
|
+
const messages = record.messages;
|
|
3818
|
+
if (!Array.isArray(messages)) {
|
|
3819
|
+
continue;
|
|
3820
|
+
}
|
|
3821
|
+
return messages.map(convertPiMessage).filter((m) => m !== void 0);
|
|
3822
|
+
}
|
|
3823
|
+
const outputMessages = [];
|
|
3824
|
+
for (const event of events) {
|
|
3825
|
+
if (!event || typeof event !== "object") {
|
|
3826
|
+
continue;
|
|
3827
|
+
}
|
|
3828
|
+
const record = event;
|
|
3829
|
+
if (record.type === "turn_end") {
|
|
3830
|
+
const message = record.message;
|
|
3831
|
+
const converted = convertPiMessage(message);
|
|
3832
|
+
if (converted) {
|
|
3833
|
+
outputMessages.push(converted);
|
|
3834
|
+
}
|
|
3835
|
+
}
|
|
3836
|
+
}
|
|
3837
|
+
return outputMessages;
|
|
3838
|
+
}
|
|
3839
|
+
function convertPiMessage(message) {
|
|
3840
|
+
if (!message || typeof message !== "object") {
|
|
3841
|
+
return void 0;
|
|
3842
|
+
}
|
|
3843
|
+
const msg = message;
|
|
3844
|
+
const role = msg.role;
|
|
3845
|
+
if (typeof role !== "string") {
|
|
3846
|
+
return void 0;
|
|
3847
|
+
}
|
|
3848
|
+
const content = extractTextContent(msg.content);
|
|
3849
|
+
const toolCalls = extractToolCalls(msg.content);
|
|
3850
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3851
|
+
const metadata = {};
|
|
3852
|
+
if (msg.api) metadata.api = msg.api;
|
|
3853
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
3854
|
+
if (msg.model) metadata.model = msg.model;
|
|
3855
|
+
if (msg.usage) metadata.usage = msg.usage;
|
|
3856
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
3857
|
+
return {
|
|
3858
|
+
role,
|
|
3859
|
+
content,
|
|
3860
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
3861
|
+
timestamp,
|
|
3862
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3863
|
+
};
|
|
3864
|
+
}
|
|
3865
|
+
function extractTextContent(content) {
|
|
3866
|
+
if (typeof content === "string") {
|
|
3867
|
+
return content;
|
|
3868
|
+
}
|
|
3869
|
+
if (!Array.isArray(content)) {
|
|
3870
|
+
return void 0;
|
|
3871
|
+
}
|
|
3872
|
+
const textParts = [];
|
|
3873
|
+
for (const part of content) {
|
|
3874
|
+
if (!part || typeof part !== "object") {
|
|
3875
|
+
continue;
|
|
3876
|
+
}
|
|
3877
|
+
const p = part;
|
|
3878
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
3879
|
+
textParts.push(p.text);
|
|
3880
|
+
}
|
|
3881
|
+
}
|
|
3882
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3883
|
+
}
|
|
3884
|
+
function extractToolCalls(content) {
|
|
3885
|
+
if (!Array.isArray(content)) {
|
|
3886
|
+
return [];
|
|
3887
|
+
}
|
|
3888
|
+
const toolCalls = [];
|
|
3889
|
+
for (const part of content) {
|
|
3890
|
+
if (!part || typeof part !== "object") {
|
|
3891
|
+
continue;
|
|
3892
|
+
}
|
|
3893
|
+
const p = part;
|
|
3894
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
3895
|
+
toolCalls.push({
|
|
3896
|
+
tool: p.name,
|
|
3897
|
+
input: p.input,
|
|
3898
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
3899
|
+
});
|
|
3900
|
+
}
|
|
3901
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
3902
|
+
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
3903
|
+
if (existing) {
|
|
3904
|
+
const idx = toolCalls.indexOf(existing);
|
|
3905
|
+
toolCalls[idx] = {
|
|
3906
|
+
...existing,
|
|
3907
|
+
output: p.content
|
|
3908
|
+
};
|
|
3909
|
+
}
|
|
3910
|
+
}
|
|
3911
|
+
}
|
|
3912
|
+
return toolCalls;
|
|
3913
|
+
}
|
|
3914
|
+
function extractAssistantText2(messages) {
|
|
3915
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3916
|
+
const msg = messages[i];
|
|
3917
|
+
if (msg.role === "assistant" && msg.content) {
|
|
3918
|
+
if (typeof msg.content === "string") {
|
|
3919
|
+
return msg.content;
|
|
3920
|
+
}
|
|
3921
|
+
return JSON.stringify(msg.content);
|
|
3922
|
+
}
|
|
3923
|
+
}
|
|
3924
|
+
return "";
|
|
3925
|
+
}
|
|
3926
|
+
function escapeAtSymbols(prompt) {
|
|
3927
|
+
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3928
|
+
}
|
|
3929
|
+
function pickDetail2(stderr, stdout) {
|
|
3930
|
+
const errorText = stderr.trim();
|
|
3931
|
+
if (errorText.length > 0) {
|
|
3932
|
+
return errorText;
|
|
3933
|
+
}
|
|
3934
|
+
const stdoutText = stdout.trim();
|
|
3935
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3936
|
+
}
|
|
3937
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3938
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
3939
|
+
return "";
|
|
3940
|
+
}
|
|
3941
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
3942
|
+
return ` after ${seconds}s`;
|
|
3943
|
+
}
|
|
3944
|
+
async function defaultPiRunner(options) {
|
|
3945
|
+
return await new Promise((resolve, reject) => {
|
|
3946
|
+
const parts = options.executable.split(/\s+/);
|
|
3947
|
+
const executable = parts[0];
|
|
3948
|
+
const executableArgs = parts.slice(1);
|
|
3949
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
3950
|
+
const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
|
|
3951
|
+
cwd: options.cwd,
|
|
3952
|
+
env: options.env,
|
|
3953
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
3954
|
+
shell: false
|
|
3955
|
+
});
|
|
3956
|
+
let stdout = "";
|
|
3957
|
+
let stderr = "";
|
|
3958
|
+
let timedOut = false;
|
|
3959
|
+
const onAbort = () => {
|
|
3960
|
+
child.kill("SIGTERM");
|
|
3961
|
+
};
|
|
3962
|
+
if (options.signal) {
|
|
3963
|
+
if (options.signal.aborted) {
|
|
3964
|
+
onAbort();
|
|
3965
|
+
} else {
|
|
3966
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
3967
|
+
}
|
|
3968
|
+
}
|
|
3969
|
+
let timeoutHandle;
|
|
3970
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
3971
|
+
timeoutHandle = setTimeout(() => {
|
|
3972
|
+
timedOut = true;
|
|
3973
|
+
child.kill("SIGTERM");
|
|
3974
|
+
}, options.timeoutMs);
|
|
3975
|
+
timeoutHandle.unref?.();
|
|
3976
|
+
}
|
|
3977
|
+
child.stdout.setEncoding("utf8");
|
|
3978
|
+
child.stdout.on("data", (chunk) => {
|
|
3979
|
+
stdout += chunk;
|
|
3980
|
+
options.onStdoutChunk?.(chunk);
|
|
3981
|
+
});
|
|
3982
|
+
child.stderr.setEncoding("utf8");
|
|
3983
|
+
child.stderr.on("data", (chunk) => {
|
|
3984
|
+
stderr += chunk;
|
|
3985
|
+
options.onStderrChunk?.(chunk);
|
|
3986
|
+
});
|
|
3987
|
+
child.stdin.end();
|
|
3988
|
+
const cleanup = () => {
|
|
3989
|
+
if (timeoutHandle) {
|
|
3990
|
+
clearTimeout(timeoutHandle);
|
|
3991
|
+
}
|
|
3992
|
+
if (options.signal) {
|
|
3993
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
3994
|
+
}
|
|
3995
|
+
};
|
|
3996
|
+
child.on("error", (error) => {
|
|
3997
|
+
cleanup();
|
|
3998
|
+
reject(error);
|
|
3999
|
+
});
|
|
4000
|
+
child.on("close", (code) => {
|
|
4001
|
+
cleanup();
|
|
4002
|
+
resolve({
|
|
4003
|
+
stdout,
|
|
4004
|
+
stderr,
|
|
4005
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
4006
|
+
timedOut
|
|
4007
|
+
});
|
|
4008
|
+
});
|
|
4009
|
+
});
|
|
4010
|
+
}
|
|
4011
|
+
|
|
4012
|
+
// src/evaluation/providers/targets.ts
|
|
4013
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
4014
|
+
var import_zod = require("zod");
|
|
4015
|
+
var CliHealthcheckHttpInputSchema = import_zod.z.object({
|
|
4016
|
+
type: import_zod.z.literal("http"),
|
|
4017
|
+
url: import_zod.z.string().min(1, "healthcheck URL is required"),
|
|
4018
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4019
|
+
timeoutSeconds: import_zod.z.number().positive().optional()
|
|
4020
|
+
});
|
|
4021
|
+
var CliHealthcheckCommandInputSchema = import_zod.z.object({
|
|
4022
|
+
type: import_zod.z.literal("command"),
|
|
4023
|
+
command_template: import_zod.z.string().optional(),
|
|
4024
|
+
commandTemplate: import_zod.z.string().optional(),
|
|
4025
|
+
cwd: import_zod.z.string().optional(),
|
|
4026
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4027
|
+
timeoutSeconds: import_zod.z.number().positive().optional()
|
|
4028
|
+
});
|
|
4029
|
+
var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
|
|
4030
|
+
CliHealthcheckHttpInputSchema,
|
|
4031
|
+
CliHealthcheckCommandInputSchema
|
|
4032
|
+
]);
|
|
4033
|
+
var CliTargetInputSchema = import_zod.z.object({
|
|
4034
|
+
name: import_zod.z.string().min(1, "target name is required"),
|
|
4035
|
+
provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
4036
|
+
// Command template - required (accept both naming conventions)
|
|
4037
|
+
command_template: import_zod.z.string().optional(),
|
|
4038
|
+
commandTemplate: import_zod.z.string().optional(),
|
|
4039
|
+
// Files format - optional
|
|
4040
|
+
files_format: import_zod.z.string().optional(),
|
|
4041
|
+
filesFormat: import_zod.z.string().optional(),
|
|
4042
|
+
attachments_format: import_zod.z.string().optional(),
|
|
4043
|
+
attachmentsFormat: import_zod.z.string().optional(),
|
|
4044
|
+
// Working directory - optional
|
|
4045
|
+
cwd: import_zod.z.string().optional(),
|
|
4046
|
+
// Timeout in seconds - optional
|
|
4047
|
+
timeout_seconds: import_zod.z.number().positive().optional(),
|
|
4048
|
+
timeoutSeconds: import_zod.z.number().positive().optional(),
|
|
4049
|
+
// Healthcheck configuration - optional
|
|
4050
|
+
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
4051
|
+
// Verbose mode - optional
|
|
4052
|
+
verbose: import_zod.z.boolean().optional(),
|
|
4053
|
+
cli_verbose: import_zod.z.boolean().optional(),
|
|
4054
|
+
cliVerbose: import_zod.z.boolean().optional(),
|
|
4055
|
+
// Keep temp files - optional
|
|
4056
|
+
keep_temp_files: import_zod.z.boolean().optional(),
|
|
4057
|
+
keepTempFiles: import_zod.z.boolean().optional(),
|
|
4058
|
+
keep_output_files: import_zod.z.boolean().optional(),
|
|
4059
|
+
keepOutputFiles: import_zod.z.boolean().optional(),
|
|
4060
|
+
// Common target fields
|
|
4061
|
+
judge_target: import_zod.z.string().optional(),
|
|
4062
|
+
workers: import_zod.z.number().int().min(1).optional(),
|
|
4063
|
+
provider_batching: import_zod.z.boolean().optional(),
|
|
4064
|
+
providerBatching: import_zod.z.boolean().optional()
|
|
4065
|
+
}).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
|
|
4066
|
+
message: "Either command_template or commandTemplate is required"
|
|
4067
|
+
});
|
|
4068
|
+
var CliHealthcheckHttpSchema = import_zod.z.object({
|
|
4069
|
+
type: import_zod.z.literal("http"),
|
|
4070
|
+
url: import_zod.z.string().min(1),
|
|
4071
|
+
timeoutMs: import_zod.z.number().positive().optional()
|
|
4072
|
+
}).strict();
|
|
4073
|
+
var CliHealthcheckCommandSchema = import_zod.z.object({
|
|
4074
|
+
type: import_zod.z.literal("command"),
|
|
4075
|
+
commandTemplate: import_zod.z.string().min(1),
|
|
4076
|
+
cwd: import_zod.z.string().optional(),
|
|
4077
|
+
timeoutMs: import_zod.z.number().positive().optional()
|
|
4078
|
+
}).strict();
|
|
4079
|
+
var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
|
|
4080
|
+
CliHealthcheckHttpSchema,
|
|
4081
|
+
CliHealthcheckCommandSchema
|
|
4082
|
+
]);
|
|
4083
|
+
var CliTargetConfigSchema = import_zod.z.object({
|
|
4084
|
+
commandTemplate: import_zod.z.string().min(1),
|
|
4085
|
+
filesFormat: import_zod.z.string().optional(),
|
|
4086
|
+
cwd: import_zod.z.string().optional(),
|
|
4087
|
+
timeoutMs: import_zod.z.number().positive().optional(),
|
|
4088
|
+
healthcheck: CliHealthcheckSchema.optional(),
|
|
4089
|
+
verbose: import_zod.z.boolean().optional(),
|
|
4090
|
+
keepTempFiles: import_zod.z.boolean().optional()
|
|
4091
|
+
}).strict();
|
|
4092
|
+
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
4093
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4094
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4095
|
+
if (input.type === "http") {
|
|
4096
|
+
const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
4097
|
+
return {
|
|
4098
|
+
type: "http",
|
|
4099
|
+
url,
|
|
4100
|
+
timeoutMs
|
|
4101
|
+
};
|
|
4102
|
+
}
|
|
4103
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4104
|
+
if (commandTemplateSource === void 0) {
|
|
4105
|
+
throw new Error(
|
|
4106
|
+
`${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
|
|
4107
|
+
);
|
|
4108
|
+
}
|
|
4109
|
+
const commandTemplate = resolveString(
|
|
4110
|
+
commandTemplateSource,
|
|
4111
|
+
env,
|
|
4112
|
+
`${targetName} healthcheck command template`,
|
|
4113
|
+
true
|
|
4114
|
+
);
|
|
4115
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
|
|
4116
|
+
allowLiteral: true,
|
|
4117
|
+
optionalEnv: true
|
|
4118
|
+
});
|
|
4119
|
+
if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
|
|
4120
|
+
cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
|
|
4121
|
+
}
|
|
4122
|
+
return {
|
|
4123
|
+
type: "command",
|
|
4124
|
+
commandTemplate,
|
|
4125
|
+
cwd,
|
|
4126
|
+
timeoutMs
|
|
4127
|
+
};
|
|
4128
|
+
}
|
|
4129
|
+
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
4130
|
+
const targetName = input.name;
|
|
4131
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4132
|
+
if (commandTemplateSource === void 0) {
|
|
4133
|
+
throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
|
|
4134
|
+
}
|
|
4135
|
+
const commandTemplate = resolveString(
|
|
4136
|
+
commandTemplateSource,
|
|
4137
|
+
env,
|
|
4138
|
+
`${targetName} CLI command template`,
|
|
4139
|
+
true
|
|
4140
|
+
);
|
|
4141
|
+
const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
|
|
4142
|
+
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
4143
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
|
|
4144
|
+
allowLiteral: true,
|
|
4145
|
+
optionalEnv: true
|
|
4146
|
+
});
|
|
4147
|
+
if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
|
|
4148
|
+
cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
|
|
4149
|
+
}
|
|
4150
|
+
if (!cwd && evalFilePath) {
|
|
4151
|
+
cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
|
|
4152
|
+
}
|
|
4153
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4154
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4155
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
|
|
4156
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
4157
|
+
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
4158
|
+
);
|
|
4159
|
+
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
4160
|
+
return {
|
|
4161
|
+
commandTemplate,
|
|
4162
|
+
filesFormat,
|
|
4163
|
+
cwd,
|
|
4164
|
+
timeoutMs,
|
|
4165
|
+
healthcheck,
|
|
4166
|
+
verbose,
|
|
4167
|
+
keepTempFiles
|
|
4168
|
+
};
|
|
4169
|
+
}
|
|
4170
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
4171
|
+
"PROMPT",
|
|
4172
|
+
"GUIDELINES",
|
|
4173
|
+
"EVAL_ID",
|
|
4174
|
+
"ATTEMPT",
|
|
4175
|
+
"FILES",
|
|
4176
|
+
"OUTPUT_FILE"
|
|
4177
|
+
]);
|
|
4178
|
+
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
4179
|
+
name: import_zod.z.string().min(1, "target name is required"),
|
|
4180
|
+
provider: import_zod.z.string().min(1, "provider is required"),
|
|
4181
|
+
judge_target: import_zod.z.string().optional(),
|
|
4182
|
+
workers: import_zod.z.number().int().min(1).optional()
|
|
4183
|
+
}).passthrough();
|
|
4184
|
+
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
4185
|
+
function normalizeAzureApiVersion(value) {
|
|
4186
|
+
if (!value) {
|
|
4187
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
4188
|
+
}
|
|
4189
|
+
const trimmed = value.trim();
|
|
4190
|
+
if (trimmed.length === 0) {
|
|
4191
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
4192
|
+
}
|
|
4193
|
+
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
4194
|
+
return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
|
|
4195
|
+
}
|
|
4196
|
+
function resolveRetryConfig(target) {
|
|
4197
|
+
const maxRetries = resolveOptionalNumber(
|
|
4198
|
+
target.max_retries ?? target.maxRetries,
|
|
4199
|
+
`${target.name} max retries`
|
|
4200
|
+
);
|
|
4201
|
+
const initialDelayMs = resolveOptionalNumber(
|
|
4202
|
+
target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
|
|
4203
|
+
`${target.name} retry initial delay`
|
|
4204
|
+
);
|
|
4205
|
+
const maxDelayMs = resolveOptionalNumber(
|
|
4206
|
+
target.retry_max_delay_ms ?? target.retryMaxDelayMs,
|
|
4207
|
+
`${target.name} retry max delay`
|
|
4208
|
+
);
|
|
4209
|
+
const backoffFactor = resolveOptionalNumber(
|
|
4210
|
+
target.retry_backoff_factor ?? target.retryBackoffFactor,
|
|
4211
|
+
`${target.name} retry backoff factor`
|
|
4212
|
+
);
|
|
4213
|
+
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
4214
|
+
target.retry_status_codes ?? target.retryStatusCodes,
|
|
4215
|
+
`${target.name} retry status codes`
|
|
4216
|
+
);
|
|
4217
|
+
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
4218
|
+
return void 0;
|
|
4219
|
+
}
|
|
4220
|
+
return {
|
|
4221
|
+
maxRetries,
|
|
4222
|
+
initialDelayMs,
|
|
4223
|
+
maxDelayMs,
|
|
4224
|
+
backoffFactor,
|
|
4225
|
+
retryableStatusCodes
|
|
4226
|
+
};
|
|
4227
|
+
}
|
|
4228
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
4229
|
+
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
4230
|
+
const provider = parsed.provider.toLowerCase();
|
|
4231
|
+
const providerBatching = resolveOptionalBoolean(
|
|
4232
|
+
parsed.provider_batching ?? parsed.providerBatching
|
|
4233
|
+
);
|
|
4234
|
+
switch (provider) {
|
|
4235
|
+
case "azure":
|
|
4236
|
+
case "azure-openai":
|
|
4237
|
+
return {
|
|
4238
|
+
kind: "azure",
|
|
4239
|
+
name: parsed.name,
|
|
4240
|
+
judgeTarget: parsed.judge_target,
|
|
4241
|
+
workers: parsed.workers,
|
|
4242
|
+
providerBatching,
|
|
4243
|
+
config: resolveAzureConfig(parsed, env)
|
|
4244
|
+
};
|
|
4245
|
+
case "anthropic":
|
|
4246
|
+
return {
|
|
4247
|
+
kind: "anthropic",
|
|
4248
|
+
name: parsed.name,
|
|
4249
|
+
judgeTarget: parsed.judge_target,
|
|
4250
|
+
workers: parsed.workers,
|
|
4251
|
+
providerBatching,
|
|
4252
|
+
config: resolveAnthropicConfig(parsed, env)
|
|
4253
|
+
};
|
|
4254
|
+
case "gemini":
|
|
4255
|
+
case "google":
|
|
4256
|
+
case "google-gemini":
|
|
4257
|
+
return {
|
|
4258
|
+
kind: "gemini",
|
|
4259
|
+
name: parsed.name,
|
|
4260
|
+
judgeTarget: parsed.judge_target,
|
|
4261
|
+
workers: parsed.workers,
|
|
4262
|
+
providerBatching,
|
|
4263
|
+
config: resolveGeminiConfig(parsed, env)
|
|
4264
|
+
};
|
|
4265
|
+
case "codex":
|
|
4266
|
+
case "codex-cli":
|
|
4267
|
+
return {
|
|
4268
|
+
kind: "codex",
|
|
4269
|
+
name: parsed.name,
|
|
4270
|
+
judgeTarget: parsed.judge_target,
|
|
4271
|
+
workers: parsed.workers,
|
|
4272
|
+
providerBatching,
|
|
4273
|
+
config: resolveCodexConfig(parsed, env)
|
|
4274
|
+
};
|
|
4275
|
+
case "pi":
|
|
4276
|
+
case "pi-coding-agent":
|
|
4277
|
+
return {
|
|
4278
|
+
kind: "pi-coding-agent",
|
|
4279
|
+
name: parsed.name,
|
|
4280
|
+
judgeTarget: parsed.judge_target,
|
|
4281
|
+
workers: parsed.workers,
|
|
4282
|
+
providerBatching,
|
|
4283
|
+
config: resolvePiCodingAgentConfig(parsed, env)
|
|
4284
|
+
};
|
|
4285
|
+
case "mock":
|
|
4286
|
+
return {
|
|
4287
|
+
kind: "mock",
|
|
4288
|
+
name: parsed.name,
|
|
4289
|
+
judgeTarget: parsed.judge_target,
|
|
4290
|
+
workers: parsed.workers,
|
|
4291
|
+
providerBatching,
|
|
4292
|
+
config: resolveMockConfig(parsed)
|
|
4293
|
+
};
|
|
4294
|
+
case "vscode":
|
|
4295
|
+
case "vscode-insiders":
|
|
4296
|
+
return {
|
|
4297
|
+
kind: provider,
|
|
4298
|
+
name: parsed.name,
|
|
4299
|
+
judgeTarget: parsed.judge_target,
|
|
4300
|
+
workers: parsed.workers,
|
|
4301
|
+
providerBatching,
|
|
4302
|
+
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
4303
|
+
};
|
|
4304
|
+
case "cli":
|
|
4305
|
+
return {
|
|
4306
|
+
kind: "cli",
|
|
4307
|
+
name: parsed.name,
|
|
4308
|
+
judgeTarget: parsed.judge_target,
|
|
4309
|
+
workers: parsed.workers,
|
|
4310
|
+
providerBatching,
|
|
4311
|
+
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
4312
|
+
};
|
|
4313
|
+
default:
|
|
4314
|
+
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
4315
|
+
}
|
|
4316
|
+
}
|
|
4317
|
+
function resolveAzureConfig(target, env) {
|
|
4318
|
+
const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
|
|
4319
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4320
|
+
const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
|
|
4321
|
+
const versionSource = target.version ?? target.api_version;
|
|
4322
|
+
const temperatureSource = target.temperature;
|
|
4323
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
4324
|
+
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
4325
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
4326
|
+
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
4327
|
+
const version = normalizeAzureApiVersion(
|
|
4328
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
4329
|
+
allowLiteral: true,
|
|
4330
|
+
optionalEnv: true
|
|
4331
|
+
})
|
|
4332
|
+
);
|
|
4333
|
+
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
4334
|
+
const maxOutputTokens = resolveOptionalNumber(
|
|
4335
|
+
maxTokensSource,
|
|
4336
|
+
`${target.name} max output tokens`
|
|
4337
|
+
);
|
|
4338
|
+
const retry = resolveRetryConfig(target);
|
|
4339
|
+
return {
|
|
4340
|
+
resourceName,
|
|
4341
|
+
deploymentName,
|
|
4342
|
+
apiKey,
|
|
4343
|
+
version,
|
|
4344
|
+
temperature,
|
|
4345
|
+
maxOutputTokens,
|
|
4346
|
+
retry
|
|
4347
|
+
};
|
|
4348
|
+
}
|
|
4349
|
+
function resolveAnthropicConfig(target, env) {
|
|
4350
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4351
|
+
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
4352
|
+
const temperatureSource = target.temperature;
|
|
4353
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
4354
|
+
const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
|
|
4355
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
4356
|
+
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
4357
|
+
const retry = resolveRetryConfig(target);
|
|
4358
|
+
return {
|
|
4359
|
+
apiKey,
|
|
4360
|
+
model,
|
|
4361
|
+
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
4362
|
+
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
4363
|
+
thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
|
|
4364
|
+
retry
|
|
4365
|
+
};
|
|
4366
|
+
}
|
|
4367
|
+
function resolveGeminiConfig(target, env) {
|
|
4368
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4369
|
+
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
4370
|
+
const temperatureSource = target.temperature;
|
|
4371
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
4372
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
4373
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
4374
|
+
allowLiteral: true,
|
|
4375
|
+
optionalEnv: true
|
|
4376
|
+
}) ?? "gemini-2.5-flash";
|
|
4377
|
+
const retry = resolveRetryConfig(target);
|
|
4378
|
+
return {
|
|
4379
|
+
apiKey,
|
|
4380
|
+
model,
|
|
4381
|
+
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
4382
|
+
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
4383
|
+
retry
|
|
4384
|
+
};
|
|
4385
|
+
}
|
|
4386
|
+
function resolveCodexConfig(target, env) {
|
|
4387
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
4388
|
+
const argsSource = target.args ?? target.arguments;
|
|
4389
|
+
const cwdSource = target.cwd;
|
|
4390
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
3429
4391
|
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
3430
4392
|
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
4393
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
3431
4394
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
3432
4395
|
allowLiteral: true,
|
|
3433
4396
|
optionalEnv: true
|
|
@@ -3443,13 +4406,15 @@ function resolveCodexConfig(target, env) {
|
|
|
3443
4406
|
optionalEnv: true
|
|
3444
4407
|
});
|
|
3445
4408
|
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
4409
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
3446
4410
|
return {
|
|
3447
4411
|
executable,
|
|
3448
4412
|
args,
|
|
3449
4413
|
cwd,
|
|
3450
4414
|
timeoutMs,
|
|
3451
4415
|
logDir,
|
|
3452
|
-
logFormat
|
|
4416
|
+
logFormat,
|
|
4417
|
+
systemPrompt
|
|
3453
4418
|
};
|
|
3454
4419
|
}
|
|
3455
4420
|
function normalizeCodexLogFormat(value) {
|
|
@@ -3465,6 +4430,70 @@ function normalizeCodexLogFormat(value) {
|
|
|
3465
4430
|
}
|
|
3466
4431
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
3467
4432
|
}
|
|
4433
|
+
function resolvePiCodingAgentConfig(target, env) {
|
|
4434
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
4435
|
+
const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
|
|
4436
|
+
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
4437
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4438
|
+
const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
|
|
4439
|
+
const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
|
|
4440
|
+
const argsSource = target.args ?? target.arguments;
|
|
4441
|
+
const cwdSource = target.cwd;
|
|
4442
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
4443
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
4444
|
+
const logFormatSource = target.log_format ?? target.logFormat;
|
|
4445
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
4446
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, {
|
|
4447
|
+
allowLiteral: true,
|
|
4448
|
+
optionalEnv: true
|
|
4449
|
+
}) ?? "pi";
|
|
4450
|
+
const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
|
|
4451
|
+
allowLiteral: true,
|
|
4452
|
+
optionalEnv: true
|
|
4453
|
+
});
|
|
4454
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
|
|
4455
|
+
allowLiteral: true,
|
|
4456
|
+
optionalEnv: true
|
|
4457
|
+
});
|
|
4458
|
+
const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, {
|
|
4459
|
+
allowLiteral: false,
|
|
4460
|
+
optionalEnv: true
|
|
4461
|
+
});
|
|
4462
|
+
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
4463
|
+
allowLiteral: true,
|
|
4464
|
+
optionalEnv: true
|
|
4465
|
+
});
|
|
4466
|
+
const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
|
|
4467
|
+
allowLiteral: true,
|
|
4468
|
+
optionalEnv: true
|
|
4469
|
+
});
|
|
4470
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
4471
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
4472
|
+
allowLiteral: true,
|
|
4473
|
+
optionalEnv: true
|
|
4474
|
+
});
|
|
4475
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
|
|
4476
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
|
|
4477
|
+
allowLiteral: true,
|
|
4478
|
+
optionalEnv: true
|
|
4479
|
+
});
|
|
4480
|
+
const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
|
|
4481
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4482
|
+
return {
|
|
4483
|
+
executable,
|
|
4484
|
+
provider,
|
|
4485
|
+
model,
|
|
4486
|
+
apiKey,
|
|
4487
|
+
tools,
|
|
4488
|
+
thinking,
|
|
4489
|
+
args,
|
|
4490
|
+
cwd,
|
|
4491
|
+
timeoutMs,
|
|
4492
|
+
logDir,
|
|
4493
|
+
logFormat,
|
|
4494
|
+
systemPrompt
|
|
4495
|
+
};
|
|
4496
|
+
}
|
|
3468
4497
|
function resolveMockConfig(target) {
|
|
3469
4498
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
3470
4499
|
return { response };
|
|
@@ -3499,46 +4528,35 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
3499
4528
|
workspaceTemplate
|
|
3500
4529
|
};
|
|
3501
4530
|
}
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
3506
|
-
);
|
|
3507
|
-
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
3508
|
-
const keepTempFiles = resolveOptionalBoolean(
|
|
3509
|
-
target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
|
|
3510
|
-
);
|
|
3511
|
-
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
3512
|
-
allowLiteral: true,
|
|
3513
|
-
optionalEnv: true
|
|
3514
|
-
});
|
|
3515
|
-
if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
|
|
3516
|
-
cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
|
|
4531
|
+
var cliErrorMap = (issue, ctx) => {
|
|
4532
|
+
if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
|
|
4533
|
+
return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
|
|
3517
4534
|
}
|
|
3518
|
-
if (
|
|
3519
|
-
|
|
4535
|
+
if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
|
|
4536
|
+
return { message: "healthcheck type must be 'http' or 'command'" };
|
|
3520
4537
|
}
|
|
3521
|
-
|
|
3522
|
-
|
|
3523
|
-
|
|
3524
|
-
|
|
3525
|
-
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
}
|
|
4538
|
+
if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
|
|
4539
|
+
return { message: `${ctx.defaultError} (expected a string value)` };
|
|
4540
|
+
}
|
|
4541
|
+
return { message: ctx.defaultError };
|
|
4542
|
+
};
|
|
4543
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
4544
|
+
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
4545
|
+
if (!parseResult.success) {
|
|
4546
|
+
const firstError = parseResult.error.errors[0];
|
|
4547
|
+
const path16 = firstError?.path.join(".") || "";
|
|
4548
|
+
const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
|
|
4549
|
+
throw new Error(`${prefix}${firstError?.message}`);
|
|
4550
|
+
}
|
|
4551
|
+
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
4552
|
+
assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
|
|
4553
|
+
if (normalized.healthcheck?.type === "command") {
|
|
4554
|
+
assertSupportedCliPlaceholders(
|
|
4555
|
+
normalized.healthcheck.commandTemplate,
|
|
4556
|
+
`${target.name} healthcheck command template`
|
|
4557
|
+
);
|
|
4558
|
+
}
|
|
4559
|
+
return normalized;
|
|
3542
4560
|
}
|
|
3543
4561
|
function resolveTimeoutMs(source, description) {
|
|
3544
4562
|
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
@@ -3550,49 +4568,6 @@ function resolveTimeoutMs(source, description) {
|
|
|
3550
4568
|
}
|
|
3551
4569
|
return Math.floor(seconds * 1e3);
|
|
3552
4570
|
}
|
|
3553
|
-
function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
|
|
3554
|
-
if (source === void 0 || source === null) {
|
|
3555
|
-
return void 0;
|
|
3556
|
-
}
|
|
3557
|
-
if (typeof source !== "object" || Array.isArray(source)) {
|
|
3558
|
-
throw new Error(`${targetName} healthcheck must be an object`);
|
|
3559
|
-
}
|
|
3560
|
-
const candidate = source;
|
|
3561
|
-
const type = candidate.type;
|
|
3562
|
-
const timeoutMs = resolveTimeoutMs(
|
|
3563
|
-
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
3564
|
-
`${targetName} healthcheck timeout`
|
|
3565
|
-
);
|
|
3566
|
-
if (type === "http") {
|
|
3567
|
-
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
3568
|
-
return {
|
|
3569
|
-
type: "http",
|
|
3570
|
-
url,
|
|
3571
|
-
timeoutMs
|
|
3572
|
-
};
|
|
3573
|
-
}
|
|
3574
|
-
if (type === "command") {
|
|
3575
|
-
const commandTemplate = resolveString(
|
|
3576
|
-
candidate.command_template ?? candidate.commandTemplate,
|
|
3577
|
-
env,
|
|
3578
|
-
`${targetName} healthcheck command template`,
|
|
3579
|
-
true
|
|
3580
|
-
);
|
|
3581
|
-
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
3582
|
-
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
3583
|
-
allowLiteral: true,
|
|
3584
|
-
optionalEnv: true
|
|
3585
|
-
});
|
|
3586
|
-
const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
|
|
3587
|
-
return {
|
|
3588
|
-
type: "command",
|
|
3589
|
-
commandTemplate,
|
|
3590
|
-
timeoutMs,
|
|
3591
|
-
cwd: resolvedCwd
|
|
3592
|
-
};
|
|
3593
|
-
}
|
|
3594
|
-
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
3595
|
-
}
|
|
3596
4571
|
function assertSupportedCliPlaceholders(template, description) {
|
|
3597
4572
|
const placeholders = extractCliPlaceholders(template);
|
|
3598
4573
|
for (const placeholder of placeholders) {
|
|
@@ -3758,7 +4733,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
3758
4733
|
}
|
|
3759
4734
|
|
|
3760
4735
|
// src/evaluation/providers/vscode.ts
|
|
3761
|
-
var
|
|
4736
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3762
4737
|
var import_subagent = require("subagent");
|
|
3763
4738
|
|
|
3764
4739
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -3928,7 +4903,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3928
4903
|
return "";
|
|
3929
4904
|
}
|
|
3930
4905
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3931
|
-
const fileName =
|
|
4906
|
+
const fileName = import_node_path13.default.basename(absolutePath);
|
|
3932
4907
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3933
4908
|
return `* [${fileName}](${fileUri})`;
|
|
3934
4909
|
});
|
|
@@ -3953,8 +4928,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3953
4928
|
}
|
|
3954
4929
|
const unique = /* @__PURE__ */ new Map();
|
|
3955
4930
|
for (const attachment of attachments) {
|
|
3956
|
-
const absolutePath =
|
|
3957
|
-
const normalized = absolutePath.split(
|
|
4931
|
+
const absolutePath = import_node_path13.default.resolve(attachment);
|
|
4932
|
+
const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
|
|
3958
4933
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3959
4934
|
if (!unique.has(absolutePath)) {
|
|
3960
4935
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3969,7 +4944,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3969
4944
|
}
|
|
3970
4945
|
const unique = /* @__PURE__ */ new Map();
|
|
3971
4946
|
for (const attachment of attachments) {
|
|
3972
|
-
const absolutePath =
|
|
4947
|
+
const absolutePath = import_node_path13.default.resolve(attachment);
|
|
3973
4948
|
if (!unique.has(absolutePath)) {
|
|
3974
4949
|
unique.set(absolutePath, absolutePath);
|
|
3975
4950
|
}
|
|
@@ -3977,7 +4952,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3977
4952
|
return Array.from(unique.values());
|
|
3978
4953
|
}
|
|
3979
4954
|
function pathToFileUri2(filePath) {
|
|
3980
|
-
const absolutePath =
|
|
4955
|
+
const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
|
|
3981
4956
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3982
4957
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3983
4958
|
return `file:///${normalizedPath}`;
|
|
@@ -3990,7 +4965,7 @@ function normalizeAttachments(attachments) {
|
|
|
3990
4965
|
}
|
|
3991
4966
|
const deduped = /* @__PURE__ */ new Set();
|
|
3992
4967
|
for (const attachment of attachments) {
|
|
3993
|
-
deduped.add(
|
|
4968
|
+
deduped.add(import_node_path13.default.resolve(attachment));
|
|
3994
4969
|
}
|
|
3995
4970
|
return Array.from(deduped);
|
|
3996
4971
|
}
|
|
@@ -3999,7 +4974,7 @@ function mergeAttachments(all) {
|
|
|
3999
4974
|
for (const list of all) {
|
|
4000
4975
|
if (!list) continue;
|
|
4001
4976
|
for (const inputFile of list) {
|
|
4002
|
-
deduped.add(
|
|
4977
|
+
deduped.add(import_node_path13.default.resolve(inputFile));
|
|
4003
4978
|
}
|
|
4004
4979
|
}
|
|
4005
4980
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -4046,9 +5021,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
4046
5021
|
}
|
|
4047
5022
|
|
|
4048
5023
|
// src/evaluation/providers/targets-file.ts
|
|
4049
|
-
var
|
|
4050
|
-
var
|
|
4051
|
-
var
|
|
5024
|
+
var import_node_fs5 = require("fs");
|
|
5025
|
+
var import_promises11 = require("fs/promises");
|
|
5026
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
4052
5027
|
var import_yaml3 = require("yaml");
|
|
4053
5028
|
function isRecord(value) {
|
|
4054
5029
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -4078,18 +5053,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
4078
5053
|
}
|
|
4079
5054
|
async function fileExists3(filePath) {
|
|
4080
5055
|
try {
|
|
4081
|
-
await (0,
|
|
5056
|
+
await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
|
|
4082
5057
|
return true;
|
|
4083
5058
|
} catch {
|
|
4084
5059
|
return false;
|
|
4085
5060
|
}
|
|
4086
5061
|
}
|
|
4087
5062
|
async function readTargetDefinitions(filePath) {
|
|
4088
|
-
const absolutePath =
|
|
5063
|
+
const absolutePath = import_node_path14.default.resolve(filePath);
|
|
4089
5064
|
if (!await fileExists3(absolutePath)) {
|
|
4090
5065
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
4091
5066
|
}
|
|
4092
|
-
const raw = await (0,
|
|
5067
|
+
const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
|
|
4093
5068
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
4094
5069
|
if (!isRecord(parsed)) {
|
|
4095
5070
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -4117,6 +5092,8 @@ function createProvider(target) {
|
|
|
4117
5092
|
return new CliProvider(target.name, target.config);
|
|
4118
5093
|
case "codex":
|
|
4119
5094
|
return new CodexProvider(target.name, target.config);
|
|
5095
|
+
case "pi-coding-agent":
|
|
5096
|
+
return new PiCodingAgentProvider(target.name, target.config);
|
|
4120
5097
|
case "mock":
|
|
4121
5098
|
return new MockProvider(target.name, target.config);
|
|
4122
5099
|
case "vscode":
|
|
@@ -4137,9 +5114,76 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
4137
5114
|
var import_ai2 = require("ai");
|
|
4138
5115
|
var import_zod2 = require("zod");
|
|
4139
5116
|
|
|
5117
|
+
// src/runtime/exec.ts
|
|
5118
|
+
function getBunSpawn() {
|
|
5119
|
+
const bunSpawn = globalThis.Bun?.spawn;
|
|
5120
|
+
return typeof bunSpawn === "function" ? bunSpawn : void 0;
|
|
5121
|
+
}
|
|
5122
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5123
|
+
const bunSpawn = getBunSpawn();
|
|
5124
|
+
if (bunSpawn) {
|
|
5125
|
+
const encoder = new TextEncoder();
|
|
5126
|
+
const proc = bunSpawn({
|
|
5127
|
+
cmd: ["sh", "-c", command],
|
|
5128
|
+
cwd: options.cwd,
|
|
5129
|
+
stdin: encoder.encode(stdinPayload),
|
|
5130
|
+
stdout: "pipe",
|
|
5131
|
+
stderr: "pipe"
|
|
5132
|
+
});
|
|
5133
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5134
|
+
proc.kill();
|
|
5135
|
+
}, options.timeoutMs) : void 0;
|
|
5136
|
+
try {
|
|
5137
|
+
const stdout = await new Response(proc.stdout).text();
|
|
5138
|
+
const stderr = await new Response(proc.stderr).text();
|
|
5139
|
+
const exitCode = await proc.exited;
|
|
5140
|
+
return { stdout, stderr, exitCode };
|
|
5141
|
+
} finally {
|
|
5142
|
+
if (timeout !== void 0) {
|
|
5143
|
+
clearTimeout(timeout);
|
|
5144
|
+
}
|
|
5145
|
+
}
|
|
5146
|
+
}
|
|
5147
|
+
const { spawn: spawn3 } = await import("child_process");
|
|
5148
|
+
return await new Promise((resolve, reject) => {
|
|
5149
|
+
const child = spawn3(command, {
|
|
5150
|
+
shell: true,
|
|
5151
|
+
cwd: options.cwd,
|
|
5152
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
5153
|
+
});
|
|
5154
|
+
let stdout = "";
|
|
5155
|
+
let stderr = "";
|
|
5156
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
5157
|
+
child.kill();
|
|
5158
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
5159
|
+
}, options.timeoutMs) : void 0;
|
|
5160
|
+
child.stdout?.on("data", (data) => {
|
|
5161
|
+
stdout += data.toString();
|
|
5162
|
+
});
|
|
5163
|
+
child.stderr?.on("data", (data) => {
|
|
5164
|
+
stderr += data.toString();
|
|
5165
|
+
});
|
|
5166
|
+
child.on("error", (error) => {
|
|
5167
|
+
if (timeout !== void 0) {
|
|
5168
|
+
clearTimeout(timeout);
|
|
5169
|
+
}
|
|
5170
|
+
reject(error);
|
|
5171
|
+
});
|
|
5172
|
+
child.on("exit", (code) => {
|
|
5173
|
+
if (timeout !== void 0) {
|
|
5174
|
+
clearTimeout(timeout);
|
|
5175
|
+
}
|
|
5176
|
+
resolve({ stdout, stderr, exitCode: code ?? 0 });
|
|
5177
|
+
});
|
|
5178
|
+
child.stdin?.write(stdinPayload);
|
|
5179
|
+
child.stdin?.end();
|
|
5180
|
+
});
|
|
5181
|
+
}
|
|
5182
|
+
|
|
4140
5183
|
// src/evaluation/providers/types.ts
|
|
4141
5184
|
var AGENT_PROVIDER_KINDS = [
|
|
4142
5185
|
"codex",
|
|
5186
|
+
"pi-coding-agent",
|
|
4143
5187
|
"vscode",
|
|
4144
5188
|
"vscode-insiders"
|
|
4145
5189
|
];
|
|
@@ -4438,17 +5482,17 @@ var CodeEvaluator = class {
|
|
|
4438
5482
|
const inputPayload = JSON.stringify(
|
|
4439
5483
|
{
|
|
4440
5484
|
question: context.evalCase.question,
|
|
4441
|
-
|
|
4442
|
-
|
|
4443
|
-
|
|
4444
|
-
|
|
4445
|
-
|
|
4446
|
-
|
|
4447
|
-
|
|
4448
|
-
(
|
|
5485
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
5486
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
5487
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
5488
|
+
candidateAnswer: context.candidate,
|
|
5489
|
+
outputMessages: context.outputMessages ?? null,
|
|
5490
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
5491
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
5492
|
+
(path16) => !context.evalCase.guideline_paths.includes(path16)
|
|
4449
5493
|
),
|
|
4450
|
-
|
|
4451
|
-
|
|
5494
|
+
inputMessages: context.evalCase.input_messages,
|
|
5495
|
+
traceSummary: context.traceSummary ?? null
|
|
4452
5496
|
},
|
|
4453
5497
|
null,
|
|
4454
5498
|
2
|
|
@@ -4518,43 +5562,17 @@ function calculateRubricScore(result, rubrics) {
|
|
|
4518
5562
|
return { score, verdict, hits, misses };
|
|
4519
5563
|
}
|
|
4520
5564
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4521
|
-
const {
|
|
4522
|
-
|
|
4523
|
-
|
|
4524
|
-
shell: true,
|
|
4525
|
-
cwd
|
|
4526
|
-
});
|
|
4527
|
-
let stdout = "";
|
|
4528
|
-
let stderr = "";
|
|
4529
|
-
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
4530
|
-
child.kill();
|
|
4531
|
-
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
4532
|
-
}, agentTimeoutMs) : void 0;
|
|
4533
|
-
child.stdout?.on("data", (data) => {
|
|
4534
|
-
stdout += data.toString();
|
|
4535
|
-
});
|
|
4536
|
-
child.stderr?.on("data", (data) => {
|
|
4537
|
-
stderr += data.toString();
|
|
4538
|
-
});
|
|
4539
|
-
child.on("error", (error) => {
|
|
4540
|
-
if (timeout !== void 0) {
|
|
4541
|
-
clearTimeout(timeout);
|
|
4542
|
-
}
|
|
4543
|
-
reject(error);
|
|
4544
|
-
});
|
|
4545
|
-
child.on("exit", (code) => {
|
|
4546
|
-
if (timeout !== void 0) {
|
|
4547
|
-
clearTimeout(timeout);
|
|
4548
|
-
}
|
|
4549
|
-
if (code && code !== 0 && stderr.length > 0) {
|
|
4550
|
-
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
4551
|
-
return;
|
|
4552
|
-
}
|
|
4553
|
-
resolve(stdout.trim());
|
|
4554
|
-
});
|
|
4555
|
-
child.stdin?.write(input);
|
|
4556
|
-
child.stdin?.end();
|
|
5565
|
+
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
5566
|
+
cwd,
|
|
5567
|
+
timeoutMs: agentTimeoutMs
|
|
4557
5568
|
});
|
|
5569
|
+
if (exitCode !== 0) {
|
|
5570
|
+
const trimmedErr = stderr.trim();
|
|
5571
|
+
throw new Error(
|
|
5572
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
5573
|
+
);
|
|
5574
|
+
}
|
|
5575
|
+
return stdout.trim();
|
|
4558
5576
|
}
|
|
4559
5577
|
function parseJsonSafe(payload) {
|
|
4560
5578
|
try {
|
|
@@ -4568,6 +5586,33 @@ function substituteVariables(template, variables) {
|
|
|
4568
5586
|
return variables[varName] ?? match;
|
|
4569
5587
|
});
|
|
4570
5588
|
}
|
|
5589
|
+
function deepEqual(a, b) {
|
|
5590
|
+
if (a === b) return true;
|
|
5591
|
+
if (a === null || b === null) return a === b;
|
|
5592
|
+
if (typeof a !== typeof b) return false;
|
|
5593
|
+
if (typeof a !== "object") return a === b;
|
|
5594
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
5595
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
5596
|
+
if (a.length !== b.length) return false;
|
|
5597
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
5598
|
+
}
|
|
5599
|
+
const aObj = a;
|
|
5600
|
+
const bObj = b;
|
|
5601
|
+
const aKeys = Object.keys(aObj);
|
|
5602
|
+
const bKeys = Object.keys(bObj);
|
|
5603
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
5604
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
5605
|
+
}
|
|
5606
|
+
function argsMatch(expected, actual) {
|
|
5607
|
+
if (expected === void 0) return true;
|
|
5608
|
+
if (expected === "any") return true;
|
|
5609
|
+
if (actual === void 0) return false;
|
|
5610
|
+
for (const key of Object.keys(expected)) {
|
|
5611
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
5612
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
5613
|
+
}
|
|
5614
|
+
return true;
|
|
5615
|
+
}
|
|
4571
5616
|
var ToolTrajectoryEvaluator = class {
|
|
4572
5617
|
kind = "tool_trajectory";
|
|
4573
5618
|
config;
|
|
@@ -4624,7 +5669,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4624
5669
|
for (const message of messages) {
|
|
4625
5670
|
if (message.toolCalls) {
|
|
4626
5671
|
for (const call of message.toolCalls) {
|
|
4627
|
-
toolCalls.push({
|
|
5672
|
+
toolCalls.push({
|
|
5673
|
+
name: call.tool,
|
|
5674
|
+
args: call.input
|
|
5675
|
+
});
|
|
4628
5676
|
}
|
|
4629
5677
|
}
|
|
4630
5678
|
}
|
|
@@ -4693,18 +5741,29 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4693
5741
|
const misses = [];
|
|
4694
5742
|
let actualIndex = 0;
|
|
4695
5743
|
for (let i = 0; i < expected.length; i++) {
|
|
4696
|
-
const
|
|
5744
|
+
const expectedItem = expected[i];
|
|
5745
|
+
const expectedTool = expectedItem.tool;
|
|
4697
5746
|
let found = false;
|
|
5747
|
+
let argsMismatch = false;
|
|
4698
5748
|
while (actualIndex < toolCalls.length) {
|
|
4699
|
-
|
|
4700
|
-
|
|
5749
|
+
const actualCall = toolCalls[actualIndex];
|
|
5750
|
+
if (actualCall.name === expectedTool) {
|
|
5751
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
5752
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
5753
|
+
actualIndex++;
|
|
5754
|
+
found = true;
|
|
5755
|
+
break;
|
|
5756
|
+
}
|
|
5757
|
+
misses.push(
|
|
5758
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
5759
|
+
);
|
|
4701
5760
|
actualIndex++;
|
|
4702
|
-
|
|
5761
|
+
argsMismatch = true;
|
|
4703
5762
|
break;
|
|
4704
5763
|
}
|
|
4705
5764
|
actualIndex++;
|
|
4706
5765
|
}
|
|
4707
|
-
if (!found) {
|
|
5766
|
+
if (!found && !argsMismatch) {
|
|
4708
5767
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
4709
5768
|
}
|
|
4710
5769
|
}
|
|
@@ -4735,10 +5794,16 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4735
5794
|
}
|
|
4736
5795
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
4737
5796
|
for (let i = 0; i < checkLength; i++) {
|
|
4738
|
-
const
|
|
4739
|
-
const
|
|
5797
|
+
const expectedItem = expected[i];
|
|
5798
|
+
const expectedTool = expectedItem.tool;
|
|
5799
|
+
const actualCall = toolCalls[i];
|
|
5800
|
+
const actualTool = actualCall.name;
|
|
4740
5801
|
if (actualTool === expectedTool) {
|
|
4741
|
-
|
|
5802
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
5803
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
5804
|
+
} else {
|
|
5805
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
5806
|
+
}
|
|
4742
5807
|
} else {
|
|
4743
5808
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4744
5809
|
}
|
|
@@ -4982,9 +6047,9 @@ var CompositeEvaluator = class {
|
|
|
4982
6047
|
};
|
|
4983
6048
|
|
|
4984
6049
|
// src/evaluation/orchestrator.ts
|
|
4985
|
-
var
|
|
4986
|
-
var
|
|
4987
|
-
var
|
|
6050
|
+
var import_node_crypto3 = require("crypto");
|
|
6051
|
+
var import_promises12 = require("fs/promises");
|
|
6052
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
4988
6053
|
|
|
4989
6054
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4990
6055
|
var Node = class {
|
|
@@ -5380,7 +6445,12 @@ async function runBatchEvaluation(options) {
|
|
|
5380
6445
|
const promptInputs = promptInputsList[i];
|
|
5381
6446
|
const providerResponse = batchResponse[i];
|
|
5382
6447
|
const outputMessages = providerResponse.outputMessages;
|
|
5383
|
-
const
|
|
6448
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
6449
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6450
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
6451
|
+
costUsd: providerResponse.costUsd,
|
|
6452
|
+
durationMs: providerResponse.durationMs
|
|
6453
|
+
}) : void 0;
|
|
5384
6454
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
5385
6455
|
let result;
|
|
5386
6456
|
try {
|
|
@@ -5501,7 +6571,12 @@ async function runEvalCase(options) {
|
|
|
5501
6571
|
await cache.set(cacheKey, providerResponse);
|
|
5502
6572
|
}
|
|
5503
6573
|
const outputMessages = providerResponse.outputMessages;
|
|
5504
|
-
const
|
|
6574
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
6575
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
6576
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
6577
|
+
costUsd: providerResponse.costUsd,
|
|
6578
|
+
durationMs: providerResponse.durationMs
|
|
6579
|
+
}) : void 0;
|
|
5505
6580
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
5506
6581
|
try {
|
|
5507
6582
|
return await evaluateCandidate({
|
|
@@ -5574,21 +6649,21 @@ async function evaluateCandidate(options) {
|
|
|
5574
6649
|
}
|
|
5575
6650
|
return {
|
|
5576
6651
|
timestamp: completedAt.toISOString(),
|
|
5577
|
-
|
|
6652
|
+
evalId: evalCase.id,
|
|
5578
6653
|
dataset: evalCase.dataset,
|
|
5579
|
-
|
|
6654
|
+
conversationId: evalCase.conversation_id,
|
|
5580
6655
|
score: score.score,
|
|
5581
6656
|
hits: score.hits,
|
|
5582
6657
|
misses: score.misses,
|
|
5583
|
-
|
|
6658
|
+
candidateAnswer: candidate,
|
|
5584
6659
|
target: target.name,
|
|
5585
6660
|
reasoning: score.reasoning,
|
|
5586
|
-
|
|
5587
|
-
|
|
5588
|
-
|
|
5589
|
-
|
|
5590
|
-
|
|
5591
|
-
|
|
6661
|
+
rawAspects: score.rawAspects,
|
|
6662
|
+
agentProviderRequest,
|
|
6663
|
+
lmProviderRequest,
|
|
6664
|
+
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
6665
|
+
evaluatorResults,
|
|
6666
|
+
traceSummary
|
|
5592
6667
|
};
|
|
5593
6668
|
}
|
|
5594
6669
|
async function runEvaluatorsForCase(options) {
|
|
@@ -5686,7 +6761,7 @@ async function runEvaluatorList(options) {
|
|
|
5686
6761
|
hits: score2.hits,
|
|
5687
6762
|
misses: score2.misses,
|
|
5688
6763
|
reasoning: score2.reasoning,
|
|
5689
|
-
|
|
6764
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
5690
6765
|
});
|
|
5691
6766
|
}
|
|
5692
6767
|
if (evaluator.type === "code") {
|
|
@@ -5717,11 +6792,11 @@ async function runEvaluatorList(options) {
|
|
|
5717
6792
|
hits: score2.hits,
|
|
5718
6793
|
misses: score2.misses,
|
|
5719
6794
|
reasoning: score2.reasoning,
|
|
5720
|
-
|
|
6795
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
5721
6796
|
});
|
|
5722
6797
|
}
|
|
5723
6798
|
if (evaluator.type === "composite") {
|
|
5724
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
6799
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5725
6800
|
const createEvaluator = (memberConfig) => {
|
|
5726
6801
|
switch (memberConfig.type) {
|
|
5727
6802
|
case "llm_judge":
|
|
@@ -5774,8 +6849,8 @@ async function runEvaluatorList(options) {
|
|
|
5774
6849
|
hits: score2.hits,
|
|
5775
6850
|
misses: score2.misses,
|
|
5776
6851
|
reasoning: score2.reasoning,
|
|
5777
|
-
|
|
5778
|
-
|
|
6852
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
6853
|
+
evaluatorResults: mapChildResults(score2.evaluatorResults)
|
|
5779
6854
|
});
|
|
5780
6855
|
}
|
|
5781
6856
|
if (evaluator.type === "tool_trajectory") {
|
|
@@ -5933,22 +7008,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5933
7008
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5934
7009
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5935
7010
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5936
|
-
const filePath =
|
|
5937
|
-
await (0,
|
|
7011
|
+
const filePath = import_node_path15.default.resolve(directory, filename);
|
|
7012
|
+
await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
|
|
5938
7013
|
const payload = {
|
|
5939
7014
|
eval_id: evalCase.id,
|
|
5940
7015
|
question: promptInputs.question,
|
|
5941
7016
|
guidelines: promptInputs.guidelines,
|
|
5942
7017
|
guideline_paths: evalCase.guideline_paths
|
|
5943
7018
|
};
|
|
5944
|
-
await (0,
|
|
7019
|
+
await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
5945
7020
|
}
|
|
5946
7021
|
function sanitizeFilename(value) {
|
|
5947
7022
|
if (!value) {
|
|
5948
7023
|
return "prompt";
|
|
5949
7024
|
}
|
|
5950
7025
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
5951
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
7026
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
|
|
5952
7027
|
}
|
|
5953
7028
|
async function invokeProvider(provider, options) {
|
|
5954
7029
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -6005,22 +7080,22 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
6005
7080
|
}
|
|
6006
7081
|
return {
|
|
6007
7082
|
timestamp: timestamp.toISOString(),
|
|
6008
|
-
|
|
7083
|
+
evalId: evalCase.id,
|
|
6009
7084
|
dataset: evalCase.dataset,
|
|
6010
|
-
|
|
7085
|
+
conversationId: evalCase.conversation_id,
|
|
6011
7086
|
score: 0,
|
|
6012
7087
|
hits: [],
|
|
6013
7088
|
misses: [`Error: ${message}`],
|
|
6014
|
-
|
|
7089
|
+
candidateAnswer: `Error occurred: ${message}`,
|
|
6015
7090
|
target: targetName,
|
|
6016
|
-
|
|
6017
|
-
|
|
6018
|
-
|
|
7091
|
+
rawAspects: [],
|
|
7092
|
+
agentProviderRequest,
|
|
7093
|
+
lmProviderRequest,
|
|
6019
7094
|
error: message
|
|
6020
7095
|
};
|
|
6021
7096
|
}
|
|
6022
7097
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
6023
|
-
const hash = (0,
|
|
7098
|
+
const hash = (0, import_node_crypto3.createHash)("sha256");
|
|
6024
7099
|
hash.update(provider.id);
|
|
6025
7100
|
hash.update(target.name);
|
|
6026
7101
|
hash.update(evalCase.id);
|
|
@@ -6060,8 +7135,8 @@ function mapChildResults(children) {
|
|
|
6060
7135
|
hits: child.hits,
|
|
6061
7136
|
misses: child.misses,
|
|
6062
7137
|
reasoning: child.reasoning,
|
|
6063
|
-
|
|
6064
|
-
|
|
7138
|
+
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
7139
|
+
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
6065
7140
|
}));
|
|
6066
7141
|
}
|
|
6067
7142
|
function computeWeightedMean(entries) {
|
|
@@ -6163,17 +7238,21 @@ function createAgentKernel() {
|
|
|
6163
7238
|
0 && (module.exports = {
|
|
6164
7239
|
CodeEvaluator,
|
|
6165
7240
|
CompositeEvaluator,
|
|
7241
|
+
DEFAULT_EXPLORATION_TOOLS,
|
|
6166
7242
|
LlmJudgeEvaluator,
|
|
6167
7243
|
TEST_MESSAGE_ROLES,
|
|
6168
7244
|
ToolTrajectoryEvaluator,
|
|
7245
|
+
avgToolDurationMs,
|
|
6169
7246
|
buildDirectoryChain,
|
|
6170
7247
|
buildPromptInputs,
|
|
6171
7248
|
buildSearchRoots,
|
|
6172
7249
|
computeTraceSummary,
|
|
6173
7250
|
consumeCodexLogEntries,
|
|
7251
|
+
consumePiLogEntries,
|
|
6174
7252
|
createAgentKernel,
|
|
6175
7253
|
createProvider,
|
|
6176
7254
|
ensureVSCodeSubagents,
|
|
7255
|
+
explorationRatio,
|
|
6177
7256
|
extractCodeBlocks,
|
|
6178
7257
|
fileExists,
|
|
6179
7258
|
findGitRoot,
|
|
@@ -6187,6 +7266,7 @@ function createAgentKernel() {
|
|
|
6187
7266
|
isTestMessageRole,
|
|
6188
7267
|
listTargetNames,
|
|
6189
7268
|
loadEvalCases,
|
|
7269
|
+
mergeExecutionMetrics,
|
|
6190
7270
|
normalizeLineEndings,
|
|
6191
7271
|
readJsonFile,
|
|
6192
7272
|
readTargetDefinitions,
|
|
@@ -6197,6 +7277,8 @@ function createAgentKernel() {
|
|
|
6197
7277
|
resolveTargetDefinition,
|
|
6198
7278
|
runEvalCase,
|
|
6199
7279
|
runEvaluation,
|
|
6200
|
-
subscribeToCodexLogEntries
|
|
7280
|
+
subscribeToCodexLogEntries,
|
|
7281
|
+
subscribeToPiLogEntries,
|
|
7282
|
+
tokensPerTool
|
|
6201
7283
|
});
|
|
6202
7284
|
//# sourceMappingURL=index.cjs.map
|