@agentv/core 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KPHTMTZ3.js → chunk-E2VSU4WZ.js} +265 -83
- package/dist/chunk-E2VSU4WZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +82 -71
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1475 -393
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +227 -33
- package/dist/index.d.ts +227 -33
- package/dist/index.js +1142 -244
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-KPHTMTZ3.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
readTextFile,
|
|
11
11
|
resolveFileReference,
|
|
12
12
|
resolveTargetDefinition
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-E2VSU4WZ.js";
|
|
14
14
|
|
|
15
15
|
// src/evaluation/types.ts
|
|
16
16
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -93,6 +93,53 @@ function computeTraceSummary(messages) {
|
|
|
93
93
|
errorCount: 0
|
|
94
94
|
};
|
|
95
95
|
}
|
|
96
|
+
var DEFAULT_EXPLORATION_TOOLS = [
|
|
97
|
+
"read",
|
|
98
|
+
"grep",
|
|
99
|
+
"glob",
|
|
100
|
+
"search",
|
|
101
|
+
"list",
|
|
102
|
+
"Read",
|
|
103
|
+
"Grep",
|
|
104
|
+
"Glob",
|
|
105
|
+
"WebSearch",
|
|
106
|
+
"WebFetch"
|
|
107
|
+
];
|
|
108
|
+
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
109
|
+
if (summary.eventCount === 0) return void 0;
|
|
110
|
+
const explorationCalls = explorationTools.reduce(
|
|
111
|
+
(sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
|
|
112
|
+
0
|
|
113
|
+
);
|
|
114
|
+
return explorationCalls / summary.eventCount;
|
|
115
|
+
}
|
|
116
|
+
function tokensPerTool(summary) {
|
|
117
|
+
if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
|
|
118
|
+
const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
|
|
119
|
+
return totalTokens / summary.eventCount;
|
|
120
|
+
}
|
|
121
|
+
function avgToolDurationMs(summary) {
|
|
122
|
+
if (!summary.toolDurations) return void 0;
|
|
123
|
+
let totalDuration = 0;
|
|
124
|
+
let totalCalls = 0;
|
|
125
|
+
for (const durations of Object.values(summary.toolDurations)) {
|
|
126
|
+
for (const duration of durations) {
|
|
127
|
+
totalDuration += duration;
|
|
128
|
+
totalCalls++;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (totalCalls === 0) return void 0;
|
|
132
|
+
return totalDuration / totalCalls;
|
|
133
|
+
}
|
|
134
|
+
function mergeExecutionMetrics(summary, metrics) {
|
|
135
|
+
if (!metrics) return summary;
|
|
136
|
+
return {
|
|
137
|
+
...summary,
|
|
138
|
+
tokenUsage: metrics.tokenUsage,
|
|
139
|
+
costUsd: metrics.costUsd,
|
|
140
|
+
durationMs: metrics.durationMs
|
|
141
|
+
};
|
|
142
|
+
}
|
|
96
143
|
|
|
97
144
|
// src/evaluation/yaml-parser.ts
|
|
98
145
|
import { readFile as readFile5 } from "node:fs/promises";
|
|
@@ -607,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
607
654
|
expected = [];
|
|
608
655
|
for (const item of rawExpected) {
|
|
609
656
|
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
610
|
-
|
|
657
|
+
let args;
|
|
658
|
+
if (item.args === "any") {
|
|
659
|
+
args = "any";
|
|
660
|
+
} else if (isJsonObject2(item.args)) {
|
|
661
|
+
args = item.args;
|
|
662
|
+
}
|
|
663
|
+
expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
|
|
611
664
|
}
|
|
612
665
|
}
|
|
613
666
|
}
|
|
@@ -1767,12 +1820,14 @@ var CliProvider = class {
|
|
|
1767
1820
|
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1768
1821
|
);
|
|
1769
1822
|
}
|
|
1823
|
+
const startTime = Date.now();
|
|
1770
1824
|
const result = await this.runCommand(renderedCommand, {
|
|
1771
1825
|
cwd: this.config.cwd,
|
|
1772
1826
|
env: process.env,
|
|
1773
1827
|
timeoutMs: this.config.timeoutMs,
|
|
1774
1828
|
signal: request.signal
|
|
1775
1829
|
});
|
|
1830
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1776
1831
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1777
1832
|
if (request.signal?.aborted) {
|
|
1778
1833
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1791,6 +1846,9 @@ var CliProvider = class {
|
|
|
1791
1846
|
const parsed = this.parseOutputContent(responseContent);
|
|
1792
1847
|
return {
|
|
1793
1848
|
outputMessages: parsed.outputMessages,
|
|
1849
|
+
tokenUsage: parsed.tokenUsage,
|
|
1850
|
+
costUsd: parsed.costUsd,
|
|
1851
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
1794
1852
|
raw: {
|
|
1795
1853
|
command: renderedCommand,
|
|
1796
1854
|
stderr: result.stderr,
|
|
@@ -1838,12 +1896,14 @@ var CliProvider = class {
|
|
|
1838
1896
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1839
1897
|
);
|
|
1840
1898
|
}
|
|
1899
|
+
const startTime = Date.now();
|
|
1841
1900
|
const result = await this.runCommand(renderedCommand, {
|
|
1842
1901
|
cwd: this.config.cwd,
|
|
1843
1902
|
env: process.env,
|
|
1844
1903
|
timeoutMs: this.config.timeoutMs,
|
|
1845
1904
|
signal: controller.signal
|
|
1846
1905
|
});
|
|
1906
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
1847
1907
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1848
1908
|
if (controller.signal.aborted) {
|
|
1849
1909
|
throw new Error("CLI provider request was aborted");
|
|
@@ -1865,11 +1925,13 @@ var CliProvider = class {
|
|
|
1865
1925
|
if (missingIds.length > 0) {
|
|
1866
1926
|
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
1867
1927
|
}
|
|
1928
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
1868
1929
|
const responses = requests.map((request) => {
|
|
1869
1930
|
const evalCaseId = request.evalCaseId;
|
|
1870
1931
|
if (!evalCaseId) {
|
|
1871
1932
|
return {
|
|
1872
1933
|
outputMessages: [],
|
|
1934
|
+
durationMs: perRequestFallbackMs,
|
|
1873
1935
|
raw: {
|
|
1874
1936
|
command: renderedCommand,
|
|
1875
1937
|
stderr: result.stderr,
|
|
@@ -1883,6 +1945,7 @@ var CliProvider = class {
|
|
|
1883
1945
|
if (!parsed) {
|
|
1884
1946
|
return {
|
|
1885
1947
|
outputMessages: [],
|
|
1948
|
+
durationMs: perRequestFallbackMs,
|
|
1886
1949
|
raw: {
|
|
1887
1950
|
command: renderedCommand,
|
|
1888
1951
|
stderr: result.stderr,
|
|
@@ -1894,6 +1957,9 @@ var CliProvider = class {
|
|
|
1894
1957
|
}
|
|
1895
1958
|
return {
|
|
1896
1959
|
outputMessages: parsed.outputMessages,
|
|
1960
|
+
tokenUsage: parsed.tokenUsage,
|
|
1961
|
+
costUsd: parsed.costUsd,
|
|
1962
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
1897
1963
|
raw: {
|
|
1898
1964
|
command: renderedCommand,
|
|
1899
1965
|
stderr: result.stderr,
|
|
@@ -1911,25 +1977,55 @@ var CliProvider = class {
|
|
|
1911
1977
|
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
1912
1978
|
* If only 'text' is provided, wrap it in outputMessages.
|
|
1913
1979
|
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
1980
|
+
*
|
|
1981
|
+
* Also extracts optional execution metrics:
|
|
1982
|
+
* - token_usage: { input, output, cached? }
|
|
1983
|
+
* - cost_usd: number
|
|
1984
|
+
* - duration_ms: number
|
|
1914
1985
|
*/
|
|
1915
1986
|
parseOutputContent(content) {
|
|
1916
1987
|
try {
|
|
1917
1988
|
const parsed = JSON.parse(content);
|
|
1918
1989
|
if (typeof parsed === "object" && parsed !== null) {
|
|
1919
1990
|
const obj = parsed;
|
|
1991
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
1992
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
1993
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
1920
1994
|
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
1921
1995
|
if (outputMessages && outputMessages.length > 0) {
|
|
1922
|
-
return { outputMessages };
|
|
1996
|
+
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
1923
1997
|
}
|
|
1924
1998
|
if ("text" in obj) {
|
|
1925
1999
|
const text = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
1926
|
-
return {
|
|
2000
|
+
return {
|
|
2001
|
+
outputMessages: [{ role: "assistant", content: text }],
|
|
2002
|
+
tokenUsage,
|
|
2003
|
+
costUsd,
|
|
2004
|
+
durationMs
|
|
2005
|
+
};
|
|
1927
2006
|
}
|
|
1928
2007
|
}
|
|
1929
2008
|
} catch {
|
|
1930
2009
|
}
|
|
1931
2010
|
return { outputMessages: [{ role: "assistant", content }] };
|
|
1932
2011
|
}
|
|
2012
|
+
/**
|
|
2013
|
+
* Parse token_usage from CLI output.
|
|
2014
|
+
*/
|
|
2015
|
+
parseTokenUsage(tokenUsage) {
|
|
2016
|
+
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
2017
|
+
return void 0;
|
|
2018
|
+
}
|
|
2019
|
+
const obj = tokenUsage;
|
|
2020
|
+
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
2021
|
+
return void 0;
|
|
2022
|
+
}
|
|
2023
|
+
return {
|
|
2024
|
+
input: obj.input,
|
|
2025
|
+
output: obj.output,
|
|
2026
|
+
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
2027
|
+
};
|
|
2028
|
+
}
|
|
1933
2029
|
/**
|
|
1934
2030
|
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
1935
2031
|
*/
|
|
@@ -2006,6 +2102,9 @@ var CliProvider = class {
|
|
|
2006
2102
|
if (records.has(id)) {
|
|
2007
2103
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
2008
2104
|
}
|
|
2105
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
2106
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
2107
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
2009
2108
|
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
2010
2109
|
let outputMessages;
|
|
2011
2110
|
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
@@ -2015,7 +2114,10 @@ var CliProvider = class {
|
|
|
2015
2114
|
outputMessages = text ? [{ role: "assistant", content: text }] : [];
|
|
2016
2115
|
}
|
|
2017
2116
|
records.set(id, {
|
|
2018
|
-
outputMessages
|
|
2117
|
+
outputMessages,
|
|
2118
|
+
tokenUsage,
|
|
2119
|
+
costUsd,
|
|
2120
|
+
durationMs
|
|
2019
2121
|
});
|
|
2020
2122
|
}
|
|
2021
2123
|
return records;
|
|
@@ -2331,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
|
|
|
2331
2433
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
2332
2434
|
var PROMPT_FILENAME = "prompt.md";
|
|
2333
2435
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
2436
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
2437
|
+
- Do NOT create any additional output files in the workspace.
|
|
2438
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
2439
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
2440
|
+
This is required for evaluation scoring.`;
|
|
2334
2441
|
var CodexProvider = class {
|
|
2335
2442
|
id;
|
|
2336
2443
|
kind = "codex";
|
|
@@ -2355,7 +2462,11 @@ var CodexProvider = class {
|
|
|
2355
2462
|
const workspaceRoot = await this.createWorkspace();
|
|
2356
2463
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2357
2464
|
try {
|
|
2358
|
-
const
|
|
2465
|
+
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
2466
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
2467
|
+
const promptContent = `${systemPrompt}
|
|
2468
|
+
|
|
2469
|
+
${basePrompt}`;
|
|
2359
2470
|
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
2360
2471
|
await writeFile(promptFile, promptContent, "utf8");
|
|
2361
2472
|
const args = this.buildCodexArgs();
|
|
@@ -3039,172 +3150,851 @@ var MockProvider = class {
|
|
|
3039
3150
|
}
|
|
3040
3151
|
};
|
|
3041
3152
|
|
|
3042
|
-
// src/evaluation/providers/
|
|
3153
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3154
|
+
import { spawn as spawn2 } from "node:child_process";
|
|
3155
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
3156
|
+
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
3157
|
+
import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3158
|
+
import { tmpdir as tmpdir2 } from "node:os";
|
|
3043
3159
|
import path10 from "node:path";
|
|
3044
|
-
import {
|
|
3045
|
-
dispatchAgentSession,
|
|
3046
|
-
dispatchBatchAgent,
|
|
3047
|
-
getSubagentRoot,
|
|
3048
|
-
provisionSubagents
|
|
3049
|
-
} from "subagent";
|
|
3050
|
-
|
|
3051
|
-
// src/evaluation/providers/vscode-templates.ts
|
|
3052
|
-
var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3053
|
-
|
|
3054
|
-
{{userQuery}}
|
|
3055
|
-
|
|
3056
|
-
[[ ## system_instructions ## ]]
|
|
3057
|
-
|
|
3058
|
-
**IMPORTANT**: Follow these exact steps:
|
|
3059
|
-
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3060
|
-
- Do NOT create any additional output files in the workspace.
|
|
3061
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
3062
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3063
|
-
2. When completely finished, run these PowerShell commands to signal completion:
|
|
3064
|
-
\`\`\`
|
|
3065
|
-
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
3066
|
-
if (Test-Path subagent.lock) { del subagent.lock }
|
|
3067
|
-
\`\`\`
|
|
3068
|
-
|
|
3069
|
-
Do not proceed to step 2 until your response is completely written to the temporary file.
|
|
3070
|
-
`;
|
|
3071
|
-
var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3072
|
-
|
|
3073
|
-
{{userQuery}}
|
|
3074
|
-
|
|
3075
|
-
[[ ## system_instructions ## ]]
|
|
3076
3160
|
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3161
|
+
// src/evaluation/providers/pi-log-tracker.ts
|
|
3162
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
|
|
3163
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
|
|
3164
|
+
function getPiLogStore() {
|
|
3165
|
+
const globalObject = globalThis;
|
|
3166
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
3167
|
+
if (existing) {
|
|
3168
|
+
return existing;
|
|
3169
|
+
}
|
|
3170
|
+
const created = [];
|
|
3171
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
3172
|
+
return created;
|
|
3173
|
+
}
|
|
3174
|
+
function getSubscriberStore2() {
|
|
3175
|
+
const globalObject = globalThis;
|
|
3176
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
3177
|
+
if (existing) {
|
|
3178
|
+
return existing;
|
|
3179
|
+
}
|
|
3180
|
+
const created = /* @__PURE__ */ new Set();
|
|
3181
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
3182
|
+
return created;
|
|
3183
|
+
}
|
|
3184
|
+
function notifySubscribers2(entry) {
|
|
3185
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
3186
|
+
for (const listener of subscribers) {
|
|
3187
|
+
try {
|
|
3188
|
+
listener(entry);
|
|
3189
|
+
} catch (error) {
|
|
3190
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3191
|
+
console.warn(`Pi log subscriber failed: ${message}`);
|
|
3192
|
+
}
|
|
3193
|
+
}
|
|
3194
|
+
}
|
|
3195
|
+
function recordPiLogEntry(entry) {
|
|
3196
|
+
getPiLogStore().push(entry);
|
|
3197
|
+
notifySubscribers2(entry);
|
|
3198
|
+
}
|
|
3199
|
+
function consumePiLogEntries() {
|
|
3200
|
+
const store = getPiLogStore();
|
|
3201
|
+
if (store.length === 0) {
|
|
3202
|
+
return [];
|
|
3203
|
+
}
|
|
3204
|
+
return store.splice(0, store.length);
|
|
3205
|
+
}
|
|
3206
|
+
function subscribeToPiLogEntries(listener) {
|
|
3207
|
+
const store = getSubscriberStore2();
|
|
3208
|
+
store.add(listener);
|
|
3209
|
+
return () => {
|
|
3210
|
+
store.delete(listener);
|
|
3211
|
+
};
|
|
3212
|
+
}
|
|
3085
3213
|
|
|
3086
|
-
// src/evaluation/providers/
|
|
3087
|
-
var
|
|
3214
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
3215
|
+
var WORKSPACE_PREFIX2 = "agentv-pi-";
|
|
3216
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
3217
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
3218
|
+
- Do NOT create any additional output files in the workspace.
|
|
3219
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
3220
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3221
|
+
This is required for evaluation scoring.`;
|
|
3222
|
+
var PiCodingAgentProvider = class {
|
|
3088
3223
|
id;
|
|
3089
|
-
kind;
|
|
3224
|
+
kind = "pi-coding-agent";
|
|
3090
3225
|
targetName;
|
|
3091
|
-
supportsBatch =
|
|
3226
|
+
supportsBatch = false;
|
|
3092
3227
|
config;
|
|
3093
|
-
|
|
3094
|
-
|
|
3095
|
-
this.
|
|
3228
|
+
runPi;
|
|
3229
|
+
constructor(targetName, config, runner = defaultPiRunner) {
|
|
3230
|
+
this.id = `pi-coding-agent:${targetName}`;
|
|
3096
3231
|
this.targetName = targetName;
|
|
3097
3232
|
this.config = config;
|
|
3233
|
+
this.runPi = runner;
|
|
3098
3234
|
}
|
|
3099
3235
|
async invoke(request) {
|
|
3100
3236
|
if (request.signal?.aborted) {
|
|
3101
|
-
throw new Error("
|
|
3102
|
-
}
|
|
3103
|
-
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
3104
|
-
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
3105
|
-
const session = await dispatchAgentSession({
|
|
3106
|
-
userQuery: promptContent,
|
|
3107
|
-
extraAttachments: inputFiles,
|
|
3108
|
-
requestTemplate: AGENTV_REQUEST_TEMPLATE,
|
|
3109
|
-
wait: this.config.waitForResponse,
|
|
3110
|
-
dryRun: this.config.dryRun,
|
|
3111
|
-
vscodeCmd: this.config.command,
|
|
3112
|
-
subagentRoot: this.config.subagentRoot,
|
|
3113
|
-
workspaceTemplate: this.config.workspaceTemplate,
|
|
3114
|
-
silent: true
|
|
3115
|
-
});
|
|
3116
|
-
if (session.exitCode !== 0 || !session.responseFile) {
|
|
3117
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
3118
|
-
throw new Error(failure);
|
|
3237
|
+
throw new Error("Pi coding agent request was aborted before execution");
|
|
3119
3238
|
}
|
|
3120
|
-
|
|
3239
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
3240
|
+
const workspaceRoot = await this.createWorkspace();
|
|
3241
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
3242
|
+
try {
|
|
3243
|
+
const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3244
|
+
await writeFile2(promptFile, request.question, "utf8");
|
|
3245
|
+
const args = this.buildPiArgs(request.question, inputFiles);
|
|
3246
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
3247
|
+
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
3248
|
+
if (result.timedOut) {
|
|
3249
|
+
throw new Error(
|
|
3250
|
+
`Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
3251
|
+
);
|
|
3252
|
+
}
|
|
3253
|
+
if (result.exitCode !== 0) {
|
|
3254
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
3255
|
+
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
3256
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
3257
|
+
}
|
|
3258
|
+
const parsed = parsePiJsonl(result.stdout);
|
|
3259
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
3260
|
+
const assistantText = extractAssistantText2(outputMessages);
|
|
3121
3261
|
return {
|
|
3122
|
-
outputMessages: [],
|
|
3123
3262
|
raw: {
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3263
|
+
response: parsed,
|
|
3264
|
+
stdout: result.stdout,
|
|
3265
|
+
stderr: result.stderr,
|
|
3266
|
+
exitCode: result.exitCode,
|
|
3267
|
+
args,
|
|
3268
|
+
executable: this.config.executable,
|
|
3269
|
+
promptFile,
|
|
3270
|
+
workspace: workspaceRoot,
|
|
3271
|
+
inputFiles,
|
|
3272
|
+
logFile: logger?.filePath
|
|
3273
|
+
},
|
|
3274
|
+
outputMessages
|
|
3127
3275
|
};
|
|
3276
|
+
} finally {
|
|
3277
|
+
await logger?.close();
|
|
3278
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
3128
3279
|
}
|
|
3129
|
-
const responseText = await readTextFile(session.responseFile);
|
|
3130
|
-
return {
|
|
3131
|
-
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3132
|
-
raw: {
|
|
3133
|
-
session,
|
|
3134
|
-
inputFiles
|
|
3135
|
-
}
|
|
3136
|
-
};
|
|
3137
3280
|
}
|
|
3138
|
-
|
|
3139
|
-
if (
|
|
3140
|
-
return
|
|
3281
|
+
resolveCwd(workspaceRoot) {
|
|
3282
|
+
if (!this.config.cwd) {
|
|
3283
|
+
return workspaceRoot;
|
|
3141
3284
|
}
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
);
|
|
3149
|
-
const userQueries = normalizedRequests.map(
|
|
3150
|
-
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
3151
|
-
);
|
|
3152
|
-
const session = await dispatchBatchAgent({
|
|
3153
|
-
userQueries,
|
|
3154
|
-
extraAttachments: combinedInputFiles,
|
|
3155
|
-
requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
|
|
3156
|
-
wait: this.config.waitForResponse,
|
|
3157
|
-
dryRun: this.config.dryRun,
|
|
3158
|
-
vscodeCmd: this.config.command,
|
|
3159
|
-
subagentRoot: this.config.subagentRoot,
|
|
3160
|
-
workspaceTemplate: this.config.workspaceTemplate,
|
|
3161
|
-
silent: true
|
|
3162
|
-
});
|
|
3163
|
-
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
3164
|
-
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
3165
|
-
throw new Error(failure);
|
|
3285
|
+
return path10.resolve(this.config.cwd);
|
|
3286
|
+
}
|
|
3287
|
+
buildPiArgs(prompt, inputFiles) {
|
|
3288
|
+
const args = [];
|
|
3289
|
+
if (this.config.provider) {
|
|
3290
|
+
args.push("--provider", this.config.provider);
|
|
3166
3291
|
}
|
|
3167
|
-
if (this.config.
|
|
3168
|
-
|
|
3169
|
-
outputMessages: [],
|
|
3170
|
-
raw: {
|
|
3171
|
-
session,
|
|
3172
|
-
inputFiles,
|
|
3173
|
-
allInputFiles: combinedInputFiles
|
|
3174
|
-
}
|
|
3175
|
-
}));
|
|
3292
|
+
if (this.config.model) {
|
|
3293
|
+
args.push("--model", this.config.model);
|
|
3176
3294
|
}
|
|
3177
|
-
if (
|
|
3178
|
-
|
|
3179
|
-
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
3180
|
-
);
|
|
3295
|
+
if (this.config.apiKey) {
|
|
3296
|
+
args.push("--api-key", this.config.apiKey);
|
|
3181
3297
|
}
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
raw: {
|
|
3188
|
-
session,
|
|
3189
|
-
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
3190
|
-
allInputFiles: combinedInputFiles,
|
|
3191
|
-
responseFile
|
|
3192
|
-
}
|
|
3193
|
-
});
|
|
3298
|
+
args.push("--mode", "json");
|
|
3299
|
+
args.push("--print");
|
|
3300
|
+
args.push("--no-session");
|
|
3301
|
+
if (this.config.tools) {
|
|
3302
|
+
args.push("--tools", this.config.tools);
|
|
3194
3303
|
}
|
|
3195
|
-
|
|
3304
|
+
if (this.config.thinking) {
|
|
3305
|
+
args.push("--thinking", this.config.thinking);
|
|
3306
|
+
}
|
|
3307
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
3308
|
+
args.push(...this.config.args);
|
|
3309
|
+
}
|
|
3310
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
3311
|
+
for (const file of inputFiles) {
|
|
3312
|
+
args.push(`@${file}`);
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
3316
|
+
const fullPrompt = `${systemPrompt}
|
|
3317
|
+
|
|
3318
|
+
${prompt}`;
|
|
3319
|
+
const escapedPrompt = escapeAtSymbols(fullPrompt);
|
|
3320
|
+
args.push(escapedPrompt);
|
|
3321
|
+
return args;
|
|
3196
3322
|
}
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3323
|
+
async executePi(args, cwd, signal, logger) {
|
|
3324
|
+
try {
|
|
3325
|
+
return await this.runPi({
|
|
3326
|
+
executable: this.config.executable,
|
|
3327
|
+
args,
|
|
3328
|
+
cwd,
|
|
3329
|
+
timeoutMs: this.config.timeoutMs,
|
|
3330
|
+
env: this.buildEnv(),
|
|
3331
|
+
signal,
|
|
3332
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
3333
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
3334
|
+
});
|
|
3335
|
+
} catch (error) {
|
|
3336
|
+
const err = error;
|
|
3337
|
+
if (err.code === "ENOENT") {
|
|
3338
|
+
throw new Error(
|
|
3339
|
+
`Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
3340
|
+
);
|
|
3341
|
+
}
|
|
3342
|
+
throw error;
|
|
3343
|
+
}
|
|
3202
3344
|
}
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3345
|
+
buildEnv() {
|
|
3346
|
+
const env = { ...process.env };
|
|
3347
|
+
if (this.config.apiKey) {
|
|
3348
|
+
const provider = this.config.provider?.toLowerCase() ?? "google";
|
|
3349
|
+
switch (provider) {
|
|
3350
|
+
case "google":
|
|
3351
|
+
case "gemini":
|
|
3352
|
+
env.GEMINI_API_KEY = this.config.apiKey;
|
|
3353
|
+
break;
|
|
3354
|
+
case "anthropic":
|
|
3355
|
+
env.ANTHROPIC_API_KEY = this.config.apiKey;
|
|
3356
|
+
break;
|
|
3357
|
+
case "openai":
|
|
3358
|
+
env.OPENAI_API_KEY = this.config.apiKey;
|
|
3359
|
+
break;
|
|
3360
|
+
case "groq":
|
|
3361
|
+
env.GROQ_API_KEY = this.config.apiKey;
|
|
3362
|
+
break;
|
|
3363
|
+
case "xai":
|
|
3364
|
+
env.XAI_API_KEY = this.config.apiKey;
|
|
3365
|
+
break;
|
|
3366
|
+
case "openrouter":
|
|
3367
|
+
env.OPENROUTER_API_KEY = this.config.apiKey;
|
|
3368
|
+
break;
|
|
3369
|
+
}
|
|
3370
|
+
}
|
|
3371
|
+
return env;
|
|
3372
|
+
}
|
|
3373
|
+
async createWorkspace() {
|
|
3374
|
+
return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
3375
|
+
}
|
|
3376
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
3377
|
+
try {
|
|
3378
|
+
await rm2(workspaceRoot, { recursive: true, force: true });
|
|
3379
|
+
} catch {
|
|
3380
|
+
}
|
|
3381
|
+
}
|
|
3382
|
+
resolveLogDirectory() {
|
|
3383
|
+
if (this.config.logDir) {
|
|
3384
|
+
return path10.resolve(this.config.logDir);
|
|
3385
|
+
}
|
|
3386
|
+
return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
3387
|
+
}
|
|
3388
|
+
async createStreamLogger(request) {
|
|
3389
|
+
const logDir = this.resolveLogDirectory();
|
|
3390
|
+
if (!logDir) {
|
|
3391
|
+
return void 0;
|
|
3392
|
+
}
|
|
3393
|
+
try {
|
|
3394
|
+
await mkdir2(logDir, { recursive: true });
|
|
3395
|
+
} catch (error) {
|
|
3396
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3397
|
+
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
3398
|
+
return void 0;
|
|
3399
|
+
}
|
|
3400
|
+
const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3401
|
+
try {
|
|
3402
|
+
const logger = await PiStreamLogger.create({
|
|
3403
|
+
filePath,
|
|
3404
|
+
targetName: this.targetName,
|
|
3405
|
+
evalCaseId: request.evalCaseId,
|
|
3406
|
+
attempt: request.attempt,
|
|
3407
|
+
format: this.config.logFormat ?? "summary"
|
|
3408
|
+
});
|
|
3409
|
+
recordPiLogEntry({
|
|
3410
|
+
filePath,
|
|
3411
|
+
targetName: this.targetName,
|
|
3412
|
+
evalCaseId: request.evalCaseId,
|
|
3413
|
+
attempt: request.attempt
|
|
3414
|
+
});
|
|
3415
|
+
return logger;
|
|
3416
|
+
} catch (error) {
|
|
3417
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3418
|
+
console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
|
|
3419
|
+
return void 0;
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3422
|
+
};
|
|
3423
|
+
var PiStreamLogger = class _PiStreamLogger {
|
|
3424
|
+
filePath;
|
|
3425
|
+
stream;
|
|
3426
|
+
startedAt = Date.now();
|
|
3427
|
+
stdoutBuffer = "";
|
|
3428
|
+
stderrBuffer = "";
|
|
3429
|
+
format;
|
|
3430
|
+
constructor(filePath, format) {
|
|
3431
|
+
this.filePath = filePath;
|
|
3432
|
+
this.format = format;
|
|
3433
|
+
this.stream = createWriteStream2(filePath, { flags: "a" });
|
|
3434
|
+
}
|
|
3435
|
+
static async create(options) {
|
|
3436
|
+
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
3437
|
+
const header = [
|
|
3438
|
+
"# Pi Coding Agent stream log",
|
|
3439
|
+
`# target: ${options.targetName}`,
|
|
3440
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
3441
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
3442
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
3443
|
+
""
|
|
3444
|
+
].filter((line) => Boolean(line));
|
|
3445
|
+
logger.writeLines(header);
|
|
3446
|
+
return logger;
|
|
3447
|
+
}
|
|
3448
|
+
handleStdoutChunk(chunk) {
|
|
3449
|
+
this.stdoutBuffer += chunk;
|
|
3450
|
+
this.flushBuffer("stdout");
|
|
3451
|
+
}
|
|
3452
|
+
handleStderrChunk(chunk) {
|
|
3453
|
+
this.stderrBuffer += chunk;
|
|
3454
|
+
this.flushBuffer("stderr");
|
|
3455
|
+
}
|
|
3456
|
+
async close() {
|
|
3457
|
+
this.flushBuffer("stdout");
|
|
3458
|
+
this.flushBuffer("stderr");
|
|
3459
|
+
this.flushRemainder();
|
|
3460
|
+
await new Promise((resolve, reject) => {
|
|
3461
|
+
this.stream.once("error", reject);
|
|
3462
|
+
this.stream.end(() => resolve());
|
|
3463
|
+
});
|
|
3464
|
+
}
|
|
3465
|
+
writeLines(lines) {
|
|
3466
|
+
for (const line of lines) {
|
|
3467
|
+
this.stream.write(`${line}
|
|
3468
|
+
`);
|
|
3469
|
+
}
|
|
3470
|
+
}
|
|
3471
|
+
flushBuffer(source) {
|
|
3472
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
3473
|
+
const lines = buffer.split(/\r?\n/);
|
|
3474
|
+
const remainder = lines.pop() ?? "";
|
|
3475
|
+
if (source === "stdout") {
|
|
3476
|
+
this.stdoutBuffer = remainder;
|
|
3477
|
+
} else {
|
|
3478
|
+
this.stderrBuffer = remainder;
|
|
3479
|
+
}
|
|
3480
|
+
for (const line of lines) {
|
|
3481
|
+
const formatted = this.formatLine(line, source);
|
|
3482
|
+
if (formatted) {
|
|
3483
|
+
this.stream.write(formatted);
|
|
3484
|
+
this.stream.write("\n");
|
|
3485
|
+
}
|
|
3486
|
+
}
|
|
3487
|
+
}
|
|
3488
|
+
formatLine(rawLine, source) {
|
|
3489
|
+
const trimmed = rawLine.trim();
|
|
3490
|
+
if (trimmed.length === 0) {
|
|
3491
|
+
return void 0;
|
|
3492
|
+
}
|
|
3493
|
+
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
|
|
3494
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
|
|
3495
|
+
}
|
|
3496
|
+
flushRemainder() {
|
|
3497
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
3498
|
+
if (stdoutRemainder.length > 0) {
|
|
3499
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
3500
|
+
if (formatted) {
|
|
3501
|
+
this.stream.write(formatted);
|
|
3502
|
+
this.stream.write("\n");
|
|
3503
|
+
}
|
|
3504
|
+
}
|
|
3505
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
3506
|
+
if (stderrRemainder.length > 0) {
|
|
3507
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
3508
|
+
if (formatted) {
|
|
3509
|
+
this.stream.write(formatted);
|
|
3510
|
+
this.stream.write("\n");
|
|
3511
|
+
}
|
|
3512
|
+
}
|
|
3513
|
+
this.stdoutBuffer = "";
|
|
3514
|
+
this.stderrBuffer = "";
|
|
3515
|
+
}
|
|
3516
|
+
};
|
|
3517
|
+
function buildLogFilename2(request, targetName) {
|
|
3518
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3519
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
|
|
3520
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
3521
|
+
const target = sanitizeForFilename2(targetName);
|
|
3522
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
|
|
3523
|
+
}
|
|
3524
|
+
function sanitizeForFilename2(value) {
|
|
3525
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3526
|
+
return sanitized.length > 0 ? sanitized : "pi";
|
|
3527
|
+
}
|
|
3528
|
+
function formatElapsed2(startedAt) {
|
|
3529
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
3530
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
3531
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
3532
|
+
const seconds = elapsedSeconds % 60;
|
|
3533
|
+
if (hours > 0) {
|
|
3534
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3535
|
+
}
|
|
3536
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
3537
|
+
}
|
|
3538
|
+
function formatPiLogMessage(rawLine, source) {
|
|
3539
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3540
|
+
if (parsed) {
|
|
3541
|
+
const summary = summarizePiEvent(parsed);
|
|
3542
|
+
if (summary) {
|
|
3543
|
+
return summary;
|
|
3544
|
+
}
|
|
3545
|
+
}
|
|
3546
|
+
if (source === "stderr") {
|
|
3547
|
+
return `stderr: ${rawLine}`;
|
|
3548
|
+
}
|
|
3549
|
+
return rawLine;
|
|
3550
|
+
}
|
|
3551
|
+
function formatPiJsonLog(rawLine) {
|
|
3552
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
3553
|
+
if (!parsed) {
|
|
3554
|
+
return rawLine;
|
|
3555
|
+
}
|
|
3556
|
+
try {
|
|
3557
|
+
return JSON.stringify(parsed, null, 2);
|
|
3558
|
+
} catch {
|
|
3559
|
+
return rawLine;
|
|
3560
|
+
}
|
|
3561
|
+
}
|
|
3562
|
+
function summarizePiEvent(event) {
|
|
3563
|
+
if (!event || typeof event !== "object") {
|
|
3564
|
+
return void 0;
|
|
3565
|
+
}
|
|
3566
|
+
const record = event;
|
|
3567
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
3568
|
+
if (!type) {
|
|
3569
|
+
return void 0;
|
|
3570
|
+
}
|
|
3571
|
+
switch (type) {
|
|
3572
|
+
case "agent_start":
|
|
3573
|
+
return "agent_start";
|
|
3574
|
+
case "agent_end":
|
|
3575
|
+
return "agent_end";
|
|
3576
|
+
case "turn_start":
|
|
3577
|
+
return "turn_start";
|
|
3578
|
+
case "turn_end":
|
|
3579
|
+
return "turn_end";
|
|
3580
|
+
case "message_start":
|
|
3581
|
+
case "message_end": {
|
|
3582
|
+
const message = record.message;
|
|
3583
|
+
const role = message?.role;
|
|
3584
|
+
return `${type}: ${role}`;
|
|
3585
|
+
}
|
|
3586
|
+
case "message_update": {
|
|
3587
|
+
const event2 = record.assistantMessageEvent;
|
|
3588
|
+
const eventType = event2?.type;
|
|
3589
|
+
if (eventType === "text_delta") {
|
|
3590
|
+
const delta = event2?.delta;
|
|
3591
|
+
if (typeof delta === "string") {
|
|
3592
|
+
const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
|
|
3593
|
+
return `text_delta: ${preview}`;
|
|
3594
|
+
}
|
|
3595
|
+
}
|
|
3596
|
+
return `message_update: ${eventType}`;
|
|
3597
|
+
}
|
|
3598
|
+
default:
|
|
3599
|
+
return type;
|
|
3600
|
+
}
|
|
3601
|
+
}
|
|
3602
|
+
function tryParseJsonValue2(rawLine) {
|
|
3603
|
+
try {
|
|
3604
|
+
return JSON.parse(rawLine);
|
|
3605
|
+
} catch {
|
|
3606
|
+
return void 0;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
function parsePiJsonl(output) {
|
|
3610
|
+
const trimmed = output.trim();
|
|
3611
|
+
if (trimmed.length === 0) {
|
|
3612
|
+
throw new Error("Pi coding agent produced no output");
|
|
3613
|
+
}
|
|
3614
|
+
const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
3615
|
+
const parsed = [];
|
|
3616
|
+
for (const line of lines) {
|
|
3617
|
+
try {
|
|
3618
|
+
parsed.push(JSON.parse(line));
|
|
3619
|
+
} catch {
|
|
3620
|
+
}
|
|
3621
|
+
}
|
|
3622
|
+
if (parsed.length === 0) {
|
|
3623
|
+
throw new Error("Pi coding agent produced no valid JSON output");
|
|
3624
|
+
}
|
|
3625
|
+
return parsed;
|
|
3626
|
+
}
|
|
3627
|
+
function extractOutputMessages(events) {
|
|
3628
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
3629
|
+
const event = events[i];
|
|
3630
|
+
if (!event || typeof event !== "object") {
|
|
3631
|
+
continue;
|
|
3632
|
+
}
|
|
3633
|
+
const record = event;
|
|
3634
|
+
if (record.type !== "agent_end") {
|
|
3635
|
+
continue;
|
|
3636
|
+
}
|
|
3637
|
+
const messages = record.messages;
|
|
3638
|
+
if (!Array.isArray(messages)) {
|
|
3639
|
+
continue;
|
|
3640
|
+
}
|
|
3641
|
+
return messages.map(convertPiMessage).filter((m) => m !== void 0);
|
|
3642
|
+
}
|
|
3643
|
+
const outputMessages = [];
|
|
3644
|
+
for (const event of events) {
|
|
3645
|
+
if (!event || typeof event !== "object") {
|
|
3646
|
+
continue;
|
|
3647
|
+
}
|
|
3648
|
+
const record = event;
|
|
3649
|
+
if (record.type === "turn_end") {
|
|
3650
|
+
const message = record.message;
|
|
3651
|
+
const converted = convertPiMessage(message);
|
|
3652
|
+
if (converted) {
|
|
3653
|
+
outputMessages.push(converted);
|
|
3654
|
+
}
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
return outputMessages;
|
|
3658
|
+
}
|
|
3659
|
+
function convertPiMessage(message) {
|
|
3660
|
+
if (!message || typeof message !== "object") {
|
|
3661
|
+
return void 0;
|
|
3662
|
+
}
|
|
3663
|
+
const msg = message;
|
|
3664
|
+
const role = msg.role;
|
|
3665
|
+
if (typeof role !== "string") {
|
|
3666
|
+
return void 0;
|
|
3667
|
+
}
|
|
3668
|
+
const content = extractTextContent(msg.content);
|
|
3669
|
+
const toolCalls = extractToolCalls(msg.content);
|
|
3670
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
3671
|
+
const metadata = {};
|
|
3672
|
+
if (msg.api) metadata.api = msg.api;
|
|
3673
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
3674
|
+
if (msg.model) metadata.model = msg.model;
|
|
3675
|
+
if (msg.usage) metadata.usage = msg.usage;
|
|
3676
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
3677
|
+
return {
|
|
3678
|
+
role,
|
|
3679
|
+
content,
|
|
3680
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
3681
|
+
timestamp,
|
|
3682
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
3683
|
+
};
|
|
3684
|
+
}
|
|
3685
|
+
function extractTextContent(content) {
|
|
3686
|
+
if (typeof content === "string") {
|
|
3687
|
+
return content;
|
|
3688
|
+
}
|
|
3689
|
+
if (!Array.isArray(content)) {
|
|
3690
|
+
return void 0;
|
|
3691
|
+
}
|
|
3692
|
+
const textParts = [];
|
|
3693
|
+
for (const part of content) {
|
|
3694
|
+
if (!part || typeof part !== "object") {
|
|
3695
|
+
continue;
|
|
3696
|
+
}
|
|
3697
|
+
const p = part;
|
|
3698
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
3699
|
+
textParts.push(p.text);
|
|
3700
|
+
}
|
|
3701
|
+
}
|
|
3702
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
3703
|
+
}
|
|
3704
|
+
function extractToolCalls(content) {
|
|
3705
|
+
if (!Array.isArray(content)) {
|
|
3706
|
+
return [];
|
|
3707
|
+
}
|
|
3708
|
+
const toolCalls = [];
|
|
3709
|
+
for (const part of content) {
|
|
3710
|
+
if (!part || typeof part !== "object") {
|
|
3711
|
+
continue;
|
|
3712
|
+
}
|
|
3713
|
+
const p = part;
|
|
3714
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
3715
|
+
toolCalls.push({
|
|
3716
|
+
tool: p.name,
|
|
3717
|
+
input: p.input,
|
|
3718
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
3719
|
+
});
|
|
3720
|
+
}
|
|
3721
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
3722
|
+
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
3723
|
+
if (existing) {
|
|
3724
|
+
const idx = toolCalls.indexOf(existing);
|
|
3725
|
+
toolCalls[idx] = {
|
|
3726
|
+
...existing,
|
|
3727
|
+
output: p.content
|
|
3728
|
+
};
|
|
3729
|
+
}
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
return toolCalls;
|
|
3733
|
+
}
|
|
3734
|
+
function extractAssistantText2(messages) {
|
|
3735
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
3736
|
+
const msg = messages[i];
|
|
3737
|
+
if (msg.role === "assistant" && msg.content) {
|
|
3738
|
+
if (typeof msg.content === "string") {
|
|
3739
|
+
return msg.content;
|
|
3740
|
+
}
|
|
3741
|
+
return JSON.stringify(msg.content);
|
|
3742
|
+
}
|
|
3743
|
+
}
|
|
3744
|
+
return "";
|
|
3745
|
+
}
|
|
3746
|
+
function escapeAtSymbols(prompt) {
|
|
3747
|
+
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
3748
|
+
}
|
|
3749
|
+
function pickDetail2(stderr, stdout) {
|
|
3750
|
+
const errorText = stderr.trim();
|
|
3751
|
+
if (errorText.length > 0) {
|
|
3752
|
+
return errorText;
|
|
3753
|
+
}
|
|
3754
|
+
const stdoutText = stdout.trim();
|
|
3755
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
3756
|
+
}
|
|
3757
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
3758
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
3759
|
+
return "";
|
|
3760
|
+
}
|
|
3761
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
3762
|
+
return ` after ${seconds}s`;
|
|
3763
|
+
}
|
|
3764
|
+
async function defaultPiRunner(options) {
|
|
3765
|
+
return await new Promise((resolve, reject) => {
|
|
3766
|
+
const parts = options.executable.split(/\s+/);
|
|
3767
|
+
const executable = parts[0];
|
|
3768
|
+
const executableArgs = parts.slice(1);
|
|
3769
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
3770
|
+
const child = spawn2(executable, allArgs, {
|
|
3771
|
+
cwd: options.cwd,
|
|
3772
|
+
env: options.env,
|
|
3773
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
3774
|
+
shell: false
|
|
3775
|
+
});
|
|
3776
|
+
let stdout = "";
|
|
3777
|
+
let stderr = "";
|
|
3778
|
+
let timedOut = false;
|
|
3779
|
+
const onAbort = () => {
|
|
3780
|
+
child.kill("SIGTERM");
|
|
3781
|
+
};
|
|
3782
|
+
if (options.signal) {
|
|
3783
|
+
if (options.signal.aborted) {
|
|
3784
|
+
onAbort();
|
|
3785
|
+
} else {
|
|
3786
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
3787
|
+
}
|
|
3788
|
+
}
|
|
3789
|
+
let timeoutHandle;
|
|
3790
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
3791
|
+
timeoutHandle = setTimeout(() => {
|
|
3792
|
+
timedOut = true;
|
|
3793
|
+
child.kill("SIGTERM");
|
|
3794
|
+
}, options.timeoutMs);
|
|
3795
|
+
timeoutHandle.unref?.();
|
|
3796
|
+
}
|
|
3797
|
+
child.stdout.setEncoding("utf8");
|
|
3798
|
+
child.stdout.on("data", (chunk) => {
|
|
3799
|
+
stdout += chunk;
|
|
3800
|
+
options.onStdoutChunk?.(chunk);
|
|
3801
|
+
});
|
|
3802
|
+
child.stderr.setEncoding("utf8");
|
|
3803
|
+
child.stderr.on("data", (chunk) => {
|
|
3804
|
+
stderr += chunk;
|
|
3805
|
+
options.onStderrChunk?.(chunk);
|
|
3806
|
+
});
|
|
3807
|
+
child.stdin.end();
|
|
3808
|
+
const cleanup = () => {
|
|
3809
|
+
if (timeoutHandle) {
|
|
3810
|
+
clearTimeout(timeoutHandle);
|
|
3811
|
+
}
|
|
3812
|
+
if (options.signal) {
|
|
3813
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
3814
|
+
}
|
|
3815
|
+
};
|
|
3816
|
+
child.on("error", (error) => {
|
|
3817
|
+
cleanup();
|
|
3818
|
+
reject(error);
|
|
3819
|
+
});
|
|
3820
|
+
child.on("close", (code) => {
|
|
3821
|
+
cleanup();
|
|
3822
|
+
resolve({
|
|
3823
|
+
stdout,
|
|
3824
|
+
stderr,
|
|
3825
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
3826
|
+
timedOut
|
|
3827
|
+
});
|
|
3828
|
+
});
|
|
3829
|
+
});
|
|
3830
|
+
}
|
|
3831
|
+
|
|
3832
|
+
// src/evaluation/providers/vscode.ts
|
|
3833
|
+
import path11 from "node:path";
|
|
3834
|
+
import {
|
|
3835
|
+
dispatchAgentSession,
|
|
3836
|
+
dispatchBatchAgent,
|
|
3837
|
+
getSubagentRoot,
|
|
3838
|
+
provisionSubagents
|
|
3839
|
+
} from "subagent";
|
|
3840
|
+
|
|
3841
|
+
// src/evaluation/providers/vscode-templates.ts
|
|
3842
|
+
var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3843
|
+
|
|
3844
|
+
{{userQuery}}
|
|
3845
|
+
|
|
3846
|
+
[[ ## system_instructions ## ]]
|
|
3847
|
+
|
|
3848
|
+
**IMPORTANT**: Follow these exact steps:
|
|
3849
|
+
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3850
|
+
- Do NOT create any additional output files in the workspace.
|
|
3851
|
+
- All intended file outputs/changes MUST be written in your response file.
|
|
3852
|
+
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3853
|
+
2. When completely finished, run these PowerShell commands to signal completion:
|
|
3854
|
+
\`\`\`
|
|
3855
|
+
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
3856
|
+
if (Test-Path subagent.lock) { del subagent.lock }
|
|
3857
|
+
\`\`\`
|
|
3858
|
+
|
|
3859
|
+
Do not proceed to step 2 until your response is completely written to the temporary file.
|
|
3860
|
+
`;
|
|
3861
|
+
var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
3862
|
+
|
|
3863
|
+
{{userQuery}}
|
|
3864
|
+
|
|
3865
|
+
[[ ## system_instructions ## ]]
|
|
3866
|
+
|
|
3867
|
+
**IMPORTANT**: Follow these exact steps:
|
|
3868
|
+
1. Create and write your complete response to: {{responseFileTmp}}
|
|
3869
|
+
- Do NOT create any additional output files in the workspace.
|
|
3870
|
+
- All intended file outputs/changes MUST be written in your response file.
|
|
3871
|
+
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
3872
|
+
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
3873
|
+
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
3874
|
+
`;
|
|
3875
|
+
|
|
3876
|
+
// src/evaluation/providers/vscode.ts
|
|
3877
|
+
var VSCodeProvider = class {
|
|
3878
|
+
id;
|
|
3879
|
+
kind;
|
|
3880
|
+
targetName;
|
|
3881
|
+
supportsBatch = true;
|
|
3882
|
+
config;
|
|
3883
|
+
constructor(targetName, config, kind) {
|
|
3884
|
+
this.id = `${kind}:${targetName}`;
|
|
3885
|
+
this.kind = kind;
|
|
3886
|
+
this.targetName = targetName;
|
|
3887
|
+
this.config = config;
|
|
3888
|
+
}
|
|
3889
|
+
async invoke(request) {
|
|
3890
|
+
if (request.signal?.aborted) {
|
|
3891
|
+
throw new Error("VS Code provider request was aborted before dispatch");
|
|
3892
|
+
}
|
|
3893
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
3894
|
+
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
3895
|
+
const session = await dispatchAgentSession({
|
|
3896
|
+
userQuery: promptContent,
|
|
3897
|
+
extraAttachments: inputFiles,
|
|
3898
|
+
requestTemplate: AGENTV_REQUEST_TEMPLATE,
|
|
3899
|
+
wait: this.config.waitForResponse,
|
|
3900
|
+
dryRun: this.config.dryRun,
|
|
3901
|
+
vscodeCmd: this.config.command,
|
|
3902
|
+
subagentRoot: this.config.subagentRoot,
|
|
3903
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
3904
|
+
silent: true
|
|
3905
|
+
});
|
|
3906
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
3907
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
3908
|
+
throw new Error(failure);
|
|
3909
|
+
}
|
|
3910
|
+
if (this.config.dryRun) {
|
|
3911
|
+
return {
|
|
3912
|
+
outputMessages: [],
|
|
3913
|
+
raw: {
|
|
3914
|
+
session,
|
|
3915
|
+
inputFiles
|
|
3916
|
+
}
|
|
3917
|
+
};
|
|
3918
|
+
}
|
|
3919
|
+
const responseText = await readTextFile(session.responseFile);
|
|
3920
|
+
return {
|
|
3921
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3922
|
+
raw: {
|
|
3923
|
+
session,
|
|
3924
|
+
inputFiles
|
|
3925
|
+
}
|
|
3926
|
+
};
|
|
3927
|
+
}
|
|
3928
|
+
async invokeBatch(requests) {
|
|
3929
|
+
if (requests.length === 0) {
|
|
3930
|
+
return [];
|
|
3931
|
+
}
|
|
3932
|
+
const normalizedRequests = requests.map((req) => ({
|
|
3933
|
+
request: req,
|
|
3934
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
3935
|
+
}));
|
|
3936
|
+
const combinedInputFiles = mergeAttachments(
|
|
3937
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
3938
|
+
);
|
|
3939
|
+
const userQueries = normalizedRequests.map(
|
|
3940
|
+
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
3941
|
+
);
|
|
3942
|
+
const session = await dispatchBatchAgent({
|
|
3943
|
+
userQueries,
|
|
3944
|
+
extraAttachments: combinedInputFiles,
|
|
3945
|
+
requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
|
|
3946
|
+
wait: this.config.waitForResponse,
|
|
3947
|
+
dryRun: this.config.dryRun,
|
|
3948
|
+
vscodeCmd: this.config.command,
|
|
3949
|
+
subagentRoot: this.config.subagentRoot,
|
|
3950
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
3951
|
+
silent: true
|
|
3952
|
+
});
|
|
3953
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
3954
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
3955
|
+
throw new Error(failure);
|
|
3956
|
+
}
|
|
3957
|
+
if (this.config.dryRun) {
|
|
3958
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
3959
|
+
outputMessages: [],
|
|
3960
|
+
raw: {
|
|
3961
|
+
session,
|
|
3962
|
+
inputFiles,
|
|
3963
|
+
allInputFiles: combinedInputFiles
|
|
3964
|
+
}
|
|
3965
|
+
}));
|
|
3966
|
+
}
|
|
3967
|
+
if (session.responseFiles.length !== requests.length) {
|
|
3968
|
+
throw new Error(
|
|
3969
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
3970
|
+
);
|
|
3971
|
+
}
|
|
3972
|
+
const responses = [];
|
|
3973
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
3974
|
+
const responseText = await readTextFile(responseFile);
|
|
3975
|
+
responses.push({
|
|
3976
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
3977
|
+
raw: {
|
|
3978
|
+
session,
|
|
3979
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
3980
|
+
allInputFiles: combinedInputFiles,
|
|
3981
|
+
responseFile
|
|
3982
|
+
}
|
|
3983
|
+
});
|
|
3984
|
+
}
|
|
3985
|
+
return responses;
|
|
3986
|
+
}
|
|
3987
|
+
};
|
|
3988
|
+
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
3989
|
+
const parts = [];
|
|
3990
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
3991
|
+
parts.push(request.systemPrompt.trim());
|
|
3992
|
+
}
|
|
3993
|
+
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
3994
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
3995
|
+
const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
|
|
3996
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
3997
|
+
if (prereadBlock.length > 0) {
|
|
3208
3998
|
parts.push("\n", prereadBlock);
|
|
3209
3999
|
}
|
|
3210
4000
|
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
@@ -3215,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
3215
4005
|
return "";
|
|
3216
4006
|
}
|
|
3217
4007
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3218
|
-
const fileName =
|
|
4008
|
+
const fileName = path11.basename(absolutePath);
|
|
3219
4009
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3220
4010
|
return `* [${fileName}](${fileUri})`;
|
|
3221
4011
|
});
|
|
@@ -3240,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3240
4030
|
}
|
|
3241
4031
|
const unique = /* @__PURE__ */ new Map();
|
|
3242
4032
|
for (const attachment of attachments) {
|
|
3243
|
-
const absolutePath =
|
|
3244
|
-
const normalized = absolutePath.split(
|
|
4033
|
+
const absolutePath = path11.resolve(attachment);
|
|
4034
|
+
const normalized = absolutePath.split(path11.sep).join("/");
|
|
3245
4035
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3246
4036
|
if (!unique.has(absolutePath)) {
|
|
3247
4037
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3256,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3256
4046
|
}
|
|
3257
4047
|
const unique = /* @__PURE__ */ new Map();
|
|
3258
4048
|
for (const attachment of attachments) {
|
|
3259
|
-
const absolutePath =
|
|
4049
|
+
const absolutePath = path11.resolve(attachment);
|
|
3260
4050
|
if (!unique.has(absolutePath)) {
|
|
3261
4051
|
unique.set(absolutePath, absolutePath);
|
|
3262
4052
|
}
|
|
@@ -3264,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3264
4054
|
return Array.from(unique.values());
|
|
3265
4055
|
}
|
|
3266
4056
|
function pathToFileUri2(filePath) {
|
|
3267
|
-
const absolutePath =
|
|
4057
|
+
const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
|
|
3268
4058
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3269
4059
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3270
4060
|
return `file:///${normalizedPath}`;
|
|
@@ -3277,7 +4067,7 @@ function normalizeAttachments(attachments) {
|
|
|
3277
4067
|
}
|
|
3278
4068
|
const deduped = /* @__PURE__ */ new Set();
|
|
3279
4069
|
for (const attachment of attachments) {
|
|
3280
|
-
deduped.add(
|
|
4070
|
+
deduped.add(path11.resolve(attachment));
|
|
3281
4071
|
}
|
|
3282
4072
|
return Array.from(deduped);
|
|
3283
4073
|
}
|
|
@@ -3286,7 +4076,7 @@ function mergeAttachments(all) {
|
|
|
3286
4076
|
for (const list of all) {
|
|
3287
4077
|
if (!list) continue;
|
|
3288
4078
|
for (const inputFile of list) {
|
|
3289
|
-
deduped.add(
|
|
4079
|
+
deduped.add(path11.resolve(inputFile));
|
|
3290
4080
|
}
|
|
3291
4081
|
}
|
|
3292
4082
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3335,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3335
4125
|
// src/evaluation/providers/targets-file.ts
|
|
3336
4126
|
import { constants as constants3 } from "node:fs";
|
|
3337
4127
|
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
3338
|
-
import
|
|
4128
|
+
import path12 from "node:path";
|
|
3339
4129
|
import { parse as parse3 } from "yaml";
|
|
3340
4130
|
function isRecord(value) {
|
|
3341
4131
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -3372,7 +4162,7 @@ async function fileExists3(filePath) {
|
|
|
3372
4162
|
}
|
|
3373
4163
|
}
|
|
3374
4164
|
async function readTargetDefinitions(filePath) {
|
|
3375
|
-
const absolutePath =
|
|
4165
|
+
const absolutePath = path12.resolve(filePath);
|
|
3376
4166
|
if (!await fileExists3(absolutePath)) {
|
|
3377
4167
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3378
4168
|
}
|
|
@@ -3404,6 +4194,8 @@ function createProvider(target) {
|
|
|
3404
4194
|
return new CliProvider(target.name, target.config);
|
|
3405
4195
|
case "codex":
|
|
3406
4196
|
return new CodexProvider(target.name, target.config);
|
|
4197
|
+
case "pi-coding-agent":
|
|
4198
|
+
return new PiCodingAgentProvider(target.name, target.config);
|
|
3407
4199
|
case "mock":
|
|
3408
4200
|
return new MockProvider(target.name, target.config);
|
|
3409
4201
|
case "vscode":
|
|
@@ -3423,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3423
4215
|
// src/evaluation/evaluators.ts
|
|
3424
4216
|
import { generateText as generateText2 } from "ai";
|
|
3425
4217
|
import { z } from "zod";
|
|
4218
|
+
|
|
4219
|
+
// src/runtime/exec.ts
|
|
4220
|
+
function getBunSpawn() {
|
|
4221
|
+
const bunSpawn = globalThis.Bun?.spawn;
|
|
4222
|
+
return typeof bunSpawn === "function" ? bunSpawn : void 0;
|
|
4223
|
+
}
|
|
4224
|
+
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
4225
|
+
const bunSpawn = getBunSpawn();
|
|
4226
|
+
if (bunSpawn) {
|
|
4227
|
+
const encoder = new TextEncoder();
|
|
4228
|
+
const proc = bunSpawn({
|
|
4229
|
+
cmd: ["sh", "-c", command],
|
|
4230
|
+
cwd: options.cwd,
|
|
4231
|
+
stdin: encoder.encode(stdinPayload),
|
|
4232
|
+
stdout: "pipe",
|
|
4233
|
+
stderr: "pipe"
|
|
4234
|
+
});
|
|
4235
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
4236
|
+
proc.kill();
|
|
4237
|
+
}, options.timeoutMs) : void 0;
|
|
4238
|
+
try {
|
|
4239
|
+
const stdout = await new Response(proc.stdout).text();
|
|
4240
|
+
const stderr = await new Response(proc.stderr).text();
|
|
4241
|
+
const exitCode = await proc.exited;
|
|
4242
|
+
return { stdout, stderr, exitCode };
|
|
4243
|
+
} finally {
|
|
4244
|
+
if (timeout !== void 0) {
|
|
4245
|
+
clearTimeout(timeout);
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
const { spawn: spawn3 } = await import("node:child_process");
|
|
4250
|
+
return await new Promise((resolve, reject) => {
|
|
4251
|
+
const child = spawn3(command, {
|
|
4252
|
+
shell: true,
|
|
4253
|
+
cwd: options.cwd,
|
|
4254
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
4255
|
+
});
|
|
4256
|
+
let stdout = "";
|
|
4257
|
+
let stderr = "";
|
|
4258
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
4259
|
+
child.kill();
|
|
4260
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
4261
|
+
}, options.timeoutMs) : void 0;
|
|
4262
|
+
child.stdout?.on("data", (data) => {
|
|
4263
|
+
stdout += data.toString();
|
|
4264
|
+
});
|
|
4265
|
+
child.stderr?.on("data", (data) => {
|
|
4266
|
+
stderr += data.toString();
|
|
4267
|
+
});
|
|
4268
|
+
child.on("error", (error) => {
|
|
4269
|
+
if (timeout !== void 0) {
|
|
4270
|
+
clearTimeout(timeout);
|
|
4271
|
+
}
|
|
4272
|
+
reject(error);
|
|
4273
|
+
});
|
|
4274
|
+
child.on("exit", (code) => {
|
|
4275
|
+
if (timeout !== void 0) {
|
|
4276
|
+
clearTimeout(timeout);
|
|
4277
|
+
}
|
|
4278
|
+
resolve({ stdout, stderr, exitCode: code ?? 0 });
|
|
4279
|
+
});
|
|
4280
|
+
child.stdin?.write(stdinPayload);
|
|
4281
|
+
child.stdin?.end();
|
|
4282
|
+
});
|
|
4283
|
+
}
|
|
4284
|
+
|
|
4285
|
+
// src/evaluation/evaluators.ts
|
|
3426
4286
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3427
4287
|
|
|
3428
4288
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3698,17 +4558,17 @@ var CodeEvaluator = class {
|
|
|
3698
4558
|
const inputPayload = JSON.stringify(
|
|
3699
4559
|
{
|
|
3700
4560
|
question: context.evalCase.question,
|
|
3701
|
-
|
|
3702
|
-
|
|
3703
|
-
|
|
3704
|
-
|
|
3705
|
-
|
|
3706
|
-
|
|
3707
|
-
|
|
3708
|
-
(
|
|
4561
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
4562
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
4563
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
4564
|
+
candidateAnswer: context.candidate,
|
|
4565
|
+
outputMessages: context.outputMessages ?? null,
|
|
4566
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
4567
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
4568
|
+
(path14) => !context.evalCase.guideline_paths.includes(path14)
|
|
3709
4569
|
),
|
|
3710
|
-
|
|
3711
|
-
|
|
4570
|
+
inputMessages: context.evalCase.input_messages,
|
|
4571
|
+
traceSummary: context.traceSummary ?? null
|
|
3712
4572
|
},
|
|
3713
4573
|
null,
|
|
3714
4574
|
2
|
|
@@ -3778,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
|
|
|
3778
4638
|
return { score, verdict, hits, misses };
|
|
3779
4639
|
}
|
|
3780
4640
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
3781
|
-
const {
|
|
3782
|
-
|
|
3783
|
-
|
|
3784
|
-
shell: true,
|
|
3785
|
-
cwd
|
|
3786
|
-
});
|
|
3787
|
-
let stdout = "";
|
|
3788
|
-
let stderr = "";
|
|
3789
|
-
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
3790
|
-
child.kill();
|
|
3791
|
-
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
3792
|
-
}, agentTimeoutMs) : void 0;
|
|
3793
|
-
child.stdout?.on("data", (data) => {
|
|
3794
|
-
stdout += data.toString();
|
|
3795
|
-
});
|
|
3796
|
-
child.stderr?.on("data", (data) => {
|
|
3797
|
-
stderr += data.toString();
|
|
3798
|
-
});
|
|
3799
|
-
child.on("error", (error) => {
|
|
3800
|
-
if (timeout !== void 0) {
|
|
3801
|
-
clearTimeout(timeout);
|
|
3802
|
-
}
|
|
3803
|
-
reject(error);
|
|
3804
|
-
});
|
|
3805
|
-
child.on("exit", (code) => {
|
|
3806
|
-
if (timeout !== void 0) {
|
|
3807
|
-
clearTimeout(timeout);
|
|
3808
|
-
}
|
|
3809
|
-
if (code && code !== 0 && stderr.length > 0) {
|
|
3810
|
-
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
3811
|
-
return;
|
|
3812
|
-
}
|
|
3813
|
-
resolve(stdout.trim());
|
|
3814
|
-
});
|
|
3815
|
-
child.stdin?.write(input);
|
|
3816
|
-
child.stdin?.end();
|
|
4641
|
+
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
4642
|
+
cwd,
|
|
4643
|
+
timeoutMs: agentTimeoutMs
|
|
3817
4644
|
});
|
|
4645
|
+
if (exitCode !== 0) {
|
|
4646
|
+
const trimmedErr = stderr.trim();
|
|
4647
|
+
throw new Error(
|
|
4648
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
4649
|
+
);
|
|
4650
|
+
}
|
|
4651
|
+
return stdout.trim();
|
|
3818
4652
|
}
|
|
3819
4653
|
function parseJsonSafe(payload) {
|
|
3820
4654
|
try {
|
|
@@ -3828,6 +4662,33 @@ function substituteVariables(template, variables) {
|
|
|
3828
4662
|
return variables[varName] ?? match;
|
|
3829
4663
|
});
|
|
3830
4664
|
}
|
|
4665
|
+
function deepEqual(a, b) {
|
|
4666
|
+
if (a === b) return true;
|
|
4667
|
+
if (a === null || b === null) return a === b;
|
|
4668
|
+
if (typeof a !== typeof b) return false;
|
|
4669
|
+
if (typeof a !== "object") return a === b;
|
|
4670
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
4671
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4672
|
+
if (a.length !== b.length) return false;
|
|
4673
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
4674
|
+
}
|
|
4675
|
+
const aObj = a;
|
|
4676
|
+
const bObj = b;
|
|
4677
|
+
const aKeys = Object.keys(aObj);
|
|
4678
|
+
const bKeys = Object.keys(bObj);
|
|
4679
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
4680
|
+
return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
|
|
4681
|
+
}
|
|
4682
|
+
function argsMatch(expected, actual) {
|
|
4683
|
+
if (expected === void 0) return true;
|
|
4684
|
+
if (expected === "any") return true;
|
|
4685
|
+
if (actual === void 0) return false;
|
|
4686
|
+
for (const key of Object.keys(expected)) {
|
|
4687
|
+
if (!Object.hasOwn(actual, key)) return false;
|
|
4688
|
+
if (!deepEqual(expected[key], actual[key])) return false;
|
|
4689
|
+
}
|
|
4690
|
+
return true;
|
|
4691
|
+
}
|
|
3831
4692
|
var ToolTrajectoryEvaluator = class {
|
|
3832
4693
|
kind = "tool_trajectory";
|
|
3833
4694
|
config;
|
|
@@ -3884,7 +4745,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3884
4745
|
for (const message of messages) {
|
|
3885
4746
|
if (message.toolCalls) {
|
|
3886
4747
|
for (const call of message.toolCalls) {
|
|
3887
|
-
toolCalls.push({
|
|
4748
|
+
toolCalls.push({
|
|
4749
|
+
name: call.tool,
|
|
4750
|
+
args: call.input
|
|
4751
|
+
});
|
|
3888
4752
|
}
|
|
3889
4753
|
}
|
|
3890
4754
|
}
|
|
@@ -3953,18 +4817,29 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3953
4817
|
const misses = [];
|
|
3954
4818
|
let actualIndex = 0;
|
|
3955
4819
|
for (let i = 0; i < expected.length; i++) {
|
|
3956
|
-
const
|
|
4820
|
+
const expectedItem = expected[i];
|
|
4821
|
+
const expectedTool = expectedItem.tool;
|
|
3957
4822
|
let found = false;
|
|
4823
|
+
let argsMismatch = false;
|
|
3958
4824
|
while (actualIndex < toolCalls.length) {
|
|
3959
|
-
|
|
3960
|
-
|
|
4825
|
+
const actualCall = toolCalls[actualIndex];
|
|
4826
|
+
if (actualCall.name === expectedTool) {
|
|
4827
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
4828
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
4829
|
+
actualIndex++;
|
|
4830
|
+
found = true;
|
|
4831
|
+
break;
|
|
4832
|
+
}
|
|
4833
|
+
misses.push(
|
|
4834
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
4835
|
+
);
|
|
3961
4836
|
actualIndex++;
|
|
3962
|
-
|
|
4837
|
+
argsMismatch = true;
|
|
3963
4838
|
break;
|
|
3964
4839
|
}
|
|
3965
4840
|
actualIndex++;
|
|
3966
4841
|
}
|
|
3967
|
-
if (!found) {
|
|
4842
|
+
if (!found && !argsMismatch) {
|
|
3968
4843
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
3969
4844
|
}
|
|
3970
4845
|
}
|
|
@@ -3995,10 +4870,16 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3995
4870
|
}
|
|
3996
4871
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
3997
4872
|
for (let i = 0; i < checkLength; i++) {
|
|
3998
|
-
const
|
|
3999
|
-
const
|
|
4873
|
+
const expectedItem = expected[i];
|
|
4874
|
+
const expectedTool = expectedItem.tool;
|
|
4875
|
+
const actualCall = toolCalls[i];
|
|
4876
|
+
const actualTool = actualCall.name;
|
|
4000
4877
|
if (actualTool === expectedTool) {
|
|
4001
|
-
|
|
4878
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
4879
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
4880
|
+
} else {
|
|
4881
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
4882
|
+
}
|
|
4002
4883
|
} else {
|
|
4003
4884
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
4004
4885
|
}
|
|
@@ -4242,9 +5123,9 @@ var CompositeEvaluator = class {
|
|
|
4242
5123
|
};
|
|
4243
5124
|
|
|
4244
5125
|
// src/evaluation/orchestrator.ts
|
|
4245
|
-
import { createHash, randomUUID as
|
|
4246
|
-
import { mkdir as
|
|
4247
|
-
import
|
|
5126
|
+
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
5127
|
+
import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
|
|
5128
|
+
import path13 from "node:path";
|
|
4248
5129
|
|
|
4249
5130
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
4250
5131
|
var Node = class {
|
|
@@ -4640,7 +5521,12 @@ async function runBatchEvaluation(options) {
|
|
|
4640
5521
|
const promptInputs = promptInputsList[i];
|
|
4641
5522
|
const providerResponse = batchResponse[i];
|
|
4642
5523
|
const outputMessages = providerResponse.outputMessages;
|
|
4643
|
-
const
|
|
5524
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5525
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5526
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
5527
|
+
costUsd: providerResponse.costUsd,
|
|
5528
|
+
durationMs: providerResponse.durationMs
|
|
5529
|
+
}) : void 0;
|
|
4644
5530
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
4645
5531
|
let result;
|
|
4646
5532
|
try {
|
|
@@ -4761,7 +5647,12 @@ async function runEvalCase(options) {
|
|
|
4761
5647
|
await cache.set(cacheKey, providerResponse);
|
|
4762
5648
|
}
|
|
4763
5649
|
const outputMessages = providerResponse.outputMessages;
|
|
4764
|
-
const
|
|
5650
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
5651
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
5652
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
5653
|
+
costUsd: providerResponse.costUsd,
|
|
5654
|
+
durationMs: providerResponse.durationMs
|
|
5655
|
+
}) : void 0;
|
|
4765
5656
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
4766
5657
|
try {
|
|
4767
5658
|
return await evaluateCandidate({
|
|
@@ -4834,21 +5725,21 @@ async function evaluateCandidate(options) {
|
|
|
4834
5725
|
}
|
|
4835
5726
|
return {
|
|
4836
5727
|
timestamp: completedAt.toISOString(),
|
|
4837
|
-
|
|
5728
|
+
evalId: evalCase.id,
|
|
4838
5729
|
dataset: evalCase.dataset,
|
|
4839
|
-
|
|
5730
|
+
conversationId: evalCase.conversation_id,
|
|
4840
5731
|
score: score.score,
|
|
4841
5732
|
hits: score.hits,
|
|
4842
5733
|
misses: score.misses,
|
|
4843
|
-
|
|
5734
|
+
candidateAnswer: candidate,
|
|
4844
5735
|
target: target.name,
|
|
4845
5736
|
reasoning: score.reasoning,
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
|
|
4849
|
-
|
|
4850
|
-
|
|
4851
|
-
|
|
5737
|
+
rawAspects: score.rawAspects,
|
|
5738
|
+
agentProviderRequest,
|
|
5739
|
+
lmProviderRequest,
|
|
5740
|
+
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
5741
|
+
evaluatorResults,
|
|
5742
|
+
traceSummary
|
|
4852
5743
|
};
|
|
4853
5744
|
}
|
|
4854
5745
|
async function runEvaluatorsForCase(options) {
|
|
@@ -4946,7 +5837,7 @@ async function runEvaluatorList(options) {
|
|
|
4946
5837
|
hits: score2.hits,
|
|
4947
5838
|
misses: score2.misses,
|
|
4948
5839
|
reasoning: score2.reasoning,
|
|
4949
|
-
|
|
5840
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
4950
5841
|
});
|
|
4951
5842
|
}
|
|
4952
5843
|
if (evaluator.type === "code") {
|
|
@@ -4977,11 +5868,11 @@ async function runEvaluatorList(options) {
|
|
|
4977
5868
|
hits: score2.hits,
|
|
4978
5869
|
misses: score2.misses,
|
|
4979
5870
|
reasoning: score2.reasoning,
|
|
4980
|
-
|
|
5871
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
4981
5872
|
});
|
|
4982
5873
|
}
|
|
4983
5874
|
if (evaluator.type === "composite") {
|
|
4984
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
5875
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
4985
5876
|
const createEvaluator = (memberConfig) => {
|
|
4986
5877
|
switch (memberConfig.type) {
|
|
4987
5878
|
case "llm_judge":
|
|
@@ -5034,8 +5925,8 @@ async function runEvaluatorList(options) {
|
|
|
5034
5925
|
hits: score2.hits,
|
|
5035
5926
|
misses: score2.misses,
|
|
5036
5927
|
reasoning: score2.reasoning,
|
|
5037
|
-
|
|
5038
|
-
|
|
5928
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
5929
|
+
evaluatorResults: mapChildResults(score2.evaluatorResults)
|
|
5039
5930
|
});
|
|
5040
5931
|
}
|
|
5041
5932
|
if (evaluator.type === "tool_trajectory") {
|
|
@@ -5193,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
5193
6084
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
5194
6085
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
5195
6086
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
5196
|
-
const filePath =
|
|
5197
|
-
await
|
|
6087
|
+
const filePath = path13.resolve(directory, filename);
|
|
6088
|
+
await mkdir3(path13.dirname(filePath), { recursive: true });
|
|
5198
6089
|
const payload = {
|
|
5199
6090
|
eval_id: evalCase.id,
|
|
5200
6091
|
question: promptInputs.question,
|
|
5201
6092
|
guidelines: promptInputs.guidelines,
|
|
5202
6093
|
guideline_paths: evalCase.guideline_paths
|
|
5203
6094
|
};
|
|
5204
|
-
await
|
|
6095
|
+
await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
5205
6096
|
}
|
|
5206
6097
|
function sanitizeFilename(value) {
|
|
5207
6098
|
if (!value) {
|
|
5208
6099
|
return "prompt";
|
|
5209
6100
|
}
|
|
5210
6101
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
5211
|
-
return sanitized.length > 0 ? sanitized :
|
|
6102
|
+
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
5212
6103
|
}
|
|
5213
6104
|
async function invokeProvider(provider, options) {
|
|
5214
6105
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -5265,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
5265
6156
|
}
|
|
5266
6157
|
return {
|
|
5267
6158
|
timestamp: timestamp.toISOString(),
|
|
5268
|
-
|
|
6159
|
+
evalId: evalCase.id,
|
|
5269
6160
|
dataset: evalCase.dataset,
|
|
5270
|
-
|
|
6161
|
+
conversationId: evalCase.conversation_id,
|
|
5271
6162
|
score: 0,
|
|
5272
6163
|
hits: [],
|
|
5273
6164
|
misses: [`Error: ${message}`],
|
|
5274
|
-
|
|
6165
|
+
candidateAnswer: `Error occurred: ${message}`,
|
|
5275
6166
|
target: targetName,
|
|
5276
|
-
|
|
5277
|
-
|
|
5278
|
-
|
|
6167
|
+
rawAspects: [],
|
|
6168
|
+
agentProviderRequest,
|
|
6169
|
+
lmProviderRequest,
|
|
5279
6170
|
error: message
|
|
5280
6171
|
};
|
|
5281
6172
|
}
|
|
@@ -5320,8 +6211,8 @@ function mapChildResults(children) {
|
|
|
5320
6211
|
hits: child.hits,
|
|
5321
6212
|
misses: child.misses,
|
|
5322
6213
|
reasoning: child.reasoning,
|
|
5323
|
-
|
|
5324
|
-
|
|
6214
|
+
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
6215
|
+
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
5325
6216
|
}));
|
|
5326
6217
|
}
|
|
5327
6218
|
function computeWeightedMean(entries) {
|
|
@@ -5422,17 +6313,21 @@ function createAgentKernel() {
|
|
|
5422
6313
|
export {
|
|
5423
6314
|
CodeEvaluator,
|
|
5424
6315
|
CompositeEvaluator,
|
|
6316
|
+
DEFAULT_EXPLORATION_TOOLS,
|
|
5425
6317
|
LlmJudgeEvaluator,
|
|
5426
6318
|
TEST_MESSAGE_ROLES,
|
|
5427
6319
|
ToolTrajectoryEvaluator,
|
|
6320
|
+
avgToolDurationMs,
|
|
5428
6321
|
buildDirectoryChain,
|
|
5429
6322
|
buildPromptInputs,
|
|
5430
6323
|
buildSearchRoots,
|
|
5431
6324
|
computeTraceSummary,
|
|
5432
6325
|
consumeCodexLogEntries,
|
|
6326
|
+
consumePiLogEntries,
|
|
5433
6327
|
createAgentKernel,
|
|
5434
6328
|
createProvider,
|
|
5435
6329
|
ensureVSCodeSubagents,
|
|
6330
|
+
explorationRatio,
|
|
5436
6331
|
extractCodeBlocks,
|
|
5437
6332
|
fileExists,
|
|
5438
6333
|
findGitRoot,
|
|
@@ -5446,6 +6341,7 @@ export {
|
|
|
5446
6341
|
isTestMessageRole,
|
|
5447
6342
|
listTargetNames,
|
|
5448
6343
|
loadEvalCases,
|
|
6344
|
+
mergeExecutionMetrics,
|
|
5449
6345
|
normalizeLineEndings,
|
|
5450
6346
|
readJsonFile,
|
|
5451
6347
|
readTargetDefinitions,
|
|
@@ -5456,6 +6352,8 @@ export {
|
|
|
5456
6352
|
resolveTargetDefinition,
|
|
5457
6353
|
runEvalCase,
|
|
5458
6354
|
runEvaluation,
|
|
5459
|
-
subscribeToCodexLogEntries
|
|
6355
|
+
subscribeToCodexLogEntries,
|
|
6356
|
+
subscribeToPiLogEntries,
|
|
6357
|
+
tokensPerTool
|
|
5460
6358
|
};
|
|
5461
6359
|
//# sourceMappingURL=index.js.map
|