@agentv/core 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10,7 +10,7 @@ import {
10
10
  readTextFile,
11
11
  resolveFileReference,
12
12
  resolveTargetDefinition
13
- } from "./chunk-KPHTMTZ3.js";
13
+ } from "./chunk-E2VSU4WZ.js";
14
14
 
15
15
  // src/evaluation/types.ts
16
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -93,6 +93,53 @@ function computeTraceSummary(messages) {
93
93
  errorCount: 0
94
94
  };
95
95
  }
96
+ var DEFAULT_EXPLORATION_TOOLS = [
97
+ "read",
98
+ "grep",
99
+ "glob",
100
+ "search",
101
+ "list",
102
+ "Read",
103
+ "Grep",
104
+ "Glob",
105
+ "WebSearch",
106
+ "WebFetch"
107
+ ];
108
+ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
109
+ if (summary.eventCount === 0) return void 0;
110
+ const explorationCalls = explorationTools.reduce(
111
+ (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
112
+ 0
113
+ );
114
+ return explorationCalls / summary.eventCount;
115
+ }
116
+ function tokensPerTool(summary) {
117
+ if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
118
+ const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
119
+ return totalTokens / summary.eventCount;
120
+ }
121
+ function avgToolDurationMs(summary) {
122
+ if (!summary.toolDurations) return void 0;
123
+ let totalDuration = 0;
124
+ let totalCalls = 0;
125
+ for (const durations of Object.values(summary.toolDurations)) {
126
+ for (const duration of durations) {
127
+ totalDuration += duration;
128
+ totalCalls++;
129
+ }
130
+ }
131
+ if (totalCalls === 0) return void 0;
132
+ return totalDuration / totalCalls;
133
+ }
134
+ function mergeExecutionMetrics(summary, metrics) {
135
+ if (!metrics) return summary;
136
+ return {
137
+ ...summary,
138
+ tokenUsage: metrics.tokenUsage,
139
+ costUsd: metrics.costUsd,
140
+ durationMs: metrics.durationMs
141
+ };
142
+ }
96
143
 
97
144
  // src/evaluation/yaml-parser.ts
98
145
  import { readFile as readFile5 } from "node:fs/promises";
@@ -607,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
607
654
  expected = [];
608
655
  for (const item of rawExpected) {
609
656
  if (isJsonObject2(item) && typeof item.tool === "string") {
610
- expected.push({ tool: item.tool });
657
+ let args;
658
+ if (item.args === "any") {
659
+ args = "any";
660
+ } else if (isJsonObject2(item.args)) {
661
+ args = item.args;
662
+ }
663
+ expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
611
664
  }
612
665
  }
613
666
  }
@@ -1767,12 +1820,14 @@ var CliProvider = class {
1767
1820
  `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1768
1821
  );
1769
1822
  }
1823
+ const startTime = Date.now();
1770
1824
  const result = await this.runCommand(renderedCommand, {
1771
1825
  cwd: this.config.cwd,
1772
1826
  env: process.env,
1773
1827
  timeoutMs: this.config.timeoutMs,
1774
1828
  signal: request.signal
1775
1829
  });
1830
+ const measuredDurationMs = Date.now() - startTime;
1776
1831
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1777
1832
  if (request.signal?.aborted) {
1778
1833
  throw new Error("CLI provider request was aborted");
@@ -1791,6 +1846,9 @@ var CliProvider = class {
1791
1846
  const parsed = this.parseOutputContent(responseContent);
1792
1847
  return {
1793
1848
  outputMessages: parsed.outputMessages,
1849
+ tokenUsage: parsed.tokenUsage,
1850
+ costUsd: parsed.costUsd,
1851
+ durationMs: parsed.durationMs ?? measuredDurationMs,
1794
1852
  raw: {
1795
1853
  command: renderedCommand,
1796
1854
  stderr: result.stderr,
@@ -1838,12 +1896,14 @@ var CliProvider = class {
1838
1896
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1839
1897
  );
1840
1898
  }
1899
+ const startTime = Date.now();
1841
1900
  const result = await this.runCommand(renderedCommand, {
1842
1901
  cwd: this.config.cwd,
1843
1902
  env: process.env,
1844
1903
  timeoutMs: this.config.timeoutMs,
1845
1904
  signal: controller.signal
1846
1905
  });
1906
+ const measuredDurationMs = Date.now() - startTime;
1847
1907
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1848
1908
  if (controller.signal.aborted) {
1849
1909
  throw new Error("CLI provider request was aborted");
@@ -1865,11 +1925,13 @@ var CliProvider = class {
1865
1925
  if (missingIds.length > 0) {
1866
1926
  throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
1867
1927
  }
1928
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
1868
1929
  const responses = requests.map((request) => {
1869
1930
  const evalCaseId = request.evalCaseId;
1870
1931
  if (!evalCaseId) {
1871
1932
  return {
1872
1933
  outputMessages: [],
1934
+ durationMs: perRequestFallbackMs,
1873
1935
  raw: {
1874
1936
  command: renderedCommand,
1875
1937
  stderr: result.stderr,
@@ -1883,6 +1945,7 @@ var CliProvider = class {
1883
1945
  if (!parsed) {
1884
1946
  return {
1885
1947
  outputMessages: [],
1948
+ durationMs: perRequestFallbackMs,
1886
1949
  raw: {
1887
1950
  command: renderedCommand,
1888
1951
  stderr: result.stderr,
@@ -1894,6 +1957,9 @@ var CliProvider = class {
1894
1957
  }
1895
1958
  return {
1896
1959
  outputMessages: parsed.outputMessages,
1960
+ tokenUsage: parsed.tokenUsage,
1961
+ costUsd: parsed.costUsd,
1962
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
1897
1963
  raw: {
1898
1964
  command: renderedCommand,
1899
1965
  stderr: result.stderr,
@@ -1911,25 +1977,55 @@ var CliProvider = class {
1911
1977
  * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
1912
1978
  * If only 'text' is provided, wrap it in outputMessages.
1913
1979
  * Otherwise, treat the entire content as plain text wrapped in outputMessages.
1980
+ *
1981
+ * Also extracts optional execution metrics:
1982
+ * - token_usage: { input, output, cached? }
1983
+ * - cost_usd: number
1984
+ * - duration_ms: number
1914
1985
  */
1915
1986
  parseOutputContent(content) {
1916
1987
  try {
1917
1988
  const parsed = JSON.parse(content);
1918
1989
  if (typeof parsed === "object" && parsed !== null) {
1919
1990
  const obj = parsed;
1991
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
1992
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
1993
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
1920
1994
  const outputMessages = this.parseOutputMessages(obj.output_messages);
1921
1995
  if (outputMessages && outputMessages.length > 0) {
1922
- return { outputMessages };
1996
+ return { outputMessages, tokenUsage, costUsd, durationMs };
1923
1997
  }
1924
1998
  if ("text" in obj) {
1925
1999
  const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1926
- return { outputMessages: [{ role: "assistant", content: text }] };
2000
+ return {
2001
+ outputMessages: [{ role: "assistant", content: text }],
2002
+ tokenUsage,
2003
+ costUsd,
2004
+ durationMs
2005
+ };
1927
2006
  }
1928
2007
  }
1929
2008
  } catch {
1930
2009
  }
1931
2010
  return { outputMessages: [{ role: "assistant", content }] };
1932
2011
  }
2012
+ /**
2013
+ * Parse token_usage from CLI output.
2014
+ */
2015
+ parseTokenUsage(tokenUsage) {
2016
+ if (typeof tokenUsage !== "object" || tokenUsage === null) {
2017
+ return void 0;
2018
+ }
2019
+ const obj = tokenUsage;
2020
+ if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2021
+ return void 0;
2022
+ }
2023
+ return {
2024
+ input: obj.input,
2025
+ output: obj.output,
2026
+ cached: typeof obj.cached === "number" ? obj.cached : void 0
2027
+ };
2028
+ }
1933
2029
  /**
1934
2030
  * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
1935
2031
  */
@@ -2006,6 +2102,9 @@ var CliProvider = class {
2006
2102
  if (records.has(id)) {
2007
2103
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
2008
2104
  }
2105
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2106
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2107
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2009
2108
  const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2010
2109
  let outputMessages;
2011
2110
  if (parsedOutputMessages && parsedOutputMessages.length > 0) {
@@ -2015,7 +2114,10 @@ var CliProvider = class {
2015
2114
  outputMessages = text ? [{ role: "assistant", content: text }] : [];
2016
2115
  }
2017
2116
  records.set(id, {
2018
- outputMessages
2117
+ outputMessages,
2118
+ tokenUsage,
2119
+ costUsd,
2120
+ durationMs
2019
2121
  });
2020
2122
  }
2021
2123
  return records;
@@ -2331,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
2331
2433
  var WORKSPACE_PREFIX = "agentv-codex-";
2332
2434
  var PROMPT_FILENAME = "prompt.md";
2333
2435
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2436
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2437
+ - Do NOT create any additional output files in the workspace.
2438
+ - All intended file outputs/changes MUST be written in your response.
2439
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2440
+ This is required for evaluation scoring.`;
2334
2441
  var CodexProvider = class {
2335
2442
  id;
2336
2443
  kind = "codex";
@@ -2355,7 +2462,11 @@ var CodexProvider = class {
2355
2462
  const workspaceRoot = await this.createWorkspace();
2356
2463
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2357
2464
  try {
2358
- const promptContent = buildPromptDocument(request, inputFiles);
2465
+ const basePrompt = buildPromptDocument(request, inputFiles);
2466
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2467
+ const promptContent = `${systemPrompt}
2468
+
2469
+ ${basePrompt}`;
2359
2470
  const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
2360
2471
  await writeFile(promptFile, promptContent, "utf8");
2361
2472
  const args = this.buildCodexArgs();
@@ -3039,172 +3150,851 @@ var MockProvider = class {
3039
3150
  }
3040
3151
  };
3041
3152
 
3042
- // src/evaluation/providers/vscode.ts
3153
+ // src/evaluation/providers/pi-coding-agent.ts
3154
+ import { spawn as spawn2 } from "node:child_process";
3155
+ import { randomUUID as randomUUID2 } from "node:crypto";
3156
+ import { createWriteStream as createWriteStream2 } from "node:fs";
3157
+ import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
3158
+ import { tmpdir as tmpdir2 } from "node:os";
3043
3159
  import path10 from "node:path";
3044
- import {
3045
- dispatchAgentSession,
3046
- dispatchBatchAgent,
3047
- getSubagentRoot,
3048
- provisionSubagents
3049
- } from "subagent";
3050
-
3051
- // src/evaluation/providers/vscode-templates.ts
3052
- var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
3053
-
3054
- {{userQuery}}
3055
-
3056
- [[ ## system_instructions ## ]]
3057
-
3058
- **IMPORTANT**: Follow these exact steps:
3059
- 1. Create and write your complete response to: {{responseFileTmp}}
3060
- - Do NOT create any additional output files in the workspace.
3061
- - All intended file outputs/changes MUST be written in your response file.
3062
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3063
- 2. When completely finished, run these PowerShell commands to signal completion:
3064
- \`\`\`
3065
- Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
3066
- if (Test-Path subagent.lock) { del subagent.lock }
3067
- \`\`\`
3068
-
3069
- Do not proceed to step 2 until your response is completely written to the temporary file.
3070
- `;
3071
- var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
3072
-
3073
- {{userQuery}}
3074
-
3075
- [[ ## system_instructions ## ]]
3076
3160
 
3077
- **IMPORTANT**: Follow these exact steps:
3078
- 1. Create and write your complete response to: {{responseFileTmp}}
3079
- - Do NOT create any additional output files in the workspace.
3080
- - All intended file outputs/changes MUST be written in your response file.
3081
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3082
- 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3083
- 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3084
- `;
3161
+ // src/evaluation/providers/pi-log-tracker.ts
3162
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3163
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
3164
+ function getPiLogStore() {
3165
+ const globalObject = globalThis;
3166
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
3167
+ if (existing) {
3168
+ return existing;
3169
+ }
3170
+ const created = [];
3171
+ globalObject[GLOBAL_LOGS_KEY2] = created;
3172
+ return created;
3173
+ }
3174
+ function getSubscriberStore2() {
3175
+ const globalObject = globalThis;
3176
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
3177
+ if (existing) {
3178
+ return existing;
3179
+ }
3180
+ const created = /* @__PURE__ */ new Set();
3181
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
3182
+ return created;
3183
+ }
3184
+ function notifySubscribers2(entry) {
3185
+ const subscribers = Array.from(getSubscriberStore2());
3186
+ for (const listener of subscribers) {
3187
+ try {
3188
+ listener(entry);
3189
+ } catch (error) {
3190
+ const message = error instanceof Error ? error.message : String(error);
3191
+ console.warn(`Pi log subscriber failed: ${message}`);
3192
+ }
3193
+ }
3194
+ }
3195
+ function recordPiLogEntry(entry) {
3196
+ getPiLogStore().push(entry);
3197
+ notifySubscribers2(entry);
3198
+ }
3199
+ function consumePiLogEntries() {
3200
+ const store = getPiLogStore();
3201
+ if (store.length === 0) {
3202
+ return [];
3203
+ }
3204
+ return store.splice(0, store.length);
3205
+ }
3206
+ function subscribeToPiLogEntries(listener) {
3207
+ const store = getSubscriberStore2();
3208
+ store.add(listener);
3209
+ return () => {
3210
+ store.delete(listener);
3211
+ };
3212
+ }
3085
3213
 
3086
- // src/evaluation/providers/vscode.ts
3087
- var VSCodeProvider = class {
3214
+ // src/evaluation/providers/pi-coding-agent.ts
3215
+ var WORKSPACE_PREFIX2 = "agentv-pi-";
3216
+ var PROMPT_FILENAME2 = "prompt.md";
3217
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
3218
+ - Do NOT create any additional output files in the workspace.
3219
+ - All intended file outputs/changes MUST be written in your response.
3220
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3221
+ This is required for evaluation scoring.`;
3222
+ var PiCodingAgentProvider = class {
3088
3223
  id;
3089
- kind;
3224
+ kind = "pi-coding-agent";
3090
3225
  targetName;
3091
- supportsBatch = true;
3226
+ supportsBatch = false;
3092
3227
  config;
3093
- constructor(targetName, config, kind) {
3094
- this.id = `${kind}:${targetName}`;
3095
- this.kind = kind;
3228
+ runPi;
3229
+ constructor(targetName, config, runner = defaultPiRunner) {
3230
+ this.id = `pi-coding-agent:${targetName}`;
3096
3231
  this.targetName = targetName;
3097
3232
  this.config = config;
3233
+ this.runPi = runner;
3098
3234
  }
3099
3235
  async invoke(request) {
3100
3236
  if (request.signal?.aborted) {
3101
- throw new Error("VS Code provider request was aborted before dispatch");
3102
- }
3103
- const inputFiles = normalizeAttachments(request.inputFiles);
3104
- const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
3105
- const session = await dispatchAgentSession({
3106
- userQuery: promptContent,
3107
- extraAttachments: inputFiles,
3108
- requestTemplate: AGENTV_REQUEST_TEMPLATE,
3109
- wait: this.config.waitForResponse,
3110
- dryRun: this.config.dryRun,
3111
- vscodeCmd: this.config.command,
3112
- subagentRoot: this.config.subagentRoot,
3113
- workspaceTemplate: this.config.workspaceTemplate,
3114
- silent: true
3115
- });
3116
- if (session.exitCode !== 0 || !session.responseFile) {
3117
- const failure = session.error ?? "VS Code subagent did not produce a response";
3118
- throw new Error(failure);
3237
+ throw new Error("Pi coding agent request was aborted before execution");
3119
3238
  }
3120
- if (this.config.dryRun) {
3239
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
3240
+ const workspaceRoot = await this.createWorkspace();
3241
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
3242
+ try {
3243
+ const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
3244
+ await writeFile2(promptFile, request.question, "utf8");
3245
+ const args = this.buildPiArgs(request.question, inputFiles);
3246
+ const cwd = this.resolveCwd(workspaceRoot);
3247
+ const result = await this.executePi(args, cwd, request.signal, logger);
3248
+ if (result.timedOut) {
3249
+ throw new Error(
3250
+ `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
3251
+ );
3252
+ }
3253
+ if (result.exitCode !== 0) {
3254
+ const detail = pickDetail2(result.stderr, result.stdout);
3255
+ const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3256
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3257
+ }
3258
+ const parsed = parsePiJsonl(result.stdout);
3259
+ const outputMessages = extractOutputMessages(parsed);
3260
+ const assistantText = extractAssistantText2(outputMessages);
3121
3261
  return {
3122
- outputMessages: [],
3123
3262
  raw: {
3124
- session,
3125
- inputFiles
3126
- }
3263
+ response: parsed,
3264
+ stdout: result.stdout,
3265
+ stderr: result.stderr,
3266
+ exitCode: result.exitCode,
3267
+ args,
3268
+ executable: this.config.executable,
3269
+ promptFile,
3270
+ workspace: workspaceRoot,
3271
+ inputFiles,
3272
+ logFile: logger?.filePath
3273
+ },
3274
+ outputMessages
3127
3275
  };
3276
+ } finally {
3277
+ await logger?.close();
3278
+ await this.cleanupWorkspace(workspaceRoot);
3128
3279
  }
3129
- const responseText = await readTextFile(session.responseFile);
3130
- return {
3131
- outputMessages: [{ role: "assistant", content: responseText }],
3132
- raw: {
3133
- session,
3134
- inputFiles
3135
- }
3136
- };
3137
3280
  }
3138
- async invokeBatch(requests) {
3139
- if (requests.length === 0) {
3140
- return [];
3281
+ resolveCwd(workspaceRoot) {
3282
+ if (!this.config.cwd) {
3283
+ return workspaceRoot;
3141
3284
  }
3142
- const normalizedRequests = requests.map((req) => ({
3143
- request: req,
3144
- inputFiles: normalizeAttachments(req.inputFiles)
3145
- }));
3146
- const combinedInputFiles = mergeAttachments(
3147
- normalizedRequests.map(({ inputFiles }) => inputFiles)
3148
- );
3149
- const userQueries = normalizedRequests.map(
3150
- ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
3151
- );
3152
- const session = await dispatchBatchAgent({
3153
- userQueries,
3154
- extraAttachments: combinedInputFiles,
3155
- requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
3156
- wait: this.config.waitForResponse,
3157
- dryRun: this.config.dryRun,
3158
- vscodeCmd: this.config.command,
3159
- subagentRoot: this.config.subagentRoot,
3160
- workspaceTemplate: this.config.workspaceTemplate,
3161
- silent: true
3162
- });
3163
- if (session.exitCode !== 0 || !session.responseFiles) {
3164
- const failure = session.error ?? "VS Code subagent did not produce batch responses";
3165
- throw new Error(failure);
3285
+ return path10.resolve(this.config.cwd);
3286
+ }
3287
+ buildPiArgs(prompt, inputFiles) {
3288
+ const args = [];
3289
+ if (this.config.provider) {
3290
+ args.push("--provider", this.config.provider);
3166
3291
  }
3167
- if (this.config.dryRun) {
3168
- return normalizedRequests.map(({ inputFiles }) => ({
3169
- outputMessages: [],
3170
- raw: {
3171
- session,
3172
- inputFiles,
3173
- allInputFiles: combinedInputFiles
3174
- }
3175
- }));
3292
+ if (this.config.model) {
3293
+ args.push("--model", this.config.model);
3176
3294
  }
3177
- if (session.responseFiles.length !== requests.length) {
3178
- throw new Error(
3179
- `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
3180
- );
3295
+ if (this.config.apiKey) {
3296
+ args.push("--api-key", this.config.apiKey);
3181
3297
  }
3182
- const responses = [];
3183
- for (const [index, responseFile] of session.responseFiles.entries()) {
3184
- const responseText = await readTextFile(responseFile);
3185
- responses.push({
3186
- outputMessages: [{ role: "assistant", content: responseText }],
3187
- raw: {
3188
- session,
3189
- inputFiles: normalizedRequests[index]?.inputFiles,
3190
- allInputFiles: combinedInputFiles,
3191
- responseFile
3192
- }
3193
- });
3298
+ args.push("--mode", "json");
3299
+ args.push("--print");
3300
+ args.push("--no-session");
3301
+ if (this.config.tools) {
3302
+ args.push("--tools", this.config.tools);
3194
3303
  }
3195
- return responses;
3304
+ if (this.config.thinking) {
3305
+ args.push("--thinking", this.config.thinking);
3306
+ }
3307
+ if (this.config.args && this.config.args.length > 0) {
3308
+ args.push(...this.config.args);
3309
+ }
3310
+ if (inputFiles && inputFiles.length > 0) {
3311
+ for (const file of inputFiles) {
3312
+ args.push(`@${file}`);
3313
+ }
3314
+ }
3315
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
3316
+ const fullPrompt = `${systemPrompt}
3317
+
3318
+ ${prompt}`;
3319
+ const escapedPrompt = escapeAtSymbols(fullPrompt);
3320
+ args.push(escapedPrompt);
3321
+ return args;
3196
3322
  }
3197
- };
3198
- function buildPromptDocument2(request, attachments, guidelinePatterns) {
3199
- const parts = [];
3200
- if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3201
- parts.push(request.systemPrompt.trim());
3323
+ async executePi(args, cwd, signal, logger) {
3324
+ try {
3325
+ return await this.runPi({
3326
+ executable: this.config.executable,
3327
+ args,
3328
+ cwd,
3329
+ timeoutMs: this.config.timeoutMs,
3330
+ env: this.buildEnv(),
3331
+ signal,
3332
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
3333
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
3334
+ });
3335
+ } catch (error) {
3336
+ const err = error;
3337
+ if (err.code === "ENOENT") {
3338
+ throw new Error(
3339
+ `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
3340
+ );
3341
+ }
3342
+ throw error;
3343
+ }
3202
3344
  }
3203
- const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
3204
- const attachmentFiles = collectAttachmentFiles(attachments);
3205
- const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
3206
- const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
3207
- if (prereadBlock.length > 0) {
3345
+ buildEnv() {
3346
+ const env = { ...process.env };
3347
+ if (this.config.apiKey) {
3348
+ const provider = this.config.provider?.toLowerCase() ?? "google";
3349
+ switch (provider) {
3350
+ case "google":
3351
+ case "gemini":
3352
+ env.GEMINI_API_KEY = this.config.apiKey;
3353
+ break;
3354
+ case "anthropic":
3355
+ env.ANTHROPIC_API_KEY = this.config.apiKey;
3356
+ break;
3357
+ case "openai":
3358
+ env.OPENAI_API_KEY = this.config.apiKey;
3359
+ break;
3360
+ case "groq":
3361
+ env.GROQ_API_KEY = this.config.apiKey;
3362
+ break;
3363
+ case "xai":
3364
+ env.XAI_API_KEY = this.config.apiKey;
3365
+ break;
3366
+ case "openrouter":
3367
+ env.OPENROUTER_API_KEY = this.config.apiKey;
3368
+ break;
3369
+ }
3370
+ }
3371
+ return env;
3372
+ }
3373
+ async createWorkspace() {
3374
+ return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
3375
+ }
3376
+ async cleanupWorkspace(workspaceRoot) {
3377
+ try {
3378
+ await rm2(workspaceRoot, { recursive: true, force: true });
3379
+ } catch {
3380
+ }
3381
+ }
3382
+ resolveLogDirectory() {
3383
+ if (this.config.logDir) {
3384
+ return path10.resolve(this.config.logDir);
3385
+ }
3386
+ return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3387
+ }
3388
+ async createStreamLogger(request) {
3389
+ const logDir = this.resolveLogDirectory();
3390
+ if (!logDir) {
3391
+ return void 0;
3392
+ }
3393
+ try {
3394
+ await mkdir2(logDir, { recursive: true });
3395
+ } catch (error) {
3396
+ const message = error instanceof Error ? error.message : String(error);
3397
+ console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3398
+ return void 0;
3399
+ }
3400
+ const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
3401
+ try {
3402
+ const logger = await PiStreamLogger.create({
3403
+ filePath,
3404
+ targetName: this.targetName,
3405
+ evalCaseId: request.evalCaseId,
3406
+ attempt: request.attempt,
3407
+ format: this.config.logFormat ?? "summary"
3408
+ });
3409
+ recordPiLogEntry({
3410
+ filePath,
3411
+ targetName: this.targetName,
3412
+ evalCaseId: request.evalCaseId,
3413
+ attempt: request.attempt
3414
+ });
3415
+ return logger;
3416
+ } catch (error) {
3417
+ const message = error instanceof Error ? error.message : String(error);
3418
+ console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
3419
+ return void 0;
3420
+ }
3421
+ }
3422
+ };
3423
+ var PiStreamLogger = class _PiStreamLogger {
3424
+ filePath;
3425
+ stream;
3426
+ startedAt = Date.now();
3427
+ stdoutBuffer = "";
3428
+ stderrBuffer = "";
3429
+ format;
3430
+ constructor(filePath, format) {
3431
+ this.filePath = filePath;
3432
+ this.format = format;
3433
+ this.stream = createWriteStream2(filePath, { flags: "a" });
3434
+ }
3435
+ static async create(options) {
3436
+ const logger = new _PiStreamLogger(options.filePath, options.format);
3437
+ const header = [
3438
+ "# Pi Coding Agent stream log",
3439
+ `# target: ${options.targetName}`,
3440
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
3441
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
3442
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
3443
+ ""
3444
+ ].filter((line) => Boolean(line));
3445
+ logger.writeLines(header);
3446
+ return logger;
3447
+ }
3448
+ handleStdoutChunk(chunk) {
3449
+ this.stdoutBuffer += chunk;
3450
+ this.flushBuffer("stdout");
3451
+ }
3452
+ handleStderrChunk(chunk) {
3453
+ this.stderrBuffer += chunk;
3454
+ this.flushBuffer("stderr");
3455
+ }
3456
+ async close() {
3457
+ this.flushBuffer("stdout");
3458
+ this.flushBuffer("stderr");
3459
+ this.flushRemainder();
3460
+ await new Promise((resolve, reject) => {
3461
+ this.stream.once("error", reject);
3462
+ this.stream.end(() => resolve());
3463
+ });
3464
+ }
3465
+ writeLines(lines) {
3466
+ for (const line of lines) {
3467
+ this.stream.write(`${line}
3468
+ `);
3469
+ }
3470
+ }
3471
+ flushBuffer(source) {
3472
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
3473
+ const lines = buffer.split(/\r?\n/);
3474
+ const remainder = lines.pop() ?? "";
3475
+ if (source === "stdout") {
3476
+ this.stdoutBuffer = remainder;
3477
+ } else {
3478
+ this.stderrBuffer = remainder;
3479
+ }
3480
+ for (const line of lines) {
3481
+ const formatted = this.formatLine(line, source);
3482
+ if (formatted) {
3483
+ this.stream.write(formatted);
3484
+ this.stream.write("\n");
3485
+ }
3486
+ }
3487
+ }
3488
+ formatLine(rawLine, source) {
3489
+ const trimmed = rawLine.trim();
3490
+ if (trimmed.length === 0) {
3491
+ return void 0;
3492
+ }
3493
+ const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3494
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
3495
+ }
3496
+ flushRemainder() {
3497
+ const stdoutRemainder = this.stdoutBuffer.trim();
3498
+ if (stdoutRemainder.length > 0) {
3499
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
3500
+ if (formatted) {
3501
+ this.stream.write(formatted);
3502
+ this.stream.write("\n");
3503
+ }
3504
+ }
3505
+ const stderrRemainder = this.stderrBuffer.trim();
3506
+ if (stderrRemainder.length > 0) {
3507
+ const formatted = this.formatLine(stderrRemainder, "stderr");
3508
+ if (formatted) {
3509
+ this.stream.write(formatted);
3510
+ this.stream.write("\n");
3511
+ }
3512
+ }
3513
+ this.stdoutBuffer = "";
3514
+ this.stderrBuffer = "";
3515
+ }
3516
+ };
3517
+ function buildLogFilename2(request, targetName) {
3518
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3519
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
3520
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3521
+ const target = sanitizeForFilename2(targetName);
3522
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
3523
+ }
3524
+ function sanitizeForFilename2(value) {
3525
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3526
+ return sanitized.length > 0 ? sanitized : "pi";
3527
+ }
3528
+ function formatElapsed2(startedAt) {
3529
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3530
+ const hours = Math.floor(elapsedSeconds / 3600);
3531
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
3532
+ const seconds = elapsedSeconds % 60;
3533
+ if (hours > 0) {
3534
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3535
+ }
3536
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3537
+ }
3538
+ function formatPiLogMessage(rawLine, source) {
3539
+ const parsed = tryParseJsonValue2(rawLine);
3540
+ if (parsed) {
3541
+ const summary = summarizePiEvent(parsed);
3542
+ if (summary) {
3543
+ return summary;
3544
+ }
3545
+ }
3546
+ if (source === "stderr") {
3547
+ return `stderr: ${rawLine}`;
3548
+ }
3549
+ return rawLine;
3550
+ }
3551
+ function formatPiJsonLog(rawLine) {
3552
+ const parsed = tryParseJsonValue2(rawLine);
3553
+ if (!parsed) {
3554
+ return rawLine;
3555
+ }
3556
+ try {
3557
+ return JSON.stringify(parsed, null, 2);
3558
+ } catch {
3559
+ return rawLine;
3560
+ }
3561
+ }
3562
+ function summarizePiEvent(event) {
3563
+ if (!event || typeof event !== "object") {
3564
+ return void 0;
3565
+ }
3566
+ const record = event;
3567
+ const type = typeof record.type === "string" ? record.type : void 0;
3568
+ if (!type) {
3569
+ return void 0;
3570
+ }
3571
+ switch (type) {
3572
+ case "agent_start":
3573
+ return "agent_start";
3574
+ case "agent_end":
3575
+ return "agent_end";
3576
+ case "turn_start":
3577
+ return "turn_start";
3578
+ case "turn_end":
3579
+ return "turn_end";
3580
+ case "message_start":
3581
+ case "message_end": {
3582
+ const message = record.message;
3583
+ const role = message?.role;
3584
+ return `${type}: ${role}`;
3585
+ }
3586
+ case "message_update": {
3587
+ const event2 = record.assistantMessageEvent;
3588
+ const eventType = event2?.type;
3589
+ if (eventType === "text_delta") {
3590
+ const delta = event2?.delta;
3591
+ if (typeof delta === "string") {
3592
+ const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
3593
+ return `text_delta: ${preview}`;
3594
+ }
3595
+ }
3596
+ return `message_update: ${eventType}`;
3597
+ }
3598
+ default:
3599
+ return type;
3600
+ }
3601
+ }
3602
+ function tryParseJsonValue2(rawLine) {
3603
+ try {
3604
+ return JSON.parse(rawLine);
3605
+ } catch {
3606
+ return void 0;
3607
+ }
3608
+ }
3609
+ function parsePiJsonl(output) {
3610
+ const trimmed = output.trim();
3611
+ if (trimmed.length === 0) {
3612
+ throw new Error("Pi coding agent produced no output");
3613
+ }
3614
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
3615
+ const parsed = [];
3616
+ for (const line of lines) {
3617
+ try {
3618
+ parsed.push(JSON.parse(line));
3619
+ } catch {
3620
+ }
3621
+ }
3622
+ if (parsed.length === 0) {
3623
+ throw new Error("Pi coding agent produced no valid JSON output");
3624
+ }
3625
+ return parsed;
3626
+ }
3627
+ function extractOutputMessages(events) {
3628
+ for (let i = events.length - 1; i >= 0; i--) {
3629
+ const event = events[i];
3630
+ if (!event || typeof event !== "object") {
3631
+ continue;
3632
+ }
3633
+ const record = event;
3634
+ if (record.type !== "agent_end") {
3635
+ continue;
3636
+ }
3637
+ const messages = record.messages;
3638
+ if (!Array.isArray(messages)) {
3639
+ continue;
3640
+ }
3641
+ return messages.map(convertPiMessage).filter((m) => m !== void 0);
3642
+ }
3643
+ const outputMessages = [];
3644
+ for (const event of events) {
3645
+ if (!event || typeof event !== "object") {
3646
+ continue;
3647
+ }
3648
+ const record = event;
3649
+ if (record.type === "turn_end") {
3650
+ const message = record.message;
3651
+ const converted = convertPiMessage(message);
3652
+ if (converted) {
3653
+ outputMessages.push(converted);
3654
+ }
3655
+ }
3656
+ }
3657
+ return outputMessages;
3658
+ }
3659
+ function convertPiMessage(message) {
3660
+ if (!message || typeof message !== "object") {
3661
+ return void 0;
3662
+ }
3663
+ const msg = message;
3664
+ const role = msg.role;
3665
+ if (typeof role !== "string") {
3666
+ return void 0;
3667
+ }
3668
+ const content = extractTextContent(msg.content);
3669
+ const toolCalls = extractToolCalls(msg.content);
3670
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3671
+ const metadata = {};
3672
+ if (msg.api) metadata.api = msg.api;
3673
+ if (msg.provider) metadata.provider = msg.provider;
3674
+ if (msg.model) metadata.model = msg.model;
3675
+ if (msg.usage) metadata.usage = msg.usage;
3676
+ if (msg.stopReason) metadata.stopReason = msg.stopReason;
3677
+ return {
3678
+ role,
3679
+ content,
3680
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
3681
+ timestamp,
3682
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3683
+ };
3684
+ }
3685
+ function extractTextContent(content) {
3686
+ if (typeof content === "string") {
3687
+ return content;
3688
+ }
3689
+ if (!Array.isArray(content)) {
3690
+ return void 0;
3691
+ }
3692
+ const textParts = [];
3693
+ for (const part of content) {
3694
+ if (!part || typeof part !== "object") {
3695
+ continue;
3696
+ }
3697
+ const p = part;
3698
+ if (p.type === "text" && typeof p.text === "string") {
3699
+ textParts.push(p.text);
3700
+ }
3701
+ }
3702
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
3703
+ }
3704
+ function extractToolCalls(content) {
3705
+ if (!Array.isArray(content)) {
3706
+ return [];
3707
+ }
3708
+ const toolCalls = [];
3709
+ for (const part of content) {
3710
+ if (!part || typeof part !== "object") {
3711
+ continue;
3712
+ }
3713
+ const p = part;
3714
+ if (p.type === "tool_use" && typeof p.name === "string") {
3715
+ toolCalls.push({
3716
+ tool: p.name,
3717
+ input: p.input,
3718
+ id: typeof p.id === "string" ? p.id : void 0
3719
+ });
3720
+ }
3721
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
3722
+ const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
3723
+ if (existing) {
3724
+ const idx = toolCalls.indexOf(existing);
3725
+ toolCalls[idx] = {
3726
+ ...existing,
3727
+ output: p.content
3728
+ };
3729
+ }
3730
+ }
3731
+ }
3732
+ return toolCalls;
3733
+ }
3734
+ function extractAssistantText2(messages) {
3735
+ for (let i = messages.length - 1; i >= 0; i--) {
3736
+ const msg = messages[i];
3737
+ if (msg.role === "assistant" && msg.content) {
3738
+ if (typeof msg.content === "string") {
3739
+ return msg.content;
3740
+ }
3741
+ return JSON.stringify(msg.content);
3742
+ }
3743
+ }
3744
+ return "";
3745
+ }
3746
+ function escapeAtSymbols(prompt) {
3747
+ return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3748
+ }
3749
+ function pickDetail2(stderr, stdout) {
3750
+ const errorText = stderr.trim();
3751
+ if (errorText.length > 0) {
3752
+ return errorText;
3753
+ }
3754
+ const stdoutText = stdout.trim();
3755
+ return stdoutText.length > 0 ? stdoutText : void 0;
3756
+ }
3757
+ function formatTimeoutSuffix3(timeoutMs) {
3758
+ if (!timeoutMs || timeoutMs <= 0) {
3759
+ return "";
3760
+ }
3761
+ const seconds = Math.ceil(timeoutMs / 1e3);
3762
+ return ` after ${seconds}s`;
3763
+ }
3764
+ async function defaultPiRunner(options) {
3765
+ return await new Promise((resolve, reject) => {
3766
+ const parts = options.executable.split(/\s+/);
3767
+ const executable = parts[0];
3768
+ const executableArgs = parts.slice(1);
3769
+ const allArgs = [...executableArgs, ...options.args];
3770
+ const child = spawn2(executable, allArgs, {
3771
+ cwd: options.cwd,
3772
+ env: options.env,
3773
+ stdio: ["pipe", "pipe", "pipe"],
3774
+ shell: false
3775
+ });
3776
+ let stdout = "";
3777
+ let stderr = "";
3778
+ let timedOut = false;
3779
+ const onAbort = () => {
3780
+ child.kill("SIGTERM");
3781
+ };
3782
+ if (options.signal) {
3783
+ if (options.signal.aborted) {
3784
+ onAbort();
3785
+ } else {
3786
+ options.signal.addEventListener("abort", onAbort, { once: true });
3787
+ }
3788
+ }
3789
+ let timeoutHandle;
3790
+ if (options.timeoutMs && options.timeoutMs > 0) {
3791
+ timeoutHandle = setTimeout(() => {
3792
+ timedOut = true;
3793
+ child.kill("SIGTERM");
3794
+ }, options.timeoutMs);
3795
+ timeoutHandle.unref?.();
3796
+ }
3797
+ child.stdout.setEncoding("utf8");
3798
+ child.stdout.on("data", (chunk) => {
3799
+ stdout += chunk;
3800
+ options.onStdoutChunk?.(chunk);
3801
+ });
3802
+ child.stderr.setEncoding("utf8");
3803
+ child.stderr.on("data", (chunk) => {
3804
+ stderr += chunk;
3805
+ options.onStderrChunk?.(chunk);
3806
+ });
3807
+ child.stdin.end();
3808
+ const cleanup = () => {
3809
+ if (timeoutHandle) {
3810
+ clearTimeout(timeoutHandle);
3811
+ }
3812
+ if (options.signal) {
3813
+ options.signal.removeEventListener("abort", onAbort);
3814
+ }
3815
+ };
3816
+ child.on("error", (error) => {
3817
+ cleanup();
3818
+ reject(error);
3819
+ });
3820
+ child.on("close", (code) => {
3821
+ cleanup();
3822
+ resolve({
3823
+ stdout,
3824
+ stderr,
3825
+ exitCode: typeof code === "number" ? code : -1,
3826
+ timedOut
3827
+ });
3828
+ });
3829
+ });
3830
+ }
3831
+
3832
+ // src/evaluation/providers/vscode.ts
3833
+ import path11 from "node:path";
3834
+ import {
3835
+ dispatchAgentSession,
3836
+ dispatchBatchAgent,
3837
+ getSubagentRoot,
3838
+ provisionSubagents
3839
+ } from "subagent";
3840
+
3841
+ // src/evaluation/providers/vscode-templates.ts
3842
+ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
3843
+
3844
+ {{userQuery}}
3845
+
3846
+ [[ ## system_instructions ## ]]
3847
+
3848
+ **IMPORTANT**: Follow these exact steps:
3849
+ 1. Create and write your complete response to: {{responseFileTmp}}
3850
+ - Do NOT create any additional output files in the workspace.
3851
+ - All intended file outputs/changes MUST be written in your response file.
3852
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3853
+ 2. When completely finished, run these PowerShell commands to signal completion:
3854
+ \`\`\`
3855
+ Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
3856
+ if (Test-Path subagent.lock) { del subagent.lock }
3857
+ \`\`\`
3858
+
3859
+ Do not proceed to step 2 until your response is completely written to the temporary file.
3860
+ `;
3861
+ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
3862
+
3863
+ {{userQuery}}
3864
+
3865
+ [[ ## system_instructions ## ]]
3866
+
3867
+ **IMPORTANT**: Follow these exact steps:
3868
+ 1. Create and write your complete response to: {{responseFileTmp}}
3869
+ - Do NOT create any additional output files in the workspace.
3870
+ - All intended file outputs/changes MUST be written in your response file.
3871
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3872
+ 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3873
+ 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3874
+ `;
3875
+
3876
+ // src/evaluation/providers/vscode.ts
3877
+ var VSCodeProvider = class {
3878
+ id;
3879
+ kind;
3880
+ targetName;
3881
+ supportsBatch = true;
3882
+ config;
3883
+ constructor(targetName, config, kind) {
3884
+ this.id = `${kind}:${targetName}`;
3885
+ this.kind = kind;
3886
+ this.targetName = targetName;
3887
+ this.config = config;
3888
+ }
3889
+ async invoke(request) {
3890
+ if (request.signal?.aborted) {
3891
+ throw new Error("VS Code provider request was aborted before dispatch");
3892
+ }
3893
+ const inputFiles = normalizeAttachments(request.inputFiles);
3894
+ const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
3895
+ const session = await dispatchAgentSession({
3896
+ userQuery: promptContent,
3897
+ extraAttachments: inputFiles,
3898
+ requestTemplate: AGENTV_REQUEST_TEMPLATE,
3899
+ wait: this.config.waitForResponse,
3900
+ dryRun: this.config.dryRun,
3901
+ vscodeCmd: this.config.command,
3902
+ subagentRoot: this.config.subagentRoot,
3903
+ workspaceTemplate: this.config.workspaceTemplate,
3904
+ silent: true
3905
+ });
3906
+ if (session.exitCode !== 0 || !session.responseFile) {
3907
+ const failure = session.error ?? "VS Code subagent did not produce a response";
3908
+ throw new Error(failure);
3909
+ }
3910
+ if (this.config.dryRun) {
3911
+ return {
3912
+ outputMessages: [],
3913
+ raw: {
3914
+ session,
3915
+ inputFiles
3916
+ }
3917
+ };
3918
+ }
3919
+ const responseText = await readTextFile(session.responseFile);
3920
+ return {
3921
+ outputMessages: [{ role: "assistant", content: responseText }],
3922
+ raw: {
3923
+ session,
3924
+ inputFiles
3925
+ }
3926
+ };
3927
+ }
3928
+ async invokeBatch(requests) {
3929
+ if (requests.length === 0) {
3930
+ return [];
3931
+ }
3932
+ const normalizedRequests = requests.map((req) => ({
3933
+ request: req,
3934
+ inputFiles: normalizeAttachments(req.inputFiles)
3935
+ }));
3936
+ const combinedInputFiles = mergeAttachments(
3937
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
3938
+ );
3939
+ const userQueries = normalizedRequests.map(
3940
+ ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
3941
+ );
3942
+ const session = await dispatchBatchAgent({
3943
+ userQueries,
3944
+ extraAttachments: combinedInputFiles,
3945
+ requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
3946
+ wait: this.config.waitForResponse,
3947
+ dryRun: this.config.dryRun,
3948
+ vscodeCmd: this.config.command,
3949
+ subagentRoot: this.config.subagentRoot,
3950
+ workspaceTemplate: this.config.workspaceTemplate,
3951
+ silent: true
3952
+ });
3953
+ if (session.exitCode !== 0 || !session.responseFiles) {
3954
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
3955
+ throw new Error(failure);
3956
+ }
3957
+ if (this.config.dryRun) {
3958
+ return normalizedRequests.map(({ inputFiles }) => ({
3959
+ outputMessages: [],
3960
+ raw: {
3961
+ session,
3962
+ inputFiles,
3963
+ allInputFiles: combinedInputFiles
3964
+ }
3965
+ }));
3966
+ }
3967
+ if (session.responseFiles.length !== requests.length) {
3968
+ throw new Error(
3969
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
3970
+ );
3971
+ }
3972
+ const responses = [];
3973
+ for (const [index, responseFile] of session.responseFiles.entries()) {
3974
+ const responseText = await readTextFile(responseFile);
3975
+ responses.push({
3976
+ outputMessages: [{ role: "assistant", content: responseText }],
3977
+ raw: {
3978
+ session,
3979
+ inputFiles: normalizedRequests[index]?.inputFiles,
3980
+ allInputFiles: combinedInputFiles,
3981
+ responseFile
3982
+ }
3983
+ });
3984
+ }
3985
+ return responses;
3986
+ }
3987
+ };
3988
+ function buildPromptDocument2(request, attachments, guidelinePatterns) {
3989
+ const parts = [];
3990
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3991
+ parts.push(request.systemPrompt.trim());
3992
+ }
3993
+ const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
3994
+ const attachmentFiles = collectAttachmentFiles(attachments);
3995
+ const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
3996
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
3997
+ if (prereadBlock.length > 0) {
3208
3998
  parts.push("\n", prereadBlock);
3209
3999
  }
3210
4000
  parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
@@ -3215,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3215
4005
  return "";
3216
4006
  }
3217
4007
  const buildList = (files) => files.map((absolutePath) => {
3218
- const fileName = path10.basename(absolutePath);
4008
+ const fileName = path11.basename(absolutePath);
3219
4009
  const fileUri = pathToFileUri2(absolutePath);
3220
4010
  return `* [${fileName}](${fileUri})`;
3221
4011
  });
@@ -3240,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3240
4030
  }
3241
4031
  const unique = /* @__PURE__ */ new Map();
3242
4032
  for (const attachment of attachments) {
3243
- const absolutePath = path10.resolve(attachment);
3244
- const normalized = absolutePath.split(path10.sep).join("/");
4033
+ const absolutePath = path11.resolve(attachment);
4034
+ const normalized = absolutePath.split(path11.sep).join("/");
3245
4035
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3246
4036
  if (!unique.has(absolutePath)) {
3247
4037
  unique.set(absolutePath, absolutePath);
@@ -3256,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
3256
4046
  }
3257
4047
  const unique = /* @__PURE__ */ new Map();
3258
4048
  for (const attachment of attachments) {
3259
- const absolutePath = path10.resolve(attachment);
4049
+ const absolutePath = path11.resolve(attachment);
3260
4050
  if (!unique.has(absolutePath)) {
3261
4051
  unique.set(absolutePath, absolutePath);
3262
4052
  }
@@ -3264,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
3264
4054
  return Array.from(unique.values());
3265
4055
  }
3266
4056
  function pathToFileUri2(filePath) {
3267
- const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
4057
+ const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
3268
4058
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3269
4059
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3270
4060
  return `file:///${normalizedPath}`;
@@ -3277,7 +4067,7 @@ function normalizeAttachments(attachments) {
3277
4067
  }
3278
4068
  const deduped = /* @__PURE__ */ new Set();
3279
4069
  for (const attachment of attachments) {
3280
- deduped.add(path10.resolve(attachment));
4070
+ deduped.add(path11.resolve(attachment));
3281
4071
  }
3282
4072
  return Array.from(deduped);
3283
4073
  }
@@ -3286,7 +4076,7 @@ function mergeAttachments(all) {
3286
4076
  for (const list of all) {
3287
4077
  if (!list) continue;
3288
4078
  for (const inputFile of list) {
3289
- deduped.add(path10.resolve(inputFile));
4079
+ deduped.add(path11.resolve(inputFile));
3290
4080
  }
3291
4081
  }
3292
4082
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3335,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3335
4125
  // src/evaluation/providers/targets-file.ts
3336
4126
  import { constants as constants3 } from "node:fs";
3337
4127
  import { access as access3, readFile as readFile6 } from "node:fs/promises";
3338
- import path11 from "node:path";
4128
+ import path12 from "node:path";
3339
4129
  import { parse as parse3 } from "yaml";
3340
4130
  function isRecord(value) {
3341
4131
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3372,7 +4162,7 @@ async function fileExists3(filePath) {
3372
4162
  }
3373
4163
  }
3374
4164
  async function readTargetDefinitions(filePath) {
3375
- const absolutePath = path11.resolve(filePath);
4165
+ const absolutePath = path12.resolve(filePath);
3376
4166
  if (!await fileExists3(absolutePath)) {
3377
4167
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3378
4168
  }
@@ -3404,6 +4194,8 @@ function createProvider(target) {
3404
4194
  return new CliProvider(target.name, target.config);
3405
4195
  case "codex":
3406
4196
  return new CodexProvider(target.name, target.config);
4197
+ case "pi-coding-agent":
4198
+ return new PiCodingAgentProvider(target.name, target.config);
3407
4199
  case "mock":
3408
4200
  return new MockProvider(target.name, target.config);
3409
4201
  case "vscode":
@@ -3423,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
3423
4215
  // src/evaluation/evaluators.ts
3424
4216
  import { generateText as generateText2 } from "ai";
3425
4217
  import { z } from "zod";
4218
+
4219
+ // src/runtime/exec.ts
4220
+ function getBunSpawn() {
4221
+ const bunSpawn = globalThis.Bun?.spawn;
4222
+ return typeof bunSpawn === "function" ? bunSpawn : void 0;
4223
+ }
4224
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
4225
+ const bunSpawn = getBunSpawn();
4226
+ if (bunSpawn) {
4227
+ const encoder = new TextEncoder();
4228
+ const proc = bunSpawn({
4229
+ cmd: ["sh", "-c", command],
4230
+ cwd: options.cwd,
4231
+ stdin: encoder.encode(stdinPayload),
4232
+ stdout: "pipe",
4233
+ stderr: "pipe"
4234
+ });
4235
+ const timeout = options.timeoutMs ? setTimeout(() => {
4236
+ proc.kill();
4237
+ }, options.timeoutMs) : void 0;
4238
+ try {
4239
+ const stdout = await new Response(proc.stdout).text();
4240
+ const stderr = await new Response(proc.stderr).text();
4241
+ const exitCode = await proc.exited;
4242
+ return { stdout, stderr, exitCode };
4243
+ } finally {
4244
+ if (timeout !== void 0) {
4245
+ clearTimeout(timeout);
4246
+ }
4247
+ }
4248
+ }
4249
+ const { spawn: spawn3 } = await import("node:child_process");
4250
+ return await new Promise((resolve, reject) => {
4251
+ const child = spawn3(command, {
4252
+ shell: true,
4253
+ cwd: options.cwd,
4254
+ stdio: ["pipe", "pipe", "pipe"]
4255
+ });
4256
+ let stdout = "";
4257
+ let stderr = "";
4258
+ const timeout = options.timeoutMs ? setTimeout(() => {
4259
+ child.kill();
4260
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
4261
+ }, options.timeoutMs) : void 0;
4262
+ child.stdout?.on("data", (data) => {
4263
+ stdout += data.toString();
4264
+ });
4265
+ child.stderr?.on("data", (data) => {
4266
+ stderr += data.toString();
4267
+ });
4268
+ child.on("error", (error) => {
4269
+ if (timeout !== void 0) {
4270
+ clearTimeout(timeout);
4271
+ }
4272
+ reject(error);
4273
+ });
4274
+ child.on("exit", (code) => {
4275
+ if (timeout !== void 0) {
4276
+ clearTimeout(timeout);
4277
+ }
4278
+ resolve({ stdout, stderr, exitCode: code ?? 0 });
4279
+ });
4280
+ child.stdin?.write(stdinPayload);
4281
+ child.stdin?.end();
4282
+ });
4283
+ }
4284
+
4285
+ // src/evaluation/evaluators.ts
3426
4286
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3427
4287
 
3428
4288
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3698,17 +4558,17 @@ var CodeEvaluator = class {
3698
4558
  const inputPayload = JSON.stringify(
3699
4559
  {
3700
4560
  question: context.evalCase.question,
3701
- expected_outcome: context.evalCase.expected_outcome,
3702
- expected_messages: context.evalCase.expected_messages,
3703
- reference_answer: context.evalCase.reference_answer,
3704
- candidate_answer: context.candidate,
3705
- output_messages: context.outputMessages ?? null,
3706
- guideline_files: context.evalCase.guideline_paths,
3707
- input_files: context.evalCase.file_paths.filter(
3708
- (path13) => !context.evalCase.guideline_paths.includes(path13)
4561
+ expectedOutcome: context.evalCase.expected_outcome,
4562
+ expectedMessages: context.evalCase.expected_messages,
4563
+ referenceAnswer: context.evalCase.reference_answer,
4564
+ candidateAnswer: context.candidate,
4565
+ outputMessages: context.outputMessages ?? null,
4566
+ guidelineFiles: context.evalCase.guideline_paths,
4567
+ inputFiles: context.evalCase.file_paths.filter(
4568
+ (path14) => !context.evalCase.guideline_paths.includes(path14)
3709
4569
  ),
3710
- input_messages: context.evalCase.input_messages,
3711
- candidate_trace_summary: context.traceSummary ?? null
4570
+ inputMessages: context.evalCase.input_messages,
4571
+ traceSummary: context.traceSummary ?? null
3712
4572
  },
3713
4573
  null,
3714
4574
  2
@@ -3778,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
3778
4638
  return { score, verdict, hits, misses };
3779
4639
  }
3780
4640
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
3781
- const { spawn: spawn2 } = await import("node:child_process");
3782
- return await new Promise((resolve, reject) => {
3783
- const child = spawn2(scriptPath, {
3784
- shell: true,
3785
- cwd
3786
- });
3787
- let stdout = "";
3788
- let stderr = "";
3789
- const timeout = agentTimeoutMs ? setTimeout(() => {
3790
- child.kill();
3791
- reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
3792
- }, agentTimeoutMs) : void 0;
3793
- child.stdout?.on("data", (data) => {
3794
- stdout += data.toString();
3795
- });
3796
- child.stderr?.on("data", (data) => {
3797
- stderr += data.toString();
3798
- });
3799
- child.on("error", (error) => {
3800
- if (timeout !== void 0) {
3801
- clearTimeout(timeout);
3802
- }
3803
- reject(error);
3804
- });
3805
- child.on("exit", (code) => {
3806
- if (timeout !== void 0) {
3807
- clearTimeout(timeout);
3808
- }
3809
- if (code && code !== 0 && stderr.length > 0) {
3810
- reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
3811
- return;
3812
- }
3813
- resolve(stdout.trim());
3814
- });
3815
- child.stdin?.write(input);
3816
- child.stdin?.end();
4641
+ const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
4642
+ cwd,
4643
+ timeoutMs: agentTimeoutMs
3817
4644
  });
4645
+ if (exitCode !== 0) {
4646
+ const trimmedErr = stderr.trim();
4647
+ throw new Error(
4648
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
4649
+ );
4650
+ }
4651
+ return stdout.trim();
3818
4652
  }
3819
4653
  function parseJsonSafe(payload) {
3820
4654
  try {
@@ -3828,6 +4662,33 @@ function substituteVariables(template, variables) {
3828
4662
  return variables[varName] ?? match;
3829
4663
  });
3830
4664
  }
4665
+ function deepEqual(a, b) {
4666
+ if (a === b) return true;
4667
+ if (a === null || b === null) return a === b;
4668
+ if (typeof a !== typeof b) return false;
4669
+ if (typeof a !== "object") return a === b;
4670
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
4671
+ if (Array.isArray(a) && Array.isArray(b)) {
4672
+ if (a.length !== b.length) return false;
4673
+ return a.every((val, i) => deepEqual(val, b[i]));
4674
+ }
4675
+ const aObj = a;
4676
+ const bObj = b;
4677
+ const aKeys = Object.keys(aObj);
4678
+ const bKeys = Object.keys(bObj);
4679
+ if (aKeys.length !== bKeys.length) return false;
4680
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
4681
+ }
4682
+ function argsMatch(expected, actual) {
4683
+ if (expected === void 0) return true;
4684
+ if (expected === "any") return true;
4685
+ if (actual === void 0) return false;
4686
+ for (const key of Object.keys(expected)) {
4687
+ if (!Object.hasOwn(actual, key)) return false;
4688
+ if (!deepEqual(expected[key], actual[key])) return false;
4689
+ }
4690
+ return true;
4691
+ }
3831
4692
  var ToolTrajectoryEvaluator = class {
3832
4693
  kind = "tool_trajectory";
3833
4694
  config;
@@ -3884,7 +4745,10 @@ var ToolTrajectoryEvaluator = class {
3884
4745
  for (const message of messages) {
3885
4746
  if (message.toolCalls) {
3886
4747
  for (const call of message.toolCalls) {
3887
- toolCalls.push({ name: call.tool });
4748
+ toolCalls.push({
4749
+ name: call.tool,
4750
+ args: call.input
4751
+ });
3888
4752
  }
3889
4753
  }
3890
4754
  }
@@ -3953,18 +4817,29 @@ var ToolTrajectoryEvaluator = class {
3953
4817
  const misses = [];
3954
4818
  let actualIndex = 0;
3955
4819
  for (let i = 0; i < expected.length; i++) {
3956
- const expectedTool = expected[i].tool;
4820
+ const expectedItem = expected[i];
4821
+ const expectedTool = expectedItem.tool;
3957
4822
  let found = false;
4823
+ let argsMismatch = false;
3958
4824
  while (actualIndex < toolCalls.length) {
3959
- if (toolCalls[actualIndex].name === expectedTool) {
3960
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4825
+ const actualCall = toolCalls[actualIndex];
4826
+ if (actualCall.name === expectedTool) {
4827
+ if (argsMatch(expectedItem.args, actualCall.args)) {
4828
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4829
+ actualIndex++;
4830
+ found = true;
4831
+ break;
4832
+ }
4833
+ misses.push(
4834
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
4835
+ );
3961
4836
  actualIndex++;
3962
- found = true;
4837
+ argsMismatch = true;
3963
4838
  break;
3964
4839
  }
3965
4840
  actualIndex++;
3966
4841
  }
3967
- if (!found) {
4842
+ if (!found && !argsMismatch) {
3968
4843
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
3969
4844
  }
3970
4845
  }
@@ -3995,10 +4870,16 @@ var ToolTrajectoryEvaluator = class {
3995
4870
  }
3996
4871
  const checkLength = Math.min(expected.length, toolCalls.length);
3997
4872
  for (let i = 0; i < checkLength; i++) {
3998
- const expectedTool = expected[i].tool;
3999
- const actualTool = toolCalls[i].name;
4873
+ const expectedItem = expected[i];
4874
+ const expectedTool = expectedItem.tool;
4875
+ const actualCall = toolCalls[i];
4876
+ const actualTool = actualCall.name;
4000
4877
  if (actualTool === expectedTool) {
4001
- hits.push(`Position ${i}: ${expectedTool} \u2713`);
4878
+ if (argsMatch(expectedItem.args, actualCall.args)) {
4879
+ hits.push(`Position ${i}: ${expectedTool}`);
4880
+ } else {
4881
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
4882
+ }
4002
4883
  } else {
4003
4884
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4004
4885
  }
@@ -4242,9 +5123,9 @@ var CompositeEvaluator = class {
4242
5123
  };
4243
5124
 
4244
5125
  // src/evaluation/orchestrator.ts
4245
- import { createHash, randomUUID as randomUUID2 } from "node:crypto";
4246
- import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
4247
- import path12 from "node:path";
5126
+ import { createHash, randomUUID as randomUUID3 } from "node:crypto";
5127
+ import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
5128
+ import path13 from "node:path";
4248
5129
 
4249
5130
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4250
5131
  var Node = class {
@@ -4640,7 +5521,12 @@ async function runBatchEvaluation(options) {
4640
5521
  const promptInputs = promptInputsList[i];
4641
5522
  const providerResponse = batchResponse[i];
4642
5523
  const outputMessages = providerResponse.outputMessages;
4643
- const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5524
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5525
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5526
+ tokenUsage: providerResponse.tokenUsage,
5527
+ costUsd: providerResponse.costUsd,
5528
+ durationMs: providerResponse.durationMs
5529
+ }) : void 0;
4644
5530
  const candidate = extractLastAssistantContent(outputMessages);
4645
5531
  let result;
4646
5532
  try {
@@ -4761,7 +5647,12 @@ async function runEvalCase(options) {
4761
5647
  await cache.set(cacheKey, providerResponse);
4762
5648
  }
4763
5649
  const outputMessages = providerResponse.outputMessages;
4764
- const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5650
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5651
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5652
+ tokenUsage: providerResponse.tokenUsage,
5653
+ costUsd: providerResponse.costUsd,
5654
+ durationMs: providerResponse.durationMs
5655
+ }) : void 0;
4765
5656
  const candidate = extractLastAssistantContent(outputMessages);
4766
5657
  try {
4767
5658
  return await evaluateCandidate({
@@ -4834,21 +5725,21 @@ async function evaluateCandidate(options) {
4834
5725
  }
4835
5726
  return {
4836
5727
  timestamp: completedAt.toISOString(),
4837
- eval_id: evalCase.id,
5728
+ evalId: evalCase.id,
4838
5729
  dataset: evalCase.dataset,
4839
- conversation_id: evalCase.conversation_id,
5730
+ conversationId: evalCase.conversation_id,
4840
5731
  score: score.score,
4841
5732
  hits: score.hits,
4842
5733
  misses: score.misses,
4843
- candidate_answer: candidate,
5734
+ candidateAnswer: candidate,
4844
5735
  target: target.name,
4845
5736
  reasoning: score.reasoning,
4846
- raw_aspects: score.rawAspects,
4847
- agent_provider_request: agentProviderRequest,
4848
- lm_provider_request: lmProviderRequest,
4849
- evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4850
- evaluator_results: evaluatorResults,
4851
- trace_summary: traceSummary
5737
+ rawAspects: score.rawAspects,
5738
+ agentProviderRequest,
5739
+ lmProviderRequest,
5740
+ evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5741
+ evaluatorResults,
5742
+ traceSummary
4852
5743
  };
4853
5744
  }
4854
5745
  async function runEvaluatorsForCase(options) {
@@ -4946,7 +5837,7 @@ async function runEvaluatorList(options) {
4946
5837
  hits: score2.hits,
4947
5838
  misses: score2.misses,
4948
5839
  reasoning: score2.reasoning,
4949
- evaluator_provider_request: score2.evaluatorRawRequest
5840
+ evaluatorProviderRequest: score2.evaluatorRawRequest
4950
5841
  });
4951
5842
  }
4952
5843
  if (evaluator.type === "code") {
@@ -4977,11 +5868,11 @@ async function runEvaluatorList(options) {
4977
5868
  hits: score2.hits,
4978
5869
  misses: score2.misses,
4979
5870
  reasoning: score2.reasoning,
4980
- evaluator_provider_request: score2.evaluatorRawRequest
5871
+ evaluatorProviderRequest: score2.evaluatorRawRequest
4981
5872
  });
4982
5873
  }
4983
5874
  if (evaluator.type === "composite") {
4984
- const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
5875
+ const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
4985
5876
  const createEvaluator = (memberConfig) => {
4986
5877
  switch (memberConfig.type) {
4987
5878
  case "llm_judge":
@@ -5034,8 +5925,8 @@ async function runEvaluatorList(options) {
5034
5925
  hits: score2.hits,
5035
5926
  misses: score2.misses,
5036
5927
  reasoning: score2.reasoning,
5037
- evaluator_provider_request: score2.evaluatorRawRequest,
5038
- evaluator_results: mapChildResults(score2.evaluatorResults)
5928
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
5929
+ evaluatorResults: mapChildResults(score2.evaluatorResults)
5039
5930
  });
5040
5931
  }
5041
5932
  if (evaluator.type === "tool_trajectory") {
@@ -5193,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5193
6084
  async function dumpPrompt(directory, evalCase, promptInputs) {
5194
6085
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5195
6086
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5196
- const filePath = path12.resolve(directory, filename);
5197
- await mkdir2(path12.dirname(filePath), { recursive: true });
6087
+ const filePath = path13.resolve(directory, filename);
6088
+ await mkdir3(path13.dirname(filePath), { recursive: true });
5198
6089
  const payload = {
5199
6090
  eval_id: evalCase.id,
5200
6091
  question: promptInputs.question,
5201
6092
  guidelines: promptInputs.guidelines,
5202
6093
  guideline_paths: evalCase.guideline_paths
5203
6094
  };
5204
- await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
6095
+ await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
5205
6096
  }
5206
6097
  function sanitizeFilename(value) {
5207
6098
  if (!value) {
5208
6099
  return "prompt";
5209
6100
  }
5210
6101
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
5211
- return sanitized.length > 0 ? sanitized : randomUUID2();
6102
+ return sanitized.length > 0 ? sanitized : randomUUID3();
5212
6103
  }
5213
6104
  async function invokeProvider(provider, options) {
5214
6105
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -5265,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
5265
6156
  }
5266
6157
  return {
5267
6158
  timestamp: timestamp.toISOString(),
5268
- eval_id: evalCase.id,
6159
+ evalId: evalCase.id,
5269
6160
  dataset: evalCase.dataset,
5270
- conversation_id: evalCase.conversation_id,
6161
+ conversationId: evalCase.conversation_id,
5271
6162
  score: 0,
5272
6163
  hits: [],
5273
6164
  misses: [`Error: ${message}`],
5274
- candidate_answer: `Error occurred: ${message}`,
6165
+ candidateAnswer: `Error occurred: ${message}`,
5275
6166
  target: targetName,
5276
- raw_aspects: [],
5277
- agent_provider_request: agentProviderRequest,
5278
- lm_provider_request: lmProviderRequest,
6167
+ rawAspects: [],
6168
+ agentProviderRequest,
6169
+ lmProviderRequest,
5279
6170
  error: message
5280
6171
  };
5281
6172
  }
@@ -5320,8 +6211,8 @@ function mapChildResults(children) {
5320
6211
  hits: child.hits,
5321
6212
  misses: child.misses,
5322
6213
  reasoning: child.reasoning,
5323
- evaluator_provider_request: child.evaluatorRawRequest,
5324
- evaluator_results: mapChildResults(child.evaluatorResults)
6214
+ evaluatorProviderRequest: child.evaluatorRawRequest,
6215
+ evaluatorResults: mapChildResults(child.evaluatorResults)
5325
6216
  }));
5326
6217
  }
5327
6218
  function computeWeightedMean(entries) {
@@ -5422,17 +6313,21 @@ function createAgentKernel() {
5422
6313
  export {
5423
6314
  CodeEvaluator,
5424
6315
  CompositeEvaluator,
6316
+ DEFAULT_EXPLORATION_TOOLS,
5425
6317
  LlmJudgeEvaluator,
5426
6318
  TEST_MESSAGE_ROLES,
5427
6319
  ToolTrajectoryEvaluator,
6320
+ avgToolDurationMs,
5428
6321
  buildDirectoryChain,
5429
6322
  buildPromptInputs,
5430
6323
  buildSearchRoots,
5431
6324
  computeTraceSummary,
5432
6325
  consumeCodexLogEntries,
6326
+ consumePiLogEntries,
5433
6327
  createAgentKernel,
5434
6328
  createProvider,
5435
6329
  ensureVSCodeSubagents,
6330
+ explorationRatio,
5436
6331
  extractCodeBlocks,
5437
6332
  fileExists,
5438
6333
  findGitRoot,
@@ -5446,6 +6341,7 @@ export {
5446
6341
  isTestMessageRole,
5447
6342
  listTargetNames,
5448
6343
  loadEvalCases,
6344
+ mergeExecutionMetrics,
5449
6345
  normalizeLineEndings,
5450
6346
  readJsonFile,
5451
6347
  readTargetDefinitions,
@@ -5456,6 +6352,8 @@ export {
5456
6352
  resolveTargetDefinition,
5457
6353
  runEvalCase,
5458
6354
  runEvaluation,
5459
- subscribeToCodexLogEntries
6355
+ subscribeToCodexLogEntries,
6356
+ subscribeToPiLogEntries,
6357
+ tokensPerTool
5460
6358
  };
5461
6359
  //# sourceMappingURL=index.js.map