@agentv/core 1.3.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,17 +32,21 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
35
36
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
37
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
38
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
39
+ avgToolDurationMs: () => avgToolDurationMs,
38
40
  buildDirectoryChain: () => buildDirectoryChain2,
39
41
  buildPromptInputs: () => buildPromptInputs,
40
42
  buildSearchRoots: () => buildSearchRoots2,
41
43
  computeTraceSummary: () => computeTraceSummary,
42
44
  consumeCodexLogEntries: () => consumeCodexLogEntries,
45
+ consumePiLogEntries: () => consumePiLogEntries,
43
46
  createAgentKernel: () => createAgentKernel,
44
47
  createProvider: () => createProvider,
45
48
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
49
+ explorationRatio: () => explorationRatio,
46
50
  extractCodeBlocks: () => extractCodeBlocks,
47
51
  fileExists: () => fileExists2,
48
52
  findGitRoot: () => findGitRoot,
@@ -54,10 +58,9 @@ __export(index_exports, {
54
58
  isJsonValue: () => isJsonValue,
55
59
  isTestMessage: () => isTestMessage,
56
60
  isTestMessageRole: () => isTestMessageRole,
57
- isTraceEvent: () => isTraceEvent,
58
- isTraceEventType: () => isTraceEventType,
59
61
  listTargetNames: () => listTargetNames,
60
62
  loadEvalCases: () => loadEvalCases,
63
+ mergeExecutionMetrics: () => mergeExecutionMetrics,
61
64
  normalizeLineEndings: () => normalizeLineEndings,
62
65
  readJsonFile: () => readJsonFile,
63
66
  readTargetDefinitions: () => readTargetDefinitions,
@@ -68,7 +71,9 @@ __export(index_exports, {
68
71
  resolveTargetDefinition: () => resolveTargetDefinition,
69
72
  runEvalCase: () => runEvalCase,
70
73
  runEvaluation: () => runEvaluation,
71
- subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
74
+ subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
75
+ subscribeToPiLogEntries: () => subscribeToPiLogEntries,
76
+ tokensPerTool: () => tokensPerTool
72
77
  });
73
78
  module.exports = __toCommonJS(index_exports);
74
79
 
@@ -135,33 +140,69 @@ function getHitCount(result) {
135
140
  }
136
141
 
137
142
  // src/evaluation/trace.ts
138
- function isTraceEventType(value) {
139
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
140
- }
141
- function isTraceEvent(value) {
142
- if (typeof value !== "object" || value === null) {
143
- return false;
144
- }
145
- const candidate = value;
146
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
147
- }
148
- function computeTraceSummary(trace) {
143
+ function computeTraceSummary(messages) {
149
144
  const toolCallCounts = {};
150
- let errorCount = 0;
151
- for (const event of trace) {
152
- if (event.type === "tool_call" && event.name) {
153
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
154
- }
155
- if (event.type === "error") {
156
- errorCount++;
145
+ let totalToolCalls = 0;
146
+ for (const message of messages) {
147
+ if (!message.toolCalls) continue;
148
+ for (const toolCall of message.toolCalls) {
149
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
150
+ totalToolCalls++;
157
151
  }
158
152
  }
159
153
  const toolNames = Object.keys(toolCallCounts).sort();
160
154
  return {
161
- eventCount: trace.length,
155
+ eventCount: totalToolCalls,
162
156
  toolNames,
163
157
  toolCallsByName: toolCallCounts,
164
- errorCount
158
+ errorCount: 0
159
+ };
160
+ }
161
+ var DEFAULT_EXPLORATION_TOOLS = [
162
+ "read",
163
+ "grep",
164
+ "glob",
165
+ "search",
166
+ "list",
167
+ "Read",
168
+ "Grep",
169
+ "Glob",
170
+ "WebSearch",
171
+ "WebFetch"
172
+ ];
173
+ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
174
+ if (summary.eventCount === 0) return void 0;
175
+ const explorationCalls = explorationTools.reduce(
176
+ (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
177
+ 0
178
+ );
179
+ return explorationCalls / summary.eventCount;
180
+ }
181
+ function tokensPerTool(summary) {
182
+ if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
183
+ const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
184
+ return totalTokens / summary.eventCount;
185
+ }
186
+ function avgToolDurationMs(summary) {
187
+ if (!summary.toolDurations) return void 0;
188
+ let totalDuration = 0;
189
+ let totalCalls = 0;
190
+ for (const durations of Object.values(summary.toolDurations)) {
191
+ for (const duration of durations) {
192
+ totalDuration += duration;
193
+ totalCalls++;
194
+ }
195
+ }
196
+ if (totalCalls === 0) return void 0;
197
+ return totalDuration / totalCalls;
198
+ }
199
+ function mergeExecutionMetrics(summary, metrics) {
200
+ if (!metrics) return summary;
201
+ return {
202
+ ...summary,
203
+ tokenUsage: metrics.tokenUsage,
204
+ costUsd: metrics.costUsd,
205
+ durationMs: metrics.durationMs
165
206
  };
166
207
  }
167
208
 
@@ -437,7 +478,8 @@ var TEMPLATE_VARIABLES = {
437
478
  QUESTION: "question",
438
479
  EXPECTED_OUTCOME: "expected_outcome",
439
480
  REFERENCE_ANSWER: "reference_answer",
440
- INPUT_MESSAGES: "input_messages"
481
+ INPUT_MESSAGES: "input_messages",
482
+ OUTPUT_MESSAGES: "output_messages"
441
483
  };
442
484
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
443
485
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -677,7 +719,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
677
719
  expected = [];
678
720
  for (const item of rawExpected) {
679
721
  if (isJsonObject2(item) && typeof item.tool === "string") {
680
- expected.push({ tool: item.tool });
722
+ let args;
723
+ if (item.args === "any") {
724
+ args = "any";
725
+ } else if (isJsonObject2(item.args)) {
726
+ args = item.args;
727
+ }
728
+ expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
681
729
  }
682
730
  }
683
731
  }
@@ -1320,16 +1368,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1320
1368
  }) : [];
1321
1369
  const codeSnippets = extractCodeBlocks(inputSegments);
1322
1370
  let referenceAnswer = "";
1323
- if (outputSegments.length > 1) {
1324
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1325
- } else if (outputSegments.length === 1) {
1326
- const singleMessage = outputSegments[0];
1327
- if (typeof singleMessage.content === "string") {
1328
- referenceAnswer = singleMessage.content;
1329
- } else if (singleMessage.content) {
1330
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1331
- } else if (singleMessage.tool_calls) {
1332
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1371
+ if (outputSegments.length > 0) {
1372
+ const lastMessage = outputSegments[outputSegments.length - 1];
1373
+ const content = lastMessage.content;
1374
+ const toolCalls = lastMessage.tool_calls;
1375
+ if (typeof content === "string") {
1376
+ referenceAnswer = content;
1377
+ } else if (content !== void 0 && content !== null) {
1378
+ referenceAnswer = JSON.stringify(content, null, 2);
1379
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1380
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1333
1381
  }
1334
1382
  }
1335
1383
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1772,11 +1820,11 @@ async function invokeModel(options) {
1772
1820
  return mapResponse(result);
1773
1821
  }
1774
1822
  function mapResponse(result) {
1823
+ const content = result.text ?? "";
1775
1824
  return {
1776
- text: result.text ?? "",
1777
- reasoning: result.reasoningText ?? void 0,
1778
1825
  raw: result,
1779
- usage: toJsonObject(result.totalUsage ?? result.usage)
1826
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1827
+ outputMessages: [{ role: "assistant", content }]
1780
1828
  };
1781
1829
  }
1782
1830
  function toJsonObject(value) {
@@ -1929,6 +1977,7 @@ var CliProvider = class {
1929
1977
  config;
1930
1978
  runCommand;
1931
1979
  verbose;
1980
+ keepTempFiles;
1932
1981
  healthcheckPromise;
1933
1982
  constructor(targetName, config, runner = defaultCommandRunner) {
1934
1983
  this.targetName = targetName;
@@ -1936,6 +1985,7 @@ var CliProvider = class {
1936
1985
  this.config = config;
1937
1986
  this.runCommand = runner;
1938
1987
  this.verbose = config.verbose ?? false;
1988
+ this.keepTempFiles = config.keepTempFiles ?? false;
1939
1989
  }
1940
1990
  async invoke(request) {
1941
1991
  if (request.signal?.aborted) {
@@ -1950,12 +2000,14 @@ var CliProvider = class {
1950
2000
  `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1951
2001
  );
1952
2002
  }
2003
+ const startTime = Date.now();
1953
2004
  const result = await this.runCommand(renderedCommand, {
1954
2005
  cwd: this.config.cwd,
1955
2006
  env: process.env,
1956
2007
  timeoutMs: this.config.timeoutMs,
1957
2008
  signal: request.signal
1958
2009
  });
2010
+ const measuredDurationMs = Date.now() - startTime;
1959
2011
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1960
2012
  if (request.signal?.aborted) {
1961
2013
  throw new Error("CLI provider request was aborted");
@@ -1973,8 +2025,10 @@ var CliProvider = class {
1973
2025
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1974
2026
  const parsed = this.parseOutputContent(responseContent);
1975
2027
  return {
1976
- text: parsed.text,
1977
- trace: parsed.trace,
2028
+ outputMessages: parsed.outputMessages,
2029
+ tokenUsage: parsed.tokenUsage,
2030
+ costUsd: parsed.costUsd,
2031
+ durationMs: parsed.durationMs ?? measuredDurationMs,
1978
2032
  raw: {
1979
2033
  command: renderedCommand,
1980
2034
  stderr: result.stderr,
@@ -2022,12 +2076,14 @@ var CliProvider = class {
2022
2076
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2023
2077
  );
2024
2078
  }
2079
+ const startTime = Date.now();
2025
2080
  const result = await this.runCommand(renderedCommand, {
2026
2081
  cwd: this.config.cwd,
2027
2082
  env: process.env,
2028
2083
  timeoutMs: this.config.timeoutMs,
2029
2084
  signal: controller.signal
2030
2085
  });
2086
+ const measuredDurationMs = Date.now() - startTime;
2031
2087
  if (result.failed || (result.exitCode ?? 0) !== 0) {
2032
2088
  if (controller.signal.aborted) {
2033
2089
  throw new Error("CLI provider request was aborted");
@@ -2049,11 +2105,13 @@ var CliProvider = class {
2049
2105
  if (missingIds.length > 0) {
2050
2106
  throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2051
2107
  }
2108
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
2052
2109
  const responses = requests.map((request) => {
2053
2110
  const evalCaseId = request.evalCaseId;
2054
2111
  if (!evalCaseId) {
2055
2112
  return {
2056
- text: "",
2113
+ outputMessages: [],
2114
+ durationMs: perRequestFallbackMs,
2057
2115
  raw: {
2058
2116
  command: renderedCommand,
2059
2117
  stderr: result.stderr,
@@ -2066,7 +2124,8 @@ var CliProvider = class {
2066
2124
  const parsed = recordsById.get(evalCaseId);
2067
2125
  if (!parsed) {
2068
2126
  return {
2069
- text: "",
2127
+ outputMessages: [],
2128
+ durationMs: perRequestFallbackMs,
2070
2129
  raw: {
2071
2130
  command: renderedCommand,
2072
2131
  stderr: result.stderr,
@@ -2077,9 +2136,10 @@ var CliProvider = class {
2077
2136
  };
2078
2137
  }
2079
2138
  return {
2080
- text: parsed.text,
2081
- trace: parsed.trace,
2082
- traceRef: parsed.traceRef,
2139
+ outputMessages: parsed.outputMessages,
2140
+ tokenUsage: parsed.tokenUsage,
2141
+ costUsd: parsed.costUsd,
2142
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
2083
2143
  raw: {
2084
2144
  command: renderedCommand,
2085
2145
  stderr: result.stderr,
@@ -2094,28 +2154,111 @@ var CliProvider = class {
2094
2154
  }
2095
2155
  /**
2096
2156
  * Parse output content from CLI.
2097
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
2098
- * Otherwise, treat the entire content as plain text.
2157
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
2158
+ * If only 'text' is provided, wrap it in outputMessages.
2159
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
2160
+ *
2161
+ * Also extracts optional execution metrics:
2162
+ * - token_usage: { input, output, cached? }
2163
+ * - cost_usd: number
2164
+ * - duration_ms: number
2099
2165
  */
2100
2166
  parseOutputContent(content) {
2101
2167
  try {
2102
2168
  const parsed = JSON.parse(content);
2103
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
2169
+ if (typeof parsed === "object" && parsed !== null) {
2104
2170
  const obj = parsed;
2105
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2106
- const trace = this.parseTrace(obj.trace);
2107
- return { text, trace };
2171
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2172
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2173
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2174
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
2175
+ if (outputMessages && outputMessages.length > 0) {
2176
+ return { outputMessages, tokenUsage, costUsd, durationMs };
2177
+ }
2178
+ if ("text" in obj) {
2179
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2180
+ return {
2181
+ outputMessages: [{ role: "assistant", content: text }],
2182
+ tokenUsage,
2183
+ costUsd,
2184
+ durationMs
2185
+ };
2186
+ }
2108
2187
  }
2109
2188
  } catch {
2110
2189
  }
2111
- return { text: content };
2190
+ return { outputMessages: [{ role: "assistant", content }] };
2191
+ }
2192
+ /**
2193
+ * Parse token_usage from CLI output.
2194
+ */
2195
+ parseTokenUsage(tokenUsage) {
2196
+ if (typeof tokenUsage !== "object" || tokenUsage === null) {
2197
+ return void 0;
2198
+ }
2199
+ const obj = tokenUsage;
2200
+ if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2201
+ return void 0;
2202
+ }
2203
+ return {
2204
+ input: obj.input,
2205
+ output: obj.output,
2206
+ cached: typeof obj.cached === "number" ? obj.cached : void 0
2207
+ };
2208
+ }
2209
+ /**
2210
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2211
+ */
2212
+ parseOutputMessages(outputMessages) {
2213
+ if (!Array.isArray(outputMessages)) {
2214
+ return void 0;
2215
+ }
2216
+ const messages = [];
2217
+ for (const msg of outputMessages) {
2218
+ if (typeof msg !== "object" || msg === null) {
2219
+ continue;
2220
+ }
2221
+ const rawMsg = msg;
2222
+ if (typeof rawMsg.role !== "string") {
2223
+ continue;
2224
+ }
2225
+ const message = {
2226
+ role: rawMsg.role,
2227
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2228
+ content: rawMsg.content,
2229
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2230
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2231
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
2232
+ };
2233
+ messages.push(message);
2234
+ }
2235
+ return messages.length > 0 ? messages : void 0;
2112
2236
  }
2113
- parseTrace(trace) {
2114
- if (!Array.isArray(trace)) {
2237
+ /**
2238
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2239
+ */
2240
+ parseToolCalls(toolCalls) {
2241
+ if (!Array.isArray(toolCalls)) {
2115
2242
  return void 0;
2116
2243
  }
2117
- const validEvents = trace.filter(isTraceEvent);
2118
- return validEvents.length > 0 ? validEvents : void 0;
2244
+ const calls = [];
2245
+ for (const call of toolCalls) {
2246
+ if (typeof call !== "object" || call === null) {
2247
+ continue;
2248
+ }
2249
+ const rawCall = call;
2250
+ if (typeof rawCall.tool !== "string") {
2251
+ continue;
2252
+ }
2253
+ calls.push({
2254
+ tool: rawCall.tool,
2255
+ input: rawCall.input,
2256
+ output: rawCall.output,
2257
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2258
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2259
+ });
2260
+ }
2261
+ return calls.length > 0 ? calls : void 0;
2119
2262
  }
2120
2263
  parseJsonlBatchOutput(content) {
2121
2264
  const records = /* @__PURE__ */ new Map();
@@ -2139,12 +2282,22 @@ var CliProvider = class {
2139
2282
  if (records.has(id)) {
2140
2283
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
2141
2284
  }
2142
- const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2143
- const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
2285
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2286
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2287
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2288
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2289
+ let outputMessages;
2290
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2291
+ outputMessages = parsedOutputMessages;
2292
+ } else {
2293
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2294
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2295
+ }
2144
2296
  records.set(id, {
2145
- text,
2146
- trace: this.parseTrace(obj.trace),
2147
- traceRef
2297
+ outputMessages,
2298
+ tokenUsage,
2299
+ costUsd,
2300
+ durationMs
2148
2301
  });
2149
2302
  }
2150
2303
  return records;
@@ -2157,8 +2310,10 @@ var CliProvider = class {
2157
2310
  const errorMsg = error instanceof Error ? error.message : String(error);
2158
2311
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
2159
2312
  } finally {
2160
- await import_promises8.default.unlink(filePath).catch(() => {
2161
- });
2313
+ if (!this.keepTempFiles) {
2314
+ await import_promises8.default.unlink(filePath).catch(() => {
2315
+ });
2316
+ }
2162
2317
  }
2163
2318
  }
2164
2319
  async ensureHealthy(signal) {
@@ -2458,6 +2613,11 @@ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exe
2458
2613
  var WORKSPACE_PREFIX = "agentv-codex-";
2459
2614
  var PROMPT_FILENAME = "prompt.md";
2460
2615
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2616
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2617
+ - Do NOT create any additional output files in the workspace.
2618
+ - All intended file outputs/changes MUST be written in your response.
2619
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2620
+ This is required for evaluation scoring.`;
2461
2621
  var CodexProvider = class {
2462
2622
  id;
2463
2623
  kind = "codex";
@@ -2482,7 +2642,11 @@ var CodexProvider = class {
2482
2642
  const workspaceRoot = await this.createWorkspace();
2483
2643
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2484
2644
  try {
2485
- const promptContent = buildPromptDocument(request, inputFiles);
2645
+ const basePrompt = buildPromptDocument(request, inputFiles);
2646
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2647
+ const promptContent = `${systemPrompt}
2648
+
2649
+ ${basePrompt}`;
2486
2650
  const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
2487
2651
  await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
2488
2652
  const args = this.buildCodexArgs();
@@ -2501,7 +2665,6 @@ var CodexProvider = class {
2501
2665
  const parsed = parseCodexJson(result.stdout);
2502
2666
  const assistantText = extractAssistantText(parsed);
2503
2667
  return {
2504
- text: assistantText,
2505
2668
  raw: {
2506
2669
  response: parsed,
2507
2670
  stdout: result.stdout,
@@ -2513,7 +2676,8 @@ var CodexProvider = class {
2513
2676
  workspace: workspaceRoot,
2514
2677
  inputFiles,
2515
2678
  logFile: logger?.filePath
2516
- }
2679
+ },
2680
+ outputMessages: [{ role: "assistant", content: assistantText }]
2517
2681
  };
2518
2682
  } finally {
2519
2683
  await logger?.close();
@@ -3135,7 +3299,6 @@ var MockProvider = class {
3135
3299
  delayMs;
3136
3300
  delayMinMs;
3137
3301
  delayMaxMs;
3138
- trace;
3139
3302
  constructor(targetName, config) {
3140
3303
  this.id = `mock:${targetName}`;
3141
3304
  this.targetName = targetName;
@@ -3143,7 +3306,6 @@ var MockProvider = class {
3143
3306
  this.delayMs = config.delayMs ?? 0;
3144
3307
  this.delayMinMs = config.delayMinMs ?? 0;
3145
3308
  this.delayMaxMs = config.delayMaxMs ?? 0;
3146
- this.trace = config.trace;
3147
3309
  }
3148
3310
  async invoke(request) {
3149
3311
  const delay = this.calculateDelay();
@@ -3151,12 +3313,11 @@ var MockProvider = class {
3151
3313
  await new Promise((resolve) => setTimeout(resolve, delay));
3152
3314
  }
3153
3315
  return {
3154
- text: this.cannedResponse,
3316
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
3155
3317
  raw: {
3156
3318
  question: request.question,
3157
3319
  guidelines: request.guidelines
3158
- },
3159
- trace: this.trace
3320
+ }
3160
3321
  };
3161
3322
  }
3162
3323
  calculateDelay() {
@@ -3169,182 +3330,1026 @@ var MockProvider = class {
3169
3330
  }
3170
3331
  };
3171
3332
 
3172
- // src/evaluation/providers/targets.ts
3333
+ // src/evaluation/providers/pi-coding-agent.ts
3334
+ var import_node_child_process3 = require("child_process");
3335
+ var import_node_crypto2 = require("crypto");
3336
+ var import_node_fs4 = require("fs");
3337
+ var import_promises10 = require("fs/promises");
3338
+ var import_node_os3 = require("os");
3173
3339
  var import_node_path11 = __toESM(require("path"), 1);
3174
- var import_zod = require("zod");
3175
- var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
3176
- "PROMPT",
3177
- "GUIDELINES",
3178
- "EVAL_ID",
3179
- "ATTEMPT",
3180
- "FILES",
3181
- "OUTPUT_FILE"
3182
- ]);
3183
- var BASE_TARGET_SCHEMA = import_zod.z.object({
3184
- name: import_zod.z.string().min(1, "target name is required"),
3185
- provider: import_zod.z.string().min(1, "provider is required"),
3186
- judge_target: import_zod.z.string().optional(),
3187
- workers: import_zod.z.number().int().min(1).optional()
3188
- }).passthrough();
3189
- var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
3190
- function normalizeAzureApiVersion(value) {
3191
- if (!value) {
3192
- return DEFAULT_AZURE_API_VERSION;
3340
+
3341
+ // src/evaluation/providers/pi-log-tracker.ts
3342
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3343
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
3344
+ function getPiLogStore() {
3345
+ const globalObject = globalThis;
3346
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
3347
+ if (existing) {
3348
+ return existing;
3193
3349
  }
3194
- const trimmed = value.trim();
3195
- if (trimmed.length === 0) {
3196
- return DEFAULT_AZURE_API_VERSION;
3350
+ const created = [];
3351
+ globalObject[GLOBAL_LOGS_KEY2] = created;
3352
+ return created;
3353
+ }
3354
+ function getSubscriberStore2() {
3355
+ const globalObject = globalThis;
3356
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
3357
+ if (existing) {
3358
+ return existing;
3197
3359
  }
3198
- const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
3199
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
3360
+ const created = /* @__PURE__ */ new Set();
3361
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
3362
+ return created;
3200
3363
  }
3201
- function resolveRetryConfig(target) {
3202
- const maxRetries = resolveOptionalNumber(
3203
- target.max_retries ?? target.maxRetries,
3204
- `${target.name} max retries`
3205
- );
3206
- const initialDelayMs = resolveOptionalNumber(
3207
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
3208
- `${target.name} retry initial delay`
3209
- );
3210
- const maxDelayMs = resolveOptionalNumber(
3211
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
3212
- `${target.name} retry max delay`
3213
- );
3214
- const backoffFactor = resolveOptionalNumber(
3215
- target.retry_backoff_factor ?? target.retryBackoffFactor,
3216
- `${target.name} retry backoff factor`
3217
- );
3218
- const retryableStatusCodes = resolveOptionalNumberArray(
3219
- target.retry_status_codes ?? target.retryStatusCodes,
3220
- `${target.name} retry status codes`
3221
- );
3222
- if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
3223
- return void 0;
3364
+ function notifySubscribers2(entry) {
3365
+ const subscribers = Array.from(getSubscriberStore2());
3366
+ for (const listener of subscribers) {
3367
+ try {
3368
+ listener(entry);
3369
+ } catch (error) {
3370
+ const message = error instanceof Error ? error.message : String(error);
3371
+ console.warn(`Pi log subscriber failed: ${message}`);
3372
+ }
3224
3373
  }
3225
- return {
3226
- maxRetries,
3227
- initialDelayMs,
3228
- maxDelayMs,
3229
- backoffFactor,
3230
- retryableStatusCodes
3231
- };
3232
3374
  }
3233
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
3234
- const parsed = BASE_TARGET_SCHEMA.parse(definition);
3235
- const provider = parsed.provider.toLowerCase();
3236
- const providerBatching = resolveOptionalBoolean(
3237
- parsed.provider_batching ?? parsed.providerBatching
3238
- );
3239
- switch (provider) {
3240
- case "azure":
3241
- case "azure-openai":
3242
- return {
3243
- kind: "azure",
3244
- name: parsed.name,
3245
- judgeTarget: parsed.judge_target,
3246
- workers: parsed.workers,
3247
- providerBatching,
3248
- config: resolveAzureConfig(parsed, env)
3249
- };
3250
- case "anthropic":
3251
- return {
3252
- kind: "anthropic",
3253
- name: parsed.name,
3254
- judgeTarget: parsed.judge_target,
3255
- workers: parsed.workers,
3256
- providerBatching,
3257
- config: resolveAnthropicConfig(parsed, env)
3258
- };
3259
- case "gemini":
3260
- case "google":
3261
- case "google-gemini":
3262
- return {
3263
- kind: "gemini",
3264
- name: parsed.name,
3265
- judgeTarget: parsed.judge_target,
3266
- workers: parsed.workers,
3267
- providerBatching,
3268
- config: resolveGeminiConfig(parsed, env)
3269
- };
3270
- case "codex":
3271
- case "codex-cli":
3272
- return {
3273
- kind: "codex",
3274
- name: parsed.name,
3275
- judgeTarget: parsed.judge_target,
3276
- workers: parsed.workers,
3277
- providerBatching,
3278
- config: resolveCodexConfig(parsed, env)
3279
- };
3280
- case "mock":
3281
- return {
3282
- kind: "mock",
3283
- name: parsed.name,
3284
- judgeTarget: parsed.judge_target,
3285
- workers: parsed.workers,
3286
- providerBatching,
3287
- config: resolveMockConfig(parsed)
3288
- };
3289
- case "vscode":
3290
- case "vscode-insiders":
3291
- return {
3292
- kind: provider,
3293
- name: parsed.name,
3294
- judgeTarget: parsed.judge_target,
3295
- workers: parsed.workers,
3296
- providerBatching,
3297
- config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
3298
- };
3299
- case "cli":
3300
- return {
3301
- kind: "cli",
3302
- name: parsed.name,
3303
- judgeTarget: parsed.judge_target,
3304
- workers: parsed.workers,
3305
- providerBatching,
3306
- config: resolveCliConfig(parsed, env, evalFilePath)
3307
- };
3308
- default:
3309
- throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
3375
+ function recordPiLogEntry(entry) {
3376
+ getPiLogStore().push(entry);
3377
+ notifySubscribers2(entry);
3378
+ }
3379
+ function consumePiLogEntries() {
3380
+ const store = getPiLogStore();
3381
+ if (store.length === 0) {
3382
+ return [];
3310
3383
  }
3384
+ return store.splice(0, store.length);
3311
3385
  }
3312
- function resolveAzureConfig(target, env) {
3313
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
3314
- const apiKeySource = target.api_key ?? target.apiKey;
3315
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
3316
- const versionSource = target.version ?? target.api_version;
3317
- const temperatureSource = target.temperature;
3318
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
3319
- const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
3320
- const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
3321
- const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
3322
- const version = normalizeAzureApiVersion(
3323
- resolveOptionalString(versionSource, env, `${target.name} api version`, {
3324
- allowLiteral: true,
3325
- optionalEnv: true
3326
- })
3327
- );
3328
- const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
3329
- const maxOutputTokens = resolveOptionalNumber(
3330
- maxTokensSource,
3331
- `${target.name} max output tokens`
3332
- );
3333
- const retry = resolveRetryConfig(target);
3334
- return {
3335
- resourceName,
3336
- deploymentName,
3337
- apiKey,
3338
- version,
3339
- temperature,
3340
- maxOutputTokens,
3341
- retry
3386
+ function subscribeToPiLogEntries(listener) {
3387
+ const store = getSubscriberStore2();
3388
+ store.add(listener);
3389
+ return () => {
3390
+ store.delete(listener);
3342
3391
  };
3343
3392
  }
3344
- function resolveAnthropicConfig(target, env) {
3345
- const apiKeySource = target.api_key ?? target.apiKey;
3346
- const modelSource = target.model ?? target.deployment ?? target.variant;
3347
- const temperatureSource = target.temperature;
3393
+
3394
+ // src/evaluation/providers/pi-coding-agent.ts
3395
+ var WORKSPACE_PREFIX2 = "agentv-pi-";
3396
+ var PROMPT_FILENAME2 = "prompt.md";
3397
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
3398
+ - Do NOT create any additional output files in the workspace.
3399
+ - All intended file outputs/changes MUST be written in your response.
3400
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3401
+ This is required for evaluation scoring.`;
3402
+ var PiCodingAgentProvider = class {
3403
+ id;
3404
+ kind = "pi-coding-agent";
3405
+ targetName;
3406
+ supportsBatch = false;
3407
+ config;
3408
+ runPi;
3409
+ constructor(targetName, config, runner = defaultPiRunner) {
3410
+ this.id = `pi-coding-agent:${targetName}`;
3411
+ this.targetName = targetName;
3412
+ this.config = config;
3413
+ this.runPi = runner;
3414
+ }
3415
+ async invoke(request) {
3416
+ if (request.signal?.aborted) {
3417
+ throw new Error("Pi coding agent request was aborted before execution");
3418
+ }
3419
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
3420
+ const workspaceRoot = await this.createWorkspace();
3421
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
3422
+ try {
3423
+ const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
3424
+ await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
3425
+ const args = this.buildPiArgs(request.question, inputFiles);
3426
+ const cwd = this.resolveCwd(workspaceRoot);
3427
+ const result = await this.executePi(args, cwd, request.signal, logger);
3428
+ if (result.timedOut) {
3429
+ throw new Error(
3430
+ `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
3431
+ );
3432
+ }
3433
+ if (result.exitCode !== 0) {
3434
+ const detail = pickDetail2(result.stderr, result.stdout);
3435
+ const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3436
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3437
+ }
3438
+ const parsed = parsePiJsonl(result.stdout);
3439
+ const outputMessages = extractOutputMessages(parsed);
3440
+ const assistantText = extractAssistantText2(outputMessages);
3441
+ return {
3442
+ raw: {
3443
+ response: parsed,
3444
+ stdout: result.stdout,
3445
+ stderr: result.stderr,
3446
+ exitCode: result.exitCode,
3447
+ args,
3448
+ executable: this.config.executable,
3449
+ promptFile,
3450
+ workspace: workspaceRoot,
3451
+ inputFiles,
3452
+ logFile: logger?.filePath
3453
+ },
3454
+ outputMessages
3455
+ };
3456
+ } finally {
3457
+ await logger?.close();
3458
+ await this.cleanupWorkspace(workspaceRoot);
3459
+ }
3460
+ }
3461
+ resolveCwd(workspaceRoot) {
3462
+ if (!this.config.cwd) {
3463
+ return workspaceRoot;
3464
+ }
3465
+ return import_node_path11.default.resolve(this.config.cwd);
3466
+ }
3467
+ buildPiArgs(prompt, inputFiles) {
3468
+ const args = [];
3469
+ if (this.config.provider) {
3470
+ args.push("--provider", this.config.provider);
3471
+ }
3472
+ if (this.config.model) {
3473
+ args.push("--model", this.config.model);
3474
+ }
3475
+ if (this.config.apiKey) {
3476
+ args.push("--api-key", this.config.apiKey);
3477
+ }
3478
+ args.push("--mode", "json");
3479
+ args.push("--print");
3480
+ args.push("--no-session");
3481
+ if (this.config.tools) {
3482
+ args.push("--tools", this.config.tools);
3483
+ }
3484
+ if (this.config.thinking) {
3485
+ args.push("--thinking", this.config.thinking);
3486
+ }
3487
+ if (this.config.args && this.config.args.length > 0) {
3488
+ args.push(...this.config.args);
3489
+ }
3490
+ if (inputFiles && inputFiles.length > 0) {
3491
+ for (const file of inputFiles) {
3492
+ args.push(`@${file}`);
3493
+ }
3494
+ }
3495
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
3496
+ const fullPrompt = `${systemPrompt}
3497
+
3498
+ ${prompt}`;
3499
+ const escapedPrompt = escapeAtSymbols(fullPrompt);
3500
+ args.push(escapedPrompt);
3501
+ return args;
3502
+ }
3503
+ async executePi(args, cwd, signal, logger) {
3504
+ try {
3505
+ return await this.runPi({
3506
+ executable: this.config.executable,
3507
+ args,
3508
+ cwd,
3509
+ timeoutMs: this.config.timeoutMs,
3510
+ env: this.buildEnv(),
3511
+ signal,
3512
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
3513
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
3514
+ });
3515
+ } catch (error) {
3516
+ const err = error;
3517
+ if (err.code === "ENOENT") {
3518
+ throw new Error(
3519
+ `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
3520
+ );
3521
+ }
3522
+ throw error;
3523
+ }
3524
+ }
3525
+ buildEnv() {
3526
+ const env = { ...process.env };
3527
+ if (this.config.apiKey) {
3528
+ const provider = this.config.provider?.toLowerCase() ?? "google";
3529
+ switch (provider) {
3530
+ case "google":
3531
+ case "gemini":
3532
+ env.GEMINI_API_KEY = this.config.apiKey;
3533
+ break;
3534
+ case "anthropic":
3535
+ env.ANTHROPIC_API_KEY = this.config.apiKey;
3536
+ break;
3537
+ case "openai":
3538
+ env.OPENAI_API_KEY = this.config.apiKey;
3539
+ break;
3540
+ case "groq":
3541
+ env.GROQ_API_KEY = this.config.apiKey;
3542
+ break;
3543
+ case "xai":
3544
+ env.XAI_API_KEY = this.config.apiKey;
3545
+ break;
3546
+ case "openrouter":
3547
+ env.OPENROUTER_API_KEY = this.config.apiKey;
3548
+ break;
3549
+ }
3550
+ }
3551
+ return env;
3552
+ }
3553
+ async createWorkspace() {
3554
+ return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
3555
+ }
3556
+ async cleanupWorkspace(workspaceRoot) {
3557
+ try {
3558
+ await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
3559
+ } catch {
3560
+ }
3561
+ }
3562
+ resolveLogDirectory() {
3563
+ if (this.config.logDir) {
3564
+ return import_node_path11.default.resolve(this.config.logDir);
3565
+ }
3566
+ return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3567
+ }
3568
+ async createStreamLogger(request) {
3569
+ const logDir = this.resolveLogDirectory();
3570
+ if (!logDir) {
3571
+ return void 0;
3572
+ }
3573
+ try {
3574
+ await (0, import_promises10.mkdir)(logDir, { recursive: true });
3575
+ } catch (error) {
3576
+ const message = error instanceof Error ? error.message : String(error);
3577
+ console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3578
+ return void 0;
3579
+ }
3580
+ const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
3581
+ try {
3582
+ const logger = await PiStreamLogger.create({
3583
+ filePath,
3584
+ targetName: this.targetName,
3585
+ evalCaseId: request.evalCaseId,
3586
+ attempt: request.attempt,
3587
+ format: this.config.logFormat ?? "summary"
3588
+ });
3589
+ recordPiLogEntry({
3590
+ filePath,
3591
+ targetName: this.targetName,
3592
+ evalCaseId: request.evalCaseId,
3593
+ attempt: request.attempt
3594
+ });
3595
+ return logger;
3596
+ } catch (error) {
3597
+ const message = error instanceof Error ? error.message : String(error);
3598
+ console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
3599
+ return void 0;
3600
+ }
3601
+ }
3602
+ };
3603
+ var PiStreamLogger = class _PiStreamLogger {
3604
+ filePath;
3605
+ stream;
3606
+ startedAt = Date.now();
3607
+ stdoutBuffer = "";
3608
+ stderrBuffer = "";
3609
+ format;
3610
+ constructor(filePath, format) {
3611
+ this.filePath = filePath;
3612
+ this.format = format;
3613
+ this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
3614
+ }
3615
+ static async create(options) {
3616
+ const logger = new _PiStreamLogger(options.filePath, options.format);
3617
+ const header = [
3618
+ "# Pi Coding Agent stream log",
3619
+ `# target: ${options.targetName}`,
3620
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
3621
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
3622
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
3623
+ ""
3624
+ ].filter((line) => Boolean(line));
3625
+ logger.writeLines(header);
3626
+ return logger;
3627
+ }
3628
+ handleStdoutChunk(chunk) {
3629
+ this.stdoutBuffer += chunk;
3630
+ this.flushBuffer("stdout");
3631
+ }
3632
+ handleStderrChunk(chunk) {
3633
+ this.stderrBuffer += chunk;
3634
+ this.flushBuffer("stderr");
3635
+ }
3636
+ async close() {
3637
+ this.flushBuffer("stdout");
3638
+ this.flushBuffer("stderr");
3639
+ this.flushRemainder();
3640
+ await new Promise((resolve, reject) => {
3641
+ this.stream.once("error", reject);
3642
+ this.stream.end(() => resolve());
3643
+ });
3644
+ }
3645
+ writeLines(lines) {
3646
+ for (const line of lines) {
3647
+ this.stream.write(`${line}
3648
+ `);
3649
+ }
3650
+ }
3651
+ flushBuffer(source) {
3652
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
3653
+ const lines = buffer.split(/\r?\n/);
3654
+ const remainder = lines.pop() ?? "";
3655
+ if (source === "stdout") {
3656
+ this.stdoutBuffer = remainder;
3657
+ } else {
3658
+ this.stderrBuffer = remainder;
3659
+ }
3660
+ for (const line of lines) {
3661
+ const formatted = this.formatLine(line, source);
3662
+ if (formatted) {
3663
+ this.stream.write(formatted);
3664
+ this.stream.write("\n");
3665
+ }
3666
+ }
3667
+ }
3668
+ formatLine(rawLine, source) {
3669
+ const trimmed = rawLine.trim();
3670
+ if (trimmed.length === 0) {
3671
+ return void 0;
3672
+ }
3673
+ const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3674
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
3675
+ }
3676
+ flushRemainder() {
3677
+ const stdoutRemainder = this.stdoutBuffer.trim();
3678
+ if (stdoutRemainder.length > 0) {
3679
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
3680
+ if (formatted) {
3681
+ this.stream.write(formatted);
3682
+ this.stream.write("\n");
3683
+ }
3684
+ }
3685
+ const stderrRemainder = this.stderrBuffer.trim();
3686
+ if (stderrRemainder.length > 0) {
3687
+ const formatted = this.formatLine(stderrRemainder, "stderr");
3688
+ if (formatted) {
3689
+ this.stream.write(formatted);
3690
+ this.stream.write("\n");
3691
+ }
3692
+ }
3693
+ this.stdoutBuffer = "";
3694
+ this.stderrBuffer = "";
3695
+ }
3696
+ };
3697
+ function buildLogFilename2(request, targetName) {
3698
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3699
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
3700
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3701
+ const target = sanitizeForFilename2(targetName);
3702
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
3703
+ }
3704
+ function sanitizeForFilename2(value) {
3705
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3706
+ return sanitized.length > 0 ? sanitized : "pi";
3707
+ }
3708
+ function formatElapsed2(startedAt) {
3709
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3710
+ const hours = Math.floor(elapsedSeconds / 3600);
3711
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
3712
+ const seconds = elapsedSeconds % 60;
3713
+ if (hours > 0) {
3714
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3715
+ }
3716
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3717
+ }
3718
+ function formatPiLogMessage(rawLine, source) {
3719
+ const parsed = tryParseJsonValue2(rawLine);
3720
+ if (parsed) {
3721
+ const summary = summarizePiEvent(parsed);
3722
+ if (summary) {
3723
+ return summary;
3724
+ }
3725
+ }
3726
+ if (source === "stderr") {
3727
+ return `stderr: ${rawLine}`;
3728
+ }
3729
+ return rawLine;
3730
+ }
3731
+ function formatPiJsonLog(rawLine) {
3732
+ const parsed = tryParseJsonValue2(rawLine);
3733
+ if (!parsed) {
3734
+ return rawLine;
3735
+ }
3736
+ try {
3737
+ return JSON.stringify(parsed, null, 2);
3738
+ } catch {
3739
+ return rawLine;
3740
+ }
3741
+ }
3742
+ function summarizePiEvent(event) {
3743
+ if (!event || typeof event !== "object") {
3744
+ return void 0;
3745
+ }
3746
+ const record = event;
3747
+ const type = typeof record.type === "string" ? record.type : void 0;
3748
+ if (!type) {
3749
+ return void 0;
3750
+ }
3751
+ switch (type) {
3752
+ case "agent_start":
3753
+ return "agent_start";
3754
+ case "agent_end":
3755
+ return "agent_end";
3756
+ case "turn_start":
3757
+ return "turn_start";
3758
+ case "turn_end":
3759
+ return "turn_end";
3760
+ case "message_start":
3761
+ case "message_end": {
3762
+ const message = record.message;
3763
+ const role = message?.role;
3764
+ return `${type}: ${role}`;
3765
+ }
3766
+ case "message_update": {
3767
+ const event2 = record.assistantMessageEvent;
3768
+ const eventType = event2?.type;
3769
+ if (eventType === "text_delta") {
3770
+ const delta = event2?.delta;
3771
+ if (typeof delta === "string") {
3772
+ const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
3773
+ return `text_delta: ${preview}`;
3774
+ }
3775
+ }
3776
+ return `message_update: ${eventType}`;
3777
+ }
3778
+ default:
3779
+ return type;
3780
+ }
3781
+ }
3782
+ function tryParseJsonValue2(rawLine) {
3783
+ try {
3784
+ return JSON.parse(rawLine);
3785
+ } catch {
3786
+ return void 0;
3787
+ }
3788
+ }
3789
+ function parsePiJsonl(output) {
3790
+ const trimmed = output.trim();
3791
+ if (trimmed.length === 0) {
3792
+ throw new Error("Pi coding agent produced no output");
3793
+ }
3794
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
3795
+ const parsed = [];
3796
+ for (const line of lines) {
3797
+ try {
3798
+ parsed.push(JSON.parse(line));
3799
+ } catch {
3800
+ }
3801
+ }
3802
+ if (parsed.length === 0) {
3803
+ throw new Error("Pi coding agent produced no valid JSON output");
3804
+ }
3805
+ return parsed;
3806
+ }
3807
+ function extractOutputMessages(events) {
3808
+ for (let i = events.length - 1; i >= 0; i--) {
3809
+ const event = events[i];
3810
+ if (!event || typeof event !== "object") {
3811
+ continue;
3812
+ }
3813
+ const record = event;
3814
+ if (record.type !== "agent_end") {
3815
+ continue;
3816
+ }
3817
+ const messages = record.messages;
3818
+ if (!Array.isArray(messages)) {
3819
+ continue;
3820
+ }
3821
+ return messages.map(convertPiMessage).filter((m) => m !== void 0);
3822
+ }
3823
+ const outputMessages = [];
3824
+ for (const event of events) {
3825
+ if (!event || typeof event !== "object") {
3826
+ continue;
3827
+ }
3828
+ const record = event;
3829
+ if (record.type === "turn_end") {
3830
+ const message = record.message;
3831
+ const converted = convertPiMessage(message);
3832
+ if (converted) {
3833
+ outputMessages.push(converted);
3834
+ }
3835
+ }
3836
+ }
3837
+ return outputMessages;
3838
+ }
3839
+ function convertPiMessage(message) {
3840
+ if (!message || typeof message !== "object") {
3841
+ return void 0;
3842
+ }
3843
+ const msg = message;
3844
+ const role = msg.role;
3845
+ if (typeof role !== "string") {
3846
+ return void 0;
3847
+ }
3848
+ const content = extractTextContent(msg.content);
3849
+ const toolCalls = extractToolCalls(msg.content);
3850
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3851
+ const metadata = {};
3852
+ if (msg.api) metadata.api = msg.api;
3853
+ if (msg.provider) metadata.provider = msg.provider;
3854
+ if (msg.model) metadata.model = msg.model;
3855
+ if (msg.usage) metadata.usage = msg.usage;
3856
+ if (msg.stopReason) metadata.stopReason = msg.stopReason;
3857
+ return {
3858
+ role,
3859
+ content,
3860
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
3861
+ timestamp,
3862
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3863
+ };
3864
+ }
3865
+ function extractTextContent(content) {
3866
+ if (typeof content === "string") {
3867
+ return content;
3868
+ }
3869
+ if (!Array.isArray(content)) {
3870
+ return void 0;
3871
+ }
3872
+ const textParts = [];
3873
+ for (const part of content) {
3874
+ if (!part || typeof part !== "object") {
3875
+ continue;
3876
+ }
3877
+ const p = part;
3878
+ if (p.type === "text" && typeof p.text === "string") {
3879
+ textParts.push(p.text);
3880
+ }
3881
+ }
3882
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
3883
+ }
3884
+ function extractToolCalls(content) {
3885
+ if (!Array.isArray(content)) {
3886
+ return [];
3887
+ }
3888
+ const toolCalls = [];
3889
+ for (const part of content) {
3890
+ if (!part || typeof part !== "object") {
3891
+ continue;
3892
+ }
3893
+ const p = part;
3894
+ if (p.type === "tool_use" && typeof p.name === "string") {
3895
+ toolCalls.push({
3896
+ tool: p.name,
3897
+ input: p.input,
3898
+ id: typeof p.id === "string" ? p.id : void 0
3899
+ });
3900
+ }
3901
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
3902
+ const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
3903
+ if (existing) {
3904
+ const idx = toolCalls.indexOf(existing);
3905
+ toolCalls[idx] = {
3906
+ ...existing,
3907
+ output: p.content
3908
+ };
3909
+ }
3910
+ }
3911
+ }
3912
+ return toolCalls;
3913
+ }
3914
+ function extractAssistantText2(messages) {
3915
+ for (let i = messages.length - 1; i >= 0; i--) {
3916
+ const msg = messages[i];
3917
+ if (msg.role === "assistant" && msg.content) {
3918
+ if (typeof msg.content === "string") {
3919
+ return msg.content;
3920
+ }
3921
+ return JSON.stringify(msg.content);
3922
+ }
3923
+ }
3924
+ return "";
3925
+ }
3926
+ function escapeAtSymbols(prompt) {
3927
+ return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3928
+ }
3929
+ function pickDetail2(stderr, stdout) {
3930
+ const errorText = stderr.trim();
3931
+ if (errorText.length > 0) {
3932
+ return errorText;
3933
+ }
3934
+ const stdoutText = stdout.trim();
3935
+ return stdoutText.length > 0 ? stdoutText : void 0;
3936
+ }
3937
+ function formatTimeoutSuffix3(timeoutMs) {
3938
+ if (!timeoutMs || timeoutMs <= 0) {
3939
+ return "";
3940
+ }
3941
+ const seconds = Math.ceil(timeoutMs / 1e3);
3942
+ return ` after ${seconds}s`;
3943
+ }
3944
+ async function defaultPiRunner(options) {
3945
+ return await new Promise((resolve, reject) => {
3946
+ const parts = options.executable.split(/\s+/);
3947
+ const executable = parts[0];
3948
+ const executableArgs = parts.slice(1);
3949
+ const allArgs = [...executableArgs, ...options.args];
3950
+ const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
3951
+ cwd: options.cwd,
3952
+ env: options.env,
3953
+ stdio: ["pipe", "pipe", "pipe"],
3954
+ shell: false
3955
+ });
3956
+ let stdout = "";
3957
+ let stderr = "";
3958
+ let timedOut = false;
3959
+ const onAbort = () => {
3960
+ child.kill("SIGTERM");
3961
+ };
3962
+ if (options.signal) {
3963
+ if (options.signal.aborted) {
3964
+ onAbort();
3965
+ } else {
3966
+ options.signal.addEventListener("abort", onAbort, { once: true });
3967
+ }
3968
+ }
3969
+ let timeoutHandle;
3970
+ if (options.timeoutMs && options.timeoutMs > 0) {
3971
+ timeoutHandle = setTimeout(() => {
3972
+ timedOut = true;
3973
+ child.kill("SIGTERM");
3974
+ }, options.timeoutMs);
3975
+ timeoutHandle.unref?.();
3976
+ }
3977
+ child.stdout.setEncoding("utf8");
3978
+ child.stdout.on("data", (chunk) => {
3979
+ stdout += chunk;
3980
+ options.onStdoutChunk?.(chunk);
3981
+ });
3982
+ child.stderr.setEncoding("utf8");
3983
+ child.stderr.on("data", (chunk) => {
3984
+ stderr += chunk;
3985
+ options.onStderrChunk?.(chunk);
3986
+ });
3987
+ child.stdin.end();
3988
+ const cleanup = () => {
3989
+ if (timeoutHandle) {
3990
+ clearTimeout(timeoutHandle);
3991
+ }
3992
+ if (options.signal) {
3993
+ options.signal.removeEventListener("abort", onAbort);
3994
+ }
3995
+ };
3996
+ child.on("error", (error) => {
3997
+ cleanup();
3998
+ reject(error);
3999
+ });
4000
+ child.on("close", (code) => {
4001
+ cleanup();
4002
+ resolve({
4003
+ stdout,
4004
+ stderr,
4005
+ exitCode: typeof code === "number" ? code : -1,
4006
+ timedOut
4007
+ });
4008
+ });
4009
+ });
4010
+ }
4011
+
4012
+ // src/evaluation/providers/targets.ts
4013
+ var import_node_path12 = __toESM(require("path"), 1);
4014
+ var import_zod = require("zod");
4015
+ var CliHealthcheckHttpInputSchema = import_zod.z.object({
4016
+ type: import_zod.z.literal("http"),
4017
+ url: import_zod.z.string().min(1, "healthcheck URL is required"),
4018
+ timeout_seconds: import_zod.z.number().positive().optional(),
4019
+ timeoutSeconds: import_zod.z.number().positive().optional()
4020
+ });
4021
+ var CliHealthcheckCommandInputSchema = import_zod.z.object({
4022
+ type: import_zod.z.literal("command"),
4023
+ command_template: import_zod.z.string().optional(),
4024
+ commandTemplate: import_zod.z.string().optional(),
4025
+ cwd: import_zod.z.string().optional(),
4026
+ timeout_seconds: import_zod.z.number().positive().optional(),
4027
+ timeoutSeconds: import_zod.z.number().positive().optional()
4028
+ });
4029
+ var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
4030
+ CliHealthcheckHttpInputSchema,
4031
+ CliHealthcheckCommandInputSchema
4032
+ ]);
4033
+ var CliTargetInputSchema = import_zod.z.object({
4034
+ name: import_zod.z.string().min(1, "target name is required"),
4035
+ provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
4036
+ // Command template - required (accept both naming conventions)
4037
+ command_template: import_zod.z.string().optional(),
4038
+ commandTemplate: import_zod.z.string().optional(),
4039
+ // Files format - optional
4040
+ files_format: import_zod.z.string().optional(),
4041
+ filesFormat: import_zod.z.string().optional(),
4042
+ attachments_format: import_zod.z.string().optional(),
4043
+ attachmentsFormat: import_zod.z.string().optional(),
4044
+ // Working directory - optional
4045
+ cwd: import_zod.z.string().optional(),
4046
+ // Timeout in seconds - optional
4047
+ timeout_seconds: import_zod.z.number().positive().optional(),
4048
+ timeoutSeconds: import_zod.z.number().positive().optional(),
4049
+ // Healthcheck configuration - optional
4050
+ healthcheck: CliHealthcheckInputSchema.optional(),
4051
+ // Verbose mode - optional
4052
+ verbose: import_zod.z.boolean().optional(),
4053
+ cli_verbose: import_zod.z.boolean().optional(),
4054
+ cliVerbose: import_zod.z.boolean().optional(),
4055
+ // Keep temp files - optional
4056
+ keep_temp_files: import_zod.z.boolean().optional(),
4057
+ keepTempFiles: import_zod.z.boolean().optional(),
4058
+ keep_output_files: import_zod.z.boolean().optional(),
4059
+ keepOutputFiles: import_zod.z.boolean().optional(),
4060
+ // Common target fields
4061
+ judge_target: import_zod.z.string().optional(),
4062
+ workers: import_zod.z.number().int().min(1).optional(),
4063
+ provider_batching: import_zod.z.boolean().optional(),
4064
+ providerBatching: import_zod.z.boolean().optional()
4065
+ }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
4066
+ message: "Either command_template or commandTemplate is required"
4067
+ });
4068
+ var CliHealthcheckHttpSchema = import_zod.z.object({
4069
+ type: import_zod.z.literal("http"),
4070
+ url: import_zod.z.string().min(1),
4071
+ timeoutMs: import_zod.z.number().positive().optional()
4072
+ }).strict();
4073
+ var CliHealthcheckCommandSchema = import_zod.z.object({
4074
+ type: import_zod.z.literal("command"),
4075
+ commandTemplate: import_zod.z.string().min(1),
4076
+ cwd: import_zod.z.string().optional(),
4077
+ timeoutMs: import_zod.z.number().positive().optional()
4078
+ }).strict();
4079
+ var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
4080
+ CliHealthcheckHttpSchema,
4081
+ CliHealthcheckCommandSchema
4082
+ ]);
4083
+ var CliTargetConfigSchema = import_zod.z.object({
4084
+ commandTemplate: import_zod.z.string().min(1),
4085
+ filesFormat: import_zod.z.string().optional(),
4086
+ cwd: import_zod.z.string().optional(),
4087
+ timeoutMs: import_zod.z.number().positive().optional(),
4088
+ healthcheck: CliHealthcheckSchema.optional(),
4089
+ verbose: import_zod.z.boolean().optional(),
4090
+ keepTempFiles: import_zod.z.boolean().optional()
4091
+ }).strict();
4092
+ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4093
+ const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4094
+ const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
4095
+ if (input.type === "http") {
4096
+ const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
4097
+ return {
4098
+ type: "http",
4099
+ url,
4100
+ timeoutMs
4101
+ };
4102
+ }
4103
+ const commandTemplateSource = input.command_template ?? input.commandTemplate;
4104
+ if (commandTemplateSource === void 0) {
4105
+ throw new Error(
4106
+ `${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
4107
+ );
4108
+ }
4109
+ const commandTemplate = resolveString(
4110
+ commandTemplateSource,
4111
+ env,
4112
+ `${targetName} healthcheck command template`,
4113
+ true
4114
+ );
4115
+ let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
4116
+ allowLiteral: true,
4117
+ optionalEnv: true
4118
+ });
4119
+ if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4120
+ cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
4121
+ }
4122
+ return {
4123
+ type: "command",
4124
+ commandTemplate,
4125
+ cwd,
4126
+ timeoutMs
4127
+ };
4128
+ }
4129
+ function normalizeCliTargetInput(input, env, evalFilePath) {
4130
+ const targetName = input.name;
4131
+ const commandTemplateSource = input.command_template ?? input.commandTemplate;
4132
+ if (commandTemplateSource === void 0) {
4133
+ throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
4134
+ }
4135
+ const commandTemplate = resolveString(
4136
+ commandTemplateSource,
4137
+ env,
4138
+ `${targetName} CLI command template`,
4139
+ true
4140
+ );
4141
+ const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
4142
+ const filesFormat = resolveOptionalLiteralString(filesFormatSource);
4143
+ let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
4144
+ allowLiteral: true,
4145
+ optionalEnv: true
4146
+ });
4147
+ if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
4148
+ cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
4149
+ }
4150
+ if (!cwd && evalFilePath) {
4151
+ cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
4152
+ }
4153
+ const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
4154
+ const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
4155
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
4156
+ const keepTempFiles = resolveOptionalBoolean(
4157
+ input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
4158
+ );
4159
+ const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
4160
+ return {
4161
+ commandTemplate,
4162
+ filesFormat,
4163
+ cwd,
4164
+ timeoutMs,
4165
+ healthcheck,
4166
+ verbose,
4167
+ keepTempFiles
4168
+ };
4169
+ }
4170
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
4171
+ "PROMPT",
4172
+ "GUIDELINES",
4173
+ "EVAL_ID",
4174
+ "ATTEMPT",
4175
+ "FILES",
4176
+ "OUTPUT_FILE"
4177
+ ]);
4178
+ var BASE_TARGET_SCHEMA = import_zod.z.object({
4179
+ name: import_zod.z.string().min(1, "target name is required"),
4180
+ provider: import_zod.z.string().min(1, "provider is required"),
4181
+ judge_target: import_zod.z.string().optional(),
4182
+ workers: import_zod.z.number().int().min(1).optional()
4183
+ }).passthrough();
4184
+ var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
4185
+ function normalizeAzureApiVersion(value) {
4186
+ if (!value) {
4187
+ return DEFAULT_AZURE_API_VERSION;
4188
+ }
4189
+ const trimmed = value.trim();
4190
+ if (trimmed.length === 0) {
4191
+ return DEFAULT_AZURE_API_VERSION;
4192
+ }
4193
+ const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
4194
+ return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
4195
+ }
4196
+ function resolveRetryConfig(target) {
4197
+ const maxRetries = resolveOptionalNumber(
4198
+ target.max_retries ?? target.maxRetries,
4199
+ `${target.name} max retries`
4200
+ );
4201
+ const initialDelayMs = resolveOptionalNumber(
4202
+ target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
4203
+ `${target.name} retry initial delay`
4204
+ );
4205
+ const maxDelayMs = resolveOptionalNumber(
4206
+ target.retry_max_delay_ms ?? target.retryMaxDelayMs,
4207
+ `${target.name} retry max delay`
4208
+ );
4209
+ const backoffFactor = resolveOptionalNumber(
4210
+ target.retry_backoff_factor ?? target.retryBackoffFactor,
4211
+ `${target.name} retry backoff factor`
4212
+ );
4213
+ const retryableStatusCodes = resolveOptionalNumberArray(
4214
+ target.retry_status_codes ?? target.retryStatusCodes,
4215
+ `${target.name} retry status codes`
4216
+ );
4217
+ if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
4218
+ return void 0;
4219
+ }
4220
+ return {
4221
+ maxRetries,
4222
+ initialDelayMs,
4223
+ maxDelayMs,
4224
+ backoffFactor,
4225
+ retryableStatusCodes
4226
+ };
4227
+ }
4228
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
4229
+ const parsed = BASE_TARGET_SCHEMA.parse(definition);
4230
+ const provider = parsed.provider.toLowerCase();
4231
+ const providerBatching = resolveOptionalBoolean(
4232
+ parsed.provider_batching ?? parsed.providerBatching
4233
+ );
4234
+ switch (provider) {
4235
+ case "azure":
4236
+ case "azure-openai":
4237
+ return {
4238
+ kind: "azure",
4239
+ name: parsed.name,
4240
+ judgeTarget: parsed.judge_target,
4241
+ workers: parsed.workers,
4242
+ providerBatching,
4243
+ config: resolveAzureConfig(parsed, env)
4244
+ };
4245
+ case "anthropic":
4246
+ return {
4247
+ kind: "anthropic",
4248
+ name: parsed.name,
4249
+ judgeTarget: parsed.judge_target,
4250
+ workers: parsed.workers,
4251
+ providerBatching,
4252
+ config: resolveAnthropicConfig(parsed, env)
4253
+ };
4254
+ case "gemini":
4255
+ case "google":
4256
+ case "google-gemini":
4257
+ return {
4258
+ kind: "gemini",
4259
+ name: parsed.name,
4260
+ judgeTarget: parsed.judge_target,
4261
+ workers: parsed.workers,
4262
+ providerBatching,
4263
+ config: resolveGeminiConfig(parsed, env)
4264
+ };
4265
+ case "codex":
4266
+ case "codex-cli":
4267
+ return {
4268
+ kind: "codex",
4269
+ name: parsed.name,
4270
+ judgeTarget: parsed.judge_target,
4271
+ workers: parsed.workers,
4272
+ providerBatching,
4273
+ config: resolveCodexConfig(parsed, env)
4274
+ };
4275
+ case "pi":
4276
+ case "pi-coding-agent":
4277
+ return {
4278
+ kind: "pi-coding-agent",
4279
+ name: parsed.name,
4280
+ judgeTarget: parsed.judge_target,
4281
+ workers: parsed.workers,
4282
+ providerBatching,
4283
+ config: resolvePiCodingAgentConfig(parsed, env)
4284
+ };
4285
+ case "mock":
4286
+ return {
4287
+ kind: "mock",
4288
+ name: parsed.name,
4289
+ judgeTarget: parsed.judge_target,
4290
+ workers: parsed.workers,
4291
+ providerBatching,
4292
+ config: resolveMockConfig(parsed)
4293
+ };
4294
+ case "vscode":
4295
+ case "vscode-insiders":
4296
+ return {
4297
+ kind: provider,
4298
+ name: parsed.name,
4299
+ judgeTarget: parsed.judge_target,
4300
+ workers: parsed.workers,
4301
+ providerBatching,
4302
+ config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
4303
+ };
4304
+ case "cli":
4305
+ return {
4306
+ kind: "cli",
4307
+ name: parsed.name,
4308
+ judgeTarget: parsed.judge_target,
4309
+ workers: parsed.workers,
4310
+ providerBatching,
4311
+ config: resolveCliConfig(parsed, env, evalFilePath)
4312
+ };
4313
+ default:
4314
+ throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
4315
+ }
4316
+ }
4317
+ function resolveAzureConfig(target, env) {
4318
+ const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
4319
+ const apiKeySource = target.api_key ?? target.apiKey;
4320
+ const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
4321
+ const versionSource = target.version ?? target.api_version;
4322
+ const temperatureSource = target.temperature;
4323
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4324
+ const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
4325
+ const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
4326
+ const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
4327
+ const version = normalizeAzureApiVersion(
4328
+ resolveOptionalString(versionSource, env, `${target.name} api version`, {
4329
+ allowLiteral: true,
4330
+ optionalEnv: true
4331
+ })
4332
+ );
4333
+ const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
4334
+ const maxOutputTokens = resolveOptionalNumber(
4335
+ maxTokensSource,
4336
+ `${target.name} max output tokens`
4337
+ );
4338
+ const retry = resolveRetryConfig(target);
4339
+ return {
4340
+ resourceName,
4341
+ deploymentName,
4342
+ apiKey,
4343
+ version,
4344
+ temperature,
4345
+ maxOutputTokens,
4346
+ retry
4347
+ };
4348
+ }
4349
+ function resolveAnthropicConfig(target, env) {
4350
+ const apiKeySource = target.api_key ?? target.apiKey;
4351
+ const modelSource = target.model ?? target.deployment ?? target.variant;
4352
+ const temperatureSource = target.temperature;
3348
4353
  const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
3349
4354
  const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
3350
4355
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
@@ -3385,6 +4390,7 @@ function resolveCodexConfig(target, env) {
3385
4390
  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
3386
4391
  const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
3387
4392
  const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
4393
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
3388
4394
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
3389
4395
  allowLiteral: true,
3390
4396
  optionalEnv: true
@@ -3400,13 +4406,15 @@ function resolveCodexConfig(target, env) {
3400
4406
  optionalEnv: true
3401
4407
  });
3402
4408
  const logFormat = normalizeCodexLogFormat(logFormatSource);
4409
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
3403
4410
  return {
3404
4411
  executable,
3405
4412
  args,
3406
4413
  cwd,
3407
4414
  timeoutMs,
3408
4415
  logDir,
3409
- logFormat
4416
+ logFormat,
4417
+ systemPrompt
3410
4418
  };
3411
4419
  }
3412
4420
  function normalizeCodexLogFormat(value) {
@@ -3422,10 +4430,73 @@ function normalizeCodexLogFormat(value) {
3422
4430
  }
3423
4431
  throw new Error("codex log format must be 'summary' or 'json'");
3424
4432
  }
4433
+ function resolvePiCodingAgentConfig(target, env) {
4434
+ const executableSource = target.executable ?? target.command ?? target.binary;
4435
+ const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
4436
+ const modelSource = target.model ?? target.pi_model ?? target.piModel;
4437
+ const apiKeySource = target.api_key ?? target.apiKey;
4438
+ const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
4439
+ const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
4440
+ const argsSource = target.args ?? target.arguments;
4441
+ const cwdSource = target.cwd;
4442
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
4443
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
4444
+ const logFormatSource = target.log_format ?? target.logFormat;
4445
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
4446
+ const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, {
4447
+ allowLiteral: true,
4448
+ optionalEnv: true
4449
+ }) ?? "pi";
4450
+ const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
4451
+ allowLiteral: true,
4452
+ optionalEnv: true
4453
+ });
4454
+ const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
4455
+ allowLiteral: true,
4456
+ optionalEnv: true
4457
+ });
4458
+ const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, {
4459
+ allowLiteral: false,
4460
+ optionalEnv: true
4461
+ });
4462
+ const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
4463
+ allowLiteral: true,
4464
+ optionalEnv: true
4465
+ });
4466
+ const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
4467
+ allowLiteral: true,
4468
+ optionalEnv: true
4469
+ });
4470
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
4471
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
4472
+ allowLiteral: true,
4473
+ optionalEnv: true
4474
+ });
4475
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
4476
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
4477
+ allowLiteral: true,
4478
+ optionalEnv: true
4479
+ });
4480
+ const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
4481
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
4482
+ return {
4483
+ executable,
4484
+ provider,
4485
+ model,
4486
+ apiKey,
4487
+ tools,
4488
+ thinking,
4489
+ args,
4490
+ cwd,
4491
+ timeoutMs,
4492
+ logDir,
4493
+ logFormat,
4494
+ systemPrompt
4495
+ };
4496
+ }
3425
4497
  function resolveMockConfig(target) {
3426
4498
  const response = typeof target.response === "string" ? target.response : void 0;
3427
- const trace = Array.isArray(target.trace) ? target.trace : void 0;
3428
- return { response, trace };
4499
+ return { response };
3429
4500
  }
3430
4501
  function resolveVSCodeConfig(target, env, insiders) {
3431
4502
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -3457,42 +4528,35 @@ function resolveVSCodeConfig(target, env, insiders) {
3457
4528
  workspaceTemplate
3458
4529
  };
3459
4530
  }
3460
- function resolveCliConfig(target, env, evalFilePath) {
3461
- const commandTemplateSource = target.command_template ?? target.commandTemplate;
3462
- const filesFormat = resolveOptionalLiteralString(
3463
- target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3464
- );
3465
- const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
3466
- let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3467
- allowLiteral: true,
3468
- optionalEnv: true
3469
- });
3470
- if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
3471
- cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
4531
+ var cliErrorMap = (issue, ctx) => {
4532
+ if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
4533
+ return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
3472
4534
  }
3473
- if (!cwd && evalFilePath) {
3474
- cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
4535
+ if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
4536
+ return { message: "healthcheck type must be 'http' or 'command'" };
3475
4537
  }
3476
- const timeoutMs = resolveTimeoutMs(
3477
- target.timeout_seconds ?? target.timeoutSeconds,
3478
- `${target.name} timeout`
3479
- );
3480
- const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
3481
- const commandTemplate = resolveString(
3482
- commandTemplateSource,
3483
- env,
3484
- `${target.name} CLI command template`,
3485
- true
3486
- );
3487
- assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
3488
- return {
3489
- commandTemplate,
3490
- filesFormat,
3491
- cwd,
3492
- timeoutMs,
3493
- healthcheck,
3494
- verbose
3495
- };
4538
+ if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
4539
+ return { message: `${ctx.defaultError} (expected a string value)` };
4540
+ }
4541
+ return { message: ctx.defaultError };
4542
+ };
4543
+ function resolveCliConfig(target, env, evalFilePath) {
4544
+ const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
4545
+ if (!parseResult.success) {
4546
+ const firstError = parseResult.error.errors[0];
4547
+ const path16 = firstError?.path.join(".") || "";
4548
+ const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
4549
+ throw new Error(`${prefix}${firstError?.message}`);
4550
+ }
4551
+ const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
4552
+ assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
4553
+ if (normalized.healthcheck?.type === "command") {
4554
+ assertSupportedCliPlaceholders(
4555
+ normalized.healthcheck.commandTemplate,
4556
+ `${target.name} healthcheck command template`
4557
+ );
4558
+ }
4559
+ return normalized;
3496
4560
  }
3497
4561
  function resolveTimeoutMs(source, description) {
3498
4562
  const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
@@ -3504,49 +4568,6 @@ function resolveTimeoutMs(source, description) {
3504
4568
  }
3505
4569
  return Math.floor(seconds * 1e3);
3506
4570
  }
3507
- function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
3508
- if (source === void 0 || source === null) {
3509
- return void 0;
3510
- }
3511
- if (typeof source !== "object" || Array.isArray(source)) {
3512
- throw new Error(`${targetName} healthcheck must be an object`);
3513
- }
3514
- const candidate = source;
3515
- const type = candidate.type;
3516
- const timeoutMs = resolveTimeoutMs(
3517
- candidate.timeout_seconds ?? candidate.timeoutSeconds,
3518
- `${targetName} healthcheck timeout`
3519
- );
3520
- if (type === "http") {
3521
- const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
3522
- return {
3523
- type: "http",
3524
- url,
3525
- timeoutMs
3526
- };
3527
- }
3528
- if (type === "command") {
3529
- const commandTemplate = resolveString(
3530
- candidate.command_template ?? candidate.commandTemplate,
3531
- env,
3532
- `${targetName} healthcheck command template`,
3533
- true
3534
- );
3535
- assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
3536
- const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
3537
- allowLiteral: true,
3538
- optionalEnv: true
3539
- });
3540
- const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
3541
- return {
3542
- type: "command",
3543
- commandTemplate,
3544
- timeoutMs,
3545
- cwd: resolvedCwd
3546
- };
3547
- }
3548
- throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
3549
- }
3550
4571
  function assertSupportedCliPlaceholders(template, description) {
3551
4572
  const placeholders = extractCliPlaceholders(template);
3552
4573
  for (const placeholder of placeholders) {
@@ -3712,7 +4733,7 @@ function resolveOptionalNumberArray(source, description) {
3712
4733
  }
3713
4734
 
3714
4735
  // src/evaluation/providers/vscode.ts
3715
- var import_node_path12 = __toESM(require("path"), 1);
4736
+ var import_node_path13 = __toESM(require("path"), 1);
3716
4737
  var import_subagent = require("subagent");
3717
4738
 
3718
4739
  // src/evaluation/providers/vscode-templates.ts
@@ -3786,7 +4807,7 @@ var VSCodeProvider = class {
3786
4807
  }
3787
4808
  if (this.config.dryRun) {
3788
4809
  return {
3789
- text: "",
4810
+ outputMessages: [],
3790
4811
  raw: {
3791
4812
  session,
3792
4813
  inputFiles
@@ -3795,7 +4816,7 @@ var VSCodeProvider = class {
3795
4816
  }
3796
4817
  const responseText = await readTextFile(session.responseFile);
3797
4818
  return {
3798
- text: responseText,
4819
+ outputMessages: [{ role: "assistant", content: responseText }],
3799
4820
  raw: {
3800
4821
  session,
3801
4822
  inputFiles
@@ -3833,7 +4854,7 @@ var VSCodeProvider = class {
3833
4854
  }
3834
4855
  if (this.config.dryRun) {
3835
4856
  return normalizedRequests.map(({ inputFiles }) => ({
3836
- text: "",
4857
+ outputMessages: [],
3837
4858
  raw: {
3838
4859
  session,
3839
4860
  inputFiles,
@@ -3850,7 +4871,7 @@ var VSCodeProvider = class {
3850
4871
  for (const [index, responseFile] of session.responseFiles.entries()) {
3851
4872
  const responseText = await readTextFile(responseFile);
3852
4873
  responses.push({
3853
- text: responseText,
4874
+ outputMessages: [{ role: "assistant", content: responseText }],
3854
4875
  raw: {
3855
4876
  session,
3856
4877
  inputFiles: normalizedRequests[index]?.inputFiles,
@@ -3882,7 +4903,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3882
4903
  return "";
3883
4904
  }
3884
4905
  const buildList = (files) => files.map((absolutePath) => {
3885
- const fileName = import_node_path12.default.basename(absolutePath);
4906
+ const fileName = import_node_path13.default.basename(absolutePath);
3886
4907
  const fileUri = pathToFileUri2(absolutePath);
3887
4908
  return `* [${fileName}](${fileUri})`;
3888
4909
  });
@@ -3907,8 +4928,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3907
4928
  }
3908
4929
  const unique = /* @__PURE__ */ new Map();
3909
4930
  for (const attachment of attachments) {
3910
- const absolutePath = import_node_path12.default.resolve(attachment);
3911
- const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
4931
+ const absolutePath = import_node_path13.default.resolve(attachment);
4932
+ const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
3912
4933
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3913
4934
  if (!unique.has(absolutePath)) {
3914
4935
  unique.set(absolutePath, absolutePath);
@@ -3923,7 +4944,7 @@ function collectAttachmentFiles(attachments) {
3923
4944
  }
3924
4945
  const unique = /* @__PURE__ */ new Map();
3925
4946
  for (const attachment of attachments) {
3926
- const absolutePath = import_node_path12.default.resolve(attachment);
4947
+ const absolutePath = import_node_path13.default.resolve(attachment);
3927
4948
  if (!unique.has(absolutePath)) {
3928
4949
  unique.set(absolutePath, absolutePath);
3929
4950
  }
@@ -3931,7 +4952,7 @@ function collectAttachmentFiles(attachments) {
3931
4952
  return Array.from(unique.values());
3932
4953
  }
3933
4954
  function pathToFileUri2(filePath) {
3934
- const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
4955
+ const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
3935
4956
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3936
4957
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3937
4958
  return `file:///${normalizedPath}`;
@@ -3944,7 +4965,7 @@ function normalizeAttachments(attachments) {
3944
4965
  }
3945
4966
  const deduped = /* @__PURE__ */ new Set();
3946
4967
  for (const attachment of attachments) {
3947
- deduped.add(import_node_path12.default.resolve(attachment));
4968
+ deduped.add(import_node_path13.default.resolve(attachment));
3948
4969
  }
3949
4970
  return Array.from(deduped);
3950
4971
  }
@@ -3953,7 +4974,7 @@ function mergeAttachments(all) {
3953
4974
  for (const list of all) {
3954
4975
  if (!list) continue;
3955
4976
  for (const inputFile of list) {
3956
- deduped.add(import_node_path12.default.resolve(inputFile));
4977
+ deduped.add(import_node_path13.default.resolve(inputFile));
3957
4978
  }
3958
4979
  }
3959
4980
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -4000,9 +5021,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
4000
5021
  }
4001
5022
 
4002
5023
  // src/evaluation/providers/targets-file.ts
4003
- var import_node_fs4 = require("fs");
4004
- var import_promises10 = require("fs/promises");
4005
- var import_node_path13 = __toESM(require("path"), 1);
5024
+ var import_node_fs5 = require("fs");
5025
+ var import_promises11 = require("fs/promises");
5026
+ var import_node_path14 = __toESM(require("path"), 1);
4006
5027
  var import_yaml3 = require("yaml");
4007
5028
  function isRecord(value) {
4008
5029
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4032,18 +5053,18 @@ function assertTargetDefinition(value, index, filePath) {
4032
5053
  }
4033
5054
  async function fileExists3(filePath) {
4034
5055
  try {
4035
- await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
5056
+ await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
4036
5057
  return true;
4037
5058
  } catch {
4038
5059
  return false;
4039
5060
  }
4040
5061
  }
4041
5062
  async function readTargetDefinitions(filePath) {
4042
- const absolutePath = import_node_path13.default.resolve(filePath);
5063
+ const absolutePath = import_node_path14.default.resolve(filePath);
4043
5064
  if (!await fileExists3(absolutePath)) {
4044
5065
  throw new Error(`targets.yaml not found at ${absolutePath}`);
4045
5066
  }
4046
- const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
5067
+ const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
4047
5068
  const parsed = (0, import_yaml3.parse)(raw);
4048
5069
  if (!isRecord(parsed)) {
4049
5070
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -4071,6 +5092,8 @@ function createProvider(target) {
4071
5092
  return new CliProvider(target.name, target.config);
4072
5093
  case "codex":
4073
5094
  return new CodexProvider(target.name, target.config);
5095
+ case "pi-coding-agent":
5096
+ return new PiCodingAgentProvider(target.name, target.config);
4074
5097
  case "mock":
4075
5098
  return new MockProvider(target.name, target.config);
4076
5099
  case "vscode":
@@ -4090,6 +5113,100 @@ function resolveAndCreateProvider(definition, env = process.env) {
4090
5113
  // src/evaluation/evaluators.ts
4091
5114
  var import_ai2 = require("ai");
4092
5115
  var import_zod2 = require("zod");
5116
+
5117
+ // src/runtime/exec.ts
5118
+ function getBunSpawn() {
5119
+ const bunSpawn = globalThis.Bun?.spawn;
5120
+ return typeof bunSpawn === "function" ? bunSpawn : void 0;
5121
+ }
5122
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
5123
+ const bunSpawn = getBunSpawn();
5124
+ if (bunSpawn) {
5125
+ const encoder = new TextEncoder();
5126
+ const proc = bunSpawn({
5127
+ cmd: ["sh", "-c", command],
5128
+ cwd: options.cwd,
5129
+ stdin: encoder.encode(stdinPayload),
5130
+ stdout: "pipe",
5131
+ stderr: "pipe"
5132
+ });
5133
+ const timeout = options.timeoutMs ? setTimeout(() => {
5134
+ proc.kill();
5135
+ }, options.timeoutMs) : void 0;
5136
+ try {
5137
+ const stdout = await new Response(proc.stdout).text();
5138
+ const stderr = await new Response(proc.stderr).text();
5139
+ const exitCode = await proc.exited;
5140
+ return { stdout, stderr, exitCode };
5141
+ } finally {
5142
+ if (timeout !== void 0) {
5143
+ clearTimeout(timeout);
5144
+ }
5145
+ }
5146
+ }
5147
+ const { spawn: spawn3 } = await import("child_process");
5148
+ return await new Promise((resolve, reject) => {
5149
+ const child = spawn3(command, {
5150
+ shell: true,
5151
+ cwd: options.cwd,
5152
+ stdio: ["pipe", "pipe", "pipe"]
5153
+ });
5154
+ let stdout = "";
5155
+ let stderr = "";
5156
+ const timeout = options.timeoutMs ? setTimeout(() => {
5157
+ child.kill();
5158
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
5159
+ }, options.timeoutMs) : void 0;
5160
+ child.stdout?.on("data", (data) => {
5161
+ stdout += data.toString();
5162
+ });
5163
+ child.stderr?.on("data", (data) => {
5164
+ stderr += data.toString();
5165
+ });
5166
+ child.on("error", (error) => {
5167
+ if (timeout !== void 0) {
5168
+ clearTimeout(timeout);
5169
+ }
5170
+ reject(error);
5171
+ });
5172
+ child.on("exit", (code) => {
5173
+ if (timeout !== void 0) {
5174
+ clearTimeout(timeout);
5175
+ }
5176
+ resolve({ stdout, stderr, exitCode: code ?? 0 });
5177
+ });
5178
+ child.stdin?.write(stdinPayload);
5179
+ child.stdin?.end();
5180
+ });
5181
+ }
5182
+
5183
+ // src/evaluation/providers/types.ts
5184
+ var AGENT_PROVIDER_KINDS = [
5185
+ "codex",
5186
+ "pi-coding-agent",
5187
+ "vscode",
5188
+ "vscode-insiders"
5189
+ ];
5190
+ function extractLastAssistantContent(messages) {
5191
+ if (!messages || messages.length === 0) {
5192
+ return "";
5193
+ }
5194
+ for (let i = messages.length - 1; i >= 0; i--) {
5195
+ const msg = messages[i];
5196
+ if (msg.role === "assistant" && msg.content !== void 0) {
5197
+ if (typeof msg.content === "string") {
5198
+ return msg.content;
5199
+ }
5200
+ return JSON.stringify(msg.content);
5201
+ }
5202
+ }
5203
+ return "";
5204
+ }
5205
+ function isAgentProvider(provider) {
5206
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
5207
+ }
5208
+
5209
+ // src/evaluation/evaluators.ts
4093
5210
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
4094
5211
 
4095
5212
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -4154,6 +5271,7 @@ var LlmJudgeEvaluator = class {
4154
5271
  null,
4155
5272
  2
4156
5273
  ),
5274
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
4157
5275
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
4158
5276
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
4159
5277
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -4178,7 +5296,7 @@ var LlmJudgeEvaluator = class {
4178
5296
  const score = clampScore(data.score);
4179
5297
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4180
5298
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4181
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
5299
+ const reasoning = data.reasoning;
4182
5300
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
4183
5301
  return {
4184
5302
  score,
@@ -4280,7 +5398,9 @@ var LlmJudgeEvaluator = class {
4280
5398
  maxOutputTokens: this.maxOutputTokens,
4281
5399
  temperature: this.temperature
4282
5400
  });
4283
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
5401
+ const data = schema.parse(
5402
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
5403
+ );
4284
5404
  return { data, providerResponse: response };
4285
5405
  } catch (e) {
4286
5406
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -4362,17 +5482,17 @@ var CodeEvaluator = class {
4362
5482
  const inputPayload = JSON.stringify(
4363
5483
  {
4364
5484
  question: context.evalCase.question,
4365
- expected_outcome: context.evalCase.expected_outcome,
4366
- expected_messages: context.evalCase.expected_messages,
4367
- reference_answer: context.evalCase.reference_answer,
4368
- candidate_answer: context.candidate,
4369
- guideline_files: context.evalCase.guideline_paths,
4370
- input_files: context.evalCase.file_paths.filter(
4371
- (path15) => !context.evalCase.guideline_paths.includes(path15)
5485
+ expectedOutcome: context.evalCase.expected_outcome,
5486
+ expectedMessages: context.evalCase.expected_messages,
5487
+ referenceAnswer: context.evalCase.reference_answer,
5488
+ candidateAnswer: context.candidate,
5489
+ outputMessages: context.outputMessages ?? null,
5490
+ guidelineFiles: context.evalCase.guideline_paths,
5491
+ inputFiles: context.evalCase.file_paths.filter(
5492
+ (path16) => !context.evalCase.guideline_paths.includes(path16)
4372
5493
  ),
4373
- input_messages: context.evalCase.input_messages,
4374
- candidate_trace_file: context.candidateTraceRef ?? null,
4375
- candidate_trace_summary: context.candidateTraceSummary ?? null
5494
+ inputMessages: context.evalCase.input_messages,
5495
+ traceSummary: context.traceSummary ?? null
4376
5496
  },
4377
5497
  null,
4378
5498
  2
@@ -4442,43 +5562,17 @@ function calculateRubricScore(result, rubrics) {
4442
5562
  return { score, verdict, hits, misses };
4443
5563
  }
4444
5564
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
4445
- const { spawn: spawn2 } = await import("child_process");
4446
- return await new Promise((resolve, reject) => {
4447
- const child = spawn2(scriptPath, {
4448
- shell: true,
4449
- cwd
4450
- });
4451
- let stdout = "";
4452
- let stderr = "";
4453
- const timeout = agentTimeoutMs ? setTimeout(() => {
4454
- child.kill();
4455
- reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
4456
- }, agentTimeoutMs) : void 0;
4457
- child.stdout?.on("data", (data) => {
4458
- stdout += data.toString();
4459
- });
4460
- child.stderr?.on("data", (data) => {
4461
- stderr += data.toString();
4462
- });
4463
- child.on("error", (error) => {
4464
- if (timeout !== void 0) {
4465
- clearTimeout(timeout);
4466
- }
4467
- reject(error);
4468
- });
4469
- child.on("exit", (code) => {
4470
- if (timeout !== void 0) {
4471
- clearTimeout(timeout);
4472
- }
4473
- if (code && code !== 0 && stderr.length > 0) {
4474
- reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
4475
- return;
4476
- }
4477
- resolve(stdout.trim());
4478
- });
4479
- child.stdin?.write(input);
4480
- child.stdin?.end();
5565
+ const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
5566
+ cwd,
5567
+ timeoutMs: agentTimeoutMs
4481
5568
  });
5569
+ if (exitCode !== 0) {
5570
+ const trimmedErr = stderr.trim();
5571
+ throw new Error(
5572
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5573
+ );
5574
+ }
5575
+ return stdout.trim();
4482
5576
  }
4483
5577
  function parseJsonSafe(payload) {
4484
5578
  try {
@@ -4492,6 +5586,33 @@ function substituteVariables(template, variables) {
4492
5586
  return variables[varName] ?? match;
4493
5587
  });
4494
5588
  }
5589
+ function deepEqual(a, b) {
5590
+ if (a === b) return true;
5591
+ if (a === null || b === null) return a === b;
5592
+ if (typeof a !== typeof b) return false;
5593
+ if (typeof a !== "object") return a === b;
5594
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
5595
+ if (Array.isArray(a) && Array.isArray(b)) {
5596
+ if (a.length !== b.length) return false;
5597
+ return a.every((val, i) => deepEqual(val, b[i]));
5598
+ }
5599
+ const aObj = a;
5600
+ const bObj = b;
5601
+ const aKeys = Object.keys(aObj);
5602
+ const bKeys = Object.keys(bObj);
5603
+ if (aKeys.length !== bKeys.length) return false;
5604
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5605
+ }
5606
+ function argsMatch(expected, actual) {
5607
+ if (expected === void 0) return true;
5608
+ if (expected === "any") return true;
5609
+ if (actual === void 0) return false;
5610
+ for (const key of Object.keys(expected)) {
5611
+ if (!Object.hasOwn(actual, key)) return false;
5612
+ if (!deepEqual(expected[key], actual[key])) return false;
5613
+ }
5614
+ return true;
5615
+ }
4495
5616
  var ToolTrajectoryEvaluator = class {
4496
5617
  kind = "tool_trajectory";
4497
5618
  config;
@@ -4499,8 +5620,19 @@ var ToolTrajectoryEvaluator = class {
4499
5620
  this.config = options.config;
4500
5621
  }
4501
5622
  evaluate(context) {
4502
- const { candidateTrace, candidateTraceSummary } = context;
4503
- if (!candidateTrace || !candidateTraceSummary) {
5623
+ const { outputMessages, traceSummary } = context;
5624
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
5625
+ if (toolCalls.length === 0 && !traceSummary) {
5626
+ return {
5627
+ score: 0,
5628
+ verdict: "fail",
5629
+ hits: [],
5630
+ misses: ["No trace available for evaluation"],
5631
+ expectedAspectCount: 1
5632
+ };
5633
+ }
5634
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
5635
+ if (!summary) {
4504
5636
  return {
4505
5637
  score: 0,
4506
5638
  verdict: "fail",
@@ -4511,11 +5643,11 @@ var ToolTrajectoryEvaluator = class {
4511
5643
  }
4512
5644
  switch (this.config.mode) {
4513
5645
  case "any_order":
4514
- return this.evaluateAnyOrder(candidateTraceSummary);
5646
+ return this.evaluateAnyOrder(summary);
4515
5647
  case "in_order":
4516
- return this.evaluateInOrder(candidateTrace);
5648
+ return this.evaluateInOrder(toolCalls);
4517
5649
  case "exact":
4518
- return this.evaluateExact(candidateTrace);
5650
+ return this.evaluateExact(toolCalls);
4519
5651
  default:
4520
5652
  return {
4521
5653
  score: 0,
@@ -4526,6 +5658,42 @@ var ToolTrajectoryEvaluator = class {
4526
5658
  };
4527
5659
  }
4528
5660
  }
5661
+ /**
5662
+ * Extract tool calls from output messages.
5663
+ */
5664
+ extractToolCallsFromMessages(messages) {
5665
+ if (!messages) {
5666
+ return [];
5667
+ }
5668
+ const toolCalls = [];
5669
+ for (const message of messages) {
5670
+ if (message.toolCalls) {
5671
+ for (const call of message.toolCalls) {
5672
+ toolCalls.push({
5673
+ name: call.tool,
5674
+ args: call.input
5675
+ });
5676
+ }
5677
+ }
5678
+ }
5679
+ return toolCalls;
5680
+ }
5681
+ /**
5682
+ * Build a summary from extracted tool calls.
5683
+ */
5684
+ buildSummary(toolCalls) {
5685
+ const toolCallsByName = {};
5686
+ for (const call of toolCalls) {
5687
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
5688
+ }
5689
+ const toolNames = Object.keys(toolCallsByName).sort();
5690
+ return {
5691
+ eventCount: toolCalls.length,
5692
+ toolNames,
5693
+ toolCallsByName,
5694
+ errorCount: 0
5695
+ };
5696
+ }
4529
5697
  evaluateAnyOrder(summary) {
4530
5698
  const minimums = this.config.minimums ?? {};
4531
5699
  const toolNames = Object.keys(minimums);
@@ -4558,7 +5726,7 @@ var ToolTrajectoryEvaluator = class {
4558
5726
  expectedAspectCount: toolNames.length
4559
5727
  };
4560
5728
  }
4561
- evaluateInOrder(trace) {
5729
+ evaluateInOrder(toolCalls) {
4562
5730
  const expected = this.config.expected ?? [];
4563
5731
  if (expected.length === 0) {
4564
5732
  return {
@@ -4569,23 +5737,33 @@ var ToolTrajectoryEvaluator = class {
4569
5737
  expectedAspectCount: 0
4570
5738
  };
4571
5739
  }
4572
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4573
5740
  const hits = [];
4574
5741
  const misses = [];
4575
5742
  let actualIndex = 0;
4576
5743
  for (let i = 0; i < expected.length; i++) {
4577
- const expectedTool = expected[i].tool;
5744
+ const expectedItem = expected[i];
5745
+ const expectedTool = expectedItem.tool;
4578
5746
  let found = false;
4579
- while (actualIndex < actualToolCalls.length) {
4580
- if (actualToolCalls[actualIndex].name === expectedTool) {
4581
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
5747
+ let argsMismatch = false;
5748
+ while (actualIndex < toolCalls.length) {
5749
+ const actualCall = toolCalls[actualIndex];
5750
+ if (actualCall.name === expectedTool) {
5751
+ if (argsMatch(expectedItem.args, actualCall.args)) {
5752
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
5753
+ actualIndex++;
5754
+ found = true;
5755
+ break;
5756
+ }
5757
+ misses.push(
5758
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
5759
+ );
4582
5760
  actualIndex++;
4583
- found = true;
5761
+ argsMismatch = true;
4584
5762
  break;
4585
5763
  }
4586
5764
  actualIndex++;
4587
5765
  }
4588
- if (!found) {
5766
+ if (!found && !argsMismatch) {
4589
5767
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
4590
5768
  }
4591
5769
  }
@@ -4598,7 +5776,7 @@ var ToolTrajectoryEvaluator = class {
4598
5776
  expectedAspectCount: expected.length
4599
5777
  };
4600
5778
  }
4601
- evaluateExact(trace) {
5779
+ evaluateExact(toolCalls) {
4602
5780
  const expected = this.config.expected ?? [];
4603
5781
  if (expected.length === 0) {
4604
5782
  return {
@@ -4609,18 +5787,23 @@ var ToolTrajectoryEvaluator = class {
4609
5787
  expectedAspectCount: 0
4610
5788
  };
4611
5789
  }
4612
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4613
5790
  const hits = [];
4614
5791
  const misses = [];
4615
- if (actualToolCalls.length !== expected.length) {
4616
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
5792
+ if (toolCalls.length !== expected.length) {
5793
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
4617
5794
  }
4618
- const checkLength = Math.min(expected.length, actualToolCalls.length);
5795
+ const checkLength = Math.min(expected.length, toolCalls.length);
4619
5796
  for (let i = 0; i < checkLength; i++) {
4620
- const expectedTool = expected[i].tool;
4621
- const actualTool = actualToolCalls[i].name;
5797
+ const expectedItem = expected[i];
5798
+ const expectedTool = expectedItem.tool;
5799
+ const actualCall = toolCalls[i];
5800
+ const actualTool = actualCall.name;
4622
5801
  if (actualTool === expectedTool) {
4623
- hits.push(`Position ${i}: ${expectedTool} \u2713`);
5802
+ if (argsMatch(expectedItem.args, actualCall.args)) {
5803
+ hits.push(`Position ${i}: ${expectedTool}`);
5804
+ } else {
5805
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
5806
+ }
4624
5807
  } else {
4625
5808
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4626
5809
  }
@@ -4832,11 +6015,13 @@ var CompositeEvaluator = class {
4832
6015
  evalCaseId: context.evalCase.id,
4833
6016
  attempt: context.attempt
4834
6017
  });
4835
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
6018
+ const data = freeformEvaluationSchema.parse(
6019
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6020
+ );
4836
6021
  const score = clampScore(data.score);
4837
6022
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4838
6023
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4839
- const reasoning = data.reasoning ?? response.reasoning;
6024
+ const reasoning = data.reasoning;
4840
6025
  return {
4841
6026
  score,
4842
6027
  verdict: scoreToVerdict(score),
@@ -4862,9 +6047,9 @@ var CompositeEvaluator = class {
4862
6047
  };
4863
6048
 
4864
6049
  // src/evaluation/orchestrator.ts
4865
- var import_node_crypto2 = require("crypto");
4866
- var import_promises11 = require("fs/promises");
4867
- var import_node_path14 = __toESM(require("path"), 1);
6050
+ var import_node_crypto3 = require("crypto");
6051
+ var import_promises12 = require("fs/promises");
6052
+ var import_node_path15 = __toESM(require("path"), 1);
4868
6053
 
4869
6054
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4870
6055
  var Node = class {
@@ -5005,16 +6190,6 @@ function validateConcurrency(concurrency) {
5005
6190
  }
5006
6191
  }
5007
6192
 
5008
- // src/evaluation/providers/types.ts
5009
- var AGENT_PROVIDER_KINDS = [
5010
- "codex",
5011
- "vscode",
5012
- "vscode-insiders"
5013
- ];
5014
- function isAgentProvider(provider) {
5015
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
5016
- }
5017
-
5018
6193
  // src/evaluation/orchestrator.ts
5019
6194
  async function runEvaluation(options) {
5020
6195
  const {
@@ -5269,11 +6444,19 @@ async function runBatchEvaluation(options) {
5269
6444
  const evalCase = evalCases[i];
5270
6445
  const promptInputs = promptInputsList[i];
5271
6446
  const providerResponse = batchResponse[i];
6447
+ const outputMessages = providerResponse.outputMessages;
6448
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6449
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6450
+ tokenUsage: providerResponse.tokenUsage,
6451
+ costUsd: providerResponse.costUsd,
6452
+ durationMs: providerResponse.durationMs
6453
+ }) : void 0;
6454
+ const candidate = extractLastAssistantContent(outputMessages);
5272
6455
  let result;
5273
6456
  try {
5274
6457
  result = await evaluateCandidate({
5275
6458
  evalCase,
5276
- candidate: providerResponse.text ?? "",
6459
+ candidate,
5277
6460
  target,
5278
6461
  provider,
5279
6462
  evaluators: evaluatorRegistry,
@@ -5281,7 +6464,9 @@ async function runBatchEvaluation(options) {
5281
6464
  nowFn,
5282
6465
  attempt: 0,
5283
6466
  judgeProvider: await resolveJudgeProvider(target),
5284
- agentTimeoutMs
6467
+ agentTimeoutMs,
6468
+ outputMessages,
6469
+ traceSummary
5285
6470
  });
5286
6471
  } catch (error) {
5287
6472
  const errorResult = buildErrorResult(
@@ -5385,21 +6570,18 @@ async function runEvalCase(options) {
5385
6570
  if (cacheKey && cache && !cachedResponse) {
5386
6571
  await cache.set(cacheKey, providerResponse);
5387
6572
  }
5388
- let candidateTrace = providerResponse.trace;
5389
- if (!candidateTrace && providerResponse.traceRef) {
5390
- try {
5391
- const rawTrace = await readJsonFile(providerResponse.traceRef);
5392
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5393
- candidateTrace = rawTrace;
5394
- }
5395
- } catch {
5396
- }
5397
- }
5398
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
6573
+ const outputMessages = providerResponse.outputMessages;
6574
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
6575
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
6576
+ tokenUsage: providerResponse.tokenUsage,
6577
+ costUsd: providerResponse.costUsd,
6578
+ durationMs: providerResponse.durationMs
6579
+ }) : void 0;
6580
+ const candidate = extractLastAssistantContent(outputMessages);
5399
6581
  try {
5400
6582
  return await evaluateCandidate({
5401
6583
  evalCase,
5402
- candidate: providerResponse.text ?? "",
6584
+ candidate,
5403
6585
  target,
5404
6586
  provider,
5405
6587
  evaluators,
@@ -5408,9 +6590,8 @@ async function runEvalCase(options) {
5408
6590
  attempt,
5409
6591
  judgeProvider,
5410
6592
  agentTimeoutMs,
5411
- candidateTrace,
5412
- candidateTraceRef: providerResponse.traceRef,
5413
- candidateTraceSummary
6593
+ outputMessages,
6594
+ traceSummary
5414
6595
  });
5415
6596
  } catch (error) {
5416
6597
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -5428,9 +6609,8 @@ async function evaluateCandidate(options) {
5428
6609
  attempt,
5429
6610
  judgeProvider,
5430
6611
  agentTimeoutMs,
5431
- candidateTrace,
5432
- candidateTraceRef,
5433
- candidateTraceSummary
6612
+ outputMessages,
6613
+ traceSummary
5434
6614
  } = options;
5435
6615
  const gradeTimestamp = nowFn();
5436
6616
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -5444,9 +6624,8 @@ async function evaluateCandidate(options) {
5444
6624
  now: gradeTimestamp,
5445
6625
  judgeProvider,
5446
6626
  agentTimeoutMs,
5447
- candidateTrace,
5448
- candidateTraceRef,
5449
- candidateTraceSummary
6627
+ outputMessages,
6628
+ traceSummary
5450
6629
  });
5451
6630
  const completedAt = nowFn();
5452
6631
  let agentProviderRequest;
@@ -5470,21 +6649,21 @@ async function evaluateCandidate(options) {
5470
6649
  }
5471
6650
  return {
5472
6651
  timestamp: completedAt.toISOString(),
5473
- eval_id: evalCase.id,
6652
+ evalId: evalCase.id,
5474
6653
  dataset: evalCase.dataset,
5475
- conversation_id: evalCase.conversation_id,
6654
+ conversationId: evalCase.conversation_id,
5476
6655
  score: score.score,
5477
6656
  hits: score.hits,
5478
6657
  misses: score.misses,
5479
- candidate_answer: candidate,
6658
+ candidateAnswer: candidate,
5480
6659
  target: target.name,
5481
6660
  reasoning: score.reasoning,
5482
- raw_aspects: score.rawAspects,
5483
- agent_provider_request: agentProviderRequest,
5484
- lm_provider_request: lmProviderRequest,
5485
- evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5486
- evaluator_results: evaluatorResults,
5487
- trace_summary: candidateTraceSummary
6661
+ rawAspects: score.rawAspects,
6662
+ agentProviderRequest,
6663
+ lmProviderRequest,
6664
+ evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
6665
+ evaluatorResults,
6666
+ traceSummary
5488
6667
  };
5489
6668
  }
5490
6669
  async function runEvaluatorsForCase(options) {
@@ -5499,9 +6678,8 @@ async function runEvaluatorsForCase(options) {
5499
6678
  now,
5500
6679
  judgeProvider,
5501
6680
  agentTimeoutMs,
5502
- candidateTrace,
5503
- candidateTraceRef,
5504
- candidateTraceSummary
6681
+ outputMessages,
6682
+ traceSummary
5505
6683
  } = options;
5506
6684
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
5507
6685
  return runEvaluatorList({
@@ -5516,9 +6694,8 @@ async function runEvaluatorsForCase(options) {
5516
6694
  now,
5517
6695
  judgeProvider,
5518
6696
  agentTimeoutMs,
5519
- candidateTrace,
5520
- candidateTraceRef,
5521
- candidateTraceSummary
6697
+ outputMessages,
6698
+ traceSummary
5522
6699
  });
5523
6700
  }
5524
6701
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -5535,9 +6712,8 @@ async function runEvaluatorsForCase(options) {
5535
6712
  promptInputs,
5536
6713
  now,
5537
6714
  judgeProvider,
5538
- candidateTrace,
5539
- candidateTraceRef,
5540
- candidateTraceSummary
6715
+ outputMessages,
6716
+ traceSummary
5541
6717
  });
5542
6718
  return { score };
5543
6719
  }
@@ -5554,9 +6730,8 @@ async function runEvaluatorList(options) {
5554
6730
  now,
5555
6731
  judgeProvider,
5556
6732
  agentTimeoutMs,
5557
- candidateTrace,
5558
- candidateTraceRef,
5559
- candidateTraceSummary
6733
+ outputMessages,
6734
+ traceSummary
5560
6735
  } = options;
5561
6736
  const scored = [];
5562
6737
  const evaluatorResults = [];
@@ -5586,7 +6761,7 @@ async function runEvaluatorList(options) {
5586
6761
  hits: score2.hits,
5587
6762
  misses: score2.misses,
5588
6763
  reasoning: score2.reasoning,
5589
- evaluator_provider_request: score2.evaluatorRawRequest
6764
+ evaluatorProviderRequest: score2.evaluatorRawRequest
5590
6765
  });
5591
6766
  }
5592
6767
  if (evaluator.type === "code") {
@@ -5603,8 +6778,8 @@ async function runEvaluatorList(options) {
5603
6778
  attempt,
5604
6779
  promptInputs,
5605
6780
  now,
5606
- candidateTraceRef,
5607
- candidateTraceSummary
6781
+ outputMessages,
6782
+ traceSummary
5608
6783
  });
5609
6784
  const weight = evaluator.weight ?? 1;
5610
6785
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5617,11 +6792,11 @@ async function runEvaluatorList(options) {
5617
6792
  hits: score2.hits,
5618
6793
  misses: score2.misses,
5619
6794
  reasoning: score2.reasoning,
5620
- evaluator_provider_request: score2.evaluatorRawRequest
6795
+ evaluatorProviderRequest: score2.evaluatorRawRequest
5621
6796
  });
5622
6797
  }
5623
6798
  if (evaluator.type === "composite") {
5624
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
6799
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5625
6800
  const createEvaluator = (memberConfig) => {
5626
6801
  switch (memberConfig.type) {
5627
6802
  case "llm_judge":
@@ -5674,8 +6849,8 @@ async function runEvaluatorList(options) {
5674
6849
  hits: score2.hits,
5675
6850
  misses: score2.misses,
5676
6851
  reasoning: score2.reasoning,
5677
- evaluator_provider_request: score2.evaluatorRawRequest,
5678
- evaluator_results: mapChildResults(score2.evaluatorResults)
6852
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
6853
+ evaluatorResults: mapChildResults(score2.evaluatorResults)
5679
6854
  });
5680
6855
  }
5681
6856
  if (evaluator.type === "tool_trajectory") {
@@ -5690,9 +6865,8 @@ async function runEvaluatorList(options) {
5690
6865
  attempt,
5691
6866
  promptInputs,
5692
6867
  now,
5693
- candidateTrace,
5694
- candidateTraceRef,
5695
- candidateTraceSummary
6868
+ outputMessages,
6869
+ traceSummary
5696
6870
  });
5697
6871
  const weight = evaluator.weight ?? 1;
5698
6872
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5834,22 +7008,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5834
7008
  async function dumpPrompt(directory, evalCase, promptInputs) {
5835
7009
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5836
7010
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5837
- const filePath = import_node_path14.default.resolve(directory, filename);
5838
- await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
7011
+ const filePath = import_node_path15.default.resolve(directory, filename);
7012
+ await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
5839
7013
  const payload = {
5840
7014
  eval_id: evalCase.id,
5841
7015
  question: promptInputs.question,
5842
7016
  guidelines: promptInputs.guidelines,
5843
7017
  guideline_paths: evalCase.guideline_paths
5844
7018
  };
5845
- await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
7019
+ await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
5846
7020
  }
5847
7021
  function sanitizeFilename(value) {
5848
7022
  if (!value) {
5849
7023
  return "prompt";
5850
7024
  }
5851
7025
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
5852
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
7026
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
5853
7027
  }
5854
7028
  async function invokeProvider(provider, options) {
5855
7029
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -5906,22 +7080,22 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
5906
7080
  }
5907
7081
  return {
5908
7082
  timestamp: timestamp.toISOString(),
5909
- eval_id: evalCase.id,
7083
+ evalId: evalCase.id,
5910
7084
  dataset: evalCase.dataset,
5911
- conversation_id: evalCase.conversation_id,
7085
+ conversationId: evalCase.conversation_id,
5912
7086
  score: 0,
5913
7087
  hits: [],
5914
7088
  misses: [`Error: ${message}`],
5915
- candidate_answer: `Error occurred: ${message}`,
7089
+ candidateAnswer: `Error occurred: ${message}`,
5916
7090
  target: targetName,
5917
- raw_aspects: [],
5918
- agent_provider_request: agentProviderRequest,
5919
- lm_provider_request: lmProviderRequest,
7091
+ rawAspects: [],
7092
+ agentProviderRequest,
7093
+ lmProviderRequest,
5920
7094
  error: message
5921
7095
  };
5922
7096
  }
5923
7097
  function createCacheKey(provider, target, evalCase, promptInputs) {
5924
- const hash = (0, import_node_crypto2.createHash)("sha256");
7098
+ const hash = (0, import_node_crypto3.createHash)("sha256");
5925
7099
  hash.update(provider.id);
5926
7100
  hash.update(target.name);
5927
7101
  hash.update(evalCase.id);
@@ -5961,8 +7135,8 @@ function mapChildResults(children) {
5961
7135
  hits: child.hits,
5962
7136
  misses: child.misses,
5963
7137
  reasoning: child.reasoning,
5964
- evaluator_provider_request: child.evaluatorRawRequest,
5965
- evaluator_results: mapChildResults(child.evaluatorResults)
7138
+ evaluatorProviderRequest: child.evaluatorRawRequest,
7139
+ evaluatorResults: mapChildResults(child.evaluatorResults)
5966
7140
  }));
5967
7141
  }
5968
7142
  function computeWeightedMean(entries) {
@@ -6064,17 +7238,21 @@ function createAgentKernel() {
6064
7238
  0 && (module.exports = {
6065
7239
  CodeEvaluator,
6066
7240
  CompositeEvaluator,
7241
+ DEFAULT_EXPLORATION_TOOLS,
6067
7242
  LlmJudgeEvaluator,
6068
7243
  TEST_MESSAGE_ROLES,
6069
7244
  ToolTrajectoryEvaluator,
7245
+ avgToolDurationMs,
6070
7246
  buildDirectoryChain,
6071
7247
  buildPromptInputs,
6072
7248
  buildSearchRoots,
6073
7249
  computeTraceSummary,
6074
7250
  consumeCodexLogEntries,
7251
+ consumePiLogEntries,
6075
7252
  createAgentKernel,
6076
7253
  createProvider,
6077
7254
  ensureVSCodeSubagents,
7255
+ explorationRatio,
6078
7256
  extractCodeBlocks,
6079
7257
  fileExists,
6080
7258
  findGitRoot,
@@ -6086,10 +7264,9 @@ function createAgentKernel() {
6086
7264
  isJsonValue,
6087
7265
  isTestMessage,
6088
7266
  isTestMessageRole,
6089
- isTraceEvent,
6090
- isTraceEventType,
6091
7267
  listTargetNames,
6092
7268
  loadEvalCases,
7269
+ mergeExecutionMetrics,
6093
7270
  normalizeLineEndings,
6094
7271
  readJsonFile,
6095
7272
  readTargetDefinitions,
@@ -6100,6 +7277,8 @@ function createAgentKernel() {
6100
7277
  resolveTargetDefinition,
6101
7278
  runEvalCase,
6102
7279
  runEvaluation,
6103
- subscribeToCodexLogEntries
7280
+ subscribeToCodexLogEntries,
7281
+ subscribeToPiLogEntries,
7282
+ tokensPerTool
6104
7283
  });
6105
7284
  //# sourceMappingURL=index.cjs.map