@agentv/core 1.3.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import {
2
2
  buildDirectoryChain,
3
3
  buildSearchRoots,
4
+ extractLastAssistantContent,
4
5
  fileExists,
5
6
  findGitRoot,
6
7
  isAgentProvider,
@@ -9,7 +10,7 @@ import {
9
10
  readTextFile,
10
11
  resolveFileReference,
11
12
  resolveTargetDefinition
12
- } from "./chunk-4A6L2F6L.js";
13
+ } from "./chunk-E2VSU4WZ.js";
13
14
 
14
15
  // src/evaluation/types.ts
15
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -74,33 +75,69 @@ function getHitCount(result) {
74
75
  }
75
76
 
76
77
  // src/evaluation/trace.ts
77
- function isTraceEventType(value) {
78
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
79
- }
80
- function isTraceEvent(value) {
81
- if (typeof value !== "object" || value === null) {
82
- return false;
83
- }
84
- const candidate = value;
85
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
86
- }
87
- function computeTraceSummary(trace) {
78
+ function computeTraceSummary(messages) {
88
79
  const toolCallCounts = {};
89
- let errorCount = 0;
90
- for (const event of trace) {
91
- if (event.type === "tool_call" && event.name) {
92
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
93
- }
94
- if (event.type === "error") {
95
- errorCount++;
80
+ let totalToolCalls = 0;
81
+ for (const message of messages) {
82
+ if (!message.toolCalls) continue;
83
+ for (const toolCall of message.toolCalls) {
84
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
85
+ totalToolCalls++;
96
86
  }
97
87
  }
98
88
  const toolNames = Object.keys(toolCallCounts).sort();
99
89
  return {
100
- eventCount: trace.length,
90
+ eventCount: totalToolCalls,
101
91
  toolNames,
102
92
  toolCallsByName: toolCallCounts,
103
- errorCount
93
+ errorCount: 0
94
+ };
95
+ }
96
+ var DEFAULT_EXPLORATION_TOOLS = [
97
+ "read",
98
+ "grep",
99
+ "glob",
100
+ "search",
101
+ "list",
102
+ "Read",
103
+ "Grep",
104
+ "Glob",
105
+ "WebSearch",
106
+ "WebFetch"
107
+ ];
108
+ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
109
+ if (summary.eventCount === 0) return void 0;
110
+ const explorationCalls = explorationTools.reduce(
111
+ (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
112
+ 0
113
+ );
114
+ return explorationCalls / summary.eventCount;
115
+ }
116
+ function tokensPerTool(summary) {
117
+ if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
118
+ const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
119
+ return totalTokens / summary.eventCount;
120
+ }
121
+ function avgToolDurationMs(summary) {
122
+ if (!summary.toolDurations) return void 0;
123
+ let totalDuration = 0;
124
+ let totalCalls = 0;
125
+ for (const durations of Object.values(summary.toolDurations)) {
126
+ for (const duration of durations) {
127
+ totalDuration += duration;
128
+ totalCalls++;
129
+ }
130
+ }
131
+ if (totalCalls === 0) return void 0;
132
+ return totalDuration / totalCalls;
133
+ }
134
+ function mergeExecutionMetrics(summary, metrics) {
135
+ if (!metrics) return summary;
136
+ return {
137
+ ...summary,
138
+ tokenUsage: metrics.tokenUsage,
139
+ costUsd: metrics.costUsd,
140
+ durationMs: metrics.durationMs
104
141
  };
105
142
  }
106
143
 
@@ -376,7 +413,8 @@ var TEMPLATE_VARIABLES = {
376
413
  QUESTION: "question",
377
414
  EXPECTED_OUTCOME: "expected_outcome",
378
415
  REFERENCE_ANSWER: "reference_answer",
379
- INPUT_MESSAGES: "input_messages"
416
+ INPUT_MESSAGES: "input_messages",
417
+ OUTPUT_MESSAGES: "output_messages"
380
418
  };
381
419
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
382
420
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -616,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
616
654
  expected = [];
617
655
  for (const item of rawExpected) {
618
656
  if (isJsonObject2(item) && typeof item.tool === "string") {
619
- expected.push({ tool: item.tool });
657
+ let args;
658
+ if (item.args === "any") {
659
+ args = "any";
660
+ } else if (isJsonObject2(item.args)) {
661
+ args = item.args;
662
+ }
663
+ expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
620
664
  }
621
665
  }
622
666
  }
@@ -1259,16 +1303,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1259
1303
  }) : [];
1260
1304
  const codeSnippets = extractCodeBlocks(inputSegments);
1261
1305
  let referenceAnswer = "";
1262
- if (outputSegments.length > 1) {
1263
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1264
- } else if (outputSegments.length === 1) {
1265
- const singleMessage = outputSegments[0];
1266
- if (typeof singleMessage.content === "string") {
1267
- referenceAnswer = singleMessage.content;
1268
- } else if (singleMessage.content) {
1269
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1270
- } else if (singleMessage.tool_calls) {
1271
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1306
+ if (outputSegments.length > 0) {
1307
+ const lastMessage = outputSegments[outputSegments.length - 1];
1308
+ const content = lastMessage.content;
1309
+ const toolCalls = lastMessage.tool_calls;
1310
+ if (typeof content === "string") {
1311
+ referenceAnswer = content;
1312
+ } else if (content !== void 0 && content !== null) {
1313
+ referenceAnswer = JSON.stringify(content, null, 2);
1314
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1315
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1272
1316
  }
1273
1317
  }
1274
1318
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1596,11 +1640,11 @@ async function invokeModel(options) {
1596
1640
  return mapResponse(result);
1597
1641
  }
1598
1642
  function mapResponse(result) {
1643
+ const content = result.text ?? "";
1599
1644
  return {
1600
- text: result.text ?? "",
1601
- reasoning: result.reasoningText ?? void 0,
1602
1645
  raw: result,
1603
- usage: toJsonObject(result.totalUsage ?? result.usage)
1646
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1647
+ outputMessages: [{ role: "assistant", content }]
1604
1648
  };
1605
1649
  }
1606
1650
  function toJsonObject(value) {
@@ -1753,6 +1797,7 @@ var CliProvider = class {
1753
1797
  config;
1754
1798
  runCommand;
1755
1799
  verbose;
1800
+ keepTempFiles;
1756
1801
  healthcheckPromise;
1757
1802
  constructor(targetName, config, runner = defaultCommandRunner) {
1758
1803
  this.targetName = targetName;
@@ -1760,6 +1805,7 @@ var CliProvider = class {
1760
1805
  this.config = config;
1761
1806
  this.runCommand = runner;
1762
1807
  this.verbose = config.verbose ?? false;
1808
+ this.keepTempFiles = config.keepTempFiles ?? false;
1763
1809
  }
1764
1810
  async invoke(request) {
1765
1811
  if (request.signal?.aborted) {
@@ -1774,12 +1820,14 @@ var CliProvider = class {
1774
1820
  `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1775
1821
  );
1776
1822
  }
1823
+ const startTime = Date.now();
1777
1824
  const result = await this.runCommand(renderedCommand, {
1778
1825
  cwd: this.config.cwd,
1779
1826
  env: process.env,
1780
1827
  timeoutMs: this.config.timeoutMs,
1781
1828
  signal: request.signal
1782
1829
  });
1830
+ const measuredDurationMs = Date.now() - startTime;
1783
1831
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1784
1832
  if (request.signal?.aborted) {
1785
1833
  throw new Error("CLI provider request was aborted");
@@ -1797,8 +1845,10 @@ var CliProvider = class {
1797
1845
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1798
1846
  const parsed = this.parseOutputContent(responseContent);
1799
1847
  return {
1800
- text: parsed.text,
1801
- trace: parsed.trace,
1848
+ outputMessages: parsed.outputMessages,
1849
+ tokenUsage: parsed.tokenUsage,
1850
+ costUsd: parsed.costUsd,
1851
+ durationMs: parsed.durationMs ?? measuredDurationMs,
1802
1852
  raw: {
1803
1853
  command: renderedCommand,
1804
1854
  stderr: result.stderr,
@@ -1846,12 +1896,14 @@ var CliProvider = class {
1846
1896
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1847
1897
  );
1848
1898
  }
1899
+ const startTime = Date.now();
1849
1900
  const result = await this.runCommand(renderedCommand, {
1850
1901
  cwd: this.config.cwd,
1851
1902
  env: process.env,
1852
1903
  timeoutMs: this.config.timeoutMs,
1853
1904
  signal: controller.signal
1854
1905
  });
1906
+ const measuredDurationMs = Date.now() - startTime;
1855
1907
  if (result.failed || (result.exitCode ?? 0) !== 0) {
1856
1908
  if (controller.signal.aborted) {
1857
1909
  throw new Error("CLI provider request was aborted");
@@ -1873,11 +1925,13 @@ var CliProvider = class {
1873
1925
  if (missingIds.length > 0) {
1874
1926
  throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
1875
1927
  }
1928
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
1876
1929
  const responses = requests.map((request) => {
1877
1930
  const evalCaseId = request.evalCaseId;
1878
1931
  if (!evalCaseId) {
1879
1932
  return {
1880
- text: "",
1933
+ outputMessages: [],
1934
+ durationMs: perRequestFallbackMs,
1881
1935
  raw: {
1882
1936
  command: renderedCommand,
1883
1937
  stderr: result.stderr,
@@ -1890,7 +1944,8 @@ var CliProvider = class {
1890
1944
  const parsed = recordsById.get(evalCaseId);
1891
1945
  if (!parsed) {
1892
1946
  return {
1893
- text: "",
1947
+ outputMessages: [],
1948
+ durationMs: perRequestFallbackMs,
1894
1949
  raw: {
1895
1950
  command: renderedCommand,
1896
1951
  stderr: result.stderr,
@@ -1901,9 +1956,10 @@ var CliProvider = class {
1901
1956
  };
1902
1957
  }
1903
1958
  return {
1904
- text: parsed.text,
1905
- trace: parsed.trace,
1906
- traceRef: parsed.traceRef,
1959
+ outputMessages: parsed.outputMessages,
1960
+ tokenUsage: parsed.tokenUsage,
1961
+ costUsd: parsed.costUsd,
1962
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
1907
1963
  raw: {
1908
1964
  command: renderedCommand,
1909
1965
  stderr: result.stderr,
@@ -1918,28 +1974,111 @@ var CliProvider = class {
1918
1974
  }
1919
1975
  /**
1920
1976
  * Parse output content from CLI.
1921
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
1922
- * Otherwise, treat the entire content as plain text.
1977
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
1978
+ * If only 'text' is provided, wrap it in outputMessages.
1979
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
1980
+ *
1981
+ * Also extracts optional execution metrics:
1982
+ * - token_usage: { input, output, cached? }
1983
+ * - cost_usd: number
1984
+ * - duration_ms: number
1923
1985
  */
1924
1986
  parseOutputContent(content) {
1925
1987
  try {
1926
1988
  const parsed = JSON.parse(content);
1927
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1989
+ if (typeof parsed === "object" && parsed !== null) {
1928
1990
  const obj = parsed;
1929
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1930
- const trace = this.parseTrace(obj.trace);
1931
- return { text, trace };
1991
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
1992
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
1993
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
1994
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
1995
+ if (outputMessages && outputMessages.length > 0) {
1996
+ return { outputMessages, tokenUsage, costUsd, durationMs };
1997
+ }
1998
+ if ("text" in obj) {
1999
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2000
+ return {
2001
+ outputMessages: [{ role: "assistant", content: text }],
2002
+ tokenUsage,
2003
+ costUsd,
2004
+ durationMs
2005
+ };
2006
+ }
1932
2007
  }
1933
2008
  } catch {
1934
2009
  }
1935
- return { text: content };
2010
+ return { outputMessages: [{ role: "assistant", content }] };
2011
+ }
2012
+ /**
2013
+ * Parse token_usage from CLI output.
2014
+ */
2015
+ parseTokenUsage(tokenUsage) {
2016
+ if (typeof tokenUsage !== "object" || tokenUsage === null) {
2017
+ return void 0;
2018
+ }
2019
+ const obj = tokenUsage;
2020
+ if (typeof obj.input !== "number" || typeof obj.output !== "number") {
2021
+ return void 0;
2022
+ }
2023
+ return {
2024
+ input: obj.input,
2025
+ output: obj.output,
2026
+ cached: typeof obj.cached === "number" ? obj.cached : void 0
2027
+ };
2028
+ }
2029
+ /**
2030
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2031
+ */
2032
+ parseOutputMessages(outputMessages) {
2033
+ if (!Array.isArray(outputMessages)) {
2034
+ return void 0;
2035
+ }
2036
+ const messages = [];
2037
+ for (const msg of outputMessages) {
2038
+ if (typeof msg !== "object" || msg === null) {
2039
+ continue;
2040
+ }
2041
+ const rawMsg = msg;
2042
+ if (typeof rawMsg.role !== "string") {
2043
+ continue;
2044
+ }
2045
+ const message = {
2046
+ role: rawMsg.role,
2047
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2048
+ content: rawMsg.content,
2049
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2050
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2051
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
2052
+ };
2053
+ messages.push(message);
2054
+ }
2055
+ return messages.length > 0 ? messages : void 0;
1936
2056
  }
1937
- parseTrace(trace) {
1938
- if (!Array.isArray(trace)) {
2057
+ /**
2058
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2059
+ */
2060
+ parseToolCalls(toolCalls) {
2061
+ if (!Array.isArray(toolCalls)) {
1939
2062
  return void 0;
1940
2063
  }
1941
- const validEvents = trace.filter(isTraceEvent);
1942
- return validEvents.length > 0 ? validEvents : void 0;
2064
+ const calls = [];
2065
+ for (const call of toolCalls) {
2066
+ if (typeof call !== "object" || call === null) {
2067
+ continue;
2068
+ }
2069
+ const rawCall = call;
2070
+ if (typeof rawCall.tool !== "string") {
2071
+ continue;
2072
+ }
2073
+ calls.push({
2074
+ tool: rawCall.tool,
2075
+ input: rawCall.input,
2076
+ output: rawCall.output,
2077
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2078
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2079
+ });
2080
+ }
2081
+ return calls.length > 0 ? calls : void 0;
1943
2082
  }
1944
2083
  parseJsonlBatchOutput(content) {
1945
2084
  const records = /* @__PURE__ */ new Map();
@@ -1963,12 +2102,22 @@ var CliProvider = class {
1963
2102
  if (records.has(id)) {
1964
2103
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
1965
2104
  }
1966
- const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
1967
- const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
2105
+ const tokenUsage = this.parseTokenUsage(obj.token_usage);
2106
+ const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
2107
+ const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
2108
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2109
+ let outputMessages;
2110
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2111
+ outputMessages = parsedOutputMessages;
2112
+ } else {
2113
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2114
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2115
+ }
1968
2116
  records.set(id, {
1969
- text,
1970
- trace: this.parseTrace(obj.trace),
1971
- traceRef
2117
+ outputMessages,
2118
+ tokenUsage,
2119
+ costUsd,
2120
+ durationMs
1972
2121
  });
1973
2122
  }
1974
2123
  return records;
@@ -1981,8 +2130,10 @@ var CliProvider = class {
1981
2130
  const errorMsg = error instanceof Error ? error.message : String(error);
1982
2131
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1983
2132
  } finally {
1984
- await fs.unlink(filePath).catch(() => {
1985
- });
2133
+ if (!this.keepTempFiles) {
2134
+ await fs.unlink(filePath).catch(() => {
2135
+ });
2136
+ }
1986
2137
  }
1987
2138
  }
1988
2139
  async ensureHealthy(signal) {
@@ -2282,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
2282
2433
  var WORKSPACE_PREFIX = "agentv-codex-";
2283
2434
  var PROMPT_FILENAME = "prompt.md";
2284
2435
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
2436
+ var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
2437
+ - Do NOT create any additional output files in the workspace.
2438
+ - All intended file outputs/changes MUST be written in your response.
2439
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
2440
+ This is required for evaluation scoring.`;
2285
2441
  var CodexProvider = class {
2286
2442
  id;
2287
2443
  kind = "codex";
@@ -2306,7 +2462,11 @@ var CodexProvider = class {
2306
2462
  const workspaceRoot = await this.createWorkspace();
2307
2463
  const logger = await this.createStreamLogger(request).catch(() => void 0);
2308
2464
  try {
2309
- const promptContent = buildPromptDocument(request, inputFiles);
2465
+ const basePrompt = buildPromptDocument(request, inputFiles);
2466
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
2467
+ const promptContent = `${systemPrompt}
2468
+
2469
+ ${basePrompt}`;
2310
2470
  const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
2311
2471
  await writeFile(promptFile, promptContent, "utf8");
2312
2472
  const args = this.buildCodexArgs();
@@ -2325,7 +2485,6 @@ var CodexProvider = class {
2325
2485
  const parsed = parseCodexJson(result.stdout);
2326
2486
  const assistantText = extractAssistantText(parsed);
2327
2487
  return {
2328
- text: assistantText,
2329
2488
  raw: {
2330
2489
  response: parsed,
2331
2490
  stdout: result.stdout,
@@ -2337,7 +2496,8 @@ var CodexProvider = class {
2337
2496
  workspace: workspaceRoot,
2338
2497
  inputFiles,
2339
2498
  logFile: logger?.filePath
2340
- }
2499
+ },
2500
+ outputMessages: [{ role: "assistant", content: assistantText }]
2341
2501
  };
2342
2502
  } finally {
2343
2503
  await logger?.close();
@@ -2959,7 +3119,6 @@ var MockProvider = class {
2959
3119
  delayMs;
2960
3120
  delayMinMs;
2961
3121
  delayMaxMs;
2962
- trace;
2963
3122
  constructor(targetName, config) {
2964
3123
  this.id = `mock:${targetName}`;
2965
3124
  this.targetName = targetName;
@@ -2967,7 +3126,6 @@ var MockProvider = class {
2967
3126
  this.delayMs = config.delayMs ?? 0;
2968
3127
  this.delayMinMs = config.delayMinMs ?? 0;
2969
3128
  this.delayMaxMs = config.delayMaxMs ?? 0;
2970
- this.trace = config.trace;
2971
3129
  }
2972
3130
  async invoke(request) {
2973
3131
  const delay = this.calculateDelay();
@@ -2975,12 +3133,11 @@ var MockProvider = class {
2975
3133
  await new Promise((resolve) => setTimeout(resolve, delay));
2976
3134
  }
2977
3135
  return {
2978
- text: this.cannedResponse,
3136
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
2979
3137
  raw: {
2980
3138
  question: request.question,
2981
3139
  guidelines: request.guidelines
2982
- },
2983
- trace: this.trace
3140
+ }
2984
3141
  };
2985
3142
  }
2986
3143
  calculateDelay() {
@@ -2993,163 +3150,842 @@ var MockProvider = class {
2993
3150
  }
2994
3151
  };
2995
3152
 
2996
- // src/evaluation/providers/vscode.ts
3153
+ // src/evaluation/providers/pi-coding-agent.ts
3154
+ import { spawn as spawn2 } from "node:child_process";
3155
+ import { randomUUID as randomUUID2 } from "node:crypto";
3156
+ import { createWriteStream as createWriteStream2 } from "node:fs";
3157
+ import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
3158
+ import { tmpdir as tmpdir2 } from "node:os";
2997
3159
  import path10 from "node:path";
2998
- import {
2999
- dispatchAgentSession,
3000
- dispatchBatchAgent,
3001
- getSubagentRoot,
3002
- provisionSubagents
3003
- } from "subagent";
3004
-
3005
- // src/evaluation/providers/vscode-templates.ts
3006
- var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
3007
-
3008
- {{userQuery}}
3009
-
3010
- [[ ## system_instructions ## ]]
3011
-
3012
- **IMPORTANT**: Follow these exact steps:
3013
- 1. Create and write your complete response to: {{responseFileTmp}}
3014
- - Do NOT create any additional output files in the workspace.
3015
- - All intended file outputs/changes MUST be written in your response file.
3016
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3017
- 2. When completely finished, run these PowerShell commands to signal completion:
3018
- \`\`\`
3019
- Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
3020
- if (Test-Path subagent.lock) { del subagent.lock }
3021
- \`\`\`
3022
-
3023
- Do not proceed to step 2 until your response is completely written to the temporary file.
3024
- `;
3025
- var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
3026
-
3027
- {{userQuery}}
3028
-
3029
- [[ ## system_instructions ## ]]
3030
3160
 
3031
- **IMPORTANT**: Follow these exact steps:
3032
- 1. Create and write your complete response to: {{responseFileTmp}}
3033
- - Do NOT create any additional output files in the workspace.
3034
- - All intended file outputs/changes MUST be written in your response file.
3035
- - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3036
- 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3037
- 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3038
- `;
3161
+ // src/evaluation/providers/pi-log-tracker.ts
3162
+ var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
3163
+ var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
3164
+ function getPiLogStore() {
3165
+ const globalObject = globalThis;
3166
+ const existing = globalObject[GLOBAL_LOGS_KEY2];
3167
+ if (existing) {
3168
+ return existing;
3169
+ }
3170
+ const created = [];
3171
+ globalObject[GLOBAL_LOGS_KEY2] = created;
3172
+ return created;
3173
+ }
3174
+ function getSubscriberStore2() {
3175
+ const globalObject = globalThis;
3176
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
3177
+ if (existing) {
3178
+ return existing;
3179
+ }
3180
+ const created = /* @__PURE__ */ new Set();
3181
+ globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
3182
+ return created;
3183
+ }
3184
+ function notifySubscribers2(entry) {
3185
+ const subscribers = Array.from(getSubscriberStore2());
3186
+ for (const listener of subscribers) {
3187
+ try {
3188
+ listener(entry);
3189
+ } catch (error) {
3190
+ const message = error instanceof Error ? error.message : String(error);
3191
+ console.warn(`Pi log subscriber failed: ${message}`);
3192
+ }
3193
+ }
3194
+ }
3195
+ function recordPiLogEntry(entry) {
3196
+ getPiLogStore().push(entry);
3197
+ notifySubscribers2(entry);
3198
+ }
3199
+ function consumePiLogEntries() {
3200
+ const store = getPiLogStore();
3201
+ if (store.length === 0) {
3202
+ return [];
3203
+ }
3204
+ return store.splice(0, store.length);
3205
+ }
3206
+ function subscribeToPiLogEntries(listener) {
3207
+ const store = getSubscriberStore2();
3208
+ store.add(listener);
3209
+ return () => {
3210
+ store.delete(listener);
3211
+ };
3212
+ }
3039
3213
 
3040
- // src/evaluation/providers/vscode.ts
3041
- var VSCodeProvider = class {
3214
+ // src/evaluation/providers/pi-coding-agent.ts
3215
+ var WORKSPACE_PREFIX2 = "agentv-pi-";
3216
+ var PROMPT_FILENAME2 = "prompt.md";
3217
+ var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
3218
+ - Do NOT create any additional output files in the workspace.
3219
+ - All intended file outputs/changes MUST be written in your response.
3220
+ - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3221
+ This is required for evaluation scoring.`;
3222
+ var PiCodingAgentProvider = class {
3042
3223
  id;
3043
- kind;
3224
+ kind = "pi-coding-agent";
3044
3225
  targetName;
3045
- supportsBatch = true;
3226
+ supportsBatch = false;
3046
3227
  config;
3047
- constructor(targetName, config, kind) {
3048
- this.id = `${kind}:${targetName}`;
3049
- this.kind = kind;
3228
+ runPi;
3229
+ constructor(targetName, config, runner = defaultPiRunner) {
3230
+ this.id = `pi-coding-agent:${targetName}`;
3050
3231
  this.targetName = targetName;
3051
3232
  this.config = config;
3233
+ this.runPi = runner;
3052
3234
  }
3053
3235
  async invoke(request) {
3054
3236
  if (request.signal?.aborted) {
3055
- throw new Error("VS Code provider request was aborted before dispatch");
3056
- }
3057
- const inputFiles = normalizeAttachments(request.inputFiles);
3058
- const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
3059
- const session = await dispatchAgentSession({
3060
- userQuery: promptContent,
3061
- extraAttachments: inputFiles,
3062
- requestTemplate: AGENTV_REQUEST_TEMPLATE,
3063
- wait: this.config.waitForResponse,
3064
- dryRun: this.config.dryRun,
3065
- vscodeCmd: this.config.command,
3066
- subagentRoot: this.config.subagentRoot,
3067
- workspaceTemplate: this.config.workspaceTemplate,
3068
- silent: true
3069
- });
3070
- if (session.exitCode !== 0 || !session.responseFile) {
3071
- const failure = session.error ?? "VS Code subagent did not produce a response";
3072
- throw new Error(failure);
3237
+ throw new Error("Pi coding agent request was aborted before execution");
3073
3238
  }
3074
- if (this.config.dryRun) {
3239
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
3240
+ const workspaceRoot = await this.createWorkspace();
3241
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
3242
+ try {
3243
+ const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
3244
+ await writeFile2(promptFile, request.question, "utf8");
3245
+ const args = this.buildPiArgs(request.question, inputFiles);
3246
+ const cwd = this.resolveCwd(workspaceRoot);
3247
+ const result = await this.executePi(args, cwd, request.signal, logger);
3248
+ if (result.timedOut) {
3249
+ throw new Error(
3250
+ `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
3251
+ );
3252
+ }
3253
+ if (result.exitCode !== 0) {
3254
+ const detail = pickDetail2(result.stderr, result.stdout);
3255
+ const prefix = `Pi coding agent exited with code ${result.exitCode}`;
3256
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
3257
+ }
3258
+ const parsed = parsePiJsonl(result.stdout);
3259
+ const outputMessages = extractOutputMessages(parsed);
3260
+ const assistantText = extractAssistantText2(outputMessages);
3075
3261
  return {
3076
- text: "",
3077
3262
  raw: {
3078
- session,
3079
- inputFiles
3080
- }
3263
+ response: parsed,
3264
+ stdout: result.stdout,
3265
+ stderr: result.stderr,
3266
+ exitCode: result.exitCode,
3267
+ args,
3268
+ executable: this.config.executable,
3269
+ promptFile,
3270
+ workspace: workspaceRoot,
3271
+ inputFiles,
3272
+ logFile: logger?.filePath
3273
+ },
3274
+ outputMessages
3081
3275
  };
3276
+ } finally {
3277
+ await logger?.close();
3278
+ await this.cleanupWorkspace(workspaceRoot);
3082
3279
  }
3083
- const responseText = await readTextFile(session.responseFile);
3084
- return {
3085
- text: responseText,
3086
- raw: {
3087
- session,
3088
- inputFiles
3089
- }
3090
- };
3091
3280
  }
3092
- async invokeBatch(requests) {
3093
- if (requests.length === 0) {
3094
- return [];
3281
+ resolveCwd(workspaceRoot) {
3282
+ if (!this.config.cwd) {
3283
+ return workspaceRoot;
3095
3284
  }
3096
- const normalizedRequests = requests.map((req) => ({
3097
- request: req,
3098
- inputFiles: normalizeAttachments(req.inputFiles)
3099
- }));
3100
- const combinedInputFiles = mergeAttachments(
3101
- normalizedRequests.map(({ inputFiles }) => inputFiles)
3102
- );
3103
- const userQueries = normalizedRequests.map(
3104
- ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
3105
- );
3106
- const session = await dispatchBatchAgent({
3107
- userQueries,
3108
- extraAttachments: combinedInputFiles,
3109
- requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
3110
- wait: this.config.waitForResponse,
3111
- dryRun: this.config.dryRun,
3112
- vscodeCmd: this.config.command,
3113
- subagentRoot: this.config.subagentRoot,
3114
- workspaceTemplate: this.config.workspaceTemplate,
3115
- silent: true
3116
- });
3117
- if (session.exitCode !== 0 || !session.responseFiles) {
3118
- const failure = session.error ?? "VS Code subagent did not produce batch responses";
3119
- throw new Error(failure);
3285
+ return path10.resolve(this.config.cwd);
3286
+ }
3287
+ buildPiArgs(prompt, inputFiles) {
3288
+ const args = [];
3289
+ if (this.config.provider) {
3290
+ args.push("--provider", this.config.provider);
3120
3291
  }
3121
- if (this.config.dryRun) {
3122
- return normalizedRequests.map(({ inputFiles }) => ({
3123
- text: "",
3124
- raw: {
3125
- session,
3126
- inputFiles,
3127
- allInputFiles: combinedInputFiles
3128
- }
3129
- }));
3292
+ if (this.config.model) {
3293
+ args.push("--model", this.config.model);
3130
3294
  }
3131
- if (session.responseFiles.length !== requests.length) {
3132
- throw new Error(
3133
- `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
3134
- );
3295
+ if (this.config.apiKey) {
3296
+ args.push("--api-key", this.config.apiKey);
3135
3297
  }
3136
- const responses = [];
3137
- for (const [index, responseFile] of session.responseFiles.entries()) {
3138
- const responseText = await readTextFile(responseFile);
3139
- responses.push({
3140
- text: responseText,
3141
- raw: {
3142
- session,
3143
- inputFiles: normalizedRequests[index]?.inputFiles,
3144
- allInputFiles: combinedInputFiles,
3145
- responseFile
3146
- }
3147
- });
3298
+ args.push("--mode", "json");
3299
+ args.push("--print");
3300
+ args.push("--no-session");
3301
+ if (this.config.tools) {
3302
+ args.push("--tools", this.config.tools);
3148
3303
  }
3149
- return responses;
3150
- }
3151
- };
3152
- function buildPromptDocument2(request, attachments, guidelinePatterns) {
3304
+ if (this.config.thinking) {
3305
+ args.push("--thinking", this.config.thinking);
3306
+ }
3307
+ if (this.config.args && this.config.args.length > 0) {
3308
+ args.push(...this.config.args);
3309
+ }
3310
+ if (inputFiles && inputFiles.length > 0) {
3311
+ for (const file of inputFiles) {
3312
+ args.push(`@${file}`);
3313
+ }
3314
+ }
3315
+ const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
3316
+ const fullPrompt = `${systemPrompt}
3317
+
3318
+ ${prompt}`;
3319
+ const escapedPrompt = escapeAtSymbols(fullPrompt);
3320
+ args.push(escapedPrompt);
3321
+ return args;
3322
+ }
3323
+ async executePi(args, cwd, signal, logger) {
3324
+ try {
3325
+ return await this.runPi({
3326
+ executable: this.config.executable,
3327
+ args,
3328
+ cwd,
3329
+ timeoutMs: this.config.timeoutMs,
3330
+ env: this.buildEnv(),
3331
+ signal,
3332
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
3333
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
3334
+ });
3335
+ } catch (error) {
3336
+ const err = error;
3337
+ if (err.code === "ENOENT") {
3338
+ throw new Error(
3339
+ `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
3340
+ );
3341
+ }
3342
+ throw error;
3343
+ }
3344
+ }
3345
+ buildEnv() {
3346
+ const env = { ...process.env };
3347
+ if (this.config.apiKey) {
3348
+ const provider = this.config.provider?.toLowerCase() ?? "google";
3349
+ switch (provider) {
3350
+ case "google":
3351
+ case "gemini":
3352
+ env.GEMINI_API_KEY = this.config.apiKey;
3353
+ break;
3354
+ case "anthropic":
3355
+ env.ANTHROPIC_API_KEY = this.config.apiKey;
3356
+ break;
3357
+ case "openai":
3358
+ env.OPENAI_API_KEY = this.config.apiKey;
3359
+ break;
3360
+ case "groq":
3361
+ env.GROQ_API_KEY = this.config.apiKey;
3362
+ break;
3363
+ case "xai":
3364
+ env.XAI_API_KEY = this.config.apiKey;
3365
+ break;
3366
+ case "openrouter":
3367
+ env.OPENROUTER_API_KEY = this.config.apiKey;
3368
+ break;
3369
+ }
3370
+ }
3371
+ return env;
3372
+ }
3373
+ async createWorkspace() {
3374
+ return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
3375
+ }
3376
+ async cleanupWorkspace(workspaceRoot) {
3377
+ try {
3378
+ await rm2(workspaceRoot, { recursive: true, force: true });
3379
+ } catch {
3380
+ }
3381
+ }
3382
+ resolveLogDirectory() {
3383
+ if (this.config.logDir) {
3384
+ return path10.resolve(this.config.logDir);
3385
+ }
3386
+ return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
3387
+ }
3388
+ async createStreamLogger(request) {
3389
+ const logDir = this.resolveLogDirectory();
3390
+ if (!logDir) {
3391
+ return void 0;
3392
+ }
3393
+ try {
3394
+ await mkdir2(logDir, { recursive: true });
3395
+ } catch (error) {
3396
+ const message = error instanceof Error ? error.message : String(error);
3397
+ console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
3398
+ return void 0;
3399
+ }
3400
+ const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
3401
+ try {
3402
+ const logger = await PiStreamLogger.create({
3403
+ filePath,
3404
+ targetName: this.targetName,
3405
+ evalCaseId: request.evalCaseId,
3406
+ attempt: request.attempt,
3407
+ format: this.config.logFormat ?? "summary"
3408
+ });
3409
+ recordPiLogEntry({
3410
+ filePath,
3411
+ targetName: this.targetName,
3412
+ evalCaseId: request.evalCaseId,
3413
+ attempt: request.attempt
3414
+ });
3415
+ return logger;
3416
+ } catch (error) {
3417
+ const message = error instanceof Error ? error.message : String(error);
3418
+ console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
3419
+ return void 0;
3420
+ }
3421
+ }
3422
+ };
3423
+ var PiStreamLogger = class _PiStreamLogger {
3424
+ filePath;
3425
+ stream;
3426
+ startedAt = Date.now();
3427
+ stdoutBuffer = "";
3428
+ stderrBuffer = "";
3429
+ format;
3430
+ constructor(filePath, format) {
3431
+ this.filePath = filePath;
3432
+ this.format = format;
3433
+ this.stream = createWriteStream2(filePath, { flags: "a" });
3434
+ }
3435
+ static async create(options) {
3436
+ const logger = new _PiStreamLogger(options.filePath, options.format);
3437
+ const header = [
3438
+ "# Pi Coding Agent stream log",
3439
+ `# target: ${options.targetName}`,
3440
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
3441
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
3442
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
3443
+ ""
3444
+ ].filter((line) => Boolean(line));
3445
+ logger.writeLines(header);
3446
+ return logger;
3447
+ }
3448
+ handleStdoutChunk(chunk) {
3449
+ this.stdoutBuffer += chunk;
3450
+ this.flushBuffer("stdout");
3451
+ }
3452
+ handleStderrChunk(chunk) {
3453
+ this.stderrBuffer += chunk;
3454
+ this.flushBuffer("stderr");
3455
+ }
3456
+ async close() {
3457
+ this.flushBuffer("stdout");
3458
+ this.flushBuffer("stderr");
3459
+ this.flushRemainder();
3460
+ await new Promise((resolve, reject) => {
3461
+ this.stream.once("error", reject);
3462
+ this.stream.end(() => resolve());
3463
+ });
3464
+ }
3465
+ writeLines(lines) {
3466
+ for (const line of lines) {
3467
+ this.stream.write(`${line}
3468
+ `);
3469
+ }
3470
+ }
3471
+ flushBuffer(source) {
3472
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
3473
+ const lines = buffer.split(/\r?\n/);
3474
+ const remainder = lines.pop() ?? "";
3475
+ if (source === "stdout") {
3476
+ this.stdoutBuffer = remainder;
3477
+ } else {
3478
+ this.stderrBuffer = remainder;
3479
+ }
3480
+ for (const line of lines) {
3481
+ const formatted = this.formatLine(line, source);
3482
+ if (formatted) {
3483
+ this.stream.write(formatted);
3484
+ this.stream.write("\n");
3485
+ }
3486
+ }
3487
+ }
3488
+ formatLine(rawLine, source) {
3489
+ const trimmed = rawLine.trim();
3490
+ if (trimmed.length === 0) {
3491
+ return void 0;
3492
+ }
3493
+ const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
3494
+ return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
3495
+ }
3496
+ flushRemainder() {
3497
+ const stdoutRemainder = this.stdoutBuffer.trim();
3498
+ if (stdoutRemainder.length > 0) {
3499
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
3500
+ if (formatted) {
3501
+ this.stream.write(formatted);
3502
+ this.stream.write("\n");
3503
+ }
3504
+ }
3505
+ const stderrRemainder = this.stderrBuffer.trim();
3506
+ if (stderrRemainder.length > 0) {
3507
+ const formatted = this.formatLine(stderrRemainder, "stderr");
3508
+ if (formatted) {
3509
+ this.stream.write(formatted);
3510
+ this.stream.write("\n");
3511
+ }
3512
+ }
3513
+ this.stdoutBuffer = "";
3514
+ this.stderrBuffer = "";
3515
+ }
3516
+ };
3517
+ function buildLogFilename2(request, targetName) {
3518
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3519
+ const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
3520
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
3521
+ const target = sanitizeForFilename2(targetName);
3522
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
3523
+ }
3524
+ function sanitizeForFilename2(value) {
3525
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3526
+ return sanitized.length > 0 ? sanitized : "pi";
3527
+ }
3528
+ function formatElapsed2(startedAt) {
3529
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
3530
+ const hours = Math.floor(elapsedSeconds / 3600);
3531
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
3532
+ const seconds = elapsedSeconds % 60;
3533
+ if (hours > 0) {
3534
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3535
+ }
3536
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
3537
+ }
3538
+ function formatPiLogMessage(rawLine, source) {
3539
+ const parsed = tryParseJsonValue2(rawLine);
3540
+ if (parsed) {
3541
+ const summary = summarizePiEvent(parsed);
3542
+ if (summary) {
3543
+ return summary;
3544
+ }
3545
+ }
3546
+ if (source === "stderr") {
3547
+ return `stderr: ${rawLine}`;
3548
+ }
3549
+ return rawLine;
3550
+ }
3551
+ function formatPiJsonLog(rawLine) {
3552
+ const parsed = tryParseJsonValue2(rawLine);
3553
+ if (!parsed) {
3554
+ return rawLine;
3555
+ }
3556
+ try {
3557
+ return JSON.stringify(parsed, null, 2);
3558
+ } catch {
3559
+ return rawLine;
3560
+ }
3561
+ }
3562
+ function summarizePiEvent(event) {
3563
+ if (!event || typeof event !== "object") {
3564
+ return void 0;
3565
+ }
3566
+ const record = event;
3567
+ const type = typeof record.type === "string" ? record.type : void 0;
3568
+ if (!type) {
3569
+ return void 0;
3570
+ }
3571
+ switch (type) {
3572
+ case "agent_start":
3573
+ return "agent_start";
3574
+ case "agent_end":
3575
+ return "agent_end";
3576
+ case "turn_start":
3577
+ return "turn_start";
3578
+ case "turn_end":
3579
+ return "turn_end";
3580
+ case "message_start":
3581
+ case "message_end": {
3582
+ const message = record.message;
3583
+ const role = message?.role;
3584
+ return `${type}: ${role}`;
3585
+ }
3586
+ case "message_update": {
3587
+ const event2 = record.assistantMessageEvent;
3588
+ const eventType = event2?.type;
3589
+ if (eventType === "text_delta") {
3590
+ const delta = event2?.delta;
3591
+ if (typeof delta === "string") {
3592
+ const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
3593
+ return `text_delta: ${preview}`;
3594
+ }
3595
+ }
3596
+ return `message_update: ${eventType}`;
3597
+ }
3598
+ default:
3599
+ return type;
3600
+ }
3601
+ }
3602
+ function tryParseJsonValue2(rawLine) {
3603
+ try {
3604
+ return JSON.parse(rawLine);
3605
+ } catch {
3606
+ return void 0;
3607
+ }
3608
+ }
3609
+ function parsePiJsonl(output) {
3610
+ const trimmed = output.trim();
3611
+ if (trimmed.length === 0) {
3612
+ throw new Error("Pi coding agent produced no output");
3613
+ }
3614
+ const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
3615
+ const parsed = [];
3616
+ for (const line of lines) {
3617
+ try {
3618
+ parsed.push(JSON.parse(line));
3619
+ } catch {
3620
+ }
3621
+ }
3622
+ if (parsed.length === 0) {
3623
+ throw new Error("Pi coding agent produced no valid JSON output");
3624
+ }
3625
+ return parsed;
3626
+ }
3627
+ function extractOutputMessages(events) {
3628
+ for (let i = events.length - 1; i >= 0; i--) {
3629
+ const event = events[i];
3630
+ if (!event || typeof event !== "object") {
3631
+ continue;
3632
+ }
3633
+ const record = event;
3634
+ if (record.type !== "agent_end") {
3635
+ continue;
3636
+ }
3637
+ const messages = record.messages;
3638
+ if (!Array.isArray(messages)) {
3639
+ continue;
3640
+ }
3641
+ return messages.map(convertPiMessage).filter((m) => m !== void 0);
3642
+ }
3643
+ const outputMessages = [];
3644
+ for (const event of events) {
3645
+ if (!event || typeof event !== "object") {
3646
+ continue;
3647
+ }
3648
+ const record = event;
3649
+ if (record.type === "turn_end") {
3650
+ const message = record.message;
3651
+ const converted = convertPiMessage(message);
3652
+ if (converted) {
3653
+ outputMessages.push(converted);
3654
+ }
3655
+ }
3656
+ }
3657
+ return outputMessages;
3658
+ }
3659
+ function convertPiMessage(message) {
3660
+ if (!message || typeof message !== "object") {
3661
+ return void 0;
3662
+ }
3663
+ const msg = message;
3664
+ const role = msg.role;
3665
+ if (typeof role !== "string") {
3666
+ return void 0;
3667
+ }
3668
+ const content = extractTextContent(msg.content);
3669
+ const toolCalls = extractToolCalls(msg.content);
3670
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
3671
+ const metadata = {};
3672
+ if (msg.api) metadata.api = msg.api;
3673
+ if (msg.provider) metadata.provider = msg.provider;
3674
+ if (msg.model) metadata.model = msg.model;
3675
+ if (msg.usage) metadata.usage = msg.usage;
3676
+ if (msg.stopReason) metadata.stopReason = msg.stopReason;
3677
+ return {
3678
+ role,
3679
+ content,
3680
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
3681
+ timestamp,
3682
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0
3683
+ };
3684
+ }
3685
+ function extractTextContent(content) {
3686
+ if (typeof content === "string") {
3687
+ return content;
3688
+ }
3689
+ if (!Array.isArray(content)) {
3690
+ return void 0;
3691
+ }
3692
+ const textParts = [];
3693
+ for (const part of content) {
3694
+ if (!part || typeof part !== "object") {
3695
+ continue;
3696
+ }
3697
+ const p = part;
3698
+ if (p.type === "text" && typeof p.text === "string") {
3699
+ textParts.push(p.text);
3700
+ }
3701
+ }
3702
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
3703
+ }
3704
+ function extractToolCalls(content) {
3705
+ if (!Array.isArray(content)) {
3706
+ return [];
3707
+ }
3708
+ const toolCalls = [];
3709
+ for (const part of content) {
3710
+ if (!part || typeof part !== "object") {
3711
+ continue;
3712
+ }
3713
+ const p = part;
3714
+ if (p.type === "tool_use" && typeof p.name === "string") {
3715
+ toolCalls.push({
3716
+ tool: p.name,
3717
+ input: p.input,
3718
+ id: typeof p.id === "string" ? p.id : void 0
3719
+ });
3720
+ }
3721
+ if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
3722
+ const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
3723
+ if (existing) {
3724
+ const idx = toolCalls.indexOf(existing);
3725
+ toolCalls[idx] = {
3726
+ ...existing,
3727
+ output: p.content
3728
+ };
3729
+ }
3730
+ }
3731
+ }
3732
+ return toolCalls;
3733
+ }
3734
+ function extractAssistantText2(messages) {
3735
+ for (let i = messages.length - 1; i >= 0; i--) {
3736
+ const msg = messages[i];
3737
+ if (msg.role === "assistant" && msg.content) {
3738
+ if (typeof msg.content === "string") {
3739
+ return msg.content;
3740
+ }
3741
+ return JSON.stringify(msg.content);
3742
+ }
3743
+ }
3744
+ return "";
3745
+ }
3746
+ function escapeAtSymbols(prompt) {
3747
+ return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
3748
+ }
3749
+ function pickDetail2(stderr, stdout) {
3750
+ const errorText = stderr.trim();
3751
+ if (errorText.length > 0) {
3752
+ return errorText;
3753
+ }
3754
+ const stdoutText = stdout.trim();
3755
+ return stdoutText.length > 0 ? stdoutText : void 0;
3756
+ }
3757
+ function formatTimeoutSuffix3(timeoutMs) {
3758
+ if (!timeoutMs || timeoutMs <= 0) {
3759
+ return "";
3760
+ }
3761
+ const seconds = Math.ceil(timeoutMs / 1e3);
3762
+ return ` after ${seconds}s`;
3763
+ }
3764
+ async function defaultPiRunner(options) {
3765
+ return await new Promise((resolve, reject) => {
3766
+ const parts = options.executable.split(/\s+/);
3767
+ const executable = parts[0];
3768
+ const executableArgs = parts.slice(1);
3769
+ const allArgs = [...executableArgs, ...options.args];
3770
+ const child = spawn2(executable, allArgs, {
3771
+ cwd: options.cwd,
3772
+ env: options.env,
3773
+ stdio: ["pipe", "pipe", "pipe"],
3774
+ shell: false
3775
+ });
3776
+ let stdout = "";
3777
+ let stderr = "";
3778
+ let timedOut = false;
3779
+ const onAbort = () => {
3780
+ child.kill("SIGTERM");
3781
+ };
3782
+ if (options.signal) {
3783
+ if (options.signal.aborted) {
3784
+ onAbort();
3785
+ } else {
3786
+ options.signal.addEventListener("abort", onAbort, { once: true });
3787
+ }
3788
+ }
3789
+ let timeoutHandle;
3790
+ if (options.timeoutMs && options.timeoutMs > 0) {
3791
+ timeoutHandle = setTimeout(() => {
3792
+ timedOut = true;
3793
+ child.kill("SIGTERM");
3794
+ }, options.timeoutMs);
3795
+ timeoutHandle.unref?.();
3796
+ }
3797
+ child.stdout.setEncoding("utf8");
3798
+ child.stdout.on("data", (chunk) => {
3799
+ stdout += chunk;
3800
+ options.onStdoutChunk?.(chunk);
3801
+ });
3802
+ child.stderr.setEncoding("utf8");
3803
+ child.stderr.on("data", (chunk) => {
3804
+ stderr += chunk;
3805
+ options.onStderrChunk?.(chunk);
3806
+ });
3807
+ child.stdin.end();
3808
+ const cleanup = () => {
3809
+ if (timeoutHandle) {
3810
+ clearTimeout(timeoutHandle);
3811
+ }
3812
+ if (options.signal) {
3813
+ options.signal.removeEventListener("abort", onAbort);
3814
+ }
3815
+ };
3816
+ child.on("error", (error) => {
3817
+ cleanup();
3818
+ reject(error);
3819
+ });
3820
+ child.on("close", (code) => {
3821
+ cleanup();
3822
+ resolve({
3823
+ stdout,
3824
+ stderr,
3825
+ exitCode: typeof code === "number" ? code : -1,
3826
+ timedOut
3827
+ });
3828
+ });
3829
+ });
3830
+ }
3831
+
3832
+ // src/evaluation/providers/vscode.ts
3833
+ import path11 from "node:path";
3834
+ import {
3835
+ dispatchAgentSession,
3836
+ dispatchBatchAgent,
3837
+ getSubagentRoot,
3838
+ provisionSubagents
3839
+ } from "subagent";
3840
+
3841
+ // src/evaluation/providers/vscode-templates.ts
3842
+ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
3843
+
3844
+ {{userQuery}}
3845
+
3846
+ [[ ## system_instructions ## ]]
3847
+
3848
+ **IMPORTANT**: Follow these exact steps:
3849
+ 1. Create and write your complete response to: {{responseFileTmp}}
3850
+ - Do NOT create any additional output files in the workspace.
3851
+ - All intended file outputs/changes MUST be written in your response file.
3852
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3853
+ 2. When completely finished, run these PowerShell commands to signal completion:
3854
+ \`\`\`
3855
+ Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
3856
+ if (Test-Path subagent.lock) { del subagent.lock }
3857
+ \`\`\`
3858
+
3859
+ Do not proceed to step 2 until your response is completely written to the temporary file.
3860
+ `;
3861
+ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
3862
+
3863
+ {{userQuery}}
3864
+
3865
+ [[ ## system_instructions ## ]]
3866
+
3867
+ **IMPORTANT**: Follow these exact steps:
3868
+ 1. Create and write your complete response to: {{responseFileTmp}}
3869
+ - Do NOT create any additional output files in the workspace.
3870
+ - All intended file outputs/changes MUST be written in your response file.
3871
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3872
+ 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3873
+ 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3874
+ `;
3875
+
3876
+ // src/evaluation/providers/vscode.ts
3877
+ var VSCodeProvider = class {
3878
+ id;
3879
+ kind;
3880
+ targetName;
3881
+ supportsBatch = true;
3882
+ config;
3883
+ constructor(targetName, config, kind) {
3884
+ this.id = `${kind}:${targetName}`;
3885
+ this.kind = kind;
3886
+ this.targetName = targetName;
3887
+ this.config = config;
3888
+ }
3889
+ async invoke(request) {
3890
+ if (request.signal?.aborted) {
3891
+ throw new Error("VS Code provider request was aborted before dispatch");
3892
+ }
3893
+ const inputFiles = normalizeAttachments(request.inputFiles);
3894
+ const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
3895
+ const session = await dispatchAgentSession({
3896
+ userQuery: promptContent,
3897
+ extraAttachments: inputFiles,
3898
+ requestTemplate: AGENTV_REQUEST_TEMPLATE,
3899
+ wait: this.config.waitForResponse,
3900
+ dryRun: this.config.dryRun,
3901
+ vscodeCmd: this.config.command,
3902
+ subagentRoot: this.config.subagentRoot,
3903
+ workspaceTemplate: this.config.workspaceTemplate,
3904
+ silent: true
3905
+ });
3906
+ if (session.exitCode !== 0 || !session.responseFile) {
3907
+ const failure = session.error ?? "VS Code subagent did not produce a response";
3908
+ throw new Error(failure);
3909
+ }
3910
+ if (this.config.dryRun) {
3911
+ return {
3912
+ outputMessages: [],
3913
+ raw: {
3914
+ session,
3915
+ inputFiles
3916
+ }
3917
+ };
3918
+ }
3919
+ const responseText = await readTextFile(session.responseFile);
3920
+ return {
3921
+ outputMessages: [{ role: "assistant", content: responseText }],
3922
+ raw: {
3923
+ session,
3924
+ inputFiles
3925
+ }
3926
+ };
3927
+ }
3928
+ async invokeBatch(requests) {
3929
+ if (requests.length === 0) {
3930
+ return [];
3931
+ }
3932
+ const normalizedRequests = requests.map((req) => ({
3933
+ request: req,
3934
+ inputFiles: normalizeAttachments(req.inputFiles)
3935
+ }));
3936
+ const combinedInputFiles = mergeAttachments(
3937
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
3938
+ );
3939
+ const userQueries = normalizedRequests.map(
3940
+ ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
3941
+ );
3942
+ const session = await dispatchBatchAgent({
3943
+ userQueries,
3944
+ extraAttachments: combinedInputFiles,
3945
+ requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
3946
+ wait: this.config.waitForResponse,
3947
+ dryRun: this.config.dryRun,
3948
+ vscodeCmd: this.config.command,
3949
+ subagentRoot: this.config.subagentRoot,
3950
+ workspaceTemplate: this.config.workspaceTemplate,
3951
+ silent: true
3952
+ });
3953
+ if (session.exitCode !== 0 || !session.responseFiles) {
3954
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
3955
+ throw new Error(failure);
3956
+ }
3957
+ if (this.config.dryRun) {
3958
+ return normalizedRequests.map(({ inputFiles }) => ({
3959
+ outputMessages: [],
3960
+ raw: {
3961
+ session,
3962
+ inputFiles,
3963
+ allInputFiles: combinedInputFiles
3964
+ }
3965
+ }));
3966
+ }
3967
+ if (session.responseFiles.length !== requests.length) {
3968
+ throw new Error(
3969
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
3970
+ );
3971
+ }
3972
+ const responses = [];
3973
+ for (const [index, responseFile] of session.responseFiles.entries()) {
3974
+ const responseText = await readTextFile(responseFile);
3975
+ responses.push({
3976
+ outputMessages: [{ role: "assistant", content: responseText }],
3977
+ raw: {
3978
+ session,
3979
+ inputFiles: normalizedRequests[index]?.inputFiles,
3980
+ allInputFiles: combinedInputFiles,
3981
+ responseFile
3982
+ }
3983
+ });
3984
+ }
3985
+ return responses;
3986
+ }
3987
+ };
3988
+ function buildPromptDocument2(request, attachments, guidelinePatterns) {
3153
3989
  const parts = [];
3154
3990
  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3155
3991
  parts.push(request.systemPrompt.trim());
@@ -3169,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3169
4005
  return "";
3170
4006
  }
3171
4007
  const buildList = (files) => files.map((absolutePath) => {
3172
- const fileName = path10.basename(absolutePath);
4008
+ const fileName = path11.basename(absolutePath);
3173
4009
  const fileUri = pathToFileUri2(absolutePath);
3174
4010
  return `* [${fileName}](${fileUri})`;
3175
4011
  });
@@ -3194,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3194
4030
  }
3195
4031
  const unique = /* @__PURE__ */ new Map();
3196
4032
  for (const attachment of attachments) {
3197
- const absolutePath = path10.resolve(attachment);
3198
- const normalized = absolutePath.split(path10.sep).join("/");
4033
+ const absolutePath = path11.resolve(attachment);
4034
+ const normalized = absolutePath.split(path11.sep).join("/");
3199
4035
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3200
4036
  if (!unique.has(absolutePath)) {
3201
4037
  unique.set(absolutePath, absolutePath);
@@ -3210,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
3210
4046
  }
3211
4047
  const unique = /* @__PURE__ */ new Map();
3212
4048
  for (const attachment of attachments) {
3213
- const absolutePath = path10.resolve(attachment);
4049
+ const absolutePath = path11.resolve(attachment);
3214
4050
  if (!unique.has(absolutePath)) {
3215
4051
  unique.set(absolutePath, absolutePath);
3216
4052
  }
@@ -3218,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
3218
4054
  return Array.from(unique.values());
3219
4055
  }
3220
4056
  function pathToFileUri2(filePath) {
3221
- const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
4057
+ const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
3222
4058
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3223
4059
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3224
4060
  return `file:///${normalizedPath}`;
@@ -3231,7 +4067,7 @@ function normalizeAttachments(attachments) {
3231
4067
  }
3232
4068
  const deduped = /* @__PURE__ */ new Set();
3233
4069
  for (const attachment of attachments) {
3234
- deduped.add(path10.resolve(attachment));
4070
+ deduped.add(path11.resolve(attachment));
3235
4071
  }
3236
4072
  return Array.from(deduped);
3237
4073
  }
@@ -3240,7 +4076,7 @@ function mergeAttachments(all) {
3240
4076
  for (const list of all) {
3241
4077
  if (!list) continue;
3242
4078
  for (const inputFile of list) {
3243
- deduped.add(path10.resolve(inputFile));
4079
+ deduped.add(path11.resolve(inputFile));
3244
4080
  }
3245
4081
  }
3246
4082
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3289,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3289
4125
  // src/evaluation/providers/targets-file.ts
3290
4126
  import { constants as constants3 } from "node:fs";
3291
4127
  import { access as access3, readFile as readFile6 } from "node:fs/promises";
3292
- import path11 from "node:path";
4128
+ import path12 from "node:path";
3293
4129
  import { parse as parse3 } from "yaml";
3294
4130
  function isRecord(value) {
3295
4131
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3326,7 +4162,7 @@ async function fileExists3(filePath) {
3326
4162
  }
3327
4163
  }
3328
4164
  async function readTargetDefinitions(filePath) {
3329
- const absolutePath = path11.resolve(filePath);
4165
+ const absolutePath = path12.resolve(filePath);
3330
4166
  if (!await fileExists3(absolutePath)) {
3331
4167
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3332
4168
  }
@@ -3358,6 +4194,8 @@ function createProvider(target) {
3358
4194
  return new CliProvider(target.name, target.config);
3359
4195
  case "codex":
3360
4196
  return new CodexProvider(target.name, target.config);
4197
+ case "pi-coding-agent":
4198
+ return new PiCodingAgentProvider(target.name, target.config);
3361
4199
  case "mock":
3362
4200
  return new MockProvider(target.name, target.config);
3363
4201
  case "vscode":
@@ -3377,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
3377
4215
  // src/evaluation/evaluators.ts
3378
4216
  import { generateText as generateText2 } from "ai";
3379
4217
  import { z } from "zod";
4218
+
4219
+ // src/runtime/exec.ts
4220
+ function getBunSpawn() {
4221
+ const bunSpawn = globalThis.Bun?.spawn;
4222
+ return typeof bunSpawn === "function" ? bunSpawn : void 0;
4223
+ }
4224
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
4225
+ const bunSpawn = getBunSpawn();
4226
+ if (bunSpawn) {
4227
+ const encoder = new TextEncoder();
4228
+ const proc = bunSpawn({
4229
+ cmd: ["sh", "-c", command],
4230
+ cwd: options.cwd,
4231
+ stdin: encoder.encode(stdinPayload),
4232
+ stdout: "pipe",
4233
+ stderr: "pipe"
4234
+ });
4235
+ const timeout = options.timeoutMs ? setTimeout(() => {
4236
+ proc.kill();
4237
+ }, options.timeoutMs) : void 0;
4238
+ try {
4239
+ const stdout = await new Response(proc.stdout).text();
4240
+ const stderr = await new Response(proc.stderr).text();
4241
+ const exitCode = await proc.exited;
4242
+ return { stdout, stderr, exitCode };
4243
+ } finally {
4244
+ if (timeout !== void 0) {
4245
+ clearTimeout(timeout);
4246
+ }
4247
+ }
4248
+ }
4249
+ const { spawn: spawn3 } = await import("node:child_process");
4250
+ return await new Promise((resolve, reject) => {
4251
+ const child = spawn3(command, {
4252
+ shell: true,
4253
+ cwd: options.cwd,
4254
+ stdio: ["pipe", "pipe", "pipe"]
4255
+ });
4256
+ let stdout = "";
4257
+ let stderr = "";
4258
+ const timeout = options.timeoutMs ? setTimeout(() => {
4259
+ child.kill();
4260
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
4261
+ }, options.timeoutMs) : void 0;
4262
+ child.stdout?.on("data", (data) => {
4263
+ stdout += data.toString();
4264
+ });
4265
+ child.stderr?.on("data", (data) => {
4266
+ stderr += data.toString();
4267
+ });
4268
+ child.on("error", (error) => {
4269
+ if (timeout !== void 0) {
4270
+ clearTimeout(timeout);
4271
+ }
4272
+ reject(error);
4273
+ });
4274
+ child.on("exit", (code) => {
4275
+ if (timeout !== void 0) {
4276
+ clearTimeout(timeout);
4277
+ }
4278
+ resolve({ stdout, stderr, exitCode: code ?? 0 });
4279
+ });
4280
+ child.stdin?.write(stdinPayload);
4281
+ child.stdin?.end();
4282
+ });
4283
+ }
4284
+
4285
+ // src/evaluation/evaluators.ts
3380
4286
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3381
4287
 
3382
4288
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3441,6 +4347,7 @@ var LlmJudgeEvaluator = class {
3441
4347
  null,
3442
4348
  2
3443
4349
  ),
4350
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
3444
4351
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
3445
4352
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
3446
4353
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -3465,7 +4372,7 @@ var LlmJudgeEvaluator = class {
3465
4372
  const score = clampScore(data.score);
3466
4373
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3467
4374
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3468
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
4375
+ const reasoning = data.reasoning;
3469
4376
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3470
4377
  return {
3471
4378
  score,
@@ -3567,7 +4474,9 @@ var LlmJudgeEvaluator = class {
3567
4474
  maxOutputTokens: this.maxOutputTokens,
3568
4475
  temperature: this.temperature
3569
4476
  });
3570
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
4477
+ const data = schema.parse(
4478
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4479
+ );
3571
4480
  return { data, providerResponse: response };
3572
4481
  } catch (e) {
3573
4482
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -3649,17 +4558,17 @@ var CodeEvaluator = class {
3649
4558
  const inputPayload = JSON.stringify(
3650
4559
  {
3651
4560
  question: context.evalCase.question,
3652
- expected_outcome: context.evalCase.expected_outcome,
3653
- expected_messages: context.evalCase.expected_messages,
3654
- reference_answer: context.evalCase.reference_answer,
3655
- candidate_answer: context.candidate,
3656
- guideline_files: context.evalCase.guideline_paths,
3657
- input_files: context.evalCase.file_paths.filter(
3658
- (path13) => !context.evalCase.guideline_paths.includes(path13)
4561
+ expectedOutcome: context.evalCase.expected_outcome,
4562
+ expectedMessages: context.evalCase.expected_messages,
4563
+ referenceAnswer: context.evalCase.reference_answer,
4564
+ candidateAnswer: context.candidate,
4565
+ outputMessages: context.outputMessages ?? null,
4566
+ guidelineFiles: context.evalCase.guideline_paths,
4567
+ inputFiles: context.evalCase.file_paths.filter(
4568
+ (path14) => !context.evalCase.guideline_paths.includes(path14)
3659
4569
  ),
3660
- input_messages: context.evalCase.input_messages,
3661
- candidate_trace_file: context.candidateTraceRef ?? null,
3662
- candidate_trace_summary: context.candidateTraceSummary ?? null
4570
+ inputMessages: context.evalCase.input_messages,
4571
+ traceSummary: context.traceSummary ?? null
3663
4572
  },
3664
4573
  null,
3665
4574
  2
@@ -3729,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
3729
4638
  return { score, verdict, hits, misses };
3730
4639
  }
3731
4640
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
3732
- const { spawn: spawn2 } = await import("node:child_process");
3733
- return await new Promise((resolve, reject) => {
3734
- const child = spawn2(scriptPath, {
3735
- shell: true,
3736
- cwd
3737
- });
3738
- let stdout = "";
3739
- let stderr = "";
3740
- const timeout = agentTimeoutMs ? setTimeout(() => {
3741
- child.kill();
3742
- reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
3743
- }, agentTimeoutMs) : void 0;
3744
- child.stdout?.on("data", (data) => {
3745
- stdout += data.toString();
3746
- });
3747
- child.stderr?.on("data", (data) => {
3748
- stderr += data.toString();
3749
- });
3750
- child.on("error", (error) => {
3751
- if (timeout !== void 0) {
3752
- clearTimeout(timeout);
3753
- }
3754
- reject(error);
3755
- });
3756
- child.on("exit", (code) => {
3757
- if (timeout !== void 0) {
3758
- clearTimeout(timeout);
3759
- }
3760
- if (code && code !== 0 && stderr.length > 0) {
3761
- reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
3762
- return;
3763
- }
3764
- resolve(stdout.trim());
3765
- });
3766
- child.stdin?.write(input);
3767
- child.stdin?.end();
4641
+ const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
4642
+ cwd,
4643
+ timeoutMs: agentTimeoutMs
3768
4644
  });
4645
+ if (exitCode !== 0) {
4646
+ const trimmedErr = stderr.trim();
4647
+ throw new Error(
4648
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
4649
+ );
4650
+ }
4651
+ return stdout.trim();
3769
4652
  }
3770
4653
  function parseJsonSafe(payload) {
3771
4654
  try {
@@ -3779,6 +4662,33 @@ function substituteVariables(template, variables) {
3779
4662
  return variables[varName] ?? match;
3780
4663
  });
3781
4664
  }
4665
+ function deepEqual(a, b) {
4666
+ if (a === b) return true;
4667
+ if (a === null || b === null) return a === b;
4668
+ if (typeof a !== typeof b) return false;
4669
+ if (typeof a !== "object") return a === b;
4670
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
4671
+ if (Array.isArray(a) && Array.isArray(b)) {
4672
+ if (a.length !== b.length) return false;
4673
+ return a.every((val, i) => deepEqual(val, b[i]));
4674
+ }
4675
+ const aObj = a;
4676
+ const bObj = b;
4677
+ const aKeys = Object.keys(aObj);
4678
+ const bKeys = Object.keys(bObj);
4679
+ if (aKeys.length !== bKeys.length) return false;
4680
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
4681
+ }
4682
+ function argsMatch(expected, actual) {
4683
+ if (expected === void 0) return true;
4684
+ if (expected === "any") return true;
4685
+ if (actual === void 0) return false;
4686
+ for (const key of Object.keys(expected)) {
4687
+ if (!Object.hasOwn(actual, key)) return false;
4688
+ if (!deepEqual(expected[key], actual[key])) return false;
4689
+ }
4690
+ return true;
4691
+ }
3782
4692
  var ToolTrajectoryEvaluator = class {
3783
4693
  kind = "tool_trajectory";
3784
4694
  config;
@@ -3786,8 +4696,19 @@ var ToolTrajectoryEvaluator = class {
3786
4696
  this.config = options.config;
3787
4697
  }
3788
4698
  evaluate(context) {
3789
- const { candidateTrace, candidateTraceSummary } = context;
3790
- if (!candidateTrace || !candidateTraceSummary) {
4699
+ const { outputMessages, traceSummary } = context;
4700
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
4701
+ if (toolCalls.length === 0 && !traceSummary) {
4702
+ return {
4703
+ score: 0,
4704
+ verdict: "fail",
4705
+ hits: [],
4706
+ misses: ["No trace available for evaluation"],
4707
+ expectedAspectCount: 1
4708
+ };
4709
+ }
4710
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
4711
+ if (!summary) {
3791
4712
  return {
3792
4713
  score: 0,
3793
4714
  verdict: "fail",
@@ -3798,11 +4719,11 @@ var ToolTrajectoryEvaluator = class {
3798
4719
  }
3799
4720
  switch (this.config.mode) {
3800
4721
  case "any_order":
3801
- return this.evaluateAnyOrder(candidateTraceSummary);
4722
+ return this.evaluateAnyOrder(summary);
3802
4723
  case "in_order":
3803
- return this.evaluateInOrder(candidateTrace);
4724
+ return this.evaluateInOrder(toolCalls);
3804
4725
  case "exact":
3805
- return this.evaluateExact(candidateTrace);
4726
+ return this.evaluateExact(toolCalls);
3806
4727
  default:
3807
4728
  return {
3808
4729
  score: 0,
@@ -3813,6 +4734,42 @@ var ToolTrajectoryEvaluator = class {
3813
4734
  };
3814
4735
  }
3815
4736
  }
4737
+ /**
4738
+ * Extract tool calls from output messages.
4739
+ */
4740
+ extractToolCallsFromMessages(messages) {
4741
+ if (!messages) {
4742
+ return [];
4743
+ }
4744
+ const toolCalls = [];
4745
+ for (const message of messages) {
4746
+ if (message.toolCalls) {
4747
+ for (const call of message.toolCalls) {
4748
+ toolCalls.push({
4749
+ name: call.tool,
4750
+ args: call.input
4751
+ });
4752
+ }
4753
+ }
4754
+ }
4755
+ return toolCalls;
4756
+ }
4757
+ /**
4758
+ * Build a summary from extracted tool calls.
4759
+ */
4760
+ buildSummary(toolCalls) {
4761
+ const toolCallsByName = {};
4762
+ for (const call of toolCalls) {
4763
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
4764
+ }
4765
+ const toolNames = Object.keys(toolCallsByName).sort();
4766
+ return {
4767
+ eventCount: toolCalls.length,
4768
+ toolNames,
4769
+ toolCallsByName,
4770
+ errorCount: 0
4771
+ };
4772
+ }
3816
4773
  evaluateAnyOrder(summary) {
3817
4774
  const minimums = this.config.minimums ?? {};
3818
4775
  const toolNames = Object.keys(minimums);
@@ -3845,7 +4802,7 @@ var ToolTrajectoryEvaluator = class {
3845
4802
  expectedAspectCount: toolNames.length
3846
4803
  };
3847
4804
  }
3848
- evaluateInOrder(trace) {
4805
+ evaluateInOrder(toolCalls) {
3849
4806
  const expected = this.config.expected ?? [];
3850
4807
  if (expected.length === 0) {
3851
4808
  return {
@@ -3856,23 +4813,33 @@ var ToolTrajectoryEvaluator = class {
3856
4813
  expectedAspectCount: 0
3857
4814
  };
3858
4815
  }
3859
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3860
4816
  const hits = [];
3861
4817
  const misses = [];
3862
4818
  let actualIndex = 0;
3863
4819
  for (let i = 0; i < expected.length; i++) {
3864
- const expectedTool = expected[i].tool;
4820
+ const expectedItem = expected[i];
4821
+ const expectedTool = expectedItem.tool;
3865
4822
  let found = false;
3866
- while (actualIndex < actualToolCalls.length) {
3867
- if (actualToolCalls[actualIndex].name === expectedTool) {
3868
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4823
+ let argsMismatch = false;
4824
+ while (actualIndex < toolCalls.length) {
4825
+ const actualCall = toolCalls[actualIndex];
4826
+ if (actualCall.name === expectedTool) {
4827
+ if (argsMatch(expectedItem.args, actualCall.args)) {
4828
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4829
+ actualIndex++;
4830
+ found = true;
4831
+ break;
4832
+ }
4833
+ misses.push(
4834
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
4835
+ );
3869
4836
  actualIndex++;
3870
- found = true;
4837
+ argsMismatch = true;
3871
4838
  break;
3872
4839
  }
3873
4840
  actualIndex++;
3874
4841
  }
3875
- if (!found) {
4842
+ if (!found && !argsMismatch) {
3876
4843
  misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
3877
4844
  }
3878
4845
  }
@@ -3885,7 +4852,7 @@ var ToolTrajectoryEvaluator = class {
3885
4852
  expectedAspectCount: expected.length
3886
4853
  };
3887
4854
  }
3888
- evaluateExact(trace) {
4855
+ evaluateExact(toolCalls) {
3889
4856
  const expected = this.config.expected ?? [];
3890
4857
  if (expected.length === 0) {
3891
4858
  return {
@@ -3896,18 +4863,23 @@ var ToolTrajectoryEvaluator = class {
3896
4863
  expectedAspectCount: 0
3897
4864
  };
3898
4865
  }
3899
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3900
4866
  const hits = [];
3901
4867
  const misses = [];
3902
- if (actualToolCalls.length !== expected.length) {
3903
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4868
+ if (toolCalls.length !== expected.length) {
4869
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
3904
4870
  }
3905
- const checkLength = Math.min(expected.length, actualToolCalls.length);
4871
+ const checkLength = Math.min(expected.length, toolCalls.length);
3906
4872
  for (let i = 0; i < checkLength; i++) {
3907
- const expectedTool = expected[i].tool;
3908
- const actualTool = actualToolCalls[i].name;
4873
+ const expectedItem = expected[i];
4874
+ const expectedTool = expectedItem.tool;
4875
+ const actualCall = toolCalls[i];
4876
+ const actualTool = actualCall.name;
3909
4877
  if (actualTool === expectedTool) {
3910
- hits.push(`Position ${i}: ${expectedTool} \u2713`);
4878
+ if (argsMatch(expectedItem.args, actualCall.args)) {
4879
+ hits.push(`Position ${i}: ${expectedTool}`);
4880
+ } else {
4881
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
4882
+ }
3911
4883
  } else {
3912
4884
  misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
3913
4885
  }
@@ -4119,11 +5091,13 @@ var CompositeEvaluator = class {
4119
5091
  evalCaseId: context.evalCase.id,
4120
5092
  attempt: context.attempt
4121
5093
  });
4122
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
5094
+ const data = freeformEvaluationSchema.parse(
5095
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
5096
+ );
4123
5097
  const score = clampScore(data.score);
4124
5098
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4125
5099
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4126
- const reasoning = data.reasoning ?? response.reasoning;
5100
+ const reasoning = data.reasoning;
4127
5101
  return {
4128
5102
  score,
4129
5103
  verdict: scoreToVerdict(score),
@@ -4149,9 +5123,9 @@ var CompositeEvaluator = class {
4149
5123
  };
4150
5124
 
4151
5125
  // src/evaluation/orchestrator.ts
4152
- import { createHash, randomUUID as randomUUID2 } from "node:crypto";
4153
- import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
4154
- import path12 from "node:path";
5126
+ import { createHash, randomUUID as randomUUID3 } from "node:crypto";
5127
+ import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
5128
+ import path13 from "node:path";
4155
5129
 
4156
5130
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4157
5131
  var Node = class {
@@ -4546,11 +5520,19 @@ async function runBatchEvaluation(options) {
4546
5520
  const evalCase = evalCases[i];
4547
5521
  const promptInputs = promptInputsList[i];
4548
5522
  const providerResponse = batchResponse[i];
5523
+ const outputMessages = providerResponse.outputMessages;
5524
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5525
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5526
+ tokenUsage: providerResponse.tokenUsage,
5527
+ costUsd: providerResponse.costUsd,
5528
+ durationMs: providerResponse.durationMs
5529
+ }) : void 0;
5530
+ const candidate = extractLastAssistantContent(outputMessages);
4549
5531
  let result;
4550
5532
  try {
4551
5533
  result = await evaluateCandidate({
4552
5534
  evalCase,
4553
- candidate: providerResponse.text ?? "",
5535
+ candidate,
4554
5536
  target,
4555
5537
  provider,
4556
5538
  evaluators: evaluatorRegistry,
@@ -4558,7 +5540,9 @@ async function runBatchEvaluation(options) {
4558
5540
  nowFn,
4559
5541
  attempt: 0,
4560
5542
  judgeProvider: await resolveJudgeProvider(target),
4561
- agentTimeoutMs
5543
+ agentTimeoutMs,
5544
+ outputMessages,
5545
+ traceSummary
4562
5546
  });
4563
5547
  } catch (error) {
4564
5548
  const errorResult = buildErrorResult(
@@ -4662,21 +5646,18 @@ async function runEvalCase(options) {
4662
5646
  if (cacheKey && cache && !cachedResponse) {
4663
5647
  await cache.set(cacheKey, providerResponse);
4664
5648
  }
4665
- let candidateTrace = providerResponse.trace;
4666
- if (!candidateTrace && providerResponse.traceRef) {
4667
- try {
4668
- const rawTrace = await readJsonFile(providerResponse.traceRef);
4669
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4670
- candidateTrace = rawTrace;
4671
- }
4672
- } catch {
4673
- }
4674
- }
4675
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
5649
+ const outputMessages = providerResponse.outputMessages;
5650
+ const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5651
+ const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
5652
+ tokenUsage: providerResponse.tokenUsage,
5653
+ costUsd: providerResponse.costUsd,
5654
+ durationMs: providerResponse.durationMs
5655
+ }) : void 0;
5656
+ const candidate = extractLastAssistantContent(outputMessages);
4676
5657
  try {
4677
5658
  return await evaluateCandidate({
4678
5659
  evalCase,
4679
- candidate: providerResponse.text ?? "",
5660
+ candidate,
4680
5661
  target,
4681
5662
  provider,
4682
5663
  evaluators,
@@ -4685,9 +5666,8 @@ async function runEvalCase(options) {
4685
5666
  attempt,
4686
5667
  judgeProvider,
4687
5668
  agentTimeoutMs,
4688
- candidateTrace,
4689
- candidateTraceRef: providerResponse.traceRef,
4690
- candidateTraceSummary
5669
+ outputMessages,
5670
+ traceSummary
4691
5671
  });
4692
5672
  } catch (error) {
4693
5673
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4705,9 +5685,8 @@ async function evaluateCandidate(options) {
4705
5685
  attempt,
4706
5686
  judgeProvider,
4707
5687
  agentTimeoutMs,
4708
- candidateTrace,
4709
- candidateTraceRef,
4710
- candidateTraceSummary
5688
+ outputMessages,
5689
+ traceSummary
4711
5690
  } = options;
4712
5691
  const gradeTimestamp = nowFn();
4713
5692
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4721,9 +5700,8 @@ async function evaluateCandidate(options) {
4721
5700
  now: gradeTimestamp,
4722
5701
  judgeProvider,
4723
5702
  agentTimeoutMs,
4724
- candidateTrace,
4725
- candidateTraceRef,
4726
- candidateTraceSummary
5703
+ outputMessages,
5704
+ traceSummary
4727
5705
  });
4728
5706
  const completedAt = nowFn();
4729
5707
  let agentProviderRequest;
@@ -4747,21 +5725,21 @@ async function evaluateCandidate(options) {
4747
5725
  }
4748
5726
  return {
4749
5727
  timestamp: completedAt.toISOString(),
4750
- eval_id: evalCase.id,
5728
+ evalId: evalCase.id,
4751
5729
  dataset: evalCase.dataset,
4752
- conversation_id: evalCase.conversation_id,
5730
+ conversationId: evalCase.conversation_id,
4753
5731
  score: score.score,
4754
5732
  hits: score.hits,
4755
5733
  misses: score.misses,
4756
- candidate_answer: candidate,
5734
+ candidateAnswer: candidate,
4757
5735
  target: target.name,
4758
5736
  reasoning: score.reasoning,
4759
- raw_aspects: score.rawAspects,
4760
- agent_provider_request: agentProviderRequest,
4761
- lm_provider_request: lmProviderRequest,
4762
- evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4763
- evaluator_results: evaluatorResults,
4764
- trace_summary: candidateTraceSummary
5737
+ rawAspects: score.rawAspects,
5738
+ agentProviderRequest,
5739
+ lmProviderRequest,
5740
+ evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5741
+ evaluatorResults,
5742
+ traceSummary
4765
5743
  };
4766
5744
  }
4767
5745
  async function runEvaluatorsForCase(options) {
@@ -4776,9 +5754,8 @@ async function runEvaluatorsForCase(options) {
4776
5754
  now,
4777
5755
  judgeProvider,
4778
5756
  agentTimeoutMs,
4779
- candidateTrace,
4780
- candidateTraceRef,
4781
- candidateTraceSummary
5757
+ outputMessages,
5758
+ traceSummary
4782
5759
  } = options;
4783
5760
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4784
5761
  return runEvaluatorList({
@@ -4793,9 +5770,8 @@ async function runEvaluatorsForCase(options) {
4793
5770
  now,
4794
5771
  judgeProvider,
4795
5772
  agentTimeoutMs,
4796
- candidateTrace,
4797
- candidateTraceRef,
4798
- candidateTraceSummary
5773
+ outputMessages,
5774
+ traceSummary
4799
5775
  });
4800
5776
  }
4801
5777
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4812,9 +5788,8 @@ async function runEvaluatorsForCase(options) {
4812
5788
  promptInputs,
4813
5789
  now,
4814
5790
  judgeProvider,
4815
- candidateTrace,
4816
- candidateTraceRef,
4817
- candidateTraceSummary
5791
+ outputMessages,
5792
+ traceSummary
4818
5793
  });
4819
5794
  return { score };
4820
5795
  }
@@ -4831,9 +5806,8 @@ async function runEvaluatorList(options) {
4831
5806
  now,
4832
5807
  judgeProvider,
4833
5808
  agentTimeoutMs,
4834
- candidateTrace,
4835
- candidateTraceRef,
4836
- candidateTraceSummary
5809
+ outputMessages,
5810
+ traceSummary
4837
5811
  } = options;
4838
5812
  const scored = [];
4839
5813
  const evaluatorResults = [];
@@ -4863,7 +5837,7 @@ async function runEvaluatorList(options) {
4863
5837
  hits: score2.hits,
4864
5838
  misses: score2.misses,
4865
5839
  reasoning: score2.reasoning,
4866
- evaluator_provider_request: score2.evaluatorRawRequest
5840
+ evaluatorProviderRequest: score2.evaluatorRawRequest
4867
5841
  });
4868
5842
  }
4869
5843
  if (evaluator.type === "code") {
@@ -4880,8 +5854,8 @@ async function runEvaluatorList(options) {
4880
5854
  attempt,
4881
5855
  promptInputs,
4882
5856
  now,
4883
- candidateTraceRef,
4884
- candidateTraceSummary
5857
+ outputMessages,
5858
+ traceSummary
4885
5859
  });
4886
5860
  const weight = evaluator.weight ?? 1;
4887
5861
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4894,11 +5868,11 @@ async function runEvaluatorList(options) {
4894
5868
  hits: score2.hits,
4895
5869
  misses: score2.misses,
4896
5870
  reasoning: score2.reasoning,
4897
- evaluator_provider_request: score2.evaluatorRawRequest
5871
+ evaluatorProviderRequest: score2.evaluatorRawRequest
4898
5872
  });
4899
5873
  }
4900
5874
  if (evaluator.type === "composite") {
4901
- const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
5875
+ const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
4902
5876
  const createEvaluator = (memberConfig) => {
4903
5877
  switch (memberConfig.type) {
4904
5878
  case "llm_judge":
@@ -4951,8 +5925,8 @@ async function runEvaluatorList(options) {
4951
5925
  hits: score2.hits,
4952
5926
  misses: score2.misses,
4953
5927
  reasoning: score2.reasoning,
4954
- evaluator_provider_request: score2.evaluatorRawRequest,
4955
- evaluator_results: mapChildResults(score2.evaluatorResults)
5928
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
5929
+ evaluatorResults: mapChildResults(score2.evaluatorResults)
4956
5930
  });
4957
5931
  }
4958
5932
  if (evaluator.type === "tool_trajectory") {
@@ -4967,9 +5941,8 @@ async function runEvaluatorList(options) {
4967
5941
  attempt,
4968
5942
  promptInputs,
4969
5943
  now,
4970
- candidateTrace,
4971
- candidateTraceRef,
4972
- candidateTraceSummary
5944
+ outputMessages,
5945
+ traceSummary
4973
5946
  });
4974
5947
  const weight = evaluator.weight ?? 1;
4975
5948
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5111,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5111
6084
  async function dumpPrompt(directory, evalCase, promptInputs) {
5112
6085
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5113
6086
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5114
- const filePath = path12.resolve(directory, filename);
5115
- await mkdir2(path12.dirname(filePath), { recursive: true });
6087
+ const filePath = path13.resolve(directory, filename);
6088
+ await mkdir3(path13.dirname(filePath), { recursive: true });
5116
6089
  const payload = {
5117
6090
  eval_id: evalCase.id,
5118
6091
  question: promptInputs.question,
5119
6092
  guidelines: promptInputs.guidelines,
5120
6093
  guideline_paths: evalCase.guideline_paths
5121
6094
  };
5122
- await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
6095
+ await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
5123
6096
  }
5124
6097
  function sanitizeFilename(value) {
5125
6098
  if (!value) {
5126
6099
  return "prompt";
5127
6100
  }
5128
6101
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
5129
- return sanitized.length > 0 ? sanitized : randomUUID2();
6102
+ return sanitized.length > 0 ? sanitized : randomUUID3();
5130
6103
  }
5131
6104
  async function invokeProvider(provider, options) {
5132
6105
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -5183,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
5183
6156
  }
5184
6157
  return {
5185
6158
  timestamp: timestamp.toISOString(),
5186
- eval_id: evalCase.id,
6159
+ evalId: evalCase.id,
5187
6160
  dataset: evalCase.dataset,
5188
- conversation_id: evalCase.conversation_id,
6161
+ conversationId: evalCase.conversation_id,
5189
6162
  score: 0,
5190
6163
  hits: [],
5191
6164
  misses: [`Error: ${message}`],
5192
- candidate_answer: `Error occurred: ${message}`,
6165
+ candidateAnswer: `Error occurred: ${message}`,
5193
6166
  target: targetName,
5194
- raw_aspects: [],
5195
- agent_provider_request: agentProviderRequest,
5196
- lm_provider_request: lmProviderRequest,
6167
+ rawAspects: [],
6168
+ agentProviderRequest,
6169
+ lmProviderRequest,
5197
6170
  error: message
5198
6171
  };
5199
6172
  }
@@ -5238,8 +6211,8 @@ function mapChildResults(children) {
5238
6211
  hits: child.hits,
5239
6212
  misses: child.misses,
5240
6213
  reasoning: child.reasoning,
5241
- evaluator_provider_request: child.evaluatorRawRequest,
5242
- evaluator_results: mapChildResults(child.evaluatorResults)
6214
+ evaluatorProviderRequest: child.evaluatorRawRequest,
6215
+ evaluatorResults: mapChildResults(child.evaluatorResults)
5243
6216
  }));
5244
6217
  }
5245
6218
  function computeWeightedMean(entries) {
@@ -5340,17 +6313,21 @@ function createAgentKernel() {
5340
6313
  export {
5341
6314
  CodeEvaluator,
5342
6315
  CompositeEvaluator,
6316
+ DEFAULT_EXPLORATION_TOOLS,
5343
6317
  LlmJudgeEvaluator,
5344
6318
  TEST_MESSAGE_ROLES,
5345
6319
  ToolTrajectoryEvaluator,
6320
+ avgToolDurationMs,
5346
6321
  buildDirectoryChain,
5347
6322
  buildPromptInputs,
5348
6323
  buildSearchRoots,
5349
6324
  computeTraceSummary,
5350
6325
  consumeCodexLogEntries,
6326
+ consumePiLogEntries,
5351
6327
  createAgentKernel,
5352
6328
  createProvider,
5353
6329
  ensureVSCodeSubagents,
6330
+ explorationRatio,
5354
6331
  extractCodeBlocks,
5355
6332
  fileExists,
5356
6333
  findGitRoot,
@@ -5362,10 +6339,9 @@ export {
5362
6339
  isJsonValue,
5363
6340
  isTestMessage,
5364
6341
  isTestMessageRole,
5365
- isTraceEvent,
5366
- isTraceEventType,
5367
6342
  listTargetNames,
5368
6343
  loadEvalCases,
6344
+ mergeExecutionMetrics,
5369
6345
  normalizeLineEndings,
5370
6346
  readJsonFile,
5371
6347
  readTargetDefinitions,
@@ -5376,6 +6352,8 @@ export {
5376
6352
  resolveTargetDefinition,
5377
6353
  runEvalCase,
5378
6354
  runEvaluation,
5379
- subscribeToCodexLogEntries
6355
+ subscribeToCodexLogEntries,
6356
+ subscribeToPiLogEntries,
6357
+ tokensPerTool
5380
6358
  };
5381
6359
  //# sourceMappingURL=index.js.map