npm - @agentv/core - Versions diffs - 1.3.1 → 1.5.0 - Mend

@agentv/core 1.3.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +77 -77
package/dist/{chunk-4A6L2F6L.js → chunk-E2VSU4WZ.js} +282 -81
package/dist/chunk-E2VSU4WZ.js.map +1 -0
package/dist/evaluation/validation/index.cjs +82 -67
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +3 -68
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1668 -489
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +279 -77
package/dist/index.d.ts +279 -77
package/dist/index.js +1334 -356
package/dist/index.js.map +1 -1
package/package.json +2 -5
package/dist/chunk-4A6L2F6L.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import {
   buildDirectoryChain,
   buildSearchRoots,
+  extractLastAssistantContent,
   fileExists,
   findGitRoot,
   isAgentProvider,
@@ -9,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-4A6L2F6L.js";
+} from "./chunk-E2VSU4WZ.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -74,33 +75,69 @@ function getHitCount(result) {
 }
 // src/evaluation/trace.ts
-function isTraceEventType(value) {
-  return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
-}
-function isTraceEvent(value) {
-  if (typeof value !== "object" || value === null) {
-    return false;
-  }
-  const candidate = value;
-  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
-}
-function computeTraceSummary(trace) {
+function computeTraceSummary(messages) {
   const toolCallCounts = {};
-  let errorCount = 0;
-  for (const event of trace) {
-    if (event.type === "tool_call" && event.name) {
-      toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
-    }
-    if (event.type === "error") {
-      errorCount++;
+  let totalToolCalls = 0;
+  for (const message of messages) {
+    if (!message.toolCalls) continue;
+    for (const toolCall of message.toolCalls) {
+      toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
+      totalToolCalls++;
     }
   }
   const toolNames = Object.keys(toolCallCounts).sort();
   return {
-    eventCount: trace.length,
+    eventCount: totalToolCalls,
     toolNames,
     toolCallsByName: toolCallCounts,
-    errorCount
+    errorCount: 0
+  };
+}
+var DEFAULT_EXPLORATION_TOOLS = [
+  "read",
+  "grep",
+  "glob",
+  "search",
+  "list",
+  "Read",
+  "Grep",
+  "Glob",
+  "WebSearch",
+  "WebFetch"
+];
+function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
+  if (summary.eventCount === 0) return void 0;
+  const explorationCalls = explorationTools.reduce(
+    (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
+    0
+  );
+  return explorationCalls / summary.eventCount;
+}
+function tokensPerTool(summary) {
+  if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
+  const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
+  return totalTokens / summary.eventCount;
+}
+function avgToolDurationMs(summary) {
+  if (!summary.toolDurations) return void 0;
+  let totalDuration = 0;
+  let totalCalls = 0;
+  for (const durations of Object.values(summary.toolDurations)) {
+    for (const duration of durations) {
+      totalDuration += duration;
+      totalCalls++;
+    }
+  }
+  if (totalCalls === 0) return void 0;
+  return totalDuration / totalCalls;
+}
+function mergeExecutionMetrics(summary, metrics) {
+  if (!metrics) return summary;
+  return {
+    ...summary,
+    tokenUsage: metrics.tokenUsage,
+    costUsd: metrics.costUsd,
+    durationMs: metrics.durationMs
   };
 }
@@ -376,7 +413,8 @@ var TEMPLATE_VARIABLES = {
   QUESTION: "question",
   EXPECTED_OUTCOME: "expected_outcome",
   REFERENCE_ANSWER: "reference_answer",
-  INPUT_MESSAGES: "input_messages"
+  INPUT_MESSAGES: "input_messages",
+  OUTPUT_MESSAGES: "output_messages"
 };
 var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
 var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -616,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         expected = [];
         for (const item of rawExpected) {
           if (isJsonObject2(item) && typeof item.tool === "string") {
-            expected.push({ tool: item.tool });
+            let args;
+            if (item.args === "any") {
+              args = "any";
+            } else if (isJsonObject2(item.args)) {
+              args = item.args;
+            }
+            expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
           }
         }
       }
@@ -1259,16 +1303,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
     let referenceAnswer = "";
-    if (outputSegments.length > 1) {
-      referenceAnswer = JSON.stringify(outputSegments, null, 2);
-    } else if (outputSegments.length === 1) {
-      const singleMessage = outputSegments[0];
-      if (typeof singleMessage.content === "string") {
-        referenceAnswer = singleMessage.content;
-      } else if (singleMessage.content) {
-        referenceAnswer = JSON.stringify(singleMessage, null, 2);
-      } else if (singleMessage.tool_calls) {
-        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+    if (outputSegments.length > 0) {
+      const lastMessage = outputSegments[outputSegments.length - 1];
+      const content = lastMessage.content;
+      const toolCalls = lastMessage.tool_calls;
+      if (typeof content === "string") {
+        referenceAnswer = content;
+      } else if (content !== void 0 && content !== null) {
+        referenceAnswer = JSON.stringify(content, null, 2);
+      } else if (toolCalls !== void 0 && toolCalls !== null) {
+        referenceAnswer = JSON.stringify(toolCalls, null, 2);
       }
     }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1596,11 +1640,11 @@ async function invokeModel(options) {
   return mapResponse(result);
 }
 function mapResponse(result) {
+  const content = result.text ?? "";
   return {
-    text: result.text ?? "",
-    reasoning: result.reasoningText ?? void 0,
     raw: result,
-    usage: toJsonObject(result.totalUsage ?? result.usage)
+    usage: toJsonObject(result.totalUsage ?? result.usage),
+    outputMessages: [{ role: "assistant", content }]
   };
 }
 function toJsonObject(value) {
@@ -1753,6 +1797,7 @@ var CliProvider = class {
   config;
   runCommand;
   verbose;
+  keepTempFiles;
   healthcheckPromise;
   constructor(targetName, config, runner = defaultCommandRunner) {
     this.targetName = targetName;
@@ -1760,6 +1805,7 @@ var CliProvider = class {
     this.config = config;
     this.runCommand = runner;
     this.verbose = config.verbose ?? false;
+    this.keepTempFiles = config.keepTempFiles ?? false;
   }
   async invoke(request) {
     if (request.signal?.aborted) {
@@ -1774,12 +1820,14 @@ var CliProvider = class {
         `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
+    const startTime = Date.now();
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
       timeoutMs: this.config.timeoutMs,
       signal: request.signal
     });
+    const measuredDurationMs = Date.now() - startTime;
     if (result.failed || (result.exitCode ?? 0) !== 0) {
       if (request.signal?.aborted) {
         throw new Error("CLI provider request was aborted");
@@ -1797,8 +1845,10 @@ var CliProvider = class {
     const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
     const parsed = this.parseOutputContent(responseContent);
     return {
-      text: parsed.text,
-      trace: parsed.trace,
+      outputMessages: parsed.outputMessages,
+      tokenUsage: parsed.tokenUsage,
+      costUsd: parsed.costUsd,
+      durationMs: parsed.durationMs ?? measuredDurationMs,
       raw: {
         command: renderedCommand,
         stderr: result.stderr,
@@ -1846,12 +1896,14 @@ var CliProvider = class {
         `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
+    const startTime = Date.now();
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
       timeoutMs: this.config.timeoutMs,
       signal: controller.signal
     });
+    const measuredDurationMs = Date.now() - startTime;
     if (result.failed || (result.exitCode ?? 0) !== 0) {
       if (controller.signal.aborted) {
         throw new Error("CLI provider request was aborted");
@@ -1873,11 +1925,13 @@ var CliProvider = class {
     if (missingIds.length > 0) {
       throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
     }
+    const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
     const responses = requests.map((request) => {
       const evalCaseId = request.evalCaseId;
       if (!evalCaseId) {
         return {
-          text: "",
+          outputMessages: [],
+          durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
@@ -1890,7 +1944,8 @@ var CliProvider = class {
       const parsed = recordsById.get(evalCaseId);
       if (!parsed) {
         return {
-          text: "",
+          outputMessages: [],
+          durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
@@ -1901,9 +1956,10 @@ var CliProvider = class {
         };
       }
       return {
-        text: parsed.text,
-        trace: parsed.trace,
-        traceRef: parsed.traceRef,
+        outputMessages: parsed.outputMessages,
+        tokenUsage: parsed.tokenUsage,
+        costUsd: parsed.costUsd,
+        durationMs: parsed.durationMs ?? perRequestFallbackMs,
         raw: {
           command: renderedCommand,
           stderr: result.stderr,
@@ -1918,28 +1974,111 @@ var CliProvider = class {
   }
   /**
    * Parse output content from CLI.
-   * If the content is valid JSON with a 'text' field, extract text and optional trace.
-   * Otherwise, treat the entire content as plain text.
+   * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
+   * If only 'text' is provided, wrap it in outputMessages.
+   * Otherwise, treat the entire content as plain text wrapped in outputMessages.
+   *
+   * Also extracts optional execution metrics:
+   * - token_usage: { input, output, cached? }
+   * - cost_usd: number
+   * - duration_ms: number
    */
   parseOutputContent(content) {
     try {
       const parsed = JSON.parse(content);
-      if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
+      if (typeof parsed === "object" && parsed !== null) {
         const obj = parsed;
-        const text = typeof obj.text === "string" ? obj.text : String(obj.text);
-        const trace = this.parseTrace(obj.trace);
-        return { text, trace };
+        const tokenUsage = this.parseTokenUsage(obj.token_usage);
+        const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
+        const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
+        const outputMessages = this.parseOutputMessages(obj.output_messages);
+        if (outputMessages && outputMessages.length > 0) {
+          return { outputMessages, tokenUsage, costUsd, durationMs };
+        }
+        if ("text" in obj) {
+          const text = typeof obj.text === "string" ? obj.text : String(obj.text);
+          return {
+            outputMessages: [{ role: "assistant", content: text }],
+            tokenUsage,
+            costUsd,
+            durationMs
+          };
+        }
       }
     } catch {
     }
-    return { text: content };
+    return { outputMessages: [{ role: "assistant", content }] };
+  }
+  /**
+   * Parse token_usage from CLI output.
+   */
+  parseTokenUsage(tokenUsage) {
+    if (typeof tokenUsage !== "object" || tokenUsage === null) {
+      return void 0;
+    }
+    const obj = tokenUsage;
+    if (typeof obj.input !== "number" || typeof obj.output !== "number") {
+      return void 0;
+    }
+    return {
+      input: obj.input,
+      output: obj.output,
+      cached: typeof obj.cached === "number" ? obj.cached : void 0
+    };
+  }
+  /**
+   * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
+   */
+  parseOutputMessages(outputMessages) {
+    if (!Array.isArray(outputMessages)) {
+      return void 0;
+    }
+    const messages = [];
+    for (const msg of outputMessages) {
+      if (typeof msg !== "object" || msg === null) {
+        continue;
+      }
+      const rawMsg = msg;
+      if (typeof rawMsg.role !== "string") {
+        continue;
+      }
+      const message = {
+        role: rawMsg.role,
+        name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
+        content: rawMsg.content,
+        toolCalls: this.parseToolCalls(rawMsg.tool_calls),
+        timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
+        metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
+      };
+      messages.push(message);
+    }
+    return messages.length > 0 ? messages : void 0;
   }
-  parseTrace(trace) {
-    if (!Array.isArray(trace)) {
+  /**
+   * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
+   */
+  parseToolCalls(toolCalls) {
+    if (!Array.isArray(toolCalls)) {
       return void 0;
     }
-    const validEvents = trace.filter(isTraceEvent);
-    return validEvents.length > 0 ? validEvents : void 0;
+    const calls = [];
+    for (const call of toolCalls) {
+      if (typeof call !== "object" || call === null) {
+        continue;
+      }
+      const rawCall = call;
+      if (typeof rawCall.tool !== "string") {
+        continue;
+      }
+      calls.push({
+        tool: rawCall.tool,
+        input: rawCall.input,
+        output: rawCall.output,
+        id: typeof rawCall.id === "string" ? rawCall.id : void 0,
+        timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
+      });
+    }
+    return calls.length > 0 ? calls : void 0;
   }
   parseJsonlBatchOutput(content) {
     const records = /* @__PURE__ */ new Map();
@@ -1963,12 +2102,22 @@ var CliProvider = class {
       if (records.has(id)) {
         throw new Error(`CLI batch output contains duplicate id: ${id}`);
       }
-      const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
-      const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
+      const tokenUsage = this.parseTokenUsage(obj.token_usage);
+      const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
+      const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
+      const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
+      let outputMessages;
+      if (parsedOutputMessages && parsedOutputMessages.length > 0) {
+        outputMessages = parsedOutputMessages;
+      } else {
+        const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
+        outputMessages = text ? [{ role: "assistant", content: text }] : [];
+      }
       records.set(id, {
-        text,
-        trace: this.parseTrace(obj.trace),
-        traceRef
+        outputMessages,
+        tokenUsage,
+        costUsd,
+        durationMs
       });
     }
     return records;
@@ -1981,8 +2130,10 @@ var CliProvider = class {
       const errorMsg = error instanceof Error ? error.message : String(error);
       throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
     } finally {
-      await fs.unlink(filePath).catch(() => {
-      });
+      if (!this.keepTempFiles) {
+        await fs.unlink(filePath).catch(() => {
+        });
+      }
     }
   }
   async ensureHealthy(signal) {
@@ -2282,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
 var WORKSPACE_PREFIX = "agentv-codex-";
 var PROMPT_FILENAME = "prompt.md";
 var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
+var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
 var CodexProvider = class {
   id;
   kind = "codex";
@@ -2306,7 +2462,11 @@ var CodexProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptContent = buildPromptDocument(request, inputFiles);
+      const basePrompt = buildPromptDocument(request, inputFiles);
+      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+      const promptContent = `${systemPrompt}
+${basePrompt}`;
       const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
       await writeFile(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
@@ -2325,7 +2485,6 @@ var CodexProvider = class {
       const parsed = parseCodexJson(result.stdout);
       const assistantText = extractAssistantText(parsed);
       return {
-        text: assistantText,
         raw: {
           response: parsed,
           stdout: result.stdout,
@@ -2337,7 +2496,8 @@ var CodexProvider = class {
           workspace: workspaceRoot,
           inputFiles,
           logFile: logger?.filePath
-        }
+        },
+        outputMessages: [{ role: "assistant", content: assistantText }]
       };
     } finally {
       await logger?.close();
@@ -2959,7 +3119,6 @@ var MockProvider = class {
   delayMs;
   delayMinMs;
   delayMaxMs;
-  trace;
   constructor(targetName, config) {
     this.id = `mock:${targetName}`;
     this.targetName = targetName;
@@ -2967,7 +3126,6 @@ var MockProvider = class {
     this.delayMs = config.delayMs ?? 0;
     this.delayMinMs = config.delayMinMs ?? 0;
     this.delayMaxMs = config.delayMaxMs ?? 0;
-    this.trace = config.trace;
   }
   async invoke(request) {
     const delay = this.calculateDelay();
@@ -2975,12 +3133,11 @@ var MockProvider = class {
       await new Promise((resolve) => setTimeout(resolve, delay));
     }
     return {
-      text: this.cannedResponse,
+      outputMessages: [{ role: "assistant", content: this.cannedResponse }],
       raw: {
         question: request.question,
         guidelines: request.guidelines
-      },
-      trace: this.trace
+      }
     };
   }
   calculateDelay() {
@@ -2993,163 +3150,842 @@ var MockProvider = class {
   }
 };
-// src/evaluation/providers/vscode.ts
+// src/evaluation/providers/pi-coding-agent.ts
+import { spawn as spawn2 } from "node:child_process";
+import { randomUUID as randomUUID2 } from "node:crypto";
+import { createWriteStream as createWriteStream2 } from "node:fs";
+import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
+import { tmpdir as tmpdir2 } from "node:os";
 import path10 from "node:path";
-import {
-  dispatchAgentSession,
-  dispatchBatchAgent,
-  getSubagentRoot,
-  provisionSubagents
-} from "subagent";
-// src/evaluation/providers/vscode-templates.ts
-var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
-{{userQuery}}
-[[ ## system_instructions ## ]]
-**IMPORTANT**: Follow these exact steps:
-1. Create and write your complete response to: {{responseFileTmp}}
-    - Do NOT create any additional output files in the workspace.
-    - All intended file outputs/changes MUST be written in your response file.
-    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
-2. When completely finished, run these PowerShell commands to signal completion:
-\`\`\`
-Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
-if (Test-Path subagent.lock) { del subagent.lock }
-\`\`\`
-Do not proceed to step 2 until your response is completely written to the temporary file.
-`;
-var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
-{{userQuery}}
-[[ ## system_instructions ## ]]
-**IMPORTANT**: Follow these exact steps:
-1. Create and write your complete response to: {{responseFileTmp}}
-    - Do NOT create any additional output files in the workspace.
-    - All intended file outputs/changes MUST be written in your response file.
-    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
-2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
-3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
-`;
+// src/evaluation/providers/pi-log-tracker.ts
+var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
+var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
+function getPiLogStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_LOGS_KEY2];
+  if (existing) {
+    return existing;
+  }
+  const created = [];
+  globalObject[GLOBAL_LOGS_KEY2] = created;
+  return created;
+}
+function getSubscriberStore2() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
+  if (existing) {
+    return existing;
+  }
+  const created = /* @__PURE__ */ new Set();
+  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
+  return created;
+}
+function notifySubscribers2(entry) {
+  const subscribers = Array.from(getSubscriberStore2());
+  for (const listener of subscribers) {
+    try {
+      listener(entry);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Pi log subscriber failed: ${message}`);
+    }
+  }
+}
+function recordPiLogEntry(entry) {
+  getPiLogStore().push(entry);
+  notifySubscribers2(entry);
+}
+function consumePiLogEntries() {
+  const store = getPiLogStore();
+  if (store.length === 0) {
+    return [];
+  }
+  return store.splice(0, store.length);
+}
+function subscribeToPiLogEntries(listener) {
+  const store = getSubscriberStore2();
+  store.add(listener);
+  return () => {
+    store.delete(listener);
+  };
+}
-// src/evaluation/providers/vscode.ts
-var VSCodeProvider = class {
+// src/evaluation/providers/pi-coding-agent.ts
+var WORKSPACE_PREFIX2 = "agentv-pi-";
+var PROMPT_FILENAME2 = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
+var PiCodingAgentProvider = class {
   id;
-  kind;
+  kind = "pi-coding-agent";
   targetName;
-  supportsBatch = true;
+  supportsBatch = false;
   config;
-  constructor(targetName, config, kind) {
-    this.id = `${kind}:${targetName}`;
-    this.kind = kind;
+  runPi;
+  constructor(targetName, config, runner = defaultPiRunner) {
+    this.id = `pi-coding-agent:${targetName}`;
     this.targetName = targetName;
     this.config = config;
+    this.runPi = runner;
   }
   async invoke(request) {
     if (request.signal?.aborted) {
-      throw new Error("VS Code provider request was aborted before dispatch");
-    }
-    const inputFiles = normalizeAttachments(request.inputFiles);
-    const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
-    const session = await dispatchAgentSession({
-      userQuery: promptContent,
-      extraAttachments: inputFiles,
-      requestTemplate: AGENTV_REQUEST_TEMPLATE,
-      wait: this.config.waitForResponse,
-      dryRun: this.config.dryRun,
-      vscodeCmd: this.config.command,
-      subagentRoot: this.config.subagentRoot,
-      workspaceTemplate: this.config.workspaceTemplate,
-      silent: true
-    });
-    if (session.exitCode !== 0 || !session.responseFile) {
-      const failure = session.error ?? "VS Code subagent did not produce a response";
-      throw new Error(failure);
+      throw new Error("Pi coding agent request was aborted before execution");
     }
-    if (this.config.dryRun) {
+    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const workspaceRoot = await this.createWorkspace();
+    const logger = await this.createStreamLogger(request).catch(() => void 0);
+    try {
+      const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
+      await writeFile2(promptFile, request.question, "utf8");
+      const args = this.buildPiArgs(request.question, inputFiles);
+      const cwd = this.resolveCwd(workspaceRoot);
+      const result = await this.executePi(args, cwd, request.signal, logger);
+      if (result.timedOut) {
+        throw new Error(
+          `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      if (result.exitCode !== 0) {
+        const detail = pickDetail2(result.stderr, result.stdout);
+        const prefix = `Pi coding agent exited with code ${result.exitCode}`;
+        throw new Error(detail ? `${prefix}: ${detail}` : prefix);
+      }
+      const parsed = parsePiJsonl(result.stdout);
+      const outputMessages = extractOutputMessages(parsed);
+      const assistantText = extractAssistantText2(outputMessages);
       return {
-        text: "",
         raw: {
-          session,
-          inputFiles
-        }
+          response: parsed,
+          stdout: result.stdout,
+          stderr: result.stderr,
+          exitCode: result.exitCode,
+          args,
+          executable: this.config.executable,
+          promptFile,
+          workspace: workspaceRoot,
+          inputFiles,
+          logFile: logger?.filePath
+        },
+        outputMessages
       };
+    } finally {
+      await logger?.close();
+      await this.cleanupWorkspace(workspaceRoot);
     }
-    const responseText = await readTextFile(session.responseFile);
-    return {
-      text: responseText,
-      raw: {
-        session,
-        inputFiles
-      }
-    };
   }
-  async invokeBatch(requests) {
-    if (requests.length === 0) {
-      return [];
+  resolveCwd(workspaceRoot) {
+    if (!this.config.cwd) {
+      return workspaceRoot;
     }
-    const normalizedRequests = requests.map((req) => ({
-      request: req,
-      inputFiles: normalizeAttachments(req.inputFiles)
-    }));
-    const combinedInputFiles = mergeAttachments(
-      normalizedRequests.map(({ inputFiles }) => inputFiles)
-    );
-    const userQueries = normalizedRequests.map(
-      ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
-    );
-    const session = await dispatchBatchAgent({
-      userQueries,
-      extraAttachments: combinedInputFiles,
-      requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
-      wait: this.config.waitForResponse,
-      dryRun: this.config.dryRun,
-      vscodeCmd: this.config.command,
-      subagentRoot: this.config.subagentRoot,
-      workspaceTemplate: this.config.workspaceTemplate,
-      silent: true
-    });
-    if (session.exitCode !== 0 || !session.responseFiles) {
-      const failure = session.error ?? "VS Code subagent did not produce batch responses";
-      throw new Error(failure);
+    return path10.resolve(this.config.cwd);
+  }
+  buildPiArgs(prompt, inputFiles) {
+    const args = [];
+    if (this.config.provider) {
+      args.push("--provider", this.config.provider);
     }
-    if (this.config.dryRun) {
-      return normalizedRequests.map(({ inputFiles }) => ({
-        text: "",
-        raw: {
-          session,
-          inputFiles,
-          allInputFiles: combinedInputFiles
-        }
-      }));
+    if (this.config.model) {
+      args.push("--model", this.config.model);
     }
-    if (session.responseFiles.length !== requests.length) {
-      throw new Error(
-        `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
-      );
+    if (this.config.apiKey) {
+      args.push("--api-key", this.config.apiKey);
     }
-    const responses = [];
-    for (const [index, responseFile] of session.responseFiles.entries()) {
-      const responseText = await readTextFile(responseFile);
-      responses.push({
-        text: responseText,
-        raw: {
-          session,
-          inputFiles: normalizedRequests[index]?.inputFiles,
-          allInputFiles: combinedInputFiles,
-          responseFile
-        }
-      });
+    args.push("--mode", "json");
+    args.push("--print");
+    args.push("--no-session");
+    if (this.config.tools) {
+      args.push("--tools", this.config.tools);
     }
-    return responses;
-  }
-};
-function buildPromptDocument2(request, attachments, guidelinePatterns) {
+    if (this.config.thinking) {
+      args.push("--thinking", this.config.thinking);
+    }
+    if (this.config.args && this.config.args.length > 0) {
+      args.push(...this.config.args);
+    }
+    if (inputFiles && inputFiles.length > 0) {
+      for (const file of inputFiles) {
+        args.push(`@${file}`);
+      }
+    }
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
+    const fullPrompt = `${systemPrompt}
+${prompt}`;
+    const escapedPrompt = escapeAtSymbols(fullPrompt);
+    args.push(escapedPrompt);
+    return args;
+  }
+  async executePi(args, cwd, signal, logger) {
+    try {
+      return await this.runPi({
+        executable: this.config.executable,
+        args,
+        cwd,
+        timeoutMs: this.config.timeoutMs,
+        env: this.buildEnv(),
+        signal,
+        onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
+        onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
+      });
+    } catch (error) {
+      const err = error;
+      if (err.code === "ENOENT") {
+        throw new Error(
+          `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
+        );
+      }
+      throw error;
+    }
+  }
+  buildEnv() {
+    const env = { ...process.env };
+    if (this.config.apiKey) {
+      const provider = this.config.provider?.toLowerCase() ?? "google";
+      switch (provider) {
+        case "google":
+        case "gemini":
+          env.GEMINI_API_KEY = this.config.apiKey;
+          break;
+        case "anthropic":
+          env.ANTHROPIC_API_KEY = this.config.apiKey;
+          break;
+        case "openai":
+          env.OPENAI_API_KEY = this.config.apiKey;
+          break;
+        case "groq":
+          env.GROQ_API_KEY = this.config.apiKey;
+          break;
+        case "xai":
+          env.XAI_API_KEY = this.config.apiKey;
+          break;
+        case "openrouter":
+          env.OPENROUTER_API_KEY = this.config.apiKey;
+          break;
+      }
+    }
+    return env;
+  }
+  async createWorkspace() {
+    return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
+  }
+  async cleanupWorkspace(workspaceRoot) {
+    try {
+      await rm2(workspaceRoot, { recursive: true, force: true });
+    } catch {
+    }
+  }
+  resolveLogDirectory() {
+    if (this.config.logDir) {
+      return path10.resolve(this.config.logDir);
+    }
+    return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+  }
+  async createStreamLogger(request) {
+    const logDir = this.resolveLogDirectory();
+    if (!logDir) {
+      return void 0;
+    }
+    try {
+      await mkdir2(logDir, { recursive: true });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
+      return void 0;
+    }
+    const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
+    try {
+      const logger = await PiStreamLogger.create({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt,
+        format: this.config.logFormat ?? "summary"
+      });
+      recordPiLogEntry({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt
+      });
+      return logger;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
+      return void 0;
+    }
+  }
+};
+var PiStreamLogger = class _PiStreamLogger {
+  filePath;
+  stream;
+  startedAt = Date.now();
+  stdoutBuffer = "";
+  stderrBuffer = "";
+  format;
+  constructor(filePath, format) {
+    this.filePath = filePath;
+    this.format = format;
+    this.stream = createWriteStream2(filePath, { flags: "a" });
+  }
+  static async create(options) {
+    const logger = new _PiStreamLogger(options.filePath, options.format);
+    const header = [
+      "# Pi Coding Agent stream log",
+      `# target: ${options.targetName}`,
+      options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
+      options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
+      `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
+      ""
+    ].filter((line) => Boolean(line));
+    logger.writeLines(header);
+    return logger;
+  }
+  handleStdoutChunk(chunk) {
+    this.stdoutBuffer += chunk;
+    this.flushBuffer("stdout");
+  }
+  handleStderrChunk(chunk) {
+    this.stderrBuffer += chunk;
+    this.flushBuffer("stderr");
+  }
+  async close() {
+    this.flushBuffer("stdout");
+    this.flushBuffer("stderr");
+    this.flushRemainder();
+    await new Promise((resolve, reject) => {
+      this.stream.once("error", reject);
+      this.stream.end(() => resolve());
+    });
+  }
+  writeLines(lines) {
+    for (const line of lines) {
+      this.stream.write(`${line}
+`);
+    }
+  }
+  flushBuffer(source) {
+    const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
+    const lines = buffer.split(/\r?\n/);
+    const remainder = lines.pop() ?? "";
+    if (source === "stdout") {
+      this.stdoutBuffer = remainder;
+    } else {
+      this.stderrBuffer = remainder;
+    }
+    for (const line of lines) {
+      const formatted = this.formatLine(line, source);
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+  }
+  formatLine(rawLine, source) {
+    const trimmed = rawLine.trim();
+    if (trimmed.length === 0) {
+      return void 0;
+    }
+    const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
+    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
+  }
+  flushRemainder() {
+    const stdoutRemainder = this.stdoutBuffer.trim();
+    if (stdoutRemainder.length > 0) {
+      const formatted = this.formatLine(stdoutRemainder, "stdout");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    const stderrRemainder = this.stderrBuffer.trim();
+    if (stderrRemainder.length > 0) {
+      const formatted = this.formatLine(stderrRemainder, "stderr");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    this.stdoutBuffer = "";
+    this.stderrBuffer = "";
+  }
+};
+function buildLogFilename2(request, targetName) {
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+  const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
+  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
+  const target = sanitizeForFilename2(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
+}
+function sanitizeForFilename2(value) {
+  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
+  return sanitized.length > 0 ? sanitized : "pi";
+}
+function formatElapsed2(startedAt) {
+  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
+  const hours = Math.floor(elapsedSeconds / 3600);
+  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
+  const seconds = elapsedSeconds % 60;
+  if (hours > 0) {
+    return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+}
+function formatPiLogMessage(rawLine, source) {
+  const parsed = tryParseJsonValue2(rawLine);
+  if (parsed) {
+    const summary = summarizePiEvent(parsed);
+    if (summary) {
+      return summary;
+    }
+  }
+  if (source === "stderr") {
+    return `stderr: ${rawLine}`;
+  }
+  return rawLine;
+}
+function formatPiJsonLog(rawLine) {
+  const parsed = tryParseJsonValue2(rawLine);
+  if (!parsed) {
+    return rawLine;
+  }
+  try {
+    return JSON.stringify(parsed, null, 2);
+  } catch {
+    return rawLine;
+  }
+}
+function summarizePiEvent(event) {
+  if (!event || typeof event !== "object") {
+    return void 0;
+  }
+  const record = event;
+  const type = typeof record.type === "string" ? record.type : void 0;
+  if (!type) {
+    return void 0;
+  }
+  switch (type) {
+    case "agent_start":
+      return "agent_start";
+    case "agent_end":
+      return "agent_end";
+    case "turn_start":
+      return "turn_start";
+    case "turn_end":
+      return "turn_end";
+    case "message_start":
+    case "message_end": {
+      const message = record.message;
+      const role = message?.role;
+      return `${type}: ${role}`;
+    }
+    case "message_update": {
+      const event2 = record.assistantMessageEvent;
+      const eventType = event2?.type;
+      if (eventType === "text_delta") {
+        const delta = event2?.delta;
+        if (typeof delta === "string") {
+          const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
+          return `text_delta: ${preview}`;
+        }
+      }
+      return `message_update: ${eventType}`;
+    }
+    default:
+      return type;
+  }
+}
+function tryParseJsonValue2(rawLine) {
+  try {
+    return JSON.parse(rawLine);
+  } catch {
+    return void 0;
+  }
+}
+function parsePiJsonl(output) {
+  const trimmed = output.trim();
+  if (trimmed.length === 0) {
+    throw new Error("Pi coding agent produced no output");
+  }
+  const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+  const parsed = [];
+  for (const line of lines) {
+    try {
+      parsed.push(JSON.parse(line));
+    } catch {
+    }
+  }
+  if (parsed.length === 0) {
+    throw new Error("Pi coding agent produced no valid JSON output");
+  }
+  return parsed;
+}
+function extractOutputMessages(events) {
+  for (let i = events.length - 1; i >= 0; i--) {
+    const event = events[i];
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type !== "agent_end") {
+      continue;
+    }
+    const messages = record.messages;
+    if (!Array.isArray(messages)) {
+      continue;
+    }
+    return messages.map(convertPiMessage).filter((m) => m !== void 0);
+  }
+  const outputMessages = [];
+  for (const event of events) {
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type === "turn_end") {
+      const message = record.message;
+      const converted = convertPiMessage(message);
+      if (converted) {
+        outputMessages.push(converted);
+      }
+    }
+  }
+  return outputMessages;
+}
+function convertPiMessage(message) {
+  if (!message || typeof message !== "object") {
+    return void 0;
+  }
+  const msg = message;
+  const role = msg.role;
+  if (typeof role !== "string") {
+    return void 0;
+  }
+  const content = extractTextContent(msg.content);
+  const toolCalls = extractToolCalls(msg.content);
+  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
+  const metadata = {};
+  if (msg.api) metadata.api = msg.api;
+  if (msg.provider) metadata.provider = msg.provider;
+  if (msg.model) metadata.model = msg.model;
+  if (msg.usage) metadata.usage = msg.usage;
+  if (msg.stopReason) metadata.stopReason = msg.stopReason;
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
+    timestamp,
+    metadata: Object.keys(metadata).length > 0 ? metadata : void 0
+  };
+}
+function extractTextContent(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function extractToolCalls(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+    if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
+      const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
+      if (existing) {
+        const idx = toolCalls.indexOf(existing);
+        toolCalls[idx] = {
+          ...existing,
+          output: p.content
+        };
+      }
+    }
+  }
+  return toolCalls;
+}
+function extractAssistantText2(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      return JSON.stringify(msg.content);
+    }
+  }
+  return "";
+}
+function escapeAtSymbols(prompt) {
+  return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
+}
+function pickDetail2(stderr, stdout) {
+  const errorText = stderr.trim();
+  if (errorText.length > 0) {
+    return errorText;
+  }
+  const stdoutText = stdout.trim();
+  return stdoutText.length > 0 ? stdoutText : void 0;
+}
+function formatTimeoutSuffix3(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
+async function defaultPiRunner(options) {
+  return await new Promise((resolve, reject) => {
+    const parts = options.executable.split(/\s+/);
+    const executable = parts[0];
+    const executableArgs = parts.slice(1);
+    const allArgs = [...executableArgs, ...options.args];
+    const child = spawn2(executable, allArgs, {
+      cwd: options.cwd,
+      env: options.env,
+      stdio: ["pipe", "pipe", "pipe"],
+      shell: false
+    });
+    let stdout = "";
+    let stderr = "";
+    let timedOut = false;
+    const onAbort = () => {
+      child.kill("SIGTERM");
+    };
+    if (options.signal) {
+      if (options.signal.aborted) {
+        onAbort();
+      } else {
+        options.signal.addEventListener("abort", onAbort, { once: true });
+      }
+    }
+    let timeoutHandle;
+    if (options.timeoutMs && options.timeoutMs > 0) {
+      timeoutHandle = setTimeout(() => {
+        timedOut = true;
+        child.kill("SIGTERM");
+      }, options.timeoutMs);
+      timeoutHandle.unref?.();
+    }
+    child.stdout.setEncoding("utf8");
+    child.stdout.on("data", (chunk) => {
+      stdout += chunk;
+      options.onStdoutChunk?.(chunk);
+    });
+    child.stderr.setEncoding("utf8");
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+      options.onStderrChunk?.(chunk);
+    });
+    child.stdin.end();
+    const cleanup = () => {
+      if (timeoutHandle) {
+        clearTimeout(timeoutHandle);
+      }
+      if (options.signal) {
+        options.signal.removeEventListener("abort", onAbort);
+      }
+    };
+    child.on("error", (error) => {
+      cleanup();
+      reject(error);
+    });
+    child.on("close", (code) => {
+      cleanup();
+      resolve({
+        stdout,
+        stderr,
+        exitCode: typeof code === "number" ? code : -1,
+        timedOut
+      });
+    });
+  });
+}
+// src/evaluation/providers/vscode.ts
+import path11 from "node:path";
+import {
+  dispatchAgentSession,
+  dispatchBatchAgent,
+  getSubagentRoot,
+  provisionSubagents
+} from "subagent";
+// src/evaluation/providers/vscode-templates.ts
+var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
+{{userQuery}}
+[[ ## system_instructions ## ]]
+**IMPORTANT**: Follow these exact steps:
+1. Create and write your complete response to: {{responseFileTmp}}
+    - Do NOT create any additional output files in the workspace.
+    - All intended file outputs/changes MUST be written in your response file.
+    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
+2. When completely finished, run these PowerShell commands to signal completion:
+\`\`\`
+Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
+if (Test-Path subagent.lock) { del subagent.lock }
+\`\`\`
+Do not proceed to step 2 until your response is completely written to the temporary file.
+`;
+var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
+{{userQuery}}
+[[ ## system_instructions ## ]]
+**IMPORTANT**: Follow these exact steps:
+1. Create and write your complete response to: {{responseFileTmp}}
+    - Do NOT create any additional output files in the workspace.
+    - All intended file outputs/changes MUST be written in your response file.
+    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
+2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
+3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
+`;
+// src/evaluation/providers/vscode.ts
+var VSCodeProvider = class {
+  id;
+  kind;
+  targetName;
+  supportsBatch = true;
+  config;
+  constructor(targetName, config, kind) {
+    this.id = `${kind}:${targetName}`;
+    this.kind = kind;
+    this.targetName = targetName;
+    this.config = config;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("VS Code provider request was aborted before dispatch");
+    }
+    const inputFiles = normalizeAttachments(request.inputFiles);
+    const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
+    const session = await dispatchAgentSession({
+      userQuery: promptContent,
+      extraAttachments: inputFiles,
+      requestTemplate: AGENTV_REQUEST_TEMPLATE,
+      wait: this.config.waitForResponse,
+      dryRun: this.config.dryRun,
+      vscodeCmd: this.config.command,
+      subagentRoot: this.config.subagentRoot,
+      workspaceTemplate: this.config.workspaceTemplate,
+      silent: true
+    });
+    if (session.exitCode !== 0 || !session.responseFile) {
+      const failure = session.error ?? "VS Code subagent did not produce a response";
+      throw new Error(failure);
+    }
+    if (this.config.dryRun) {
+      return {
+        outputMessages: [],
+        raw: {
+          session,
+          inputFiles
+        }
+      };
+    }
+    const responseText = await readTextFile(session.responseFile);
+    return {
+      outputMessages: [{ role: "assistant", content: responseText }],
+      raw: {
+        session,
+        inputFiles
+      }
+    };
+  }
+  async invokeBatch(requests) {
+    if (requests.length === 0) {
+      return [];
+    }
+    const normalizedRequests = requests.map((req) => ({
+      request: req,
+      inputFiles: normalizeAttachments(req.inputFiles)
+    }));
+    const combinedInputFiles = mergeAttachments(
+      normalizedRequests.map(({ inputFiles }) => inputFiles)
+    );
+    const userQueries = normalizedRequests.map(
+      ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
+    );
+    const session = await dispatchBatchAgent({
+      userQueries,
+      extraAttachments: combinedInputFiles,
+      requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
+      wait: this.config.waitForResponse,
+      dryRun: this.config.dryRun,
+      vscodeCmd: this.config.command,
+      subagentRoot: this.config.subagentRoot,
+      workspaceTemplate: this.config.workspaceTemplate,
+      silent: true
+    });
+    if (session.exitCode !== 0 || !session.responseFiles) {
+      const failure = session.error ?? "VS Code subagent did not produce batch responses";
+      throw new Error(failure);
+    }
+    if (this.config.dryRun) {
+      return normalizedRequests.map(({ inputFiles }) => ({
+        outputMessages: [],
+        raw: {
+          session,
+          inputFiles,
+          allInputFiles: combinedInputFiles
+        }
+      }));
+    }
+    if (session.responseFiles.length !== requests.length) {
+      throw new Error(
+        `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
+      );
+    }
+    const responses = [];
+    for (const [index, responseFile] of session.responseFiles.entries()) {
+      const responseText = await readTextFile(responseFile);
+      responses.push({
+        outputMessages: [{ role: "assistant", content: responseText }],
+        raw: {
+          session,
+          inputFiles: normalizedRequests[index]?.inputFiles,
+          allInputFiles: combinedInputFiles,
+          responseFile
+        }
+      });
+    }
+    return responses;
+  }
+};
+function buildPromptDocument2(request, attachments, guidelinePatterns) {
   const parts = [];
   if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
     parts.push(request.systemPrompt.trim());
@@ -3169,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path10.basename(absolutePath);
+    const fileName = path11.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -3194,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path10.resolve(attachment);
-    const normalized = absolutePath.split(path10.sep).join("/");
+    const absolutePath = path11.resolve(attachment);
+    const normalized = absolutePath.split(path11.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -3210,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path10.resolve(attachment);
+    const absolutePath = path11.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3218,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
+  const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -3231,7 +4067,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(path10.resolve(attachment));
+    deduped.add(path11.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -3240,7 +4076,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(path10.resolve(inputFile));
+      deduped.add(path11.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3289,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3, readFile as readFile6 } from "node:fs/promises";
-import path11 from "node:path";
+import path12 from "node:path";
 import { parse as parse3 } from "yaml";
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3326,7 +4162,7 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = path11.resolve(filePath);
+  const absolutePath = path12.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
@@ -3358,6 +4194,8 @@ function createProvider(target) {
       return new CliProvider(target.name, target.config);
     case "codex":
       return new CodexProvider(target.name, target.config);
+    case "pi-coding-agent":
+      return new PiCodingAgentProvider(target.name, target.config);
     case "mock":
       return new MockProvider(target.name, target.config);
     case "vscode":
@@ -3377,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
 // src/evaluation/evaluators.ts
 import { generateText as generateText2 } from "ai";
 import { z } from "zod";
+// src/runtime/exec.ts
+function getBunSpawn() {
+  const bunSpawn = globalThis.Bun?.spawn;
+  return typeof bunSpawn === "function" ? bunSpawn : void 0;
+}
+async function execShellWithStdin(command, stdinPayload, options = {}) {
+  const bunSpawn = getBunSpawn();
+  if (bunSpawn) {
+    const encoder = new TextEncoder();
+    const proc = bunSpawn({
+      cmd: ["sh", "-c", command],
+      cwd: options.cwd,
+      stdin: encoder.encode(stdinPayload),
+      stdout: "pipe",
+      stderr: "pipe"
+    });
+    const timeout = options.timeoutMs ? setTimeout(() => {
+      proc.kill();
+    }, options.timeoutMs) : void 0;
+    try {
+      const stdout = await new Response(proc.stdout).text();
+      const stderr = await new Response(proc.stderr).text();
+      const exitCode = await proc.exited;
+      return { stdout, stderr, exitCode };
+    } finally {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+    }
+  }
+  const { spawn: spawn3 } = await import("node:child_process");
+  return await new Promise((resolve, reject) => {
+    const child = spawn3(command, {
+      shell: true,
+      cwd: options.cwd,
+      stdio: ["pipe", "pipe", "pipe"]
+    });
+    let stdout = "";
+    let stderr = "";
+    const timeout = options.timeoutMs ? setTimeout(() => {
+      child.kill();
+      reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+    }, options.timeoutMs) : void 0;
+    child.stdout?.on("data", (data) => {
+      stdout += data.toString();
+    });
+    child.stderr?.on("data", (data) => {
+      stderr += data.toString();
+    });
+    child.on("error", (error) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      reject(error);
+    });
+    child.on("exit", (code) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      resolve({ stdout, stderr, exitCode: code ?? 0 });
+    });
+    child.stdin?.write(stdinPayload);
+    child.stdin?.end();
+  });
+}
+// src/evaluation/evaluators.ts
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
 Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3441,6 +4347,7 @@ var LlmJudgeEvaluator = class {
         null,
         2
       ),
+      [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
       [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
       [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
       [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -3465,7 +4372,7 @@ var LlmJudgeEvaluator = class {
       const score = clampScore(data.score);
       const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
       const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning ?? providerResponse?.reasoning;
+      const reasoning = data.reasoning;
       const expectedAspectCount = Math.max(hits.length + misses.length, 1);
       return {
         score,
@@ -3567,7 +4474,9 @@ var LlmJudgeEvaluator = class {
           maxOutputTokens: this.maxOutputTokens,
           temperature: this.temperature
         });
-        const data = schema.parse(parseJsonFromText(response.text ?? ""));
+        const data = schema.parse(
+          parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+        );
         return { data, providerResponse: response };
       } catch (e) {
         lastError = e instanceof Error ? e : new Error(String(e));
@@ -3649,17 +4558,17 @@ var CodeEvaluator = class {
     const inputPayload = JSON.stringify(
       {
         question: context.evalCase.question,
-        expected_outcome: context.evalCase.expected_outcome,
-        expected_messages: context.evalCase.expected_messages,
-        reference_answer: context.evalCase.reference_answer,
-        candidate_answer: context.candidate,
-        guideline_files: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths.filter(
-          (path13) => !context.evalCase.guideline_paths.includes(path13)
+        expectedOutcome: context.evalCase.expected_outcome,
+        expectedMessages: context.evalCase.expected_messages,
+        referenceAnswer: context.evalCase.reference_answer,
+        candidateAnswer: context.candidate,
+        outputMessages: context.outputMessages ?? null,
+        guidelineFiles: context.evalCase.guideline_paths,
+        inputFiles: context.evalCase.file_paths.filter(
+          (path14) => !context.evalCase.guideline_paths.includes(path14)
         ),
-        input_messages: context.evalCase.input_messages,
-        candidate_trace_file: context.candidateTraceRef ?? null,
-        candidate_trace_summary: context.candidateTraceSummary ?? null
+        inputMessages: context.evalCase.input_messages,
+        traceSummary: context.traceSummary ?? null
       },
       null,
       2
@@ -3729,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
   return { score, verdict, hits, misses };
 }
 async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { spawn: spawn2 } = await import("node:child_process");
-  return await new Promise((resolve, reject) => {
-    const child = spawn2(scriptPath, {
-      shell: true,
-      cwd
-    });
-    let stdout = "";
-    let stderr = "";
-    const timeout = agentTimeoutMs ? setTimeout(() => {
-      child.kill();
-      reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
-    }, agentTimeoutMs) : void 0;
-    child.stdout?.on("data", (data) => {
-      stdout += data.toString();
-    });
-    child.stderr?.on("data", (data) => {
-      stderr += data.toString();
-    });
-    child.on("error", (error) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
-      reject(error);
-    });
-    child.on("exit", (code) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
-      if (code && code !== 0 && stderr.length > 0) {
-        reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
-        return;
-      }
-      resolve(stdout.trim());
-    });
-    child.stdin?.write(input);
-    child.stdin?.end();
+  const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
+    cwd,
+    timeoutMs: agentTimeoutMs
   });
+  if (exitCode !== 0) {
+    const trimmedErr = stderr.trim();
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
 }
 function parseJsonSafe(payload) {
   try {
@@ -3779,6 +4662,33 @@ function substituteVariables(template, variables) {
     return variables[varName] ?? match;
   });
 }
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
 var ToolTrajectoryEvaluator = class {
   kind = "tool_trajectory";
   config;
@@ -3786,8 +4696,19 @@ var ToolTrajectoryEvaluator = class {
     this.config = options.config;
   }
   evaluate(context) {
-    const { candidateTrace, candidateTraceSummary } = context;
-    if (!candidateTrace || !candidateTraceSummary) {
+    const { outputMessages, traceSummary } = context;
+    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
+    if (toolCalls.length === 0 && !traceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
+    if (!summary) {
       return {
         score: 0,
         verdict: "fail",
@@ -3798,11 +4719,11 @@ var ToolTrajectoryEvaluator = class {
     }
     switch (this.config.mode) {
       case "any_order":
-        return this.evaluateAnyOrder(candidateTraceSummary);
+        return this.evaluateAnyOrder(summary);
       case "in_order":
-        return this.evaluateInOrder(candidateTrace);
+        return this.evaluateInOrder(toolCalls);
       case "exact":
-        return this.evaluateExact(candidateTrace);
+        return this.evaluateExact(toolCalls);
       default:
         return {
           score: 0,
@@ -3813,6 +4734,42 @@ var ToolTrajectoryEvaluator = class {
         };
     }
   }
+  /**
+   * Extract tool calls from output messages.
+   */
+  extractToolCallsFromMessages(messages) {
+    if (!messages) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const message of messages) {
+      if (message.toolCalls) {
+        for (const call of message.toolCalls) {
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
+        }
+      }
+    }
+    return toolCalls;
+  }
+  /**
+   * Build a summary from extracted tool calls.
+   */
+  buildSummary(toolCalls) {
+    const toolCallsByName = {};
+    for (const call of toolCalls) {
+      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+    }
+    const toolNames = Object.keys(toolCallsByName).sort();
+    return {
+      eventCount: toolCalls.length,
+      toolNames,
+      toolCallsByName,
+      errorCount: 0
+    };
+  }
   evaluateAnyOrder(summary) {
     const minimums = this.config.minimums ?? {};
     const toolNames = Object.keys(minimums);
@@ -3845,7 +4802,7 @@ var ToolTrajectoryEvaluator = class {
       expectedAspectCount: toolNames.length
     };
   }
-  evaluateInOrder(trace) {
+  evaluateInOrder(toolCalls) {
     const expected = this.config.expected ?? [];
     if (expected.length === 0) {
       return {
@@ -3856,23 +4813,33 @@ var ToolTrajectoryEvaluator = class {
         expectedAspectCount: 0
       };
     }
-    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
     const hits = [];
     const misses = [];
     let actualIndex = 0;
     for (let i = 0; i < expected.length; i++) {
-      const expectedTool = expected[i].tool;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
       let found = false;
-      while (actualIndex < actualToolCalls.length) {
-        if (actualToolCalls[actualIndex].name === expectedTool) {
-          hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+      let argsMismatch = false;
+      while (actualIndex < toolCalls.length) {
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
           actualIndex++;
-          found = true;
+          argsMismatch = true;
           break;
         }
         actualIndex++;
       }
-      if (!found) {
+      if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
     }
@@ -3885,7 +4852,7 @@ var ToolTrajectoryEvaluator = class {
       expectedAspectCount: expected.length
     };
   }
-  evaluateExact(trace) {
+  evaluateExact(toolCalls) {
     const expected = this.config.expected ?? [];
     if (expected.length === 0) {
       return {
@@ -3896,18 +4863,23 @@ var ToolTrajectoryEvaluator = class {
         expectedAspectCount: 0
       };
     }
-    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
     const hits = [];
     const misses = [];
-    if (actualToolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
+    if (toolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
     }
-    const checkLength = Math.min(expected.length, actualToolCalls.length);
+    const checkLength = Math.min(expected.length, toolCalls.length);
     for (let i = 0; i < checkLength; i++) {
-      const expectedTool = expected[i].tool;
-      const actualTool = actualToolCalls[i].name;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
       if (actualTool === expectedTool) {
-        hits.push(`Position ${i}: ${expectedTool} \u2713`);
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
@@ -4119,11 +5091,13 @@ var CompositeEvaluator = class {
         evalCaseId: context.evalCase.id,
         attempt: context.attempt
       });
-      const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
+      const data = freeformEvaluationSchema.parse(
+        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+      );
       const score = clampScore(data.score);
       const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
       const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning ?? response.reasoning;
+      const reasoning = data.reasoning;
       return {
         score,
         verdict: scoreToVerdict(score),
@@ -4149,9 +5123,9 @@ var CompositeEvaluator = class {
 };
 // src/evaluation/orchestrator.ts
-import { createHash, randomUUID as randomUUID2 } from "node:crypto";
-import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
-import path12 from "node:path";
+import { createHash, randomUUID as randomUUID3 } from "node:crypto";
+import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
+import path13 from "node:path";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -4546,11 +5520,19 @@ async function runBatchEvaluation(options) {
     const evalCase = evalCases[i];
     const promptInputs = promptInputsList[i];
     const providerResponse = batchResponse[i];
+    const outputMessages = providerResponse.outputMessages;
+    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+    const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
+      tokenUsage: providerResponse.tokenUsage,
+      costUsd: providerResponse.costUsd,
+      durationMs: providerResponse.durationMs
+    }) : void 0;
+    const candidate = extractLastAssistantContent(outputMessages);
     let result;
     try {
       result = await evaluateCandidate({
         evalCase,
-        candidate: providerResponse.text ?? "",
+        candidate,
         target,
         provider,
         evaluators: evaluatorRegistry,
@@ -4558,7 +5540,9 @@ async function runBatchEvaluation(options) {
         nowFn,
         attempt: 0,
         judgeProvider: await resolveJudgeProvider(target),
-        agentTimeoutMs
+        agentTimeoutMs,
+        outputMessages,
+        traceSummary
       });
     } catch (error) {
       const errorResult = buildErrorResult(
@@ -4662,21 +5646,18 @@ async function runEvalCase(options) {
   if (cacheKey && cache && !cachedResponse) {
     await cache.set(cacheKey, providerResponse);
   }
-  let candidateTrace = providerResponse.trace;
-  if (!candidateTrace && providerResponse.traceRef) {
-    try {
-      const rawTrace = await readJsonFile(providerResponse.traceRef);
-      if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
-        candidateTrace = rawTrace;
-      }
-    } catch {
-    }
-  }
-  const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
+  const outputMessages = providerResponse.outputMessages;
+  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
+    tokenUsage: providerResponse.tokenUsage,
+    costUsd: providerResponse.costUsd,
+    durationMs: providerResponse.durationMs
+  }) : void 0;
+  const candidate = extractLastAssistantContent(outputMessages);
   try {
     return await evaluateCandidate({
       evalCase,
-      candidate: providerResponse.text ?? "",
+      candidate,
       target,
       provider,
       evaluators,
@@ -4685,9 +5666,8 @@ async function runEvalCase(options) {
       attempt,
       judgeProvider,
       agentTimeoutMs,
-      candidateTrace,
-      candidateTraceRef: providerResponse.traceRef,
-      candidateTraceSummary
+      outputMessages,
+      traceSummary
     });
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4705,9 +5685,8 @@ async function evaluateCandidate(options) {
     attempt,
     judgeProvider,
     agentTimeoutMs,
-    candidateTrace,
-    candidateTraceRef,
-    candidateTraceSummary
+    outputMessages,
+    traceSummary
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4721,9 +5700,8 @@ async function evaluateCandidate(options) {
     now: gradeTimestamp,
     judgeProvider,
     agentTimeoutMs,
-    candidateTrace,
-    candidateTraceRef,
-    candidateTraceSummary
+    outputMessages,
+    traceSummary
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -4747,21 +5725,21 @@ async function evaluateCandidate(options) {
   }
   return {
     timestamp: completedAt.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: score.score,
     hits: score.hits,
     misses: score.misses,
-    candidate_answer: candidate,
+    candidateAnswer: candidate,
     target: target.name,
     reasoning: score.reasoning,
-    raw_aspects: score.rawAspects,
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
-    evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
-    evaluator_results: evaluatorResults,
-    trace_summary: candidateTraceSummary
+    rawAspects: score.rawAspects,
+    agentProviderRequest,
+    lmProviderRequest,
+    evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
+    evaluatorResults,
+    traceSummary
   };
 }
 async function runEvaluatorsForCase(options) {
@@ -4776,9 +5754,8 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     agentTimeoutMs,
-    candidateTrace,
-    candidateTraceRef,
-    candidateTraceSummary
+    outputMessages,
+    traceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -4793,9 +5770,8 @@ async function runEvaluatorsForCase(options) {
       now,
       judgeProvider,
       agentTimeoutMs,
-      candidateTrace,
-      candidateTraceRef,
-      candidateTraceSummary
+      outputMessages,
+      traceSummary
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4812,9 +5788,8 @@ async function runEvaluatorsForCase(options) {
     promptInputs,
     now,
     judgeProvider,
-    candidateTrace,
-    candidateTraceRef,
-    candidateTraceSummary
+    outputMessages,
+    traceSummary
   });
   return { score };
 }
@@ -4831,9 +5806,8 @@ async function runEvaluatorList(options) {
     now,
     judgeProvider,
     agentTimeoutMs,
-    candidateTrace,
-    candidateTraceRef,
-    candidateTraceSummary
+    outputMessages,
+    traceSummary
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -4863,7 +5837,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest
         });
       }
       if (evaluator.type === "code") {
@@ -4880,8 +5854,8 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          candidateTraceRef,
-          candidateTraceSummary
+          outputMessages,
+          traceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4894,11 +5868,11 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -4951,8 +5925,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest,
-          evaluator_results: mapChildResults(score2.evaluatorResults)
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          evaluatorResults: mapChildResults(score2.evaluatorResults)
         });
       }
       if (evaluator.type === "tool_trajectory") {
@@ -4967,9 +5941,8 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          candidateTrace,
-          candidateTraceRef,
-          candidateTraceSummary
+          outputMessages,
+          traceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5111,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = path12.resolve(directory, filename);
-  await mkdir2(path12.dirname(filePath), { recursive: true });
+  const filePath = path13.resolve(directory, filename);
+  await mkdir3(path13.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
     guidelines: promptInputs.guidelines,
     guideline_paths: evalCase.guideline_paths
   };
-  await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
+  await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
 }
 function sanitizeFilename(value) {
   if (!value) {
     return "prompt";
   }
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : randomUUID2();
+  return sanitized.length > 0 ? sanitized : randomUUID3();
 }
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -5183,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
   }
   return {
     timestamp: timestamp.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: 0,
     hits: [],
     misses: [`Error: ${message}`],
-    candidate_answer: `Error occurred: ${message}`,
+    candidateAnswer: `Error occurred: ${message}`,
     target: targetName,
-    raw_aspects: [],
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
+    rawAspects: [],
+    agentProviderRequest,
+    lmProviderRequest,
     error: message
   };
 }
@@ -5238,8 +6211,8 @@ function mapChildResults(children) {
     hits: child.hits,
     misses: child.misses,
     reasoning: child.reasoning,
-    evaluator_provider_request: child.evaluatorRawRequest,
-    evaluator_results: mapChildResults(child.evaluatorResults)
+    evaluatorProviderRequest: child.evaluatorRawRequest,
+    evaluatorResults: mapChildResults(child.evaluatorResults)
   }));
 }
 function computeWeightedMean(entries) {
@@ -5340,17 +6313,21 @@ function createAgentKernel() {
 export {
   CodeEvaluator,
   CompositeEvaluator,
+  DEFAULT_EXPLORATION_TOOLS,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator,
+  avgToolDurationMs,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
   computeTraceSummary,
   consumeCodexLogEntries,
+  consumePiLogEntries,
   createAgentKernel,
   createProvider,
   ensureVSCodeSubagents,
+  explorationRatio,
   extractCodeBlocks,
   fileExists,
   findGitRoot,
@@ -5362,10 +6339,9 @@ export {
   isJsonValue,
   isTestMessage,
   isTestMessageRole,
-  isTraceEvent,
-  isTraceEventType,
   listTargetNames,
   loadEvalCases,
+  mergeExecutionMetrics,
   normalizeLineEndings,
   readJsonFile,
   readTargetDefinitions,
@@ -5376,6 +6352,8 @@ export {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
-  subscribeToCodexLogEntries
+  subscribeToCodexLogEntries,
+  subscribeToPiLogEntries,
+  tokensPerTool
 };
 //# sourceMappingURL=index.js.map