npm - @agentv/core - Versions diffs - 1.4.0 → 1.5.0 - Mend

@agentv/core 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-KPHTMTZ3.js → chunk-E2VSU4WZ.js} +265 -83
package/dist/chunk-E2VSU4WZ.js.map +1 -0
package/dist/evaluation/validation/index.cjs +82 -71
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +3 -72
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1475 -393
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +227 -33
package/dist/index.d.ts +227 -33
package/dist/index.js +1142 -244
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-KPHTMTZ3.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-KPHTMTZ3.js";
+} from "./chunk-E2VSU4WZ.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -93,6 +93,53 @@ function computeTraceSummary(messages) {
     errorCount: 0
   };
 }
+var DEFAULT_EXPLORATION_TOOLS = [
+  "read",
+  "grep",
+  "glob",
+  "search",
+  "list",
+  "Read",
+  "Grep",
+  "Glob",
+  "WebSearch",
+  "WebFetch"
+];
+function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
+  if (summary.eventCount === 0) return void 0;
+  const explorationCalls = explorationTools.reduce(
+    (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
+    0
+  );
+  return explorationCalls / summary.eventCount;
+}
+function tokensPerTool(summary) {
+  if (!summary.tokenUsage || summary.eventCount === 0) return void 0;
+  const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
+  return totalTokens / summary.eventCount;
+}
+function avgToolDurationMs(summary) {
+  if (!summary.toolDurations) return void 0;
+  let totalDuration = 0;
+  let totalCalls = 0;
+  for (const durations of Object.values(summary.toolDurations)) {
+    for (const duration of durations) {
+      totalDuration += duration;
+      totalCalls++;
+    }
+  }
+  if (totalCalls === 0) return void 0;
+  return totalDuration / totalCalls;
+}
+function mergeExecutionMetrics(summary, metrics) {
+  if (!metrics) return summary;
+  return {
+    ...summary,
+    tokenUsage: metrics.tokenUsage,
+    costUsd: metrics.costUsd,
+    durationMs: metrics.durationMs
+  };
+}
 // src/evaluation/yaml-parser.ts
 import { readFile as readFile5 } from "node:fs/promises";
@@ -607,7 +654,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         expected = [];
         for (const item of rawExpected) {
           if (isJsonObject2(item) && typeof item.tool === "string") {
-            expected.push({ tool: item.tool });
+            let args;
+            if (item.args === "any") {
+              args = "any";
+            } else if (isJsonObject2(item.args)) {
+              args = item.args;
+            }
+            expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
           }
         }
       }
@@ -1767,12 +1820,14 @@ var CliProvider = class {
         `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
+    const startTime = Date.now();
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
       timeoutMs: this.config.timeoutMs,
       signal: request.signal
     });
+    const measuredDurationMs = Date.now() - startTime;
     if (result.failed || (result.exitCode ?? 0) !== 0) {
       if (request.signal?.aborted) {
         throw new Error("CLI provider request was aborted");
@@ -1791,6 +1846,9 @@ var CliProvider = class {
     const parsed = this.parseOutputContent(responseContent);
     return {
       outputMessages: parsed.outputMessages,
+      tokenUsage: parsed.tokenUsage,
+      costUsd: parsed.costUsd,
+      durationMs: parsed.durationMs ?? measuredDurationMs,
       raw: {
         command: renderedCommand,
         stderr: result.stderr,
@@ -1838,12 +1896,14 @@ var CliProvider = class {
         `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
+    const startTime = Date.now();
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
       timeoutMs: this.config.timeoutMs,
       signal: controller.signal
     });
+    const measuredDurationMs = Date.now() - startTime;
     if (result.failed || (result.exitCode ?? 0) !== 0) {
       if (controller.signal.aborted) {
         throw new Error("CLI provider request was aborted");
@@ -1865,11 +1925,13 @@ var CliProvider = class {
     if (missingIds.length > 0) {
       throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
     }
+    const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
     const responses = requests.map((request) => {
       const evalCaseId = request.evalCaseId;
       if (!evalCaseId) {
         return {
           outputMessages: [],
+          durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
@@ -1883,6 +1945,7 @@ var CliProvider = class {
       if (!parsed) {
         return {
           outputMessages: [],
+          durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
@@ -1894,6 +1957,9 @@ var CliProvider = class {
       }
       return {
         outputMessages: parsed.outputMessages,
+        tokenUsage: parsed.tokenUsage,
+        costUsd: parsed.costUsd,
+        durationMs: parsed.durationMs ?? perRequestFallbackMs,
         raw: {
           command: renderedCommand,
           stderr: result.stderr,
@@ -1911,25 +1977,55 @@ var CliProvider = class {
    * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
    * If only 'text' is provided, wrap it in outputMessages.
    * Otherwise, treat the entire content as plain text wrapped in outputMessages.
+   *
+   * Also extracts optional execution metrics:
+   * - token_usage: { input, output, cached? }
+   * - cost_usd: number
+   * - duration_ms: number
    */
   parseOutputContent(content) {
     try {
       const parsed = JSON.parse(content);
       if (typeof parsed === "object" && parsed !== null) {
         const obj = parsed;
+        const tokenUsage = this.parseTokenUsage(obj.token_usage);
+        const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
+        const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
         const outputMessages = this.parseOutputMessages(obj.output_messages);
         if (outputMessages && outputMessages.length > 0) {
-          return { outputMessages };
+          return { outputMessages, tokenUsage, costUsd, durationMs };
         }
         if ("text" in obj) {
           const text = typeof obj.text === "string" ? obj.text : String(obj.text);
-          return { outputMessages: [{ role: "assistant", content: text }] };
+          return {
+            outputMessages: [{ role: "assistant", content: text }],
+            tokenUsage,
+            costUsd,
+            durationMs
+          };
         }
       }
     } catch {
     }
     return { outputMessages: [{ role: "assistant", content }] };
   }
+  /**
+   * Parse token_usage from CLI output.
+   */
+  parseTokenUsage(tokenUsage) {
+    if (typeof tokenUsage !== "object" || tokenUsage === null) {
+      return void 0;
+    }
+    const obj = tokenUsage;
+    if (typeof obj.input !== "number" || typeof obj.output !== "number") {
+      return void 0;
+    }
+    return {
+      input: obj.input,
+      output: obj.output,
+      cached: typeof obj.cached === "number" ? obj.cached : void 0
+    };
+  }
   /**
    * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
    */
@@ -2006,6 +2102,9 @@ var CliProvider = class {
       if (records.has(id)) {
         throw new Error(`CLI batch output contains duplicate id: ${id}`);
       }
+      const tokenUsage = this.parseTokenUsage(obj.token_usage);
+      const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
+      const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
       const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
       let outputMessages;
       if (parsedOutputMessages && parsedOutputMessages.length > 0) {
@@ -2015,7 +2114,10 @@ var CliProvider = class {
         outputMessages = text ? [{ role: "assistant", content: text }] : [];
       }
       records.set(id, {
-        outputMessages
+        outputMessages,
+        tokenUsage,
+        costUsd,
+        durationMs
       });
     }
     return records;
@@ -2331,6 +2433,11 @@ var execAsync2 = promisify2(execCallback);
 var WORKSPACE_PREFIX = "agentv-codex-";
 var PROMPT_FILENAME = "prompt.md";
 var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
+var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
 var CodexProvider = class {
   id;
   kind = "codex";
@@ -2355,7 +2462,11 @@ var CodexProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptContent = buildPromptDocument(request, inputFiles);
+      const basePrompt = buildPromptDocument(request, inputFiles);
+      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+      const promptContent = `${systemPrompt}
+${basePrompt}`;
       const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
       await writeFile(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
@@ -3039,172 +3150,851 @@ var MockProvider = class {
   }
 };
-// src/evaluation/providers/vscode.ts
+// src/evaluation/providers/pi-coding-agent.ts
+import { spawn as spawn2 } from "node:child_process";
+import { randomUUID as randomUUID2 } from "node:crypto";
+import { createWriteStream as createWriteStream2 } from "node:fs";
+import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
+import { tmpdir as tmpdir2 } from "node:os";
 import path10 from "node:path";
-import {
-  dispatchAgentSession,
-  dispatchBatchAgent,
-  getSubagentRoot,
-  provisionSubagents
-} from "subagent";
-// src/evaluation/providers/vscode-templates.ts
-var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
-{{userQuery}}
-[[ ## system_instructions ## ]]
-**IMPORTANT**: Follow these exact steps:
-1. Create and write your complete response to: {{responseFileTmp}}
-    - Do NOT create any additional output files in the workspace.
-    - All intended file outputs/changes MUST be written in your response file.
-    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
-2. When completely finished, run these PowerShell commands to signal completion:
-\`\`\`
-Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
-if (Test-Path subagent.lock) { del subagent.lock }
-\`\`\`
-Do not proceed to step 2 until your response is completely written to the temporary file.
-`;
-var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
-{{userQuery}}
-[[ ## system_instructions ## ]]
-**IMPORTANT**: Follow these exact steps:
-1. Create and write your complete response to: {{responseFileTmp}}
-    - Do NOT create any additional output files in the workspace.
-    - All intended file outputs/changes MUST be written in your response file.
-    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
-2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
-3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
-`;
+// src/evaluation/providers/pi-log-tracker.ts
+var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
+var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
+function getPiLogStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_LOGS_KEY2];
+  if (existing) {
+    return existing;
+  }
+  const created = [];
+  globalObject[GLOBAL_LOGS_KEY2] = created;
+  return created;
+}
+function getSubscriberStore2() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
+  if (existing) {
+    return existing;
+  }
+  const created = /* @__PURE__ */ new Set();
+  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
+  return created;
+}
+function notifySubscribers2(entry) {
+  const subscribers = Array.from(getSubscriberStore2());
+  for (const listener of subscribers) {
+    try {
+      listener(entry);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Pi log subscriber failed: ${message}`);
+    }
+  }
+}
+function recordPiLogEntry(entry) {
+  getPiLogStore().push(entry);
+  notifySubscribers2(entry);
+}
+function consumePiLogEntries() {
+  const store = getPiLogStore();
+  if (store.length === 0) {
+    return [];
+  }
+  return store.splice(0, store.length);
+}
+function subscribeToPiLogEntries(listener) {
+  const store = getSubscriberStore2();
+  store.add(listener);
+  return () => {
+    store.delete(listener);
+  };
+}
-// src/evaluation/providers/vscode.ts
-var VSCodeProvider = class {
+// src/evaluation/providers/pi-coding-agent.ts
+var WORKSPACE_PREFIX2 = "agentv-pi-";
+var PROMPT_FILENAME2 = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
+var PiCodingAgentProvider = class {
   id;
-  kind;
+  kind = "pi-coding-agent";
   targetName;
-  supportsBatch = true;
+  supportsBatch = false;
   config;
-  constructor(targetName, config, kind) {
-    this.id = `${kind}:${targetName}`;
-    this.kind = kind;
+  runPi;
+  constructor(targetName, config, runner = defaultPiRunner) {
+    this.id = `pi-coding-agent:${targetName}`;
     this.targetName = targetName;
     this.config = config;
+    this.runPi = runner;
   }
   async invoke(request) {
     if (request.signal?.aborted) {
-      throw new Error("VS Code provider request was aborted before dispatch");
-    }
-    const inputFiles = normalizeAttachments(request.inputFiles);
-    const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
-    const session = await dispatchAgentSession({
-      userQuery: promptContent,
-      extraAttachments: inputFiles,
-      requestTemplate: AGENTV_REQUEST_TEMPLATE,
-      wait: this.config.waitForResponse,
-      dryRun: this.config.dryRun,
-      vscodeCmd: this.config.command,
-      subagentRoot: this.config.subagentRoot,
-      workspaceTemplate: this.config.workspaceTemplate,
-      silent: true
-    });
-    if (session.exitCode !== 0 || !session.responseFile) {
-      const failure = session.error ?? "VS Code subagent did not produce a response";
-      throw new Error(failure);
+      throw new Error("Pi coding agent request was aborted before execution");
     }
-    if (this.config.dryRun) {
+    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const workspaceRoot = await this.createWorkspace();
+    const logger = await this.createStreamLogger(request).catch(() => void 0);
+    try {
+      const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
+      await writeFile2(promptFile, request.question, "utf8");
+      const args = this.buildPiArgs(request.question, inputFiles);
+      const cwd = this.resolveCwd(workspaceRoot);
+      const result = await this.executePi(args, cwd, request.signal, logger);
+      if (result.timedOut) {
+        throw new Error(
+          `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      if (result.exitCode !== 0) {
+        const detail = pickDetail2(result.stderr, result.stdout);
+        const prefix = `Pi coding agent exited with code ${result.exitCode}`;
+        throw new Error(detail ? `${prefix}: ${detail}` : prefix);
+      }
+      const parsed = parsePiJsonl(result.stdout);
+      const outputMessages = extractOutputMessages(parsed);
+      const assistantText = extractAssistantText2(outputMessages);
       return {
-        outputMessages: [],
         raw: {
-          session,
-          inputFiles
-        }
+          response: parsed,
+          stdout: result.stdout,
+          stderr: result.stderr,
+          exitCode: result.exitCode,
+          args,
+          executable: this.config.executable,
+          promptFile,
+          workspace: workspaceRoot,
+          inputFiles,
+          logFile: logger?.filePath
+        },
+        outputMessages
       };
+    } finally {
+      await logger?.close();
+      await this.cleanupWorkspace(workspaceRoot);
     }
-    const responseText = await readTextFile(session.responseFile);
-    return {
-      outputMessages: [{ role: "assistant", content: responseText }],
-      raw: {
-        session,
-        inputFiles
-      }
-    };
   }
-  async invokeBatch(requests) {
-    if (requests.length === 0) {
-      return [];
+  resolveCwd(workspaceRoot) {
+    if (!this.config.cwd) {
+      return workspaceRoot;
     }
-    const normalizedRequests = requests.map((req) => ({
-      request: req,
-      inputFiles: normalizeAttachments(req.inputFiles)
-    }));
-    const combinedInputFiles = mergeAttachments(
-      normalizedRequests.map(({ inputFiles }) => inputFiles)
-    );
-    const userQueries = normalizedRequests.map(
-      ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
-    );
-    const session = await dispatchBatchAgent({
-      userQueries,
-      extraAttachments: combinedInputFiles,
-      requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
-      wait: this.config.waitForResponse,
-      dryRun: this.config.dryRun,
-      vscodeCmd: this.config.command,
-      subagentRoot: this.config.subagentRoot,
-      workspaceTemplate: this.config.workspaceTemplate,
-      silent: true
-    });
-    if (session.exitCode !== 0 || !session.responseFiles) {
-      const failure = session.error ?? "VS Code subagent did not produce batch responses";
-      throw new Error(failure);
+    return path10.resolve(this.config.cwd);
+  }
+  buildPiArgs(prompt, inputFiles) {
+    const args = [];
+    if (this.config.provider) {
+      args.push("--provider", this.config.provider);
     }
-    if (this.config.dryRun) {
-      return normalizedRequests.map(({ inputFiles }) => ({
-        outputMessages: [],
-        raw: {
-          session,
-          inputFiles,
-          allInputFiles: combinedInputFiles
-        }
-      }));
+    if (this.config.model) {
+      args.push("--model", this.config.model);
     }
-    if (session.responseFiles.length !== requests.length) {
-      throw new Error(
-        `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
-      );
+    if (this.config.apiKey) {
+      args.push("--api-key", this.config.apiKey);
     }
-    const responses = [];
-    for (const [index, responseFile] of session.responseFiles.entries()) {
-      const responseText = await readTextFile(responseFile);
-      responses.push({
-        outputMessages: [{ role: "assistant", content: responseText }],
-        raw: {
-          session,
-          inputFiles: normalizedRequests[index]?.inputFiles,
-          allInputFiles: combinedInputFiles,
-          responseFile
-        }
-      });
+    args.push("--mode", "json");
+    args.push("--print");
+    args.push("--no-session");
+    if (this.config.tools) {
+      args.push("--tools", this.config.tools);
     }
-    return responses;
+    if (this.config.thinking) {
+      args.push("--thinking", this.config.thinking);
+    }
+    if (this.config.args && this.config.args.length > 0) {
+      args.push(...this.config.args);
+    }
+    if (inputFiles && inputFiles.length > 0) {
+      for (const file of inputFiles) {
+        args.push(`@${file}`);
+      }
+    }
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
+    const fullPrompt = `${systemPrompt}
+${prompt}`;
+    const escapedPrompt = escapeAtSymbols(fullPrompt);
+    args.push(escapedPrompt);
+    return args;
   }
-};
-function buildPromptDocument2(request, attachments, guidelinePatterns) {
-  const parts = [];
-  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
-    parts.push(request.systemPrompt.trim());
+  async executePi(args, cwd, signal, logger) {
+    try {
+      return await this.runPi({
+        executable: this.config.executable,
+        args,
+        cwd,
+        timeoutMs: this.config.timeoutMs,
+        env: this.buildEnv(),
+        signal,
+        onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
+        onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
+      });
+    } catch (error) {
+      const err = error;
+      if (err.code === "ENOENT") {
+        throw new Error(
+          `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
+        );
+      }
+      throw error;
+    }
   }
-  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
-  const attachmentFiles = collectAttachmentFiles(attachments);
-  const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
-  const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
-  if (prereadBlock.length > 0) {
+  buildEnv() {
+    const env = { ...process.env };
+    if (this.config.apiKey) {
+      const provider = this.config.provider?.toLowerCase() ?? "google";
+      switch (provider) {
+        case "google":
+        case "gemini":
+          env.GEMINI_API_KEY = this.config.apiKey;
+          break;
+        case "anthropic":
+          env.ANTHROPIC_API_KEY = this.config.apiKey;
+          break;
+        case "openai":
+          env.OPENAI_API_KEY = this.config.apiKey;
+          break;
+        case "groq":
+          env.GROQ_API_KEY = this.config.apiKey;
+          break;
+        case "xai":
+          env.XAI_API_KEY = this.config.apiKey;
+          break;
+        case "openrouter":
+          env.OPENROUTER_API_KEY = this.config.apiKey;
+          break;
+      }
+    }
+    return env;
+  }
+  async createWorkspace() {
+    return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
+  }
+  async cleanupWorkspace(workspaceRoot) {
+    try {
+      await rm2(workspaceRoot, { recursive: true, force: true });
+    } catch {
+    }
+  }
+  resolveLogDirectory() {
+    if (this.config.logDir) {
+      return path10.resolve(this.config.logDir);
+    }
+    return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+  }
+  async createStreamLogger(request) {
+    const logDir = this.resolveLogDirectory();
+    if (!logDir) {
+      return void 0;
+    }
+    try {
+      await mkdir2(logDir, { recursive: true });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
+      return void 0;
+    }
+    const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
+    try {
+      const logger = await PiStreamLogger.create({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt,
+        format: this.config.logFormat ?? "summary"
+      });
+      recordPiLogEntry({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt
+      });
+      return logger;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
+      return void 0;
+    }
+  }
+};
+var PiStreamLogger = class _PiStreamLogger {
+  filePath;
+  stream;
+  startedAt = Date.now();
+  stdoutBuffer = "";
+  stderrBuffer = "";
+  format;
+  constructor(filePath, format) {
+    this.filePath = filePath;
+    this.format = format;
+    this.stream = createWriteStream2(filePath, { flags: "a" });
+  }
+  static async create(options) {
+    const logger = new _PiStreamLogger(options.filePath, options.format);
+    const header = [
+      "# Pi Coding Agent stream log",
+      `# target: ${options.targetName}`,
+      options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
+      options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
+      `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
+      ""
+    ].filter((line) => Boolean(line));
+    logger.writeLines(header);
+    return logger;
+  }
+  handleStdoutChunk(chunk) {
+    this.stdoutBuffer += chunk;
+    this.flushBuffer("stdout");
+  }
+  handleStderrChunk(chunk) {
+    this.stderrBuffer += chunk;
+    this.flushBuffer("stderr");
+  }
+  async close() {
+    this.flushBuffer("stdout");
+    this.flushBuffer("stderr");
+    this.flushRemainder();
+    await new Promise((resolve, reject) => {
+      this.stream.once("error", reject);
+      this.stream.end(() => resolve());
+    });
+  }
+  writeLines(lines) {
+    for (const line of lines) {
+      this.stream.write(`${line}
+`);
+    }
+  }
+  flushBuffer(source) {
+    const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
+    const lines = buffer.split(/\r?\n/);
+    const remainder = lines.pop() ?? "";
+    if (source === "stdout") {
+      this.stdoutBuffer = remainder;
+    } else {
+      this.stderrBuffer = remainder;
+    }
+    for (const line of lines) {
+      const formatted = this.formatLine(line, source);
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+  }
+  formatLine(rawLine, source) {
+    const trimmed = rawLine.trim();
+    if (trimmed.length === 0) {
+      return void 0;
+    }
+    const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
+    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
+  }
+  flushRemainder() {
+    const stdoutRemainder = this.stdoutBuffer.trim();
+    if (stdoutRemainder.length > 0) {
+      const formatted = this.formatLine(stdoutRemainder, "stdout");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    const stderrRemainder = this.stderrBuffer.trim();
+    if (stderrRemainder.length > 0) {
+      const formatted = this.formatLine(stderrRemainder, "stderr");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    this.stdoutBuffer = "";
+    this.stderrBuffer = "";
+  }
+};
+function buildLogFilename2(request, targetName) {
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+  const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
+  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
+  const target = sanitizeForFilename2(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
+}
+function sanitizeForFilename2(value) {
+  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
+  return sanitized.length > 0 ? sanitized : "pi";
+}
+function formatElapsed2(startedAt) {
+  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
+  const hours = Math.floor(elapsedSeconds / 3600);
+  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
+  const seconds = elapsedSeconds % 60;
+  if (hours > 0) {
+    return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+}
+function formatPiLogMessage(rawLine, source) {
+  const parsed = tryParseJsonValue2(rawLine);
+  if (parsed) {
+    const summary = summarizePiEvent(parsed);
+    if (summary) {
+      return summary;
+    }
+  }
+  if (source === "stderr") {
+    return `stderr: ${rawLine}`;
+  }
+  return rawLine;
+}
+function formatPiJsonLog(rawLine) {
+  const parsed = tryParseJsonValue2(rawLine);
+  if (!parsed) {
+    return rawLine;
+  }
+  try {
+    return JSON.stringify(parsed, null, 2);
+  } catch {
+    return rawLine;
+  }
+}
+function summarizePiEvent(event) {
+  if (!event || typeof event !== "object") {
+    return void 0;
+  }
+  const record = event;
+  const type = typeof record.type === "string" ? record.type : void 0;
+  if (!type) {
+    return void 0;
+  }
+  switch (type) {
+    case "agent_start":
+      return "agent_start";
+    case "agent_end":
+      return "agent_end";
+    case "turn_start":
+      return "turn_start";
+    case "turn_end":
+      return "turn_end";
+    case "message_start":
+    case "message_end": {
+      const message = record.message;
+      const role = message?.role;
+      return `${type}: ${role}`;
+    }
+    case "message_update": {
+      const event2 = record.assistantMessageEvent;
+      const eventType = event2?.type;
+      if (eventType === "text_delta") {
+        const delta = event2?.delta;
+        if (typeof delta === "string") {
+          const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
+          return `text_delta: ${preview}`;
+        }
+      }
+      return `message_update: ${eventType}`;
+    }
+    default:
+      return type;
+  }
+}
+function tryParseJsonValue2(rawLine) {
+  try {
+    return JSON.parse(rawLine);
+  } catch {
+    return void 0;
+  }
+}
+function parsePiJsonl(output) {
+  const trimmed = output.trim();
+  if (trimmed.length === 0) {
+    throw new Error("Pi coding agent produced no output");
+  }
+  const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+  const parsed = [];
+  for (const line of lines) {
+    try {
+      parsed.push(JSON.parse(line));
+    } catch {
+    }
+  }
+  if (parsed.length === 0) {
+    throw new Error("Pi coding agent produced no valid JSON output");
+  }
+  return parsed;
+}
+function extractOutputMessages(events) {
+  for (let i = events.length - 1; i >= 0; i--) {
+    const event = events[i];
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type !== "agent_end") {
+      continue;
+    }
+    const messages = record.messages;
+    if (!Array.isArray(messages)) {
+      continue;
+    }
+    return messages.map(convertPiMessage).filter((m) => m !== void 0);
+  }
+  const outputMessages = [];
+  for (const event of events) {
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type === "turn_end") {
+      const message = record.message;
+      const converted = convertPiMessage(message);
+      if (converted) {
+        outputMessages.push(converted);
+      }
+    }
+  }
+  return outputMessages;
+}
+function convertPiMessage(message) {
+  if (!message || typeof message !== "object") {
+    return void 0;
+  }
+  const msg = message;
+  const role = msg.role;
+  if (typeof role !== "string") {
+    return void 0;
+  }
+  const content = extractTextContent(msg.content);
+  const toolCalls = extractToolCalls(msg.content);
+  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
+  const metadata = {};
+  if (msg.api) metadata.api = msg.api;
+  if (msg.provider) metadata.provider = msg.provider;
+  if (msg.model) metadata.model = msg.model;
+  if (msg.usage) metadata.usage = msg.usage;
+  if (msg.stopReason) metadata.stopReason = msg.stopReason;
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
+    timestamp,
+    metadata: Object.keys(metadata).length > 0 ? metadata : void 0
+  };
+}
+function extractTextContent(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function extractToolCalls(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+    if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
+      const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
+      if (existing) {
+        const idx = toolCalls.indexOf(existing);
+        toolCalls[idx] = {
+          ...existing,
+          output: p.content
+        };
+      }
+    }
+  }
+  return toolCalls;
+}
+function extractAssistantText2(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      return JSON.stringify(msg.content);
+    }
+  }
+  return "";
+}
+function escapeAtSymbols(prompt) {
+  return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
+}
+function pickDetail2(stderr, stdout) {
+  const errorText = stderr.trim();
+  if (errorText.length > 0) {
+    return errorText;
+  }
+  const stdoutText = stdout.trim();
+  return stdoutText.length > 0 ? stdoutText : void 0;
+}
+function formatTimeoutSuffix3(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
+async function defaultPiRunner(options) {
+  return await new Promise((resolve, reject) => {
+    const parts = options.executable.split(/\s+/);
+    const executable = parts[0];
+    const executableArgs = parts.slice(1);
+    const allArgs = [...executableArgs, ...options.args];
+    const child = spawn2(executable, allArgs, {
+      cwd: options.cwd,
+      env: options.env,
+      stdio: ["pipe", "pipe", "pipe"],
+      shell: false
+    });
+    let stdout = "";
+    let stderr = "";
+    let timedOut = false;
+    const onAbort = () => {
+      child.kill("SIGTERM");
+    };
+    if (options.signal) {
+      if (options.signal.aborted) {
+        onAbort();
+      } else {
+        options.signal.addEventListener("abort", onAbort, { once: true });
+      }
+    }
+    let timeoutHandle;
+    if (options.timeoutMs && options.timeoutMs > 0) {
+      timeoutHandle = setTimeout(() => {
+        timedOut = true;
+        child.kill("SIGTERM");
+      }, options.timeoutMs);
+      timeoutHandle.unref?.();
+    }
+    child.stdout.setEncoding("utf8");
+    child.stdout.on("data", (chunk) => {
+      stdout += chunk;
+      options.onStdoutChunk?.(chunk);
+    });
+    child.stderr.setEncoding("utf8");
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+      options.onStderrChunk?.(chunk);
+    });
+    child.stdin.end();
+    const cleanup = () => {
+      if (timeoutHandle) {
+        clearTimeout(timeoutHandle);
+      }
+      if (options.signal) {
+        options.signal.removeEventListener("abort", onAbort);
+      }
+    };
+    child.on("error", (error) => {
+      cleanup();
+      reject(error);
+    });
+    child.on("close", (code) => {
+      cleanup();
+      resolve({
+        stdout,
+        stderr,
+        exitCode: typeof code === "number" ? code : -1,
+        timedOut
+      });
+    });
+  });
+}
+// src/evaluation/providers/vscode.ts
+import path11 from "node:path";
+import {
+  dispatchAgentSession,
+  dispatchBatchAgent,
+  getSubagentRoot,
+  provisionSubagents
+} from "subagent";
+// src/evaluation/providers/vscode-templates.ts
+var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
+{{userQuery}}
+[[ ## system_instructions ## ]]
+**IMPORTANT**: Follow these exact steps:
+1. Create and write your complete response to: {{responseFileTmp}}
+    - Do NOT create any additional output files in the workspace.
+    - All intended file outputs/changes MUST be written in your response file.
+    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
+2. When completely finished, run these PowerShell commands to signal completion:
+\`\`\`
+Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
+if (Test-Path subagent.lock) { del subagent.lock }
+\`\`\`
+Do not proceed to step 2 until your response is completely written to the temporary file.
+`;
+var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
+{{userQuery}}
+[[ ## system_instructions ## ]]
+**IMPORTANT**: Follow these exact steps:
+1. Create and write your complete response to: {{responseFileTmp}}
+    - Do NOT create any additional output files in the workspace.
+    - All intended file outputs/changes MUST be written in your response file.
+    - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
+2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
+3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
+`;
+// src/evaluation/providers/vscode.ts
+var VSCodeProvider = class {
+  id;
+  kind;
+  targetName;
+  supportsBatch = true;
+  config;
+  constructor(targetName, config, kind) {
+    this.id = `${kind}:${targetName}`;
+    this.kind = kind;
+    this.targetName = targetName;
+    this.config = config;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("VS Code provider request was aborted before dispatch");
+    }
+    const inputFiles = normalizeAttachments(request.inputFiles);
+    const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
+    const session = await dispatchAgentSession({
+      userQuery: promptContent,
+      extraAttachments: inputFiles,
+      requestTemplate: AGENTV_REQUEST_TEMPLATE,
+      wait: this.config.waitForResponse,
+      dryRun: this.config.dryRun,
+      vscodeCmd: this.config.command,
+      subagentRoot: this.config.subagentRoot,
+      workspaceTemplate: this.config.workspaceTemplate,
+      silent: true
+    });
+    if (session.exitCode !== 0 || !session.responseFile) {
+      const failure = session.error ?? "VS Code subagent did not produce a response";
+      throw new Error(failure);
+    }
+    if (this.config.dryRun) {
+      return {
+        outputMessages: [],
+        raw: {
+          session,
+          inputFiles
+        }
+      };
+    }
+    const responseText = await readTextFile(session.responseFile);
+    return {
+      outputMessages: [{ role: "assistant", content: responseText }],
+      raw: {
+        session,
+        inputFiles
+      }
+    };
+  }
+  async invokeBatch(requests) {
+    if (requests.length === 0) {
+      return [];
+    }
+    const normalizedRequests = requests.map((req) => ({
+      request: req,
+      inputFiles: normalizeAttachments(req.inputFiles)
+    }));
+    const combinedInputFiles = mergeAttachments(
+      normalizedRequests.map(({ inputFiles }) => inputFiles)
+    );
+    const userQueries = normalizedRequests.map(
+      ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
+    );
+    const session = await dispatchBatchAgent({
+      userQueries,
+      extraAttachments: combinedInputFiles,
+      requestTemplate: AGENTV_BATCH_REQUEST_TEMPLATE,
+      wait: this.config.waitForResponse,
+      dryRun: this.config.dryRun,
+      vscodeCmd: this.config.command,
+      subagentRoot: this.config.subagentRoot,
+      workspaceTemplate: this.config.workspaceTemplate,
+      silent: true
+    });
+    if (session.exitCode !== 0 || !session.responseFiles) {
+      const failure = session.error ?? "VS Code subagent did not produce batch responses";
+      throw new Error(failure);
+    }
+    if (this.config.dryRun) {
+      return normalizedRequests.map(({ inputFiles }) => ({
+        outputMessages: [],
+        raw: {
+          session,
+          inputFiles,
+          allInputFiles: combinedInputFiles
+        }
+      }));
+    }
+    if (session.responseFiles.length !== requests.length) {
+      throw new Error(
+        `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
+      );
+    }
+    const responses = [];
+    for (const [index, responseFile] of session.responseFiles.entries()) {
+      const responseText = await readTextFile(responseFile);
+      responses.push({
+        outputMessages: [{ role: "assistant", content: responseText }],
+        raw: {
+          session,
+          inputFiles: normalizedRequests[index]?.inputFiles,
+          allInputFiles: combinedInputFiles,
+          responseFile
+        }
+      });
+    }
+    return responses;
+  }
+};
+function buildPromptDocument2(request, attachments, guidelinePatterns) {
+  const parts = [];
+  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
+    parts.push(request.systemPrompt.trim());
+  }
+  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
+  const attachmentFiles = collectAttachmentFiles(attachments);
+  const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
+  const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
+  if (prereadBlock.length > 0) {
     parts.push("\n", prereadBlock);
   }
   parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
@@ -3215,7 +4005,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path10.basename(absolutePath);
+    const fileName = path11.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -3240,8 +4030,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path10.resolve(attachment);
-    const normalized = absolutePath.split(path10.sep).join("/");
+    const absolutePath = path11.resolve(attachment);
+    const normalized = absolutePath.split(path11.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -3256,7 +4046,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path10.resolve(attachment);
+    const absolutePath = path11.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3264,7 +4054,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
+  const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -3277,7 +4067,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(path10.resolve(attachment));
+    deduped.add(path11.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -3286,7 +4076,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(path10.resolve(inputFile));
+      deduped.add(path11.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3335,7 +4125,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3, readFile as readFile6 } from "node:fs/promises";
-import path11 from "node:path";
+import path12 from "node:path";
 import { parse as parse3 } from "yaml";
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3372,7 +4162,7 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = path11.resolve(filePath);
+  const absolutePath = path12.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
@@ -3404,6 +4194,8 @@ function createProvider(target) {
       return new CliProvider(target.name, target.config);
     case "codex":
       return new CodexProvider(target.name, target.config);
+    case "pi-coding-agent":
+      return new PiCodingAgentProvider(target.name, target.config);
     case "mock":
       return new MockProvider(target.name, target.config);
     case "vscode":
@@ -3423,6 +4215,74 @@ function resolveAndCreateProvider(definition, env = process.env) {
 // src/evaluation/evaluators.ts
 import { generateText as generateText2 } from "ai";
 import { z } from "zod";
+// src/runtime/exec.ts
+function getBunSpawn() {
+  const bunSpawn = globalThis.Bun?.spawn;
+  return typeof bunSpawn === "function" ? bunSpawn : void 0;
+}
+async function execShellWithStdin(command, stdinPayload, options = {}) {
+  const bunSpawn = getBunSpawn();
+  if (bunSpawn) {
+    const encoder = new TextEncoder();
+    const proc = bunSpawn({
+      cmd: ["sh", "-c", command],
+      cwd: options.cwd,
+      stdin: encoder.encode(stdinPayload),
+      stdout: "pipe",
+      stderr: "pipe"
+    });
+    const timeout = options.timeoutMs ? setTimeout(() => {
+      proc.kill();
+    }, options.timeoutMs) : void 0;
+    try {
+      const stdout = await new Response(proc.stdout).text();
+      const stderr = await new Response(proc.stderr).text();
+      const exitCode = await proc.exited;
+      return { stdout, stderr, exitCode };
+    } finally {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+    }
+  }
+  const { spawn: spawn3 } = await import("node:child_process");
+  return await new Promise((resolve, reject) => {
+    const child = spawn3(command, {
+      shell: true,
+      cwd: options.cwd,
+      stdio: ["pipe", "pipe", "pipe"]
+    });
+    let stdout = "";
+    let stderr = "";
+    const timeout = options.timeoutMs ? setTimeout(() => {
+      child.kill();
+      reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+    }, options.timeoutMs) : void 0;
+    child.stdout?.on("data", (data) => {
+      stdout += data.toString();
+    });
+    child.stderr?.on("data", (data) => {
+      stderr += data.toString();
+    });
+    child.on("error", (error) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      reject(error);
+    });
+    child.on("exit", (code) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      resolve({ stdout, stderr, exitCode: code ?? 0 });
+    });
+    child.stdin?.write(stdinPayload);
+    child.stdin?.end();
+  });
+}
+// src/evaluation/evaluators.ts
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
 Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3698,17 +4558,17 @@ var CodeEvaluator = class {
     const inputPayload = JSON.stringify(
       {
         question: context.evalCase.question,
-        expected_outcome: context.evalCase.expected_outcome,
-        expected_messages: context.evalCase.expected_messages,
-        reference_answer: context.evalCase.reference_answer,
-        candidate_answer: context.candidate,
-        output_messages: context.outputMessages ?? null,
-        guideline_files: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths.filter(
-          (path13) => !context.evalCase.guideline_paths.includes(path13)
+        expectedOutcome: context.evalCase.expected_outcome,
+        expectedMessages: context.evalCase.expected_messages,
+        referenceAnswer: context.evalCase.reference_answer,
+        candidateAnswer: context.candidate,
+        outputMessages: context.outputMessages ?? null,
+        guidelineFiles: context.evalCase.guideline_paths,
+        inputFiles: context.evalCase.file_paths.filter(
+          (path14) => !context.evalCase.guideline_paths.includes(path14)
         ),
-        input_messages: context.evalCase.input_messages,
-        candidate_trace_summary: context.traceSummary ?? null
+        inputMessages: context.evalCase.input_messages,
+        traceSummary: context.traceSummary ?? null
       },
       null,
       2
@@ -3778,43 +4638,17 @@ function calculateRubricScore(result, rubrics) {
   return { score, verdict, hits, misses };
 }
 async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { spawn: spawn2 } = await import("node:child_process");
-  return await new Promise((resolve, reject) => {
-    const child = spawn2(scriptPath, {
-      shell: true,
-      cwd
-    });
-    let stdout = "";
-    let stderr = "";
-    const timeout = agentTimeoutMs ? setTimeout(() => {
-      child.kill();
-      reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
-    }, agentTimeoutMs) : void 0;
-    child.stdout?.on("data", (data) => {
-      stdout += data.toString();
-    });
-    child.stderr?.on("data", (data) => {
-      stderr += data.toString();
-    });
-    child.on("error", (error) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
-      reject(error);
-    });
-    child.on("exit", (code) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
-      if (code && code !== 0 && stderr.length > 0) {
-        reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
-        return;
-      }
-      resolve(stdout.trim());
-    });
-    child.stdin?.write(input);
-    child.stdin?.end();
+  const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
+    cwd,
+    timeoutMs: agentTimeoutMs
   });
+  if (exitCode !== 0) {
+    const trimmedErr = stderr.trim();
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
 }
 function parseJsonSafe(payload) {
   try {
@@ -3828,6 +4662,33 @@ function substituteVariables(template, variables) {
     return variables[varName] ?? match;
   });
 }
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
 var ToolTrajectoryEvaluator = class {
   kind = "tool_trajectory";
   config;
@@ -3884,7 +4745,10 @@ var ToolTrajectoryEvaluator = class {
     for (const message of messages) {
       if (message.toolCalls) {
         for (const call of message.toolCalls) {
-          toolCalls.push({ name: call.tool });
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
         }
       }
     }
@@ -3953,18 +4817,29 @@ var ToolTrajectoryEvaluator = class {
     const misses = [];
     let actualIndex = 0;
     for (let i = 0; i < expected.length; i++) {
-      const expectedTool = expected[i].tool;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
       let found = false;
+      let argsMismatch = false;
       while (actualIndex < toolCalls.length) {
-        if (toolCalls[actualIndex].name === expectedTool) {
-          hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
           actualIndex++;
-          found = true;
+          argsMismatch = true;
           break;
         }
         actualIndex++;
       }
-      if (!found) {
+      if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
     }
@@ -3995,10 +4870,16 @@ var ToolTrajectoryEvaluator = class {
     }
     const checkLength = Math.min(expected.length, toolCalls.length);
     for (let i = 0; i < checkLength; i++) {
-      const expectedTool = expected[i].tool;
-      const actualTool = toolCalls[i].name;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
       if (actualTool === expectedTool) {
-        hits.push(`Position ${i}: ${expectedTool} \u2713`);
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
@@ -4242,9 +5123,9 @@ var CompositeEvaluator = class {
 };
 // src/evaluation/orchestrator.ts
-import { createHash, randomUUID as randomUUID2 } from "node:crypto";
-import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
-import path12 from "node:path";
+import { createHash, randomUUID as randomUUID3 } from "node:crypto";
+import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
+import path13 from "node:path";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -4640,7 +5521,12 @@ async function runBatchEvaluation(options) {
     const promptInputs = promptInputsList[i];
     const providerResponse = batchResponse[i];
     const outputMessages = providerResponse.outputMessages;
-    const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+    const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
+      tokenUsage: providerResponse.tokenUsage,
+      costUsd: providerResponse.costUsd,
+      durationMs: providerResponse.durationMs
+    }) : void 0;
     const candidate = extractLastAssistantContent(outputMessages);
     let result;
     try {
@@ -4761,7 +5647,12 @@ async function runEvalCase(options) {
     await cache.set(cacheKey, providerResponse);
   }
   const outputMessages = providerResponse.outputMessages;
-  const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+  const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
+    tokenUsage: providerResponse.tokenUsage,
+    costUsd: providerResponse.costUsd,
+    durationMs: providerResponse.durationMs
+  }) : void 0;
   const candidate = extractLastAssistantContent(outputMessages);
   try {
     return await evaluateCandidate({
@@ -4834,21 +5725,21 @@ async function evaluateCandidate(options) {
   }
   return {
     timestamp: completedAt.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: score.score,
     hits: score.hits,
     misses: score.misses,
-    candidate_answer: candidate,
+    candidateAnswer: candidate,
     target: target.name,
     reasoning: score.reasoning,
-    raw_aspects: score.rawAspects,
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
-    evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
-    evaluator_results: evaluatorResults,
-    trace_summary: traceSummary
+    rawAspects: score.rawAspects,
+    agentProviderRequest,
+    lmProviderRequest,
+    evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
+    evaluatorResults,
+    traceSummary
   };
 }
 async function runEvaluatorsForCase(options) {
@@ -4946,7 +5837,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest
         });
       }
       if (evaluator.type === "code") {
@@ -4977,11 +5868,11 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -5034,8 +5925,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest,
-          evaluator_results: mapChildResults(score2.evaluatorResults)
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          evaluatorResults: mapChildResults(score2.evaluatorResults)
         });
       }
       if (evaluator.type === "tool_trajectory") {
@@ -5193,22 +6084,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = path12.resolve(directory, filename);
-  await mkdir2(path12.dirname(filePath), { recursive: true });
+  const filePath = path13.resolve(directory, filename);
+  await mkdir3(path13.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
     guidelines: promptInputs.guidelines,
     guideline_paths: evalCase.guideline_paths
   };
-  await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
+  await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
 }
 function sanitizeFilename(value) {
   if (!value) {
     return "prompt";
   }
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : randomUUID2();
+  return sanitized.length > 0 ? sanitized : randomUUID3();
 }
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -5265,17 +6156,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
   }
   return {
     timestamp: timestamp.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: 0,
     hits: [],
     misses: [`Error: ${message}`],
-    candidate_answer: `Error occurred: ${message}`,
+    candidateAnswer: `Error occurred: ${message}`,
     target: targetName,
-    raw_aspects: [],
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
+    rawAspects: [],
+    agentProviderRequest,
+    lmProviderRequest,
     error: message
   };
 }
@@ -5320,8 +6211,8 @@ function mapChildResults(children) {
     hits: child.hits,
     misses: child.misses,
     reasoning: child.reasoning,
-    evaluator_provider_request: child.evaluatorRawRequest,
-    evaluator_results: mapChildResults(child.evaluatorResults)
+    evaluatorProviderRequest: child.evaluatorRawRequest,
+    evaluatorResults: mapChildResults(child.evaluatorResults)
   }));
 }
 function computeWeightedMean(entries) {
@@ -5422,17 +6313,21 @@ function createAgentKernel() {
 export {
   CodeEvaluator,
   CompositeEvaluator,
+  DEFAULT_EXPLORATION_TOOLS,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator,
+  avgToolDurationMs,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
   computeTraceSummary,
   consumeCodexLogEntries,
+  consumePiLogEntries,
   createAgentKernel,
   createProvider,
   ensureVSCodeSubagents,
+  explorationRatio,
   extractCodeBlocks,
   fileExists,
   findGitRoot,
@@ -5446,6 +6341,7 @@ export {
   isTestMessageRole,
   listTargetNames,
   loadEvalCases,
+  mergeExecutionMetrics,
   normalizeLineEndings,
   readJsonFile,
   readTargetDefinitions,
@@ -5456,6 +6352,8 @@ export {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
-  subscribeToCodexLogEntries
+  subscribeToCodexLogEntries,
+  subscribeToPiLogEntries,
+  tokensPerTool
 };
 //# sourceMappingURL=index.js.map