npm - @agentv/core - Versions diffs - 4.14.0-next.1 → 4.15.0-next.1 - Mend

@agentv/core 4.14.0-next.1 → 4.15.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-A3HYVKTI.js → chunk-AOOU6PLC.js} +70 -2
package/dist/chunk-AOOU6PLC.js.map +1 -0
package/dist/evaluation/validation/index.cjs +89 -11
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +90 -12
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +774 -189
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +68 -14
package/dist/index.d.ts +68 -14
package/dist/index.js +705 -189
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-A3HYVKTI.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -5856,10 +5856,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
     const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
     const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
-    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
+    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
     if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
       logError3(
-        `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
+        `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
       );
       continue;
     }
@@ -5936,6 +5936,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     ) : void 0;
     const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
     const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
+    const modeRaw = asString5(testCaseConfig.mode);
+    const mode = modeRaw === "conversation" ? "conversation" : void 0;
+    const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
+    const aggregationRaw = asString5(testCaseConfig.aggregation);
+    const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
+    const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
+    const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
+    const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
     const testCase = {
       id,
       suite: suiteName,
@@ -5954,6 +5962,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
       metadata,
       targets: caseTargets,
       ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
+      ...mode ? { mode } : {},
+      ...turns && turns.length > 0 ? { turns } : {},
+      ...aggregation ? { aggregation } : {},
+      ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
+      ...windowSize !== void 0 ? { window_size: windowSize } : {},
       ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
       ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
     };
@@ -5971,6 +5984,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
   return match;
 }
 var loadEvalCaseById = loadTestById;
+function parseTurns(rawTurns) {
+  return rawTurns.map((rawTurn) => {
+    const turn = rawTurn;
+    const input = turn.input;
+    const expectedOutput = turn.expected_output;
+    let assertions;
+    if (Array.isArray(turn.assertions)) {
+      assertions = turn.assertions.map((a) => {
+        if (typeof a === "string") return a;
+        return a;
+      });
+    }
+    return {
+      input,
+      ...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
+      ...assertions && assertions.length > 0 ? { assertions } : {}
+    };
+  });
+}
 function parseCommandArray(source) {
   if (typeof source === "string") {
     const parts = source.trim().split(/\s+/);
@@ -7053,6 +7085,155 @@ function subscribeToClaudeLogEntries(listener) {
   };
 }
+// src/evaluation/providers/normalize-tool-call.ts
+init_cjs_shims();
+var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
+  // --- Claude (already canonical) ---
+  ["claude::Skill", "Skill"],
+  ["claude::Read", "Read"],
+  ["claude::Write", "Write"],
+  ["claude::Edit", "Edit"],
+  ["claude::Bash", "Bash"],
+  ["claude-cli::Skill", "Skill"],
+  ["claude-cli::Read", "Read"],
+  ["claude-cli::Write", "Write"],
+  ["claude-cli::Edit", "Edit"],
+  ["claude-cli::Bash", "Bash"],
+  ["claude-sdk::Skill", "Skill"],
+  ["claude-sdk::Read", "Read"],
+  ["claude-sdk::Write", "Write"],
+  ["claude-sdk::Edit", "Edit"],
+  ["claude-sdk::Bash", "Bash"],
+  // --- Copilot ---
+  ["copilot-cli::Skill", "Skill"],
+  ["copilot-cli::skill", "Skill"],
+  ["copilot-cli::Read File", "Read"],
+  ["copilot-cli::readFile", "Read"],
+  ["copilot-cli::Read", "Read"],
+  ["copilot-cli::readTextFile", "Read"],
+  ["copilot-cli::writeTextFile", "Write"],
+  ["copilot-cli::Write File", "Write"],
+  ["copilot-cli::editFile", "Edit"],
+  ["copilot-cli::Edit File", "Edit"],
+  ["copilot-cli::runTerminalCommand", "Bash"],
+  ["copilot-sdk::Skill", "Skill"],
+  ["copilot-sdk::skill", "Skill"],
+  ["copilot-sdk::Read File", "Read"],
+  ["copilot-sdk::readFile", "Read"],
+  ["copilot-sdk::Read", "Read"],
+  ["copilot-sdk::readTextFile", "Read"],
+  ["copilot-sdk::writeTextFile", "Write"],
+  ["copilot-sdk::Write File", "Write"],
+  ["copilot-sdk::editFile", "Edit"],
+  ["copilot-sdk::Edit File", "Edit"],
+  ["copilot-sdk::runTerminalCommand", "Bash"],
+  ["copilot-log::Skill", "Skill"],
+  ["copilot-log::skill", "Skill"],
+  ["copilot-log::Read File", "Read"],
+  ["copilot-log::readFile", "Read"],
+  ["copilot-log::Read", "Read"],
+  ["copilot-log::readTextFile", "Read"],
+  ["copilot-log::writeTextFile", "Write"],
+  ["copilot-log::Write File", "Write"],
+  ["copilot-log::editFile", "Edit"],
+  ["copilot-log::Edit File", "Edit"],
+  ["copilot-log::runTerminalCommand", "Bash"],
+  ["vscode::Skill", "Skill"],
+  ["vscode::skill", "Skill"],
+  ["vscode::Read File", "Read"],
+  ["vscode::readFile", "Read"],
+  ["vscode::Read", "Read"],
+  ["vscode::readTextFile", "Read"],
+  ["vscode::writeTextFile", "Write"],
+  ["vscode::Write File", "Write"],
+  ["vscode::editFile", "Edit"],
+  ["vscode::Edit File", "Edit"],
+  ["vscode::runTerminalCommand", "Bash"],
+  ["vscode-insiders::Skill", "Skill"],
+  ["vscode-insiders::skill", "Skill"],
+  ["vscode-insiders::Read File", "Read"],
+  ["vscode-insiders::readFile", "Read"],
+  ["vscode-insiders::Read", "Read"],
+  ["vscode-insiders::readTextFile", "Read"],
+  ["vscode-insiders::writeTextFile", "Write"],
+  ["vscode-insiders::Write File", "Write"],
+  ["vscode-insiders::editFile", "Edit"],
+  ["vscode-insiders::Edit File", "Edit"],
+  ["vscode-insiders::runTerminalCommand", "Bash"],
+  // --- Codex ---
+  ["codex::command_execution", "Bash"],
+  ["codex::file_change", "Edit"],
+  // --- Pi ---
+  ["pi-coding-agent::read", "Read"],
+  ["pi-coding-agent::bash", "Bash"],
+  ["pi-cli::read", "Read"],
+  ["pi-cli::bash", "Bash"]
+]);
+var COPILOT_PREFIXES = [
+  { prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
+  { prefix: "Viewing ", canonical: "Read" }
+];
+var CODEX_PREFIXES = [
+  { prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
+];
+var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
+  ["copilot-cli", COPILOT_PREFIXES],
+  ["copilot-sdk", COPILOT_PREFIXES],
+  ["copilot-log", COPILOT_PREFIXES],
+  ["vscode", COPILOT_PREFIXES],
+  ["vscode-insiders", COPILOT_PREFIXES],
+  ["codex", CODEX_PREFIXES]
+]);
+var normalizeSkillInput = (input) => {
+  if (input.skill !== void 0) return input;
+  return input;
+};
+var normalizeReadInput = (input) => {
+  if (input.file_path !== void 0) return input;
+  if (input.path !== void 0) return { ...input, file_path: input.path };
+  if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
+  return input;
+};
+var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
+  ["Skill", normalizeSkillInput],
+  ["Read", normalizeReadInput]
+]);
+function normalizeToolCall(providerKind, tc) {
+  const nativeName = tc.tool;
+  const exactKey = `${providerKind}::${nativeName}`;
+  const canonical = TOOL_NAME_MAP.get(exactKey);
+  if (canonical) {
+    return applyInputNormalization(canonical, { ...tc, tool: canonical });
+  }
+  const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
+  if (prefixRules) {
+    for (const rule of prefixRules) {
+      if (nativeName.startsWith(rule.prefix)) {
+        const suffix = nativeName.slice(rule.prefix.length);
+        let normalizedInput = tc.input;
+        if (rule.extractSkillFromName && suffix) {
+          const existingInput = tc.input ?? {};
+          normalizedInput = { ...existingInput, skill: suffix };
+        }
+        const normalized = {
+          ...tc,
+          tool: rule.canonical,
+          input: normalizedInput
+        };
+        return applyInputNormalization(rule.canonical, normalized);
+      }
+    }
+  }
+  return tc;
+}
+function applyInputNormalization(canonical, tc) {
+  const normalizer = INPUT_NORMALIZERS.get(canonical);
+  if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
+  const input = tc.input;
+  const normalized = normalizer(input);
+  return normalized === input ? tc : { ...tc, input: normalized };
+}
 // src/evaluation/providers/preread.ts
 init_cjs_shims();
 var import_node_path12 = __toESM(require("path"), 1);
@@ -7521,11 +7702,13 @@ function extractToolCalls(content) {
     }
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("claude-cli", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     }
   }
   return toolCalls;
@@ -7817,11 +8000,13 @@ function extractToolCalls2(content) {
     }
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("claude-sdk", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     }
   }
   return toolCalls;
@@ -8739,27 +8924,33 @@ ${basePrompt}` : basePrompt;
       }
     }
     if (itemType === "command_execution") {
-      completedToolCalls.push({
-        tool: "command_execution",
-        input: { command: item.command },
-        output: item.aggregated_output,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: "command_execution",
+          input: { command: item.command },
+          output: item.aggregated_output,
+          id: item.id
+        })
+      );
     }
     if (itemType === "file_change") {
-      completedToolCalls.push({
-        tool: "file_change",
-        input: item.changes,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: "file_change",
+          input: item.changes,
+          id: item.id
+        })
+      );
     }
     if (itemType === "mcp_tool_call") {
-      completedToolCalls.push({
-        tool: `mcp:${item.server}/${item.tool}`,
-        input: item.arguments,
-        output: item.result ?? item.error,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: `mcp:${item.server}/${item.tool}`,
+          input: item.arguments,
+          output: item.result ?? item.error,
+          id: item.id
+        })
+      );
     }
   }
   resolveCwd(cwdOverride) {
@@ -9299,12 +9490,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
     return logger;
   }
   handleEvent(eventType, data) {
-    if (this.format === "json") {
-      const elapsed2 = formatElapsed4(this.startedAt);
-      this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
-`);
-      return;
-    }
     if (this.chunkExtractor) {
       const chunkText = this.chunkExtractor(eventType, data);
       if (chunkText === null) {
@@ -9317,6 +9502,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
       }
       this.flushPendingText();
     }
+    if (this.format === "json") {
+      const elapsed2 = formatElapsed4(this.startedAt);
+      this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
+`);
+      return;
+    }
     const elapsed = formatElapsed4(this.startedAt);
     const summary = this.summarize(eventType, data);
     if (summary) {
@@ -9327,14 +9518,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
   flushPendingText() {
     if (!this.pendingText) return;
     const elapsed = formatElapsed4(this.startedAt);
-    this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
+    if (this.format === "json") {
+      this.stream.write(
+        `${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
+`
+      );
+    } else {
+      this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
 `);
+    }
     this.pendingText = "";
   }
   async close() {
-    if (this.format !== "json") {
-      this.flushPendingText();
-    }
+    this.flushPendingText();
     await new Promise((resolve, reject) => {
       this.stream.once("error", reject);
       this.stream.end(() => resolve());
@@ -9409,15 +9605,17 @@ var CopilotCliProvider = class {
           }
           if (update.status === "completed" || update.status === "failed") {
             const toolName = update.title ?? update.kind ?? "unknown";
-            completedToolCalls.push({
-              tool: toolName,
-              input: update.rawInput,
-              output: update.rawOutput,
-              id: callId,
-              startTime: (/* @__PURE__ */ new Date()).toISOString(),
-              endTime: (/* @__PURE__ */ new Date()).toISOString(),
-              durationMs: 0
-            });
+            completedToolCalls.push(
+              normalizeToolCall("copilot-cli", {
+                tool: toolName,
+                input: update.rawInput,
+                output: update.rawOutput,
+                id: callId,
+                startTime: (/* @__PURE__ */ new Date()).toISOString(),
+                endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                durationMs: 0
+              })
+            );
             request.streamCallbacks?.onToolCallEnd?.(
               toolName,
               update.rawInput,
@@ -9434,15 +9632,17 @@ var CopilotCliProvider = class {
             if (inProgress) {
               toolCallsInProgress.delete(callId);
               const duration = Date.now() - inProgress.startMs;
-              completedToolCalls.push({
-                tool: inProgress.tool,
-                input: inProgress.input,
-                output: update.rawOutput,
-                id: inProgress.id,
-                startTime: inProgress.startTime,
-                endTime: (/* @__PURE__ */ new Date()).toISOString(),
-                durationMs: duration
-              });
+              completedToolCalls.push(
+                normalizeToolCall("copilot-cli", {
+                  tool: inProgress.tool,
+                  input: inProgress.input,
+                  output: update.rawOutput,
+                  id: inProgress.id,
+                  startTime: inProgress.startTime,
+                  endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                  durationMs: duration
+                })
+              );
               request.streamCallbacks?.onToolCallEnd?.(
                 inProgress.tool,
                 inProgress.input,
@@ -9788,11 +9988,13 @@ function parseCopilotEvents(eventsJsonl) {
       }
       case "assistant.message": {
         const toolRequests = data.toolRequests;
-        const toolCalls = (toolRequests ?? []).map((req) => ({
-          tool: String(req.name ?? req.toolName ?? ""),
-          input: req.arguments,
-          id: req.toolCallId ? String(req.toolCallId) : void 0
-        }));
+        const toolCalls = (toolRequests ?? []).map(
+          (req) => normalizeToolCall("copilot-log", {
+            tool: String(req.name ?? req.toolName ?? ""),
+            input: req.arguments,
+            id: req.toolCallId ? String(req.toolCallId) : void 0
+          })
+        );
         messages.push({
           role: "assistant",
           content: data.content != null ? String(data.content) : void 0,
@@ -9832,12 +10034,12 @@ function parseCopilotEvents(eventsJsonl) {
           messages.push({
             role: "assistant",
             toolCalls: [
-              {
+              normalizeToolCall("copilot-log", {
                 tool: started.toolName,
                 input: started.input,
                 output: data.result,
                 id: toolCallId
-              }
+              })
             ]
           });
         }
@@ -10186,15 +10388,17 @@ var CopilotSdkProvider = class {
           if (inProgress) {
             toolCallsInProgress.delete(callId);
             const endMs = Date.now();
-            completedToolCalls.push({
-              tool: inProgress.tool,
-              input: inProgress.input,
-              output: data?.output ?? data?.result,
-              id: inProgress.id,
-              startTime: inProgress.startTime,
-              endTime: (/* @__PURE__ */ new Date()).toISOString(),
-              durationMs: endMs - inProgress.startMs
-            });
+            completedToolCalls.push(
+              normalizeToolCall("copilot-sdk", {
+                tool: inProgress.tool,
+                input: inProgress.input,
+                output: data?.output ?? data?.result,
+                id: inProgress.id,
+                startTime: inProgress.startTime,
+                endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                durationMs: endMs - inProgress.startMs
+              })
+            );
           }
         }
         if (eventType === "assistant.message") {
@@ -11178,12 +11382,14 @@ function extractToolCallsFromEvents(events) {
   }
   const toolCalls = [];
   for (const [id, { tool: tool2, input }] of starts) {
-    toolCalls.push({
-      tool: tool2,
-      input,
-      id: id.startsWith("anon-") ? void 0 : id,
-      output: results.get(id)
-    });
+    toolCalls.push(
+      normalizeToolCall("pi-cli", {
+        tool: tool2,
+        input,
+        id: id.startsWith("anon-") ? void 0 : id,
+        output: results.get(id)
+      })
+    );
   }
   return toolCalls;
 }
@@ -11305,17 +11511,21 @@ function extractToolCalls3(content) {
     if (!part || typeof part !== "object") continue;
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("pi-cli", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     } else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.arguments ?? p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("pi-cli", {
+          tool: p.name,
+          input: p.arguments ?? p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     } else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
       const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
       if (existing) {
@@ -12776,6 +12986,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
     allowLiteral: true,
     optionalEnv: true
@@ -12822,6 +13037,7 @@ function resolveCodexConfig(target, env, evalFilePath) {
     timeoutMs,
     logDir,
     logFormat,
+    streamLog: streamLogResult.streamLog,
     systemPrompt
   };
 }
@@ -12838,6 +13054,38 @@ function normalizeCodexLogFormat(value) {
   }
   throw new Error("codex log format must be 'summary' or 'json'");
 }
+function resolveStreamLog(target, envFallback) {
+  if (target.stream_log !== void 0 && target.stream_log !== null) {
+    const val = target.stream_log;
+    if (val === false || val === "false") {
+      return { streamLog: false, logFormat: void 0 };
+    }
+    if (val === "raw") {
+      return { streamLog: "raw", logFormat: "json" };
+    }
+    if (val === "summary") {
+      return { streamLog: "summary", logFormat: "summary" };
+    }
+    throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`);
+  }
+  const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback;
+  if (logFormatRaw === void 0 || logFormatRaw === null) {
+    return { streamLog: void 0, logFormat: void 0 };
+  }
+  if (typeof logFormatRaw !== "string") {
+    throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
+  }
+  const normalized = logFormatRaw.trim().toLowerCase();
+  if (normalized !== "json" && normalized !== "summary") {
+    throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
+  }
+  const streamLogEquivalent = normalized === "json" ? "raw" : "summary";
+  return {
+    streamLog: streamLogEquivalent,
+    logFormat: normalized,
+    deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' \u2192 stream_log: '${streamLogEquivalent}').`
+  };
+}
 function resolveCopilotSdkConfig(target, env, evalFilePath) {
   const cliUrlSource = target.cli_url;
   const cliPathSource = target.cli_path;
@@ -12849,6 +13097,11 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
     allowLiteral: true,
     optionalEnv: true
@@ -12959,6 +13212,7 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
     timeoutMs,
     logDir,
     logFormat,
+    streamLog: streamLogResult.streamLog,
     systemPrompt,
     byokType,
     byokBaseUrl,
@@ -12978,6 +13232,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
     allowLiteral: true,
     optionalEnv: true
@@ -13029,6 +13288,7 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
     timeoutMs,
     logDir,
     logFormat,
+    streamLog: streamLogResult.streamLog,
     systemPrompt
   };
 }
@@ -13051,6 +13311,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const subprovider = resolveOptionalString(
     subproviderSource,
     env,
@@ -13121,6 +13386,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
     timeoutMs,
     logDir,
     logFormat,
+    streamLog: streamLogResult.streamLog,
     systemPrompt
   };
 }
@@ -13137,6 +13403,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
     allowLiteral: true,
     optionalEnv: true
@@ -13207,6 +13478,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
     timeoutMs,
     logDir,
     logFormat,
+    streamLog: streamLogResult.streamLog,
     systemPrompt
   };
 }
@@ -13218,6 +13490,11 @@ function resolveClaudeConfig(target, env, evalFilePath) {
   const logDirSource = target.log_dir ?? target.log_directory;
   const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
   const systemPromptSource = target.system_prompt;
+  const streamLogResult = resolveStreamLog(target);
+  if (streamLogResult.deprecationWarning) {
+    process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
+`);
+  }
   const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
     allowLiteral: true,
     optionalEnv: true
@@ -13261,7 +13538,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
     maxTurns,
     maxBudgetUsd,
     logDir,
-    logFormat
+    logFormat,
+    streamLog: streamLogResult.streamLog
   };
 }
 function normalizeClaudeLogFormat(value) {
@@ -17946,100 +18224,35 @@ var LatencyEvaluator = class {
 // src/evaluation/evaluators/skill-trigger.ts
 init_cjs_shims();
-var CLAUDE_MATCHER = {
-  skillTools: ["Skill"],
-  skillInputField: "skill",
-  readTools: ["Read"],
-  readInputField: "file_path"
-};
-var COPILOT_MATCHER = {
-  skillTools: ["Skill", "skill"],
-  skillInputField: "skill",
-  readTools: ["Read File", "readFile", "Read", "readTextFile"],
-  readInputField: "file_path",
-  skillToolPrefixes: ["Using skill: "],
-  readToolPrefixes: ["Viewing "],
-  readInputFields: ["file_path", "path"]
-};
-var PI_CODING_AGENT_MATCHER = {
-  skillTools: [],
-  skillInputField: "skill",
-  readTools: ["read"],
-  readInputField: "path",
-  readInputFields: ["path", "file_path", "filePath"]
-};
-var CODEX_MATCHER = {
-  skillTools: [],
-  skillInputField: "skill",
-  readTools: ["command_execution"],
-  readInputField: "command",
-  skillToolPrefixes: ["mcp:"],
-  readToolPrefixes: ["mcp:"],
-  readInputFields: ["command", "path", "file_path", "filePath"]
-};
-var PROVIDER_TOOL_SEMANTICS = {
-  claude: CLAUDE_MATCHER,
-  "claude-cli": CLAUDE_MATCHER,
-  "claude-sdk": CLAUDE_MATCHER,
-  codex: CODEX_MATCHER,
-  "pi-coding-agent": PI_CODING_AGENT_MATCHER,
-  "pi-cli": PI_CODING_AGENT_MATCHER,
-  "copilot-cli": COPILOT_MATCHER,
-  "copilot-log": COPILOT_MATCHER,
-  "copilot-sdk": COPILOT_MATCHER,
-  vscode: COPILOT_MATCHER,
-  "vscode-insiders": COPILOT_MATCHER
-};
 var SkillTriggerEvaluator = class {
   kind = "skill-trigger";
   config;
   constructor(config) {
     this.config = config;
   }
-  resolveMatcher(providerKind) {
-    if (providerKind) {
-      const match = PROVIDER_TOOL_SEMANTICS[providerKind];
-      if (match) return match;
-    }
-    return CLAUDE_MATCHER;
-  }
   evaluate(context2) {
     const skillName = this.config.skill;
     const shouldTrigger = this.config.should_trigger !== false;
-    const providerKind = context2.provider?.kind;
-    const matcher = this.resolveMatcher(providerKind);
     const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
     let triggered = false;
     let evidence = "";
     for (const toolCall of allToolCalls) {
       const toolName = toolCall.tool ?? "";
       const input = toolCall.input ?? {};
-      if (matcher.skillTools.includes(toolName)) {
-        const skillArg = String(input[matcher.skillInputField] ?? "");
+      if (toolName === "Skill") {
+        const skillArg = String(input.skill ?? "");
         if (skillArg.includes(skillName)) {
           triggered = true;
-          evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
+          evidence = `Skill tool invoked with skill="${skillArg}"`;
           break;
         }
-      } else if (matcher.skillToolPrefixes?.some(
-        (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
-      )) {
-        triggered = true;
-        evidence = `Skill tool invoked via tool name "${toolName}"`;
-        break;
-      } else if (matcher.readTools.includes(toolName)) {
-        const filePath = this.readPathFromInput(input, matcher);
-        if (filePath.includes(skillName)) {
+      } else if (toolName === "Read") {
+        const filePath = String(input.file_path ?? "");
+        if (filePath.includes(`skills/${skillName}/`)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
           break;
         }
-      } else if (matcher.readToolPrefixes?.some(
-        (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
-      )) {
-        triggered = true;
-        evidence = `Read tool loaded skill file via tool name "${toolName}"`;
-        break;
       }
       if (!triggered && toolCall.output != null) {
         const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
@@ -18076,16 +18289,6 @@ var SkillTriggerEvaluator = class {
       expectedAspectCount: 1
     };
   }
-  readPathFromInput(input, matcher) {
-    const fields = matcher.readInputFields ?? [matcher.readInputField];
-    for (const field of fields) {
-      const value = input[field];
-      if (value !== void 0 && value !== null) {
-        return String(value);
-      }
-    }
-    return "";
-  }
 };
 // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -18935,10 +19138,12 @@ function runEqualsAssertion(output, value) {
 // src/evaluation/orchestrator.ts
 init_cjs_shims();
+var import_node_child_process11 = require("child_process");
 var import_node_crypto11 = require("crypto");
 var import_node_fs16 = require("fs");
 var import_promises36 = require("fs/promises");
 var import_node_path49 = __toESM(require("path"), 1);
+var import_node_util7 = require("util");
 var import_micromatch3 = __toESM(require("micromatch"), 1);
 // ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
@@ -20414,6 +20619,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
 }
 // src/evaluation/orchestrator.ts
+var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
+var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
 function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
   return score >= threshold ? "ok" : "quality_failure";
 }
@@ -20451,6 +20658,35 @@ function hasHookCommand(hook) {
 function hooksEnabled(workspace) {
   return workspace?.hooks?.enabled !== false;
 }
+function workspaceGitEnv() {
+  const env = { ...process.env };
+  for (const key of Object.keys(env)) {
+    if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
+      delete env[key];
+    }
+  }
+  return {
+    ...env,
+    GIT_TERMINAL_PROMPT: "0",
+    GIT_ASKPASS: "",
+    GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
+  };
+}
+async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
+  if (!(0, import_node_fs16.existsSync)(import_node_path49.default.join(workspacePath, ".git"))) {
+    return false;
+  }
+  const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
+  const opts = {
+    cwd: workspacePath,
+    timeout: WORKSPACE_GIT_TIMEOUT_MS,
+    env: workspaceGitEnv(),
+    maxBuffer: 50 * 1024 * 1024
+  };
+  await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
+  await execFileAsync3("git", ["clean", cleanFlag], opts);
+  return true;
+}
 function getWorkspaceTemplate(target) {
   const config = target.config;
   if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -21712,6 +21948,37 @@ async function runEvalCase(options) {
       }
     }
   }
+  let beforeEachNeedsFreshBaseline = false;
+  if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
+    try {
+      if (repoManager && evalCase.workspace.repos?.length) {
+        await repoManager.reset(
+          evalCase.workspace.repos,
+          workspacePath,
+          evalCase.workspace.hooks.before_each.reset
+        );
+      } else {
+        await resetWorkspaceRoot(
+          workspacePath,
+          evalCase.workspace.hooks.before_each.reset,
+          sharedBaselineCommit
+        );
+      }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return buildErrorResult(
+        evalCase,
+        target.name,
+        nowFn(),
+        new Error(`before_each reset failed: ${message}`),
+        promptInputs,
+        provider,
+        "setup",
+        "script_error",
+        verbose
+      );
+    }
+  }
   const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
   if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
     const beforeEachHook = caseBeforeEachHook;
@@ -21728,6 +21995,7 @@ async function runEvalCase(options) {
         toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
         scriptContext
       );
+      beforeEachNeedsFreshBaseline = true;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       return buildErrorResult(
@@ -21743,7 +22011,7 @@ async function runEvalCase(options) {
       );
     }
   }
-  let baselineCommit = sharedBaselineCommit;
+  let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
   if (!baselineCommit && workspacePath) {
     try {
       baselineCommit = await initializeBaseline(workspacePath);
@@ -21754,6 +22022,35 @@ async function runEvalCase(options) {
       }
     }
   }
+  if (evalCase.mode === "conversation" && evalCase.turns?.length) {
+    const conversationResult = await runConversationMode({
+      evalCase,
+      provider,
+      target,
+      evaluators,
+      typeRegistry,
+      graderProvider,
+      promptInputs,
+      nowFn,
+      signal,
+      workspacePath,
+      caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
+      agentTimeoutMs,
+      streamCallbacks: options.streamCallbacks,
+      verbose,
+      threshold: evalCase.threshold ?? caseThreshold,
+      targetResolver,
+      availableTargets
+    });
+    if (workspacePath && !isSharedWorkspace) {
+      const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
+      if (!shouldRetain) {
+        await cleanupWorkspace(workspacePath).catch(() => {
+        });
+      }
+    }
+    return conversationResult;
+  }
   const caseStartMs = Date.now();
   const attemptBudget = (maxRetries ?? 0) + 1;
   let attempt = 0;
@@ -21868,13 +22165,21 @@ async function runEvalCase(options) {
 ${providerFileChanges}` : providerFileChanges;
   }
   const providerError = extractProviderError(providerResponse);
-  if (caseHooksEnabled && repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
+  if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
     try {
-      await repoManager.reset(
-        evalCase.workspace.repos,
-        workspacePath,
-        evalCase.workspace.hooks.after_each.reset
-      );
+      if (repoManager && evalCase.workspace.repos?.length) {
+        await repoManager.reset(
+          evalCase.workspace.repos,
+          workspacePath,
+          evalCase.workspace.hooks.after_each.reset
+        );
+      } else {
+        await resetWorkspaceRoot(
+          workspacePath,
+          evalCase.workspace.hooks.after_each.reset,
+          baselineCommit
+        );
+      }
     } catch {
     }
   }
@@ -22490,6 +22795,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
     "llm-grader": llmGrader
   };
 }
+async function runConversationMode(options) {
+  const {
+    evalCase,
+    provider,
+    target,
+    evaluators,
+    typeRegistry,
+    graderProvider,
+    promptInputs,
+    nowFn,
+    signal,
+    workspacePath,
+    caseWorkspaceFile,
+    agentTimeoutMs,
+    streamCallbacks,
+    verbose,
+    threshold,
+    targetResolver,
+    availableTargets
+  } = options;
+  const turns = evalCase.turns;
+  const aggregation = evalCase.aggregation ?? "mean";
+  const onTurnFailure = evalCase.on_turn_failure ?? "continue";
+  const windowSize = evalCase.window_size;
+  const history = [];
+  for (const msg of evalCase.input) {
+    const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
+    history.push({ role: msg.role, content });
+  }
+  const turnScores = [];
+  const allTurnScoreValues = [];
+  let stopped = false;
+  const caseStartMs = Date.now();
+  for (let i = 0; i < turns.length; i++) {
+    const turn = turns[i];
+    const turnIndex = i + 1;
+    if (stopped) {
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 0,
+        verdict: "skip",
+        assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
+      });
+      allTurnScoreValues.push(0);
+      continue;
+    }
+    const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
+    history.push({ role: "user", content: userContent });
+    const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
+    let response;
+    try {
+      response = await provider.invoke({
+        question: userContent,
+        chatPrompt: chatPromptForProvider,
+        evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
+        signal,
+        cwd: workspacePath,
+        workspaceFile: caseWorkspaceFile,
+        streamCallbacks
+      });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 0,
+        verdict: "fail",
+        assertions: [{ text: `Provider error: ${message}`, passed: false }]
+      });
+      allTurnScoreValues.push(0);
+      if (onTurnFailure === "stop") stopped = true;
+      continue;
+    }
+    const assistantContent = extractLastAssistantContent(response.output);
+    history.push({ role: "assistant", content: assistantContent });
+    if (!turn.assertions?.length && !turn.expected_output) {
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 1,
+        verdict: "pass",
+        assertions: []
+      });
+      allTurnScoreValues.push(1);
+      continue;
+    }
+    const turnAssertions = buildTurnAssertions(turn);
+    const turnEvalCase = {
+      ...evalCase,
+      id: `${evalCase.id}/turn-${turnIndex}`,
+      assertions: turnAssertions,
+      input: buildTurnGraderInput(history, windowSize),
+      expected_output: turn.expected_output ? [
+        typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
+      ] : [],
+      // Clear conversation fields to prevent recursion
+      mode: void 0,
+      turns: void 0
+    };
+    const turnResult = await evaluateCandidate({
+      evalCase: turnEvalCase,
+      candidate: assistantContent,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: buildConversationContext(history, windowSize),
+        chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      output: response.output,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets
+    });
+    const turnScore = turnResult.score;
+    allTurnScoreValues.push(turnScore);
+    turnScores.push({
+      name: `turn-${turnIndex}`,
+      type: "rubrics",
+      score: turnScore,
+      verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
+      assertions: turnResult.assertions ? [...turnResult.assertions] : [],
+      scores: turnResult.scores
+    });
+    if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
+      stopped = true;
+    }
+  }
+  let conversationScores = [];
+  if (evalCase.assertions?.length) {
+    const conversationEvalCase = {
+      ...evalCase,
+      id: `${evalCase.id}/conversation`,
+      input: history.map((m) => ({
+        role: m.role,
+        content: m.content
+      })),
+      expected_output: [],
+      mode: void 0,
+      turns: void 0
+    };
+    const fullTranscript = history.map((m) => {
+      const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
+      return `${m.role}: ${content}`;
+    }).join("\n\n");
+    const conversationResult = await evaluateCandidate({
+      evalCase: conversationEvalCase,
+      candidate: fullTranscript,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: fullTranscript,
+        chatPrompt: [...history]
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets
+    });
+    conversationScores = [
+      {
+        name: "conversation",
+        type: "rubrics",
+        score: conversationResult.score,
+        verdict: scoreToVerdict(
+          conversationResult.score,
+          threshold ?? DEFAULT_THRESHOLD
+        ),
+        assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
+        scores: conversationResult.scores
+      }
+    ];
+  }
+  const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
+  const finalScore = aggregateConversationScores(allScoreValues, aggregation);
+  const allResultScores = [...turnScores, ...conversationScores];
+  const outputMessages = history.map((m) => ({
+    role: m.role,
+    content: m.content
+  }));
+  const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
+  const totalDurationMs = Date.now() - caseStartMs;
+  return {
+    timestamp: nowFn().toISOString(),
+    testId: evalCase.id,
+    suite: evalCase.suite,
+    category: evalCase.category,
+    score: finalScore,
+    assertions: flatAssertions,
+    target: target.name,
+    output: outputMessages,
+    scores: allResultScores,
+    executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
+    input: evalCase.input.map((m) => ({
+      role: m.role,
+      content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
+    })),
+    evalRun: { durationMs: totalDurationMs }
+  };
+}
+function buildWindowedHistory(history, windowSize) {
+  const systemMessages = history.filter((m) => m.role === "system");
+  const nonSystem = history.filter((m) => m.role !== "system");
+  const windowed = nonSystem.slice(-windowSize * 2);
+  return [...systemMessages, ...windowed];
+}
+function buildConversationContext(history, windowSize) {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs.map((m) => {
+    const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
+    return `${m.role}: ${content}`;
+  }).join("\n\n");
+}
+function buildTurnGraderInput(history, windowSize) {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs.map((m) => ({
+    role: m.role,
+    content: m.content
+  }));
+}
+function buildTurnAssertions(turn) {
+  if (!turn.assertions?.length) return [];
+  const stringCriteria = [];
+  const structured = [];
+  for (const a of turn.assertions) {
+    if (typeof a === "string") {
+      stringCriteria.push(a);
+    } else {
+      structured.push(a);
+    }
+  }
+  const result = [];
+  if (stringCriteria.length > 0) {
+    result.push({
+      name: "turn-rubrics",
+      type: "llm-grader",
+      rubrics: stringCriteria.map((text, idx) => ({
+        id: `criterion-${idx + 1}`,
+        outcome: text,
+        weight: 1
+      }))
+    });
+  }
+  result.push(...structured);
+  return result;
+}
+function aggregateConversationScores(scores, aggregation) {
+  if (scores.length === 0) return 1;
+  switch (aggregation) {
+    case "min":
+      return Math.min(...scores);
+    case "max":
+      return Math.max(...scores);
+    default:
+      return scores.reduce((sum, s) => sum + s, 0) / scores.length;
+  }
+}
 async function invokeProvider(provider, options) {
   const {
     evalCase,
@@ -23222,13 +23797,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
 // src/evaluation/results-repo.ts
 init_cjs_shims();
-var import_node_child_process11 = require("child_process");
+var import_node_child_process12 = require("child_process");
 var import_node_fs18 = require("fs");
 var import_promises39 = require("fs/promises");
 var import_node_os9 = __toESM(require("os"), 1);
 var import_node_path53 = __toESM(require("path"), 1);
-var import_node_util7 = require("util");
-var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
+var import_node_util8 = require("util");
+var execFileAsync4 = (0, import_node_util8.promisify)(import_node_child_process12.execFile);
 function sanitizeRepoSlug(repo) {
   return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
 }
@@ -23279,7 +23854,7 @@ function writePersistedStatus(statusFile, status) {
 }
 async function runCommand(executable, args, options) {
   try {
-    const { stdout, stderr } = await execFileAsync3(executable, [...args], {
+    const { stdout, stderr } = await execFileAsync4(executable, [...args], {
       cwd: options?.cwd,
       env: process.env
     });
@@ -24341,11 +24916,13 @@ function extractAssistantContent(content) {
         break;
       case "tool_use":
         if (block.name) {
-          toolCalls.push({
-            tool: block.name,
-            input: block.input,
-            id: block.id
-          });
+          toolCalls.push(
+            normalizeToolCall("claude", {
+              tool: block.name,
+              input: block.input,
+              id: block.id
+            })
+          );
         }
         break;
     }
@@ -24438,7 +25015,11 @@ function parseCodexSession(jsonl) {
             } else {
               input = payload.arguments;
             }
-            const toolCall = { tool: toolName, input, id: callId };
+            const toolCall = normalizeToolCall("codex", {
+              tool: toolName,
+              input,
+              id: callId
+            });
             const msgIdx = messages.length;
             messages.push({
               role: "assistant",
@@ -24462,7 +25043,11 @@ function parseCodexSession(jsonl) {
             } else {
               input = payload.arguments;
             }
-            const toolCall = { tool: toolName, input, id: callId };
+            const toolCall = normalizeToolCall("codex", {
+              tool: toolName,
+              input,
+              id: callId
+            });
             const msgIdx = messages.length;
             messages.push({
               role: "assistant",