npm - @agentv/core - Versions diffs - 4.14.0 → 4.15.0-next.1 - Mend

@agentv/core 4.14.0 → 4.15.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-A3HYVKTI.js → chunk-AOOU6PLC.js} +70 -2
package/dist/chunk-AOOU6PLC.js.map +1 -0
package/dist/evaluation/validation/index.cjs +89 -11
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +90 -12
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +774 -189
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +68 -14
package/dist/index.d.ts +68 -14
package/dist/index.js +705 -189
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-A3HYVKTI.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -25,7 +25,7 @@ import {
   resolveDelegatedTargetDefinition,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-A3HYVKTI.js";
+} from "./chunk-AOOU6PLC.js";
 import {
   execFileWithStdin,
   execShellWithStdin
@@ -3673,10 +3673,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
     const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
     const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
-    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
+    const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
     if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
       logError3(
-        `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
+        `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
       );
       continue;
     }
@@ -3753,6 +3753,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     ) : void 0;
     const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
     const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
+    const modeRaw = asString5(testCaseConfig.mode);
+    const mode = modeRaw === "conversation" ? "conversation" : void 0;
+    const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
+    const aggregationRaw = asString5(testCaseConfig.aggregation);
+    const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
+    const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
+    const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
+    const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
     const testCase = {
       id,
       suite: suiteName,
@@ -3771,6 +3779,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
       metadata,
       targets: caseTargets,
       ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
+      ...mode ? { mode } : {},
+      ...turns && turns.length > 0 ? { turns } : {},
+      ...aggregation ? { aggregation } : {},
+      ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
+      ...windowSize !== void 0 ? { window_size: windowSize } : {},
       ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
       ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
     };
@@ -3788,6 +3801,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
   return match;
 }
 var loadEvalCaseById = loadTestById;
+function parseTurns(rawTurns) {
+  return rawTurns.map((rawTurn) => {
+    const turn = rawTurn;
+    const input = turn.input;
+    const expectedOutput = turn.expected_output;
+    let assertions;
+    if (Array.isArray(turn.assertions)) {
+      assertions = turn.assertions.map((a) => {
+        if (typeof a === "string") return a;
+        return a;
+      });
+    }
+    return {
+      input,
+      ...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
+      ...assertions && assertions.length > 0 ? { assertions } : {}
+    };
+  });
+}
 function parseCommandArray(source) {
   if (typeof source === "string") {
     const parts = source.trim().split(/\s+/);
@@ -4745,6 +4777,154 @@ function subscribeToClaudeLogEntries(listener) {
   };
 }
+// src/evaluation/providers/normalize-tool-call.ts
+var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
+  // --- Claude (already canonical) ---
+  ["claude::Skill", "Skill"],
+  ["claude::Read", "Read"],
+  ["claude::Write", "Write"],
+  ["claude::Edit", "Edit"],
+  ["claude::Bash", "Bash"],
+  ["claude-cli::Skill", "Skill"],
+  ["claude-cli::Read", "Read"],
+  ["claude-cli::Write", "Write"],
+  ["claude-cli::Edit", "Edit"],
+  ["claude-cli::Bash", "Bash"],
+  ["claude-sdk::Skill", "Skill"],
+  ["claude-sdk::Read", "Read"],
+  ["claude-sdk::Write", "Write"],
+  ["claude-sdk::Edit", "Edit"],
+  ["claude-sdk::Bash", "Bash"],
+  // --- Copilot ---
+  ["copilot-cli::Skill", "Skill"],
+  ["copilot-cli::skill", "Skill"],
+  ["copilot-cli::Read File", "Read"],
+  ["copilot-cli::readFile", "Read"],
+  ["copilot-cli::Read", "Read"],
+  ["copilot-cli::readTextFile", "Read"],
+  ["copilot-cli::writeTextFile", "Write"],
+  ["copilot-cli::Write File", "Write"],
+  ["copilot-cli::editFile", "Edit"],
+  ["copilot-cli::Edit File", "Edit"],
+  ["copilot-cli::runTerminalCommand", "Bash"],
+  ["copilot-sdk::Skill", "Skill"],
+  ["copilot-sdk::skill", "Skill"],
+  ["copilot-sdk::Read File", "Read"],
+  ["copilot-sdk::readFile", "Read"],
+  ["copilot-sdk::Read", "Read"],
+  ["copilot-sdk::readTextFile", "Read"],
+  ["copilot-sdk::writeTextFile", "Write"],
+  ["copilot-sdk::Write File", "Write"],
+  ["copilot-sdk::editFile", "Edit"],
+  ["copilot-sdk::Edit File", "Edit"],
+  ["copilot-sdk::runTerminalCommand", "Bash"],
+  ["copilot-log::Skill", "Skill"],
+  ["copilot-log::skill", "Skill"],
+  ["copilot-log::Read File", "Read"],
+  ["copilot-log::readFile", "Read"],
+  ["copilot-log::Read", "Read"],
+  ["copilot-log::readTextFile", "Read"],
+  ["copilot-log::writeTextFile", "Write"],
+  ["copilot-log::Write File", "Write"],
+  ["copilot-log::editFile", "Edit"],
+  ["copilot-log::Edit File", "Edit"],
+  ["copilot-log::runTerminalCommand", "Bash"],
+  ["vscode::Skill", "Skill"],
+  ["vscode::skill", "Skill"],
+  ["vscode::Read File", "Read"],
+  ["vscode::readFile", "Read"],
+  ["vscode::Read", "Read"],
+  ["vscode::readTextFile", "Read"],
+  ["vscode::writeTextFile", "Write"],
+  ["vscode::Write File", "Write"],
+  ["vscode::editFile", "Edit"],
+  ["vscode::Edit File", "Edit"],
+  ["vscode::runTerminalCommand", "Bash"],
+  ["vscode-insiders::Skill", "Skill"],
+  ["vscode-insiders::skill", "Skill"],
+  ["vscode-insiders::Read File", "Read"],
+  ["vscode-insiders::readFile", "Read"],
+  ["vscode-insiders::Read", "Read"],
+  ["vscode-insiders::readTextFile", "Read"],
+  ["vscode-insiders::writeTextFile", "Write"],
+  ["vscode-insiders::Write File", "Write"],
+  ["vscode-insiders::editFile", "Edit"],
+  ["vscode-insiders::Edit File", "Edit"],
+  ["vscode-insiders::runTerminalCommand", "Bash"],
+  // --- Codex ---
+  ["codex::command_execution", "Bash"],
+  ["codex::file_change", "Edit"],
+  // --- Pi ---
+  ["pi-coding-agent::read", "Read"],
+  ["pi-coding-agent::bash", "Bash"],
+  ["pi-cli::read", "Read"],
+  ["pi-cli::bash", "Bash"]
+]);
+var COPILOT_PREFIXES = [
+  { prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
+  { prefix: "Viewing ", canonical: "Read" }
+];
+var CODEX_PREFIXES = [
+  { prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
+];
+var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
+  ["copilot-cli", COPILOT_PREFIXES],
+  ["copilot-sdk", COPILOT_PREFIXES],
+  ["copilot-log", COPILOT_PREFIXES],
+  ["vscode", COPILOT_PREFIXES],
+  ["vscode-insiders", COPILOT_PREFIXES],
+  ["codex", CODEX_PREFIXES]
+]);
+var normalizeSkillInput = (input) => {
+  if (input.skill !== void 0) return input;
+  return input;
+};
+var normalizeReadInput = (input) => {
+  if (input.file_path !== void 0) return input;
+  if (input.path !== void 0) return { ...input, file_path: input.path };
+  if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
+  return input;
+};
+var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
+  ["Skill", normalizeSkillInput],
+  ["Read", normalizeReadInput]
+]);
+function normalizeToolCall(providerKind, tc) {
+  const nativeName = tc.tool;
+  const exactKey = `${providerKind}::${nativeName}`;
+  const canonical = TOOL_NAME_MAP.get(exactKey);
+  if (canonical) {
+    return applyInputNormalization(canonical, { ...tc, tool: canonical });
+  }
+  const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
+  if (prefixRules) {
+    for (const rule of prefixRules) {
+      if (nativeName.startsWith(rule.prefix)) {
+        const suffix = nativeName.slice(rule.prefix.length);
+        let normalizedInput = tc.input;
+        if (rule.extractSkillFromName && suffix) {
+          const existingInput = tc.input ?? {};
+          normalizedInput = { ...existingInput, skill: suffix };
+        }
+        const normalized = {
+          ...tc,
+          tool: rule.canonical,
+          input: normalizedInput
+        };
+        return applyInputNormalization(rule.canonical, normalized);
+      }
+    }
+  }
+  return tc;
+}
+function applyInputNormalization(canonical, tc) {
+  const normalizer = INPUT_NORMALIZERS.get(canonical);
+  if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
+  const input = tc.input;
+  const normalized = normalizer(input);
+  return normalized === input ? tc : { ...tc, input: normalized };
+}
 // src/evaluation/providers/preread.ts
 import path10 from "node:path";
 function buildPromptDocument(request, inputFiles) {
@@ -5212,11 +5392,13 @@ function extractToolCalls(content) {
     }
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("claude-cli", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     }
   }
   return toolCalls;
@@ -5507,11 +5689,13 @@ function extractToolCalls2(content) {
     }
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("claude-sdk", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     }
   }
   return toolCalls;
@@ -6426,27 +6610,33 @@ ${basePrompt}` : basePrompt;
       }
     }
     if (itemType === "command_execution") {
-      completedToolCalls.push({
-        tool: "command_execution",
-        input: { command: item.command },
-        output: item.aggregated_output,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: "command_execution",
+          input: { command: item.command },
+          output: item.aggregated_output,
+          id: item.id
+        })
+      );
     }
     if (itemType === "file_change") {
-      completedToolCalls.push({
-        tool: "file_change",
-        input: item.changes,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: "file_change",
+          input: item.changes,
+          id: item.id
+        })
+      );
     }
     if (itemType === "mcp_tool_call") {
-      completedToolCalls.push({
-        tool: `mcp:${item.server}/${item.tool}`,
-        input: item.arguments,
-        output: item.result ?? item.error,
-        id: item.id
-      });
+      completedToolCalls.push(
+        normalizeToolCall("codex", {
+          tool: `mcp:${item.server}/${item.tool}`,
+          input: item.arguments,
+          output: item.result ?? item.error,
+          id: item.id
+        })
+      );
     }
   }
   resolveCwd(cwdOverride) {
@@ -6981,12 +7171,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
     return logger;
   }
   handleEvent(eventType, data) {
-    if (this.format === "json") {
-      const elapsed2 = formatElapsed4(this.startedAt);
-      this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
-`);
-      return;
-    }
     if (this.chunkExtractor) {
       const chunkText = this.chunkExtractor(eventType, data);
       if (chunkText === null) {
@@ -6999,6 +7183,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
       }
       this.flushPendingText();
     }
+    if (this.format === "json") {
+      const elapsed2 = formatElapsed4(this.startedAt);
+      this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
+`);
+      return;
+    }
     const elapsed = formatElapsed4(this.startedAt);
     const summary = this.summarize(eventType, data);
     if (summary) {
@@ -7009,14 +7199,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
   flushPendingText() {
     if (!this.pendingText) return;
     const elapsed = formatElapsed4(this.startedAt);
-    this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
+    if (this.format === "json") {
+      this.stream.write(
+        `${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
+`
+      );
+    } else {
+      this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
 `);
+    }
     this.pendingText = "";
   }
   async close() {
-    if (this.format !== "json") {
-      this.flushPendingText();
-    }
+    this.flushPendingText();
     await new Promise((resolve, reject) => {
       this.stream.once("error", reject);
       this.stream.end(() => resolve());
@@ -7091,15 +7286,17 @@ var CopilotCliProvider = class {
           }
           if (update.status === "completed" || update.status === "failed") {
             const toolName = update.title ?? update.kind ?? "unknown";
-            completedToolCalls.push({
-              tool: toolName,
-              input: update.rawInput,
-              output: update.rawOutput,
-              id: callId,
-              startTime: (/* @__PURE__ */ new Date()).toISOString(),
-              endTime: (/* @__PURE__ */ new Date()).toISOString(),
-              durationMs: 0
-            });
+            completedToolCalls.push(
+              normalizeToolCall("copilot-cli", {
+                tool: toolName,
+                input: update.rawInput,
+                output: update.rawOutput,
+                id: callId,
+                startTime: (/* @__PURE__ */ new Date()).toISOString(),
+                endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                durationMs: 0
+              })
+            );
             request.streamCallbacks?.onToolCallEnd?.(
               toolName,
               update.rawInput,
@@ -7116,15 +7313,17 @@ var CopilotCliProvider = class {
             if (inProgress) {
               toolCallsInProgress.delete(callId);
               const duration = Date.now() - inProgress.startMs;
-              completedToolCalls.push({
-                tool: inProgress.tool,
-                input: inProgress.input,
-                output: update.rawOutput,
-                id: inProgress.id,
-                startTime: inProgress.startTime,
-                endTime: (/* @__PURE__ */ new Date()).toISOString(),
-                durationMs: duration
-              });
+              completedToolCalls.push(
+                normalizeToolCall("copilot-cli", {
+                  tool: inProgress.tool,
+                  input: inProgress.input,
+                  output: update.rawOutput,
+                  id: inProgress.id,
+                  startTime: inProgress.startTime,
+                  endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                  durationMs: duration
+                })
+              );
               request.streamCallbacks?.onToolCallEnd?.(
                 inProgress.tool,
                 inProgress.input,
@@ -7468,11 +7667,13 @@ function parseCopilotEvents(eventsJsonl) {
       }
       case "assistant.message": {
         const toolRequests = data.toolRequests;
-        const toolCalls = (toolRequests ?? []).map((req) => ({
-          tool: String(req.name ?? req.toolName ?? ""),
-          input: req.arguments,
-          id: req.toolCallId ? String(req.toolCallId) : void 0
-        }));
+        const toolCalls = (toolRequests ?? []).map(
+          (req) => normalizeToolCall("copilot-log", {
+            tool: String(req.name ?? req.toolName ?? ""),
+            input: req.arguments,
+            id: req.toolCallId ? String(req.toolCallId) : void 0
+          })
+        );
         messages.push({
           role: "assistant",
           content: data.content != null ? String(data.content) : void 0,
@@ -7512,12 +7713,12 @@ function parseCopilotEvents(eventsJsonl) {
           messages.push({
             role: "assistant",
             toolCalls: [
-              {
+              normalizeToolCall("copilot-log", {
                 tool: started.toolName,
                 input: started.input,
                 output: data.result,
                 id: toolCallId
-              }
+              })
             ]
           });
         }
@@ -7863,15 +8064,17 @@ var CopilotSdkProvider = class {
           if (inProgress) {
             toolCallsInProgress.delete(callId);
             const endMs = Date.now();
-            completedToolCalls.push({
-              tool: inProgress.tool,
-              input: inProgress.input,
-              output: data?.output ?? data?.result,
-              id: inProgress.id,
-              startTime: inProgress.startTime,
-              endTime: (/* @__PURE__ */ new Date()).toISOString(),
-              durationMs: endMs - inProgress.startMs
-            });
+            completedToolCalls.push(
+              normalizeToolCall("copilot-sdk", {
+                tool: inProgress.tool,
+                input: inProgress.input,
+                output: data?.output ?? data?.result,
+                id: inProgress.id,
+                startTime: inProgress.startTime,
+                endTime: (/* @__PURE__ */ new Date()).toISOString(),
+                durationMs: endMs - inProgress.startMs
+              })
+            );
           }
         }
         if (eventType === "assistant.message") {
@@ -8850,12 +9053,14 @@ function extractToolCallsFromEvents(events) {
   }
   const toolCalls = [];
   for (const [id, { tool: tool2, input }] of starts) {
-    toolCalls.push({
-      tool: tool2,
-      input,
-      id: id.startsWith("anon-") ? void 0 : id,
-      output: results.get(id)
-    });
+    toolCalls.push(
+      normalizeToolCall("pi-cli", {
+        tool: tool2,
+        input,
+        id: id.startsWith("anon-") ? void 0 : id,
+        output: results.get(id)
+      })
+    );
   }
   return toolCalls;
 }
@@ -8977,17 +9182,21 @@ function extractToolCalls3(content) {
     if (!part || typeof part !== "object") continue;
     const p = part;
     if (p.type === "tool_use" && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("pi-cli", {
+          tool: p.name,
+          input: p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     } else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
-      toolCalls.push({
-        tool: p.name,
-        input: p.arguments ?? p.input,
-        id: typeof p.id === "string" ? p.id : void 0
-      });
+      toolCalls.push(
+        normalizeToolCall("pi-cli", {
+          tool: p.name,
+          input: p.arguments ?? p.input,
+          id: typeof p.id === "string" ? p.id : void 0
+        })
+      );
     } else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
       const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
       if (existing) {
@@ -14066,100 +14275,35 @@ var LatencyEvaluator = class {
 };
 // src/evaluation/evaluators/skill-trigger.ts
-var CLAUDE_MATCHER = {
-  skillTools: ["Skill"],
-  skillInputField: "skill",
-  readTools: ["Read"],
-  readInputField: "file_path"
-};
-var COPILOT_MATCHER = {
-  skillTools: ["Skill", "skill"],
-  skillInputField: "skill",
-  readTools: ["Read File", "readFile", "Read", "readTextFile"],
-  readInputField: "file_path",
-  skillToolPrefixes: ["Using skill: "],
-  readToolPrefixes: ["Viewing "],
-  readInputFields: ["file_path", "path"]
-};
-var PI_CODING_AGENT_MATCHER = {
-  skillTools: [],
-  skillInputField: "skill",
-  readTools: ["read"],
-  readInputField: "path",
-  readInputFields: ["path", "file_path", "filePath"]
-};
-var CODEX_MATCHER = {
-  skillTools: [],
-  skillInputField: "skill",
-  readTools: ["command_execution"],
-  readInputField: "command",
-  skillToolPrefixes: ["mcp:"],
-  readToolPrefixes: ["mcp:"],
-  readInputFields: ["command", "path", "file_path", "filePath"]
-};
-var PROVIDER_TOOL_SEMANTICS = {
-  claude: CLAUDE_MATCHER,
-  "claude-cli": CLAUDE_MATCHER,
-  "claude-sdk": CLAUDE_MATCHER,
-  codex: CODEX_MATCHER,
-  "pi-coding-agent": PI_CODING_AGENT_MATCHER,
-  "pi-cli": PI_CODING_AGENT_MATCHER,
-  "copilot-cli": COPILOT_MATCHER,
-  "copilot-log": COPILOT_MATCHER,
-  "copilot-sdk": COPILOT_MATCHER,
-  vscode: COPILOT_MATCHER,
-  "vscode-insiders": COPILOT_MATCHER
-};
 var SkillTriggerEvaluator = class {
   kind = "skill-trigger";
   config;
   constructor(config) {
     this.config = config;
   }
-  resolveMatcher(providerKind) {
-    if (providerKind) {
-      const match = PROVIDER_TOOL_SEMANTICS[providerKind];
-      if (match) return match;
-    }
-    return CLAUDE_MATCHER;
-  }
   evaluate(context) {
     const skillName = this.config.skill;
     const shouldTrigger = this.config.should_trigger !== false;
-    const providerKind = context.provider?.kind;
-    const matcher = this.resolveMatcher(providerKind);
     const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
     let triggered = false;
     let evidence = "";
     for (const toolCall of allToolCalls) {
       const toolName = toolCall.tool ?? "";
       const input = toolCall.input ?? {};
-      if (matcher.skillTools.includes(toolName)) {
-        const skillArg = String(input[matcher.skillInputField] ?? "");
+      if (toolName === "Skill") {
+        const skillArg = String(input.skill ?? "");
         if (skillArg.includes(skillName)) {
           triggered = true;
-          evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
+          evidence = `Skill tool invoked with skill="${skillArg}"`;
           break;
         }
-      } else if (matcher.skillToolPrefixes?.some(
-        (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
-      )) {
-        triggered = true;
-        evidence = `Skill tool invoked via tool name "${toolName}"`;
-        break;
-      } else if (matcher.readTools.includes(toolName)) {
-        const filePath = this.readPathFromInput(input, matcher);
-        if (filePath.includes(skillName)) {
+      } else if (toolName === "Read") {
+        const filePath = String(input.file_path ?? "");
+        if (filePath.includes(`skills/${skillName}/`)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
           break;
         }
-      } else if (matcher.readToolPrefixes?.some(
-        (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
-      )) {
-        triggered = true;
-        evidence = `Read tool loaded skill file via tool name "${toolName}"`;
-        break;
       }
       if (!triggered && toolCall.output != null) {
         const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
@@ -14196,16 +14340,6 @@ var SkillTriggerEvaluator = class {
       expectedAspectCount: 1
     };
   }
-  readPathFromInput(input, matcher) {
-    const fields = matcher.readInputFields ?? [matcher.readInputField];
-    for (const field of fields) {
-      const value = input[field];
-      if (value !== void 0 && value !== null) {
-        return String(value);
-      }
-    }
-    return "";
-  }
 };
 // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15050,10 +15184,12 @@ function runEqualsAssertion(output, value) {
 }
 // src/evaluation/orchestrator.ts
+import { execFile as execFile3 } from "node:child_process";
 import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
 import { existsSync as existsSync5 } from "node:fs";
 import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
 import path45 from "node:path";
+import { promisify as promisify7 } from "node:util";
 import micromatch3 from "micromatch";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -16507,6 +16643,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
 }
 // src/evaluation/orchestrator.ts
+var execFileAsync3 = promisify7(execFile3);
+var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
 function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
   return score >= threshold ? "ok" : "quality_failure";
 }
@@ -16544,6 +16682,35 @@ function hasHookCommand(hook) {
 function hooksEnabled(workspace) {
   return workspace?.hooks?.enabled !== false;
 }
+function workspaceGitEnv() {
+  const env = { ...process.env };
+  for (const key of Object.keys(env)) {
+    if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
+      delete env[key];
+    }
+  }
+  return {
+    ...env,
+    GIT_TERMINAL_PROMPT: "0",
+    GIT_ASKPASS: "",
+    GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
+  };
+}
+async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
+  if (!existsSync5(path45.join(workspacePath, ".git"))) {
+    return false;
+  }
+  const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
+  const opts = {
+    cwd: workspacePath,
+    timeout: WORKSPACE_GIT_TIMEOUT_MS,
+    env: workspaceGitEnv(),
+    maxBuffer: 50 * 1024 * 1024
+  };
+  await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
+  await execFileAsync3("git", ["clean", cleanFlag], opts);
+  return true;
+}
 function getWorkspaceTemplate(target) {
   const config = target.config;
   if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -17805,6 +17972,37 @@ async function runEvalCase(options) {
       }
     }
   }
+  let beforeEachNeedsFreshBaseline = false;
+  if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
+    try {
+      if (repoManager && evalCase.workspace.repos?.length) {
+        await repoManager.reset(
+          evalCase.workspace.repos,
+          workspacePath,
+          evalCase.workspace.hooks.before_each.reset
+        );
+      } else {
+        await resetWorkspaceRoot(
+          workspacePath,
+          evalCase.workspace.hooks.before_each.reset,
+          sharedBaselineCommit
+        );
+      }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return buildErrorResult(
+        evalCase,
+        target.name,
+        nowFn(),
+        new Error(`before_each reset failed: ${message}`),
+        promptInputs,
+        provider,
+        "setup",
+        "script_error",
+        verbose
+      );
+    }
+  }
   const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
   if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
     const beforeEachHook = caseBeforeEachHook;
@@ -17821,6 +18019,7 @@ async function runEvalCase(options) {
         toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
         scriptContext
       );
+      beforeEachNeedsFreshBaseline = true;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       return buildErrorResult(
@@ -17836,7 +18035,7 @@ async function runEvalCase(options) {
       );
     }
   }
-  let baselineCommit = sharedBaselineCommit;
+  let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
   if (!baselineCommit && workspacePath) {
     try {
       baselineCommit = await initializeBaseline(workspacePath);
@@ -17847,6 +18046,35 @@ async function runEvalCase(options) {
       }
     }
   }
+  if (evalCase.mode === "conversation" && evalCase.turns?.length) {
+    const conversationResult = await runConversationMode({
+      evalCase,
+      provider,
+      target,
+      evaluators,
+      typeRegistry,
+      graderProvider,
+      promptInputs,
+      nowFn,
+      signal,
+      workspacePath,
+      caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
+      agentTimeoutMs,
+      streamCallbacks: options.streamCallbacks,
+      verbose,
+      threshold: evalCase.threshold ?? caseThreshold,
+      targetResolver,
+      availableTargets
+    });
+    if (workspacePath && !isSharedWorkspace) {
+      const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
+      if (!shouldRetain) {
+        await cleanupWorkspace(workspacePath).catch(() => {
+        });
+      }
+    }
+    return conversationResult;
+  }
   const caseStartMs = Date.now();
   const attemptBudget = (maxRetries ?? 0) + 1;
   let attempt = 0;
@@ -17961,13 +18189,21 @@ async function runEvalCase(options) {
 ${providerFileChanges}` : providerFileChanges;
   }
   const providerError = extractProviderError(providerResponse);
-  if (caseHooksEnabled && repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
+  if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
     try {
-      await repoManager.reset(
-        evalCase.workspace.repos,
-        workspacePath,
-        evalCase.workspace.hooks.after_each.reset
-      );
+      if (repoManager && evalCase.workspace.repos?.length) {
+        await repoManager.reset(
+          evalCase.workspace.repos,
+          workspacePath,
+          evalCase.workspace.hooks.after_each.reset
+        );
+      } else {
+        await resetWorkspaceRoot(
+          workspacePath,
+          evalCase.workspace.hooks.after_each.reset,
+          baselineCommit
+        );
+      }
     } catch {
     }
   }
@@ -18583,6 +18819,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
     "llm-grader": llmGrader
   };
 }
+async function runConversationMode(options) {
+  const {
+    evalCase,
+    provider,
+    target,
+    evaluators,
+    typeRegistry,
+    graderProvider,
+    promptInputs,
+    nowFn,
+    signal,
+    workspacePath,
+    caseWorkspaceFile,
+    agentTimeoutMs,
+    streamCallbacks,
+    verbose,
+    threshold,
+    targetResolver,
+    availableTargets
+  } = options;
+  const turns = evalCase.turns;
+  const aggregation = evalCase.aggregation ?? "mean";
+  const onTurnFailure = evalCase.on_turn_failure ?? "continue";
+  const windowSize = evalCase.window_size;
+  const history = [];
+  for (const msg of evalCase.input) {
+    const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
+    history.push({ role: msg.role, content });
+  }
+  const turnScores = [];
+  const allTurnScoreValues = [];
+  let stopped = false;
+  const caseStartMs = Date.now();
+  for (let i = 0; i < turns.length; i++) {
+    const turn = turns[i];
+    const turnIndex = i + 1;
+    if (stopped) {
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 0,
+        verdict: "skip",
+        assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
+      });
+      allTurnScoreValues.push(0);
+      continue;
+    }
+    const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
+    history.push({ role: "user", content: userContent });
+    const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
+    let response;
+    try {
+      response = await provider.invoke({
+        question: userContent,
+        chatPrompt: chatPromptForProvider,
+        evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
+        signal,
+        cwd: workspacePath,
+        workspaceFile: caseWorkspaceFile,
+        streamCallbacks
+      });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 0,
+        verdict: "fail",
+        assertions: [{ text: `Provider error: ${message}`, passed: false }]
+      });
+      allTurnScoreValues.push(0);
+      if (onTurnFailure === "stop") stopped = true;
+      continue;
+    }
+    const assistantContent = extractLastAssistantContent(response.output);
+    history.push({ role: "assistant", content: assistantContent });
+    if (!turn.assertions?.length && !turn.expected_output) {
+      turnScores.push({
+        name: `turn-${turnIndex}`,
+        type: "rubrics",
+        score: 1,
+        verdict: "pass",
+        assertions: []
+      });
+      allTurnScoreValues.push(1);
+      continue;
+    }
+    const turnAssertions = buildTurnAssertions(turn);
+    const turnEvalCase = {
+      ...evalCase,
+      id: `${evalCase.id}/turn-${turnIndex}`,
+      assertions: turnAssertions,
+      input: buildTurnGraderInput(history, windowSize),
+      expected_output: turn.expected_output ? [
+        typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
+      ] : [],
+      // Clear conversation fields to prevent recursion
+      mode: void 0,
+      turns: void 0
+    };
+    const turnResult = await evaluateCandidate({
+      evalCase: turnEvalCase,
+      candidate: assistantContent,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: buildConversationContext(history, windowSize),
+        chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      output: response.output,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets
+    });
+    const turnScore = turnResult.score;
+    allTurnScoreValues.push(turnScore);
+    turnScores.push({
+      name: `turn-${turnIndex}`,
+      type: "rubrics",
+      score: turnScore,
+      verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
+      assertions: turnResult.assertions ? [...turnResult.assertions] : [],
+      scores: turnResult.scores
+    });
+    if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
+      stopped = true;
+    }
+  }
+  let conversationScores = [];
+  if (evalCase.assertions?.length) {
+    const conversationEvalCase = {
+      ...evalCase,
+      id: `${evalCase.id}/conversation`,
+      input: history.map((m) => ({
+        role: m.role,
+        content: m.content
+      })),
+      expected_output: [],
+      mode: void 0,
+      turns: void 0
+    };
+    const fullTranscript = history.map((m) => {
+      const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
+      return `${m.role}: ${content}`;
+    }).join("\n\n");
+    const conversationResult = await evaluateCandidate({
+      evalCase: conversationEvalCase,
+      candidate: fullTranscript,
+      target,
+      provider,
+      evaluators,
+      typeRegistry,
+      promptInputs: {
+        question: fullTranscript,
+        chatPrompt: [...history]
+      },
+      nowFn,
+      attempt: 0,
+      graderProvider,
+      agentTimeoutMs,
+      verbose,
+      threshold,
+      targetResolver,
+      availableTargets
+    });
+    conversationScores = [
+      {
+        name: "conversation",
+        type: "rubrics",
+        score: conversationResult.score,
+        verdict: scoreToVerdict(
+          conversationResult.score,
+          threshold ?? DEFAULT_THRESHOLD
+        ),
+        assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
+        scores: conversationResult.scores
+      }
+    ];
+  }
+  const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
+  const finalScore = aggregateConversationScores(allScoreValues, aggregation);
+  const allResultScores = [...turnScores, ...conversationScores];
+  const outputMessages = history.map((m) => ({
+    role: m.role,
+    content: m.content
+  }));
+  const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
+  const totalDurationMs = Date.now() - caseStartMs;
+  return {
+    timestamp: nowFn().toISOString(),
+    testId: evalCase.id,
+    suite: evalCase.suite,
+    category: evalCase.category,
+    score: finalScore,
+    assertions: flatAssertions,
+    target: target.name,
+    output: outputMessages,
+    scores: allResultScores,
+    executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
+    input: evalCase.input.map((m) => ({
+      role: m.role,
+      content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
+    })),
+    evalRun: { durationMs: totalDurationMs }
+  };
+}
+function buildWindowedHistory(history, windowSize) {
+  const systemMessages = history.filter((m) => m.role === "system");
+  const nonSystem = history.filter((m) => m.role !== "system");
+  const windowed = nonSystem.slice(-windowSize * 2);
+  return [...systemMessages, ...windowed];
+}
+function buildConversationContext(history, windowSize) {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs.map((m) => {
+    const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
+    return `${m.role}: ${content}`;
+  }).join("\n\n");
+}
+function buildTurnGraderInput(history, windowSize) {
+  const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
+  return msgs.map((m) => ({
+    role: m.role,
+    content: m.content
+  }));
+}
+function buildTurnAssertions(turn) {
+  if (!turn.assertions?.length) return [];
+  const stringCriteria = [];
+  const structured = [];
+  for (const a of turn.assertions) {
+    if (typeof a === "string") {
+      stringCriteria.push(a);
+    } else {
+      structured.push(a);
+    }
+  }
+  const result = [];
+  if (stringCriteria.length > 0) {
+    result.push({
+      name: "turn-rubrics",
+      type: "llm-grader",
+      rubrics: stringCriteria.map((text, idx) => ({
+        id: `criterion-${idx + 1}`,
+        outcome: text,
+        weight: 1
+      }))
+    });
+  }
+  result.push(...structured);
+  return result;
+}
+function aggregateConversationScores(scores, aggregation) {
+  if (scores.length === 0) return 1;
+  switch (aggregation) {
+    case "min":
+      return Math.min(...scores);
+    case "max":
+      return Math.max(...scores);
+    default:
+      return scores.reduce((sum, s) => sum + s, 0) / scores.length;
+  }
+}
 async function invokeProvider(provider, options) {
   const {
     evalCase,
@@ -19299,13 +19805,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
 }
 // src/evaluation/results-repo.ts
-import { execFile as execFile3 } from "node:child_process";
+import { execFile as execFile4 } from "node:child_process";
 import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
 import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
 import os3 from "node:os";
 import path49 from "node:path";
-import { promisify as promisify7 } from "node:util";
-var execFileAsync3 = promisify7(execFile3);
+import { promisify as promisify8 } from "node:util";
+var execFileAsync4 = promisify8(execFile4);
 function sanitizeRepoSlug(repo) {
   return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
 }
@@ -19356,7 +19862,7 @@ function writePersistedStatus(statusFile, status) {
 }
 async function runCommand(executable, args, options) {
   try {
-    const { stdout, stderr } = await execFileAsync3(executable, [...args], {
+    const { stdout, stderr } = await execFileAsync4(executable, [...args], {
       cwd: options?.cwd,
       env: process.env
     });
@@ -20404,11 +20910,13 @@ function extractAssistantContent(content) {
         break;
       case "tool_use":
         if (block.name) {
-          toolCalls.push({
-            tool: block.name,
-            input: block.input,
-            id: block.id
-          });
+          toolCalls.push(
+            normalizeToolCall("claude", {
+              tool: block.name,
+              input: block.input,
+              id: block.id
+            })
+          );
         }
         break;
     }
@@ -20500,7 +21008,11 @@ function parseCodexSession(jsonl) {
             } else {
               input = payload.arguments;
             }
-            const toolCall = { tool: toolName, input, id: callId };
+            const toolCall = normalizeToolCall("codex", {
+              tool: toolName,
+              input,
+              id: callId
+            });
             const msgIdx = messages.length;
             messages.push({
               role: "assistant",
@@ -20524,7 +21036,11 @@ function parseCodexSession(jsonl) {
             } else {
               input = payload.arguments;
             }
-            const toolCall = { tool: toolName, input, id: callId };
+            const toolCall = normalizeToolCall("codex", {
+              tool: toolName,
+              input,
+              id: callId
+            });
             const msgIdx = messages.length;
             messages.push({
               role: "assistant",