npm - @agentv/core - Versions diffs - 2.1.1 → 2.5.1 - Mend

@agentv/core 2.1.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +38 -4
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +39 -5
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1070 -281
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +82 -7
package/dist/index.d.ts +82 -7
package/dist/index.js +1018 -230
package/dist/index.js.map +1 -1
package/package.json +1 -1

package/dist/index.cjs CHANGED Viewed

@@ -53,6 +53,7 @@ __export(index_exports, {
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
   deepEqual: () => deepEqual,
+  detectFormat: () => detectFormat,
   ensureVSCodeSubagents: () => ensureVSCodeSubagents,
   executeScript: () => executeScript,
   explorationRatio: () => explorationRatio,
@@ -226,9 +227,10 @@ function mergeExecutionMetrics(summary, metrics) {
 }
 // src/evaluation/yaml-parser.ts
-var import_promises6 = require("fs/promises");
-var import_node_path6 = __toESM(require("path"), 1);
-var import_yaml2 = require("yaml");
+var import_promises7 = require("fs/promises");
+var import_node_path7 = __toESM(require("path"), 1);
+var import_micromatch3 = __toESM(require("micromatch"), 1);
+var import_yaml3 = require("yaml");
 // src/evaluation/loaders/config-loader.ts
 var import_promises2 = require("fs/promises");
@@ -542,11 +544,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
         }
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
-      const config = {};
+      const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
+      const config2 = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
-        if (!knownProps.has(key) && value !== void 0) {
-          config[key] = value;
+        if (!knownProps2.has(key) && value !== void 0) {
+          config2[key] = value;
         }
       }
       evaluators.push({
@@ -556,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {},
+        ...Object.keys(config2).length > 0 ? { config: config2 } : {},
         ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
@@ -721,7 +723,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const config = {
+      const config2 = {
         name,
         type: "tool_trajectory",
         mode,
@@ -729,7 +731,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         ...expected ? { expected } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {}
       };
-      evaluators.push(config);
+      evaluators.push(config2);
       continue;
     }
     if (typeValue === "field_accuracy") {
@@ -866,9 +868,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString(rawEvaluator.prompt);
+    const rawPrompt = rawEvaluator.prompt;
+    let prompt;
     let promptPath;
-    if (prompt) {
+    let resolvedPromptScript;
+    let promptScriptConfig;
+    if (isJsonObject2(rawPrompt)) {
+      const scriptArray = asStringArray(
+        rawPrompt.script,
+        `prompt.script for evaluator '${name}' in '${evalId}'`
+      );
+      if (!scriptArray) {
+        throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
+      }
+      const scriptPath = scriptArray[scriptArray.length - 1];
+      const resolved = await resolveFileReference(scriptPath, searchRoots);
+      if (resolved.resolvedPath) {
+        resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path3.default.resolve(resolved.resolvedPath)];
+      } else {
+        throw new Error(
+          `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
+        );
+      }
+      if (isJsonObject2(rawPrompt.config)) {
+        promptScriptConfig = rawPrompt.config;
+      }
+    } else if (typeof rawPrompt === "string") {
+      prompt = rawPrompt;
       const resolved = await resolveFileReference(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
@@ -887,12 +913,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
     }
     const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
-    const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString(rubric.description) ?? "",
-      weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-      required: typeof rubric.required === "boolean" ? rubric.required : true
-    })).filter((r) => r.description.length > 0) : void 0;
+    const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
     if (typeValue === "rubric") {
       if (!parsedRubrics) {
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -912,13 +933,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
+    const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
+    const config = {};
+    for (const [key, value] of Object.entries(rawEvaluator)) {
+      if (!knownProps.has(key) && value !== void 0) {
+        config[key] = value;
+      }
+    }
+    const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
+    const mergedConfig = { ...config, ...topLevelConfig };
+    const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
+      ...promptPath ? { resolvedPromptPath: promptPath } : {},
+      ...resolvedPromptScript ? { resolvedPromptScript } : {},
       ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
-      ...weight !== void 0 ? { weight } : {}
+      ...weight !== void 0 ? { weight } : {},
+      ...finalConfig ? { config: finalConfig } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -1005,6 +1039,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
 function isValidFieldAggregationType(value) {
   return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
 }
+function parseRubricItems(rawRubrics, evaluatorName, evalId) {
+  const items = [];
+  for (const [index, rawRubric] of rawRubrics.entries()) {
+    if (!isJsonObject2(rawRubric)) {
+      logWarning2(
+        `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
+      );
+      continue;
+    }
+    const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
+    const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
+    const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
+    let requiredMinScore;
+    let required;
+    if (typeof rawRubric.required_min_score === "number") {
+      const minScore = rawRubric.required_min_score;
+      if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
+        throw new Error(
+          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
+        );
+      }
+      requiredMinScore = minScore;
+    }
+    if (typeof rawRubric.required === "boolean") {
+      required = rawRubric.required;
+    }
+    let scoreRanges;
+    const rawScoreRanges = rawRubric.score_ranges;
+    if (rawScoreRanges !== void 0) {
+      if (!Array.isArray(rawScoreRanges)) {
+        throw new Error(
+          `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
+        );
+      }
+      scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
+      items.push({
+        id,
+        weight,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...required !== void 0 ? { required } : {},
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
+        score_ranges: scoreRanges
+      });
+    } else {
+      if (expectedOutcome.length === 0) {
+        logWarning2(
+          `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
+        );
+        continue;
+      }
+      items.push({
+        id,
+        expected_outcome: expectedOutcome,
+        weight,
+        // Default to required: true if not specified (backward compatibility)
+        required: required ?? true,
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
+      });
+    }
+  }
+  return items.length > 0 ? items : void 0;
+}
+function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
+  const ranges = [];
+  for (const [index, rawRange] of rawRanges.entries()) {
+    if (!isJsonObject2(rawRange)) {
+      throw new Error(
+        `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
+      );
+    }
+    const scoreRangeValue = rawRange.score_range;
+    if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
+      );
+    }
+    const [min, max] = scoreRangeValue;
+    if (!Number.isInteger(min) || !Number.isInteger(max)) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
+      );
+    }
+    if (min < 0 || min > 10 || max < 0 || max > 10) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
+      );
+    }
+    if (min > max) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
+      );
+    }
+    const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
+    if (expectedOutcome.length === 0) {
+      throw new Error(
+        `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
+      );
+    }
+    ranges.push({
+      score_range: [min, max],
+      expected_outcome: expectedOutcome
+    });
+  }
+  const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
+  for (let i = 1; i < sortedRanges.length; i++) {
+    const prev = sortedRanges[i - 1];
+    const curr = sortedRanges[i];
+    if (curr.score_range[0] <= prev.score_range[1]) {
+      throw new Error(
+        `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
+      );
+    }
+  }
+  const covered = /* @__PURE__ */ new Set();
+  for (const range of ranges) {
+    for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
+      covered.add(i);
+    }
+  }
+  const missing = [];
+  for (let i = 0; i <= 10; i++) {
+    if (!covered.has(i)) {
+      missing.push(i);
+    }
+  }
+  if (missing.length > 0) {
+    throw new Error(
+      `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
+    );
+  }
+  return ranges;
+}
+function parseInlineRubrics(rawRubrics) {
+  const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
+    if (typeof rubric === "string") {
+      return {
+        id: `rubric-${index + 1}`,
+        expected_outcome: rubric,
+        weight: 1,
+        required: true
+      };
+    }
+    const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
+    const rawScoreRanges = rubric.score_ranges;
+    const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
+      score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
+      expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
+    })).filter((r) => r.expected_outcome.length > 0) : void 0;
+    const baseRubric = {
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      weight: typeof rubric.weight === "number" ? rubric.weight : 1
+    };
+    if (scoreRanges && scoreRanges.length > 0) {
+      return {
+        ...baseRubric,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
+        ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
+        score_ranges: scoreRanges
+      };
+    }
+    return {
+      ...baseRubric,
+      expected_outcome: expectedOutcome,
+      required: typeof rubric.required === "boolean" ? rubric.required : true,
+      ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
+    };
+  }).filter(
+    (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
+  );
+  if (rubricItems.length === 0) {
+    return void 0;
+  }
+  return {
+    name: "rubric",
+    type: "llm_judge",
+    rubrics: rubricItems
+  };
+}
+// src/evaluation/loaders/jsonl-parser.ts
+var import_promises5 = require("fs/promises");
+var import_node_path5 = __toESM(require("path"), 1);
+var import_micromatch2 = __toESM(require("micromatch"), 1);
+var import_yaml2 = require("yaml");
 // src/evaluation/loaders/message-processor.ts
 var import_promises4 = require("fs/promises");
@@ -1266,28 +1485,302 @@ async function processExpectedMessages(options) {
   return segments;
 }
-// src/evaluation/formatting/prompt-builder.ts
-var import_promises5 = require("fs/promises");
-var import_node_path5 = __toESM(require("path"), 1);
+// src/evaluation/loaders/shorthand-expansion.ts
+function expandInputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "user", content: value }];
+  }
+  if (Array.isArray(value)) {
+    const messages = value.filter((msg) => isTestMessage(msg));
+    return messages.length > 0 ? messages : void 0;
+  }
+  return void 0;
+}
+function expandExpectedOutputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "assistant", content: value }];
+  }
+  if (Array.isArray(value)) {
+    if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
+      const messages = value.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  if (isJsonObject(value)) {
+    if ("role" in value) {
+      return isTestMessage(value) ? [value] : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  return void 0;
+}
+function resolveInputMessages(raw) {
+  if (raw.input_messages !== void 0) {
+    if (Array.isArray(raw.input_messages)) {
+      const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandInputShorthand(raw.input);
+}
+function resolveExpectedMessages(raw) {
+  if (raw.expected_messages !== void 0) {
+    if (Array.isArray(raw.expected_messages)) {
+      const messages = raw.expected_messages.filter(
+        (msg) => isTestMessage(msg)
+      );
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandExpectedOutputShorthand(raw.expected_output);
+}
+// src/evaluation/loaders/jsonl-parser.ts
 var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
 var ANSI_RESET5 = "\x1B[0m";
+function detectFormat(filePath) {
+  const ext = import_node_path5.default.extname(filePath).toLowerCase();
+  if (ext === ".jsonl") return "jsonl";
+  if (ext === ".yaml" || ext === ".yml") return "yaml";
+  throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
+}
+async function loadSidecarMetadata(jsonlPath, verbose) {
+  const dir = import_node_path5.default.dirname(jsonlPath);
+  const base = import_node_path5.default.basename(jsonlPath, ".jsonl");
+  const sidecarPath = import_node_path5.default.join(dir, `${base}.yaml`);
+  if (!await fileExists(sidecarPath)) {
+    if (verbose) {
+      logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
+    }
+    return {};
+  }
+  try {
+    const content = await (0, import_promises5.readFile)(sidecarPath, "utf8");
+    const parsed = (0, import_yaml2.parse)(content);
+    if (!isJsonObject(parsed)) {
+      logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
+      return {};
+    }
+    return {
+      description: asString4(parsed.description),
+      dataset: asString4(parsed.dataset),
+      execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
+      evaluator: parsed.evaluator
+    };
+  } catch (error) {
+    logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
+    return {};
+  }
+}
+function parseJsonlContent(content, filePath) {
+  const lines = content.split("\n");
+  const cases = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (line === "") continue;
+    try {
+      const parsed = JSON.parse(line);
+      if (!isJsonObject(parsed)) {
+        throw new Error("Expected JSON object");
+      }
+      cases.push(parsed);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
+  File: ${filePath}`);
+    }
+  }
+  return cases;
+}
+async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
+  const verbose = options?.verbose ?? false;
+  const filterPattern = options?.filter;
+  const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
+  const repoRootPath = resolveToAbsolutePath(repoRoot);
+  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
+  const config = await loadConfig(absoluteTestPath, repoRootPath);
+  const guidelinePatterns = config?.guideline_patterns;
+  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
+  const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
+  const rawCases = parseJsonlContent(rawFile, evalFilePath);
+  const fallbackDataset = import_node_path5.default.basename(absoluteTestPath, ".jsonl") || "eval";
+  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
+  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
+  const globalExecution = sidecar.execution;
+  if (verbose) {
+    console.log(`
+[JSONL Dataset: ${evalFilePath}]`);
+    console.log(`  Cases: ${rawCases.length}`);
+    console.log(`  Dataset name: ${datasetName}`);
+    if (sidecar.description) {
+      console.log(`  Description: ${sidecar.description}`);
+    }
+  }
+  const results = [];
+  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
+    const evalcase = rawCases[lineIndex];
+    const lineNumber = lineIndex + 1;
+    const id = asString4(evalcase.id);
+    if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
+      continue;
+    }
+    const conversationId = asString4(evalcase.conversation_id);
+    const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
+      logError(
+        `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
+      );
+      continue;
+    }
+    const hasExpectedMessages = expectedMessages.length > 0;
+    const guidelinePaths = [];
+    const inputTextParts = [];
+    const inputSegments = await processMessages({
+      messages: inputMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      guidelinePaths,
+      textParts: inputTextParts,
+      messageType: "input",
+      verbose
+    });
+    const outputSegments = hasExpectedMessages ? await processExpectedMessages({
+      messages: expectedMessages,
+      searchRoots,
+      repoRootPath,
+      verbose
+    }) : [];
+    let referenceAnswer = "";
+    if (outputSegments.length > 0) {
+      const lastMessage = outputSegments[outputSegments.length - 1];
+      const content = lastMessage.content;
+      const toolCalls = lastMessage.tool_calls;
+      if (typeof content === "string") {
+        referenceAnswer = content;
+      } else if (content !== void 0 && content !== null) {
+        referenceAnswer = JSON.stringify(content, null, 2);
+      } else if (toolCalls !== void 0 && toolCalls !== null) {
+        referenceAnswer = JSON.stringify(toolCalls, null, 2);
+      }
+    }
+    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
+    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
+    const mergedExecution = caseExecution ?? globalExecution;
+    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    let evaluators;
+    try {
+      evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
+      continue;
+    }
+    const inlineRubrics = evalcase.rubrics;
+    if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
+        evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
+      }
+    }
+    const userFilePaths = [];
+    for (const segment of inputSegments) {
+      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
+        userFilePaths.push(segment.resolvedPath);
+      }
+    }
+    const allFilePaths = [
+      ...guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
+      ...userFilePaths
+    ];
+    const testCase = {
+      id,
+      dataset: datasetName,
+      conversation_id: conversationId,
+      question,
+      input_messages: inputMessages,
+      input_segments: inputSegments,
+      expected_messages: outputSegments,
+      reference_answer: referenceAnswer,
+      guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
+      guideline_patterns: guidelinePatterns,
+      file_paths: allFilePaths,
+      expected_outcome: outcome,
+      evaluator: evalCaseEvaluatorKind,
+      evaluators
+    };
+    if (verbose) {
+      console.log(`
+[Eval Case: ${id}]`);
+      if (testCase.guideline_paths.length > 0) {
+        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
+        for (const guidelinePath of testCase.guideline_paths) {
+          console.log(`    - ${guidelinePath}`);
+        }
+      } else {
+        console.log("  No guidelines found");
+      }
+    }
+    results.push(testCase);
+  }
+  return results;
+}
+function asString4(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function logWarning4(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}
+${detailBlock}${ANSI_RESET5}`);
+  } else {
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+  }
+}
+function logError(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.error(`${ANSI_RED}Error: ${message}
+${detailBlock}${ANSI_RESET5}`);
+  } else {
+    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
+  }
+}
+// src/evaluation/formatting/prompt-builder.ts
+var import_promises6 = require("fs/promises");
+var import_node_path6 = __toESM(require("path"), 1);
+var ANSI_YELLOW6 = "\x1B[33m";
+var ANSI_RESET6 = "\x1B[0m";
 async function buildPromptInputs(testCase, mode = "lm") {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
-    const absolutePath = import_node_path5.default.resolve(rawPath);
+    const absolutePath = import_node_path6.default.resolve(rawPath);
     if (!await fileExists(absolutePath)) {
-      logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
+      logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
       continue;
     }
     try {
-      const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await (0, import_promises6.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
-        displayPath: import_node_path5.default.basename(absolutePath)
+        displayPath: import_node_path6.default.basename(absolutePath)
       });
     } catch (error) {
-      logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
+      logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
   const guidelines = formatFileContents(guidelineParts);
@@ -1311,9 +1804,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
             messageSegments.push({ type: "text", value: segment });
           }
         } else if (isJsonObject(segment)) {
-          const type = asString4(segment.type);
+          const type = asString5(segment.type);
           if (type === "file") {
-            const value = asString4(segment.value);
+            const value = asString5(segment.value);
             if (!value) continue;
             if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
               messageSegments.push({ type: "guideline_ref", path: value });
@@ -1324,7 +1817,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
               messageSegments.push({ type: "file", text: fileText, path: value });
             }
           } else if (type === "text") {
-            const textValue = asString4(segment.value);
+            const textValue = asString5(segment.value);
             if (textValue && textValue.trim().length > 0) {
               messageSegments.push({ type: "text", value: textValue });
             }
@@ -1478,22 +1971,22 @@ ${guidelineContent.trim()}`);
   }
   return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
-function asString4(value) {
+function asString5(value) {
   return typeof value === "string" ? value : void 0;
 }
-function logWarning4(message) {
-  console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+function logWarning5(message) {
+  console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
 }
 // src/evaluation/yaml-parser.ts
-var ANSI_YELLOW6 = "\x1B[33m";
-var ANSI_RED = "\x1B[31m";
-var ANSI_RESET6 = "\x1B[0m";
+var ANSI_YELLOW7 = "\x1B[33m";
+var ANSI_RED2 = "\x1B[31m";
+var ANSI_RESET7 = "\x1B[0m";
 async function readTestSuiteMetadata(testFilePath) {
   try {
-    const absolutePath = import_node_path6.default.resolve(testFilePath);
-    const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
-    const parsed = (0, import_yaml2.parse)(content);
+    const absolutePath = import_node_path7.default.resolve(testFilePath);
+    const content = await (0, import_promises7.readFile)(absolutePath, "utf8");
+    const parsed = (0, import_yaml3.parse)(content);
     if (!isJsonObject(parsed)) {
       return {};
     }
@@ -1503,21 +1996,25 @@ async function readTestSuiteMetadata(testFilePath) {
   }
 }
 async function loadEvalCases(evalFilePath, repoRoot, options) {
+  const format = detectFormat(evalFilePath);
+  if (format === "jsonl") {
+    return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
+  }
   const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
-  const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
+  const filterPattern = options?.filter;
+  const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
   const repoRootPath = resolveToAbsolutePath(repoRoot);
   const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
   const config = await loadConfig(absoluteTestPath, repoRootPath);
   const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
-  const parsed = (0, import_yaml2.parse)(rawFile);
+  const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
+  const parsed = (0, import_yaml3.parse)(rawFile);
   if (!isJsonObject(parsed)) {
     throw new Error(`Invalid test file format: ${evalFilePath}`);
   }
   const suite = parsed;
-  const datasetNameFromSuite = asString5(suite.dataset)?.trim();
-  const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
+  const datasetNameFromSuite = asString6(suite.dataset)?.trim();
+  const fallbackDataset = import_node_path7.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
   const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
   const rawTestcases = suite.evalcases;
   if (!Array.isArray(rawTestcases)) {
@@ -1525,37 +2022,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   }
   const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
   const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
-  const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
+  const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
   const results = [];
   for (const rawEvalcase of rawTestcases) {
     if (!isJsonObject(rawEvalcase)) {
-      logWarning5("Skipping invalid eval case entry (expected object)");
+      logWarning6("Skipping invalid eval case entry (expected object)");
       continue;
     }
     const evalcase = rawEvalcase;
-    const id = asString5(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    const id = asString6(evalcase.id);
+    if (filterPattern && (!id || !import_micromatch3.default.isMatch(id, filterPattern))) {
       continue;
     }
-    const conversationId = asString5(evalcase.conversation_id);
-    const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logError(
-        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
+    const conversationId = asString6(evalcase.conversation_id);
+    const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
+      logError2(
+        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
       );
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter(
-      (msg) => isTestMessage(msg)
-    );
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logError(`No valid expected message found for eval case: ${id}`);
-      continue;
-    }
+    const hasExpectedMessages = expectedMessages.length > 0;
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1594,33 +2083,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      logError(`Skipping eval case '${id}': ${message}`);
+      logError2(`Skipping eval case '${id}': ${message}`);
       continue;
     }
     const inlineRubrics = evalcase.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
-      const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
-        if (typeof rubric === "string") {
-          return {
-            id: `rubric-${index + 1}`,
-            description: rubric,
-            weight: 1,
-            required: true
-          };
-        }
-        return {
-          id: asString5(rubric.id) ?? `rubric-${index + 1}`,
-          description: asString5(rubric.description) ?? "",
-          weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-          required: typeof rubric.required === "boolean" ? rubric.required : true
-        };
-      }).filter((r) => r.description.length > 0);
-      if (rubricItems.length > 0) {
-        const rubricEvaluator = {
-          name: "rubric",
-          type: "llm_judge",
-          rubrics: rubricItems
-        };
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
         evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
       }
     }
@@ -1631,7 +2100,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       }
     }
     const allFilePaths = [
-      ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
+      ...guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
       ...userFilePaths
     ];
     const testCase = {
@@ -1643,7 +2112,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       input_segments: inputSegments,
       expected_messages: outputSegments,
       reference_answer: referenceAnswer,
-      guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
+      guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
       expected_outcome: outcome,
@@ -1666,35 +2135,35 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   }
   return results;
 }
-function asString5(value) {
+function asString6(value) {
   return typeof value === "string" ? value : void 0;
 }
-function logWarning5(message, details) {
+function logWarning6(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW6}Warning: ${message}
-${detailBlock}${ANSI_RESET6}`);
+    console.warn(`${ANSI_YELLOW7}Warning: ${message}
+${detailBlock}${ANSI_RESET7}`);
   } else {
-    console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
+    console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
   }
 }
-function logError(message, details) {
+function logError2(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.error(`${ANSI_RED}Error: ${message}
-${detailBlock}${ANSI_RESET6}`);
+    console.error(`${ANSI_RED2}Error: ${message}
+${detailBlock}${ANSI_RESET7}`);
   } else {
-    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
+    console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
   }
 }
 // src/evaluation/file-utils.ts
 var import_node_fs2 = require("fs");
-var import_promises7 = require("fs/promises");
-var import_node_path7 = __toESM(require("path"), 1);
+var import_promises8 = require("fs/promises");
+var import_node_path8 = __toESM(require("path"), 1);
 async function fileExists2(filePath) {
   try {
-    await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
+    await (0, import_promises8.access)(filePath, import_node_fs2.constants.F_OK);
     return true;
   } catch {
     return false;
@@ -1704,22 +2173,22 @@ function normalizeLineEndings(content) {
   return content.replace(/\r\n/g, "\n");
 }
 async function readTextFile(filePath) {
-  const content = await (0, import_promises7.readFile)(filePath, "utf8");
+  const content = await (0, import_promises8.readFile)(filePath, "utf8");
   return normalizeLineEndings(content);
 }
 async function readJsonFile(filePath) {
-  const content = await (0, import_promises7.readFile)(filePath, "utf8");
+  const content = await (0, import_promises8.readFile)(filePath, "utf8");
   return JSON.parse(content);
 }
 async function findGitRoot(startPath) {
-  let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
-  const root = import_node_path7.default.parse(currentDir).root;
+  let currentDir = import_node_path8.default.dirname(import_node_path8.default.resolve(startPath));
+  const root = import_node_path8.default.parse(currentDir).root;
   while (currentDir !== root) {
-    const gitPath = import_node_path7.default.join(currentDir, ".git");
+    const gitPath = import_node_path8.default.join(currentDir, ".git");
     if (await fileExists2(gitPath)) {
       return currentDir;
     }
-    const parentDir = import_node_path7.default.dirname(currentDir);
+    const parentDir = import_node_path8.default.dirname(currentDir);
     if (parentDir === currentDir) {
       break;
     }
@@ -1730,8 +2199,8 @@ async function findGitRoot(startPath) {
 function buildDirectoryChain2(filePath, repoRoot) {
   const directories = [];
   const seen = /* @__PURE__ */ new Set();
-  const boundary = import_node_path7.default.resolve(repoRoot);
-  let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
+  const boundary = import_node_path8.default.resolve(repoRoot);
+  let current = import_node_path8.default.resolve(import_node_path8.default.dirname(filePath));
   while (current !== void 0) {
     if (!seen.has(current)) {
       directories.push(current);
@@ -1740,7 +2209,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
     if (current === boundary) {
       break;
     }
-    const parent = import_node_path7.default.dirname(current);
+    const parent = import_node_path8.default.dirname(current);
     if (parent === current) {
       break;
     }
@@ -1754,16 +2223,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
 function buildSearchRoots2(evalPath, repoRoot) {
   const uniqueRoots = [];
   const addRoot = (root) => {
-    const normalized = import_node_path7.default.resolve(root);
+    const normalized = import_node_path8.default.resolve(root);
     if (!uniqueRoots.includes(normalized)) {
       uniqueRoots.push(normalized);
     }
   };
-  let currentDir = import_node_path7.default.dirname(evalPath);
+  let currentDir = import_node_path8.default.dirname(evalPath);
   let reachedBoundary = false;
   while (!reachedBoundary) {
     addRoot(currentDir);
-    const parentDir = import_node_path7.default.dirname(currentDir);
+    const parentDir = import_node_path8.default.dirname(currentDir);
     if (currentDir === repoRoot || parentDir === currentDir) {
       reachedBoundary = true;
     } else {
@@ -1781,16 +2250,16 @@ function trimLeadingSeparators2(value) {
 async function resolveFileReference2(rawValue, searchRoots) {
   const displayPath = trimLeadingSeparators2(rawValue);
   const potentialPaths = [];
-  if (import_node_path7.default.isAbsolute(rawValue)) {
-    potentialPaths.push(import_node_path7.default.normalize(rawValue));
+  if (import_node_path8.default.isAbsolute(rawValue)) {
+    potentialPaths.push(import_node_path8.default.normalize(rawValue));
   }
   for (const base of searchRoots) {
-    potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
+    potentialPaths.push(import_node_path8.default.resolve(base, displayPath));
   }
   const attempted = [];
   const seen = /* @__PURE__ */ new Set();
   for (const candidate of potentialPaths) {
-    const absoluteCandidate = import_node_path7.default.resolve(candidate);
+    const absoluteCandidate = import_node_path8.default.resolve(candidate);
     if (seen.has(absoluteCandidate)) {
       continue;
     }
@@ -2140,9 +2609,9 @@ async function withRetry(fn, retryConfig, signal) {
 var import_node_child_process = require("child_process");
 var import_node_crypto = require("crypto");
 var import_node_fs3 = require("fs");
-var import_promises8 = require("fs/promises");
+var import_promises9 = require("fs/promises");
 var import_node_os = require("os");
-var import_node_path9 = __toESM(require("path"), 1);
+var import_node_path10 = __toESM(require("path"), 1);
 // src/evaluation/providers/claude-code-log-tracker.ts
 var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
@@ -2198,7 +2667,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
 }
 // src/evaluation/providers/preread.ts
-var import_node_path8 = __toESM(require("path"), 1);
+var import_node_path9 = __toESM(require("path"), 1);
 function buildPromptDocument(request, inputFiles, options) {
   const parts = [];
   const guidelineFiles = collectGuidelineFiles(
@@ -2221,7 +2690,7 @@ function normalizeInputFiles(inputFiles) {
   }
   const deduped = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path8.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (!deduped.has(absolutePath)) {
       deduped.set(absolutePath, absolutePath);
     }
@@ -2234,14 +2703,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path8.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (overrides?.has(absolutePath)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
       }
       continue;
     }
-    const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
+    const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -2256,7 +2725,7 @@ function collectInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path8.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -2268,7 +2737,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path8.default.basename(absolutePath);
+    const fileName = import_node_path9.default.basename(absolutePath);
     const fileUri = pathToFileUri(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -2288,7 +2757,7 @@ ${buildList(inputFiles).join("\n")}.`);
   return sections.join("\n");
 }
 function pathToFileUri(filePath) {
-  const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
+  const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -2325,8 +2794,8 @@ var ClaudeCodeProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
-      await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
+      const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
+      await (0, import_promises9.writeFile)(promptFile, request.question, "utf8");
       const args = this.buildClaudeCodeArgs(request.question, inputFiles);
       const cwd = this.resolveCwd();
       const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
@@ -2373,7 +2842,7 @@ var ClaudeCodeProvider = class {
     if (!this.config.cwd) {
       return process.cwd();
     }
-    return import_node_path9.default.resolve(this.config.cwd);
+    return import_node_path10.default.resolve(this.config.cwd);
   }
   buildClaudeCodeArgs(prompt, inputFiles) {
     const args = [];
@@ -2430,11 +2899,11 @@ ${filesContext}`;
     }
   }
   async createWorkspace() {
-    return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
+    return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -2444,9 +2913,9 @@ ${filesContext}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return import_node_path9.default.resolve(this.config.logDir);
+      return import_node_path10.default.resolve(this.config.logDir);
     }
-    return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
+    return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "claude-code");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -2454,13 +2923,13 @@ ${filesContext}`;
       return void 0;
     }
     try {
-      await (0, import_promises8.mkdir)(logDir, { recursive: true });
+      await (0, import_promises9.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
     try {
       const logger = await ClaudeCodeStreamLogger.create({
         filePath,
@@ -2865,16 +3334,16 @@ function escapeShellArg(arg) {
 }
 async function defaultClaudeCodeRunner(options) {
   const tempId = (0, import_node_crypto.randomUUID)();
-  const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
-  const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
-  const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
-  const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
+  const stdoutFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
+  const stderrFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
+  const exitFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
+  const pidFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
   try {
     return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
   } finally {
     for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
       try {
-        await (0, import_promises8.rm)(file, { force: true });
+        await (0, import_promises9.rm)(file, { force: true });
       } catch {
       }
     }
@@ -2908,8 +3377,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
   let lastStdoutSize = 0;
   const readFileIfExists = async (filePath) => {
     try {
-      const { readFile: readFile8 } = await import("fs/promises");
-      return await readFile8(filePath, "utf8");
+      const { readFile: readFile9 } = await import("fs/promises");
+      return await readFile9(filePath, "utf8");
     } catch {
       return "";
     }
@@ -2982,9 +3451,9 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
 // src/evaluation/providers/cli.ts
 var import_node_child_process2 = require("child_process");
-var import_promises9 = __toESM(require("fs/promises"), 1);
+var import_promises10 = __toESM(require("fs/promises"), 1);
 var import_node_os2 = __toESM(require("os"), 1);
-var import_node_path10 = __toESM(require("path"), 1);
+var import_node_path11 = __toESM(require("path"), 1);
 var import_node_util = require("util");
 var import_zod = require("zod");
 var ToolCallSchema = import_zod.z.object({
@@ -2992,7 +3461,8 @@ var ToolCallSchema = import_zod.z.object({
   input: import_zod.z.unknown().optional(),
   output: import_zod.z.unknown().optional(),
   id: import_zod.z.string().optional(),
-  timestamp: import_zod.z.string().optional()
+  timestamp: import_zod.z.string().optional(),
+  duration_ms: import_zod.z.number().optional()
 });
 var OutputMessageInputSchema = import_zod.z.object({
   role: import_zod.z.string(),
@@ -3000,6 +3470,7 @@ var OutputMessageInputSchema = import_zod.z.object({
   content: import_zod.z.unknown().optional(),
   tool_calls: import_zod.z.array(ToolCallSchema).optional(),
   timestamp: import_zod.z.string().optional(),
+  duration_ms: import_zod.z.number().optional(),
   metadata: import_zod.z.record(import_zod.z.unknown()).optional()
 });
 var TokenUsageSchema = import_zod.z.object({
@@ -3038,8 +3509,16 @@ function convertOutputMessages(messages) {
     role: msg.role,
     name: msg.name,
     content: msg.content,
-    toolCalls: msg.tool_calls,
+    toolCalls: msg.tool_calls?.map((tc) => ({
+      tool: tc.tool,
+      input: tc.input,
+      output: tc.output,
+      id: tc.id,
+      timestamp: tc.timestamp,
+      durationMs: tc.duration_ms
+    })),
     timestamp: msg.timestamp,
+    durationMs: msg.duration_ms,
     metadata: msg.metadata
   }));
 }
@@ -3353,7 +3832,7 @@ var CliProvider = class {
       throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
     } finally {
       if (!this.keepTempFiles) {
-        await import_promises9.default.unlink(filePath).catch(() => {
+        await import_promises10.default.unlink(filePath).catch(() => {
         });
       }
     }
@@ -3441,7 +3920,7 @@ function normalizeInputFiles2(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path10.default.resolve(inputFile);
+    const absolutePath = import_node_path11.default.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3455,7 +3934,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(import_node_path10.default.basename(filePath));
+    const escapedName = shellEscape(import_node_path11.default.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -3479,7 +3958,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
+  return import_node_path11.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
 function formatTimeoutSuffix2(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -3493,9 +3972,9 @@ function formatTimeoutSuffix2(timeoutMs) {
 var import_node_child_process3 = require("child_process");
 var import_node_crypto2 = require("crypto");
 var import_node_fs4 = require("fs");
-var import_promises10 = require("fs/promises");
+var import_promises11 = require("fs/promises");
 var import_node_os3 = require("os");
-var import_node_path11 = __toESM(require("path"), 1);
+var import_node_path12 = __toESM(require("path"), 1);
 var import_node_util2 = require("util");
 // src/evaluation/providers/codex-log-tracker.ts
@@ -3590,8 +4069,8 @@ var CodexProvider = class {
       const promptContent = `${systemPrompt}
 ${basePrompt}`;
-      const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
-      await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
+      const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME2);
+      await (0, import_promises11.writeFile)(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -3640,7 +4119,7 @@ ${basePrompt}`;
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return import_node_path11.default.resolve(this.config.cwd);
+    return import_node_path12.default.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = [
@@ -3682,11 +4161,11 @@ ${basePrompt}`;
     }
   }
   async createWorkspace() {
-    return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
+    return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -3696,9 +4175,9 @@ ${basePrompt}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return import_node_path11.default.resolve(this.config.logDir);
+      return import_node_path12.default.resolve(this.config.logDir);
     }
-    return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
+    return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -3706,13 +4185,13 @@ ${basePrompt}`;
       return void 0;
     }
     try {
-      await (0, import_promises10.mkdir)(logDir, { recursive: true });
+      await (0, import_promises11.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
+    const filePath = import_node_path12.default.join(logDir, buildLogFilename2(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -3927,9 +4406,9 @@ function tryParseJsonValue2(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
+    const resolved = import_node_path12.default.isAbsolute(candidate) ? candidate : import_node_path12.default.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
-    await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
+    await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
     return executablePath;
   }
   const locator = process.platform === "win32" ? "where" : "which";
@@ -3939,7 +4418,7 @@ async function locateExecutable(candidate) {
     const preferred = selectExecutableCandidate(lines);
     if (preferred) {
       const executablePath = await ensureWindowsExecutableVariant(preferred);
-      await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
+      await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
       return executablePath;
     }
   } catch {
@@ -3973,7 +4452,7 @@ async function ensureWindowsExecutableVariant(candidate) {
   for (const ext of extensions) {
     const withExtension = `${candidate}${ext}`;
     try {
-      await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
+      await (0, import_promises11.access)(withExtension, import_node_fs4.constants.F_OK);
       return withExtension;
     } catch {
     }
@@ -4438,9 +4917,9 @@ function extractToolCalls2(content) {
 var import_node_child_process4 = require("child_process");
 var import_node_crypto3 = require("crypto");
 var import_node_fs5 = require("fs");
-var import_promises11 = require("fs/promises");
+var import_promises12 = require("fs/promises");
 var import_node_os4 = require("os");
-var import_node_path12 = __toESM(require("path"), 1);
+var import_node_path13 = __toESM(require("path"), 1);
 // src/evaluation/providers/pi-log-tracker.ts
 var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
@@ -4524,8 +5003,8 @@ var PiCodingAgentProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
-      await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
+      const promptFile = import_node_path13.default.join(workspaceRoot, PROMPT_FILENAME3);
+      await (0, import_promises12.writeFile)(promptFile, request.question, "utf8");
       const args = this.buildPiArgs(request.question, inputFiles);
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executePi(args, cwd, request.signal, logger);
@@ -4566,7 +5045,7 @@ var PiCodingAgentProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return import_node_path12.default.resolve(this.config.cwd);
+    return import_node_path13.default.resolve(this.config.cwd);
   }
   buildPiArgs(prompt, inputFiles) {
     const args = [];
@@ -4655,19 +5134,19 @@ ${prompt}`;
     return env;
   }
   async createWorkspace() {
-    return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
+    return await (0, import_promises12.mkdtemp)(import_node_path13.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises12.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
   resolveLogDirectory() {
     if (this.config.logDir) {
-      return import_node_path12.default.resolve(this.config.logDir);
+      return import_node_path13.default.resolve(this.config.logDir);
     }
-    return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+    return import_node_path13.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -4675,13 +5154,13 @@ ${prompt}`;
       return void 0;
     }
     try {
-      await (0, import_promises11.mkdir)(logDir, { recursive: true });
+      await (0, import_promises12.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
+    const filePath = import_node_path13.default.join(logDir, buildLogFilename3(request, this.targetName));
     try {
       const logger = await PiStreamLogger.create({
         filePath,
@@ -5114,7 +5593,7 @@ async function defaultPiRunner(options) {
 }
 // src/evaluation/providers/targets.ts
-var import_node_path13 = __toESM(require("path"), 1);
+var import_node_path14 = __toESM(require("path"), 1);
 var import_zod2 = require("zod");
 var CliHealthcheckHttpInputSchema = import_zod2.z.object({
   type: import_zod2.z.literal("http"),
@@ -5220,11 +5699,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
     allowLiteral: true,
     optionalEnv: true
   });
-  if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
-    cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
+  if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
+    cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
   }
   if (!cwd && evalFilePath) {
-    cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
+    cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
   }
   return {
     type: "command",
@@ -5251,11 +5730,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
     allowLiteral: true,
     optionalEnv: true
   });
-  if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
-    cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
+  if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
+    cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
   }
   if (!cwd && evalFilePath) {
-    cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
+    cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
   }
   const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
   const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -5760,8 +6239,8 @@ function resolveCliConfig(target, env, evalFilePath) {
   const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
   if (!parseResult.success) {
     const firstError = parseResult.error.errors[0];
-    const path17 = firstError?.path.join(".") || "";
-    const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
+    const path18 = firstError?.path.join(".") || "";
+    const prefix = path18 ? `${target.name} ${path18}: ` : `${target.name}: `;
     throw new Error(`${prefix}${firstError?.message}`);
   }
   const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -5949,7 +6428,7 @@ function resolveOptionalNumberArray(source, description) {
 }
 // src/evaluation/providers/vscode.ts
-var import_node_path14 = __toESM(require("path"), 1);
+var import_node_path15 = __toESM(require("path"), 1);
 var import_subagent = require("subagent");
 // src/evaluation/providers/vscode-templates.ts
@@ -6119,7 +6598,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path14.default.basename(absolutePath);
+    const fileName = import_node_path15.default.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -6144,8 +6623,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path14.default.resolve(attachment);
-    const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
+    const absolutePath = import_node_path15.default.resolve(attachment);
+    const normalized = absolutePath.split(import_node_path15.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -6160,7 +6639,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path14.default.resolve(attachment);
+    const absolutePath = import_node_path15.default.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -6168,7 +6647,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
+  const absolutePath = import_node_path15.default.isAbsolute(filePath) ? filePath : import_node_path15.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -6181,7 +6660,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(import_node_path14.default.resolve(attachment));
+    deduped.add(import_node_path15.default.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -6190,7 +6669,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(import_node_path14.default.resolve(inputFile));
+      deduped.add(import_node_path15.default.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -6238,9 +6717,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 var import_node_fs6 = require("fs");
-var import_promises12 = require("fs/promises");
-var import_node_path15 = __toESM(require("path"), 1);
-var import_yaml3 = require("yaml");
+var import_promises13 = require("fs/promises");
+var import_node_path16 = __toESM(require("path"), 1);
+var import_yaml4 = require("yaml");
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -6269,19 +6748,19 @@ function assertTargetDefinition(value, index, filePath) {
 }
 async function fileExists3(filePath) {
   try {
-    await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
+    await (0, import_promises13.access)(filePath, import_node_fs6.constants.F_OK);
     return true;
   } catch {
     return false;
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = import_node_path15.default.resolve(filePath);
+  const absolutePath = import_node_path16.default.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
-  const parsed = (0, import_yaml3.parse)(raw);
+  const raw = await (0, import_promises13.readFile)(absolutePath, "utf8");
+  const parsed = (0, import_yaml4.parse)(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
   }
@@ -6487,15 +6966,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
   });
 }
 async function execShellWithStdin(command, stdinPayload, options = {}) {
-  const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
+  const { mkdir: mkdir4, readFile: readFile9, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
   const { tmpdir: tmpdir4 } = await import("os");
-  const path17 = await import("path");
+  const path18 = await import("path");
   const { randomUUID: randomUUID4 } = await import("crypto");
-  const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
+  const dir = path18.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
   await mkdir4(dir, { recursive: true });
-  const stdinPath = path17.join(dir, "stdin.txt");
-  const stdoutPath = path17.join(dir, "stdout.txt");
-  const stderrPath = path17.join(dir, "stderr.txt");
+  const stdinPath = path18.join(dir, "stdin.txt");
+  const stdoutPath = path18.join(dir, "stdout.txt");
+  const stderrPath = path18.join(dir, "stderr.txt");
   await writeFile4(stdinPath, stdinPayload, "utf8");
   const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
   const { spawn: spawn4 } = await import("child_process");
@@ -6525,8 +7004,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
         resolve(code ?? 0);
       });
     });
-    const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
-    const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
+    const stdout = (await readFile9(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
+    const stderr = (await readFile9(stderrPath, "utf8")).replace(/\r\n/g, "\n");
     return { stdout, stderr, exitCode };
   } finally {
     await rm4(dir, { recursive: true, force: true });
@@ -6798,7 +7277,7 @@ var CodeEvaluator = class {
       outputMessages: context.outputMessages ?? null,
       guidelineFiles: context.evalCase.guideline_paths,
       inputFiles: context.evalCase.file_paths.filter(
-        (path17) => !context.evalCase.guideline_paths.includes(path17)
+        (path18) => !context.evalCase.guideline_paths.includes(path18)
       ),
       inputMessages: context.evalCase.input_messages,
       traceSummary: context.traceSummary ?? null,
@@ -6973,6 +7452,15 @@ var rubricEvaluationSchema = import_zod3.z.object({
   checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
   overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
 });
+var scoreRangeCheckResultSchema = import_zod3.z.object({
+  id: import_zod3.z.string().describe("The ID of the rubric criterion being scored"),
+  score: import_zod3.z.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
+  reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this score").optional()
+});
+var scoreRangeEvaluationSchema = import_zod3.z.object({
+  checks: import_zod3.z.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
+  overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
+});
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
@@ -7058,6 +7546,10 @@ var LlmJudgeEvaluator = class {
         `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
       );
     }
+    const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+    if (hasScoreRanges) {
+      return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
+    }
     const prompt = this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
     const evaluatorRawRequest = {
@@ -7083,6 +7575,84 @@ var LlmJudgeEvaluator = class {
       evaluatorRawRequest
     };
   }
+  /**
+   * Evaluate using score-range rubrics (analytic rubric scoring).
+   * Each criterion is scored 0-10 and normalized to 0-1.
+   */
+  async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
+    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const systemPrompt = buildScoreRangeOutputSchema();
+    const evaluatorRawRequest = {
+      userPrompt: prompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    const { data } = await this.runWithRetry({
+      context,
+      judgeProvider,
+      systemPrompt,
+      userPrompt: prompt,
+      schema: scoreRangeEvaluationSchema
+    });
+    const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
+    return {
+      score,
+      verdict,
+      hits,
+      misses,
+      expectedAspectCount: rubrics.length,
+      reasoning: data.overall_reasoning,
+      evaluatorRawRequest,
+      details
+    };
+  }
+  /**
+   * Build prompt for score-range rubric evaluation.
+   */
+  buildScoreRangePrompt(context, rubrics) {
+    const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
+    const parts = [
+      "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
+      "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
+      "",
+      "[[ ## question ## ]]",
+      formattedQuestion,
+      "",
+      "[[ ## expected_outcome ## ]]",
+      context.evalCase.expected_outcome,
+      ""
+    ];
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
+    }
+    parts.push(
+      "[[ ## candidate_answer ## ]]",
+      context.candidate,
+      "",
+      "[[ ## scoring_criteria ## ]]"
+    );
+    for (const rubric of rubrics) {
+      const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
+      const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
+      parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
+      if (rubric.expected_outcome) {
+        parts.push(`Description: ${rubric.expected_outcome}`);
+      }
+      if (rubric.score_ranges && rubric.score_ranges.length > 0) {
+        parts.push("Score ranges:");
+        for (const range of rubric.score_ranges) {
+          const [min, max] = range.score_range;
+          const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
+          parts.push(`  - Score ${rangeLabel}: ${range.expected_outcome}`);
+        }
+      }
+    }
+    parts.push(
+      "",
+      "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
+    );
+    return parts.join("\n");
+  }
   buildRubricPrompt(context, rubrics) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const parts = [
@@ -7102,7 +7672,7 @@ var LlmJudgeEvaluator = class {
     for (const rubric of rubrics) {
       const requiredLabel = rubric.required ? " (REQUIRED)" : "";
       const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
-      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
+      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
     }
     parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
     return parts.join("\n");
@@ -7189,9 +7759,9 @@ function calculateRubricScore(result, rubrics) {
     totalWeight += rubric.weight;
     if (check.satisfied) {
       earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
     } else {
-      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
       if (rubric.required) {
         failedRequired = true;
       }
@@ -7201,6 +7771,76 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
+function buildScoreRangeOutputSchema() {
+  return `You are an expert evaluator. Score the candidate answer on each criterion.
+You must return a valid JSON object matching this schema:
+{
+  "checks": [
+    {
+      "id": "string (criterion id)",
+      "score": integer (0-10),
+      "reasoning": "string (brief explanation for score)"
+    }
+  ],
+  "overall_reasoning": "string (summary, optional)"
+}
+Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
+}
+function calculateScoreRangeResult(result, rubrics) {
+  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
+  const hits = [];
+  const misses = [];
+  const rawScores = {};
+  let totalWeight = 0;
+  let weightedScoreSum = 0;
+  let failedRequired = false;
+  for (const check of result.checks) {
+    const rubric = rubricMap.get(check.id);
+    if (!rubric) {
+      continue;
+    }
+    const rawScore = Math.max(0, Math.min(10, check.score));
+    const normalizedScore = rawScore / 10;
+    rawScores[rubric.id] = rawScore;
+    totalWeight += rubric.weight;
+    weightedScoreSum += normalizedScore * rubric.weight;
+    let requiredMinScore;
+    if (rubric.required_min_score !== void 0) {
+      requiredMinScore = rubric.required_min_score;
+    } else if (rubric.required === true) {
+      requiredMinScore = 10;
+    }
+    const matchingRange = rubric.score_ranges?.find(
+      (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
+    );
+    const rangeDescription = matchingRange?.expected_outcome ?? "";
+    const criterionLabel = rubric.expected_outcome ?? rubric.id;
+    const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
+    const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
+    if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
+      failedRequired = true;
+      misses.push(scoreInfo);
+    } else if (rawScore >= 7) {
+      hits.push(scoreInfo);
+    } else {
+      misses.push(scoreInfo);
+    }
+  }
+  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
+  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
+  return {
+    score,
+    verdict,
+    hits,
+    misses,
+    details: {
+      raw_scores: rawScores,
+      normalization: "score / 10",
+      aggregation: "weighted_average"
+    }
+  };
+}
 // src/evaluation/evaluators/composite.ts
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -7584,115 +8224,115 @@ var FieldAccuracyEvaluator = class {
    * Evaluate a single field against the expected value.
    */
   evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path17);
-    const expectedValue = resolvePath(expectedData, path17);
+    const { path: path18, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path18);
+    const expectedValue = resolvePath(expectedData, path18);
     if (expectedValue === void 0) {
       return {
-        path: path17,
+        path: path18,
         score: 1,
         // No expected value means no comparison needed
         weight,
         hit: true,
-        message: `${path17}: no expected value`
+        message: `${path18}: no expected value`
       };
     }
     if (candidateValue === void 0) {
       if (required) {
         return {
-          path: path17,
+          path: path18,
           score: 0,
           weight,
           hit: false,
-          message: `${path17} (required, missing)`
+          message: `${path18} (required, missing)`
         };
       }
       return {
-        path: path17,
+        path: path18,
         score: 1,
         // Don't penalize missing optional fields
         weight: 0,
         // Zero weight means it won't affect the score
         hit: true,
-        message: `${path17}: optional field missing`
+        message: `${path18}: optional field missing`
       };
     }
     switch (match) {
       case "exact":
-        return this.compareExact(path17, candidateValue, expectedValue, weight);
+        return this.compareExact(path18, candidateValue, expectedValue, weight);
       case "numeric_tolerance":
         return this.compareNumericTolerance(
-          path17,
+          path18,
           candidateValue,
           expectedValue,
           fieldConfig,
           weight
         );
       case "date":
-        return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
+        return this.compareDate(path18, candidateValue, expectedValue, fieldConfig, weight);
       default:
         return {
-          path: path17,
+          path: path18,
           score: 0,
           weight,
           hit: false,
-          message: `${path17}: unknown match type "${match}"`
+          message: `${path18}: unknown match type "${match}"`
         };
     }
   }
   /**
    * Exact equality comparison.
    */
-  compareExact(path17, candidateValue, expectedValue, weight) {
+  compareExact(path18, candidateValue, expectedValue, weight) {
     if (deepEqual(candidateValue, expectedValue)) {
       return {
-        path: path17,
+        path: path18,
         score: 1,
         weight,
         hit: true,
-        message: path17
+        message: path18
       };
     }
     if (typeof candidateValue !== typeof expectedValue) {
       return {
-        path: path17,
+        path: path18,
         score: 0,
         weight,
         hit: false,
-        message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+        message: `${path18} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
       };
     }
     return {
-      path: path17,
+      path: path18,
       score: 0,
       weight,
       hit: false,
-      message: `${path17} (value mismatch)`
+      message: `${path18} (value mismatch)`
     };
   }
   /**
    * Numeric comparison with absolute or relative tolerance.
    */
-  compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
+  compareNumericTolerance(path18, candidateValue, expectedValue, fieldConfig, weight) {
     const { tolerance = 0, relative = false } = fieldConfig;
     const candidateNum = toNumber(candidateValue);
     const expectedNum = toNumber(expectedValue);
     if (candidateNum === null || expectedNum === null) {
       return {
-        path: path17,
+        path: path18,
         score: 0,
         weight,
         hit: false,
-        message: `${path17} (non-numeric value)`
+        message: `${path18} (non-numeric value)`
       };
     }
     if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        path: path17,
+        path: path18,
         score: 0,
         weight,
         hit: false,
-        message: `${path17} (invalid numeric value)`
+        message: `${path18} (invalid numeric value)`
       };
     }
     const diff = Math.abs(candidateNum - expectedNum);
@@ -7705,61 +8345,61 @@ var FieldAccuracyEvaluator = class {
     }
     if (withinTolerance) {
       return {
-        path: path17,
+        path: path18,
         score: 1,
         weight,
         hit: true,
-        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
+        message: `${path18} (within tolerance: diff=${diff.toFixed(2)})`
       };
     }
     return {
-      path: path17,
+      path: path18,
       score: 0,
       weight,
       hit: false,
-      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+      message: `${path18} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
     };
   }
   /**
    * Date comparison with format normalization.
    */
-  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
+  compareDate(path18, candidateValue, expectedValue, fieldConfig, weight) {
     const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
     const candidateDate = parseDate(String(candidateValue), formats);
     const expectedDate = parseDate(String(expectedValue), formats);
     if (candidateDate === null) {
       return {
-        path: path17,
+        path: path18,
         score: 0,
         weight,
         hit: false,
-        message: `${path17} (unparseable candidate date)`
+        message: `${path18} (unparseable candidate date)`
       };
     }
     if (expectedDate === null) {
       return {
-        path: path17,
+        path: path18,
         score: 0,
         weight,
         hit: false,
-        message: `${path17} (unparseable expected date)`
+        message: `${path18} (unparseable expected date)`
       };
     }
     if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
       return {
-        path: path17,
+        path: path18,
         score: 1,
         weight,
         hit: true,
-        message: path17
+        message: path18
       };
     }
     return {
-      path: path17,
+      path: path18,
       score: 0,
       weight,
       hit: false,
-      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+      message: `${path18} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
     };
   }
   /**
@@ -7799,11 +8439,11 @@ var FieldAccuracyEvaluator = class {
     };
   }
 };
-function resolvePath(obj, path17) {
-  if (!path17 || !obj) {
+function resolvePath(obj, path18) {
+  if (!path18 || !obj) {
     return void 0;
   }
-  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  const parts = path18.split(/\.|\[|\]/).filter((p) => p.length > 0);
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0) {
@@ -8028,6 +8668,27 @@ function argsMatch(expected, actual) {
   }
   return true;
 }
+function checkLatency(toolName, maxDurationMs, actualDurationMs) {
+  if (maxDurationMs === void 0) {
+    return { status: "skip", message: "" };
+  }
+  if (actualDurationMs === void 0) {
+    return {
+      status: "skip",
+      message: `No duration data for ${toolName}; latency assertion skipped`
+    };
+  }
+  if (actualDurationMs <= maxDurationMs) {
+    return {
+      status: "pass",
+      message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+    };
+  }
+  return {
+    status: "fail",
+    message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+  };
+}
 var ToolTrajectoryEvaluator = class {
   kind = "tool_trajectory";
   config;
@@ -8086,7 +8747,8 @@ var ToolTrajectoryEvaluator = class {
         for (const call of message.toolCalls) {
           toolCalls.push({
             name: call.tool,
-            args: call.input
+            args: call.input,
+            durationMs: call.durationMs
           });
         }
       }
@@ -8154,17 +8816,27 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
     let actualIndex = 0;
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     for (let i = 0; i < expected.length; i++) {
       const expectedItem = expected[i];
       const expectedTool = expectedItem.tool;
       let found = false;
       let argsMismatch = false;
+      let matchedCall;
       while (actualIndex < toolCalls.length) {
         const actualCall = toolCalls[actualIndex];
         if (actualCall.name === expectedTool) {
           if (argsMatch(expectedItem.args, actualCall.args)) {
             hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            sequenceHits++;
+            matchedCall = actualCall;
             actualIndex++;
             found = true;
             break;
@@ -8181,14 +8853,35 @@ var ToolTrajectoryEvaluator = class {
       if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
+      if (found && matchedCall) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          matchedCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
   evaluateExact(toolCalls) {
@@ -8204,6 +8897,13 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     if (toolCalls.length !== expected.length) {
       misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
     }
@@ -8213,33 +8913,58 @@ var ToolTrajectoryEvaluator = class {
       const expectedTool = expectedItem.tool;
       const actualCall = toolCalls[i];
       const actualTool = actualCall.name;
+      let sequenceMatched = false;
       if (actualTool === expectedTool) {
         if (argsMatch(expectedItem.args, actualCall.args)) {
           hits.push(`Position ${i}: ${expectedTool}`);
+          sequenceHits++;
+          sequenceMatched = true;
         } else {
           misses.push(`Position ${i}: ${expectedTool} args mismatch`);
         }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
+      if (sequenceMatched) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          actualCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
     for (let i = checkLength; i < expected.length; i++) {
       misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
 };
 // src/evaluation/orchestrator.ts
 var import_node_crypto5 = require("crypto");
-var import_node_path16 = __toESM(require("path"), 1);
+var import_node_path17 = __toESM(require("path"), 1);
+var import_micromatch4 = __toESM(require("micromatch"), 1);
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -8398,17 +9123,17 @@ async function runEvaluation(options) {
     cache,
     useCache,
     now,
-    evalId,
+    filter,
     verbose,
     evalCases: preloadedEvalCases,
     onResult,
     onProgress
   } = options;
-  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
-  const filteredEvalCases = filterEvalCases(evalCases, evalId);
+  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
+  const filteredEvalCases = filterEvalCases(evalCases, filter);
   if (filteredEvalCases.length === 0) {
-    if (evalId) {
-      throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
+    if (filter) {
+      throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
     }
     return [];
   }
@@ -8984,7 +9709,10 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          judgeProvider
+          judgeProvider,
+          outputMessages,
+          traceSummary,
+          agentTimeoutMs
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -9038,7 +9766,7 @@ async function runEvaluatorList(options) {
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? import_node_path17.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -9319,9 +10047,22 @@ async function runLlmJudgeEvaluator(options) {
     attempt,
     promptInputs,
     now,
-    judgeProvider
+    judgeProvider,
+    outputMessages,
+    traceSummary,
+    agentTimeoutMs
   } = options;
-  const customPrompt = await resolveCustomPrompt(config);
+  const customPrompt = await resolveCustomPrompt(
+    config,
+    {
+      evalCase,
+      candidate,
+      outputMessages,
+      traceSummary,
+      config: config.config
+    },
+    agentTimeoutMs
+  );
   return evaluatorRegistry.llm_judge.evaluate({
     evalCase,
     candidate,
@@ -9335,23 +10076,70 @@ async function runLlmJudgeEvaluator(options) {
     evaluator: config
   });
 }
-async function resolveCustomPrompt(config) {
-  if (config.promptPath) {
+async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
+  if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
+    if (!context) {
+      throw new Error("Context required for executable prompt templates");
+    }
+    return executePromptTemplate(
+      promptConfig.resolvedPromptScript,
+      context,
+      promptConfig.config,
+      timeoutMs
+    );
+  }
+  const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
+  if (promptPath) {
     try {
-      const content = await readTextFile(config.promptPath);
+      const content = await readTextFile(promptPath);
       return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
+      console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
+    }
+  }
+  const promptValue = promptConfig.prompt;
+  if (typeof promptValue === "string") {
+    return promptValue;
+  }
+  return void 0;
+}
+async function executePromptTemplate(script, context, config, timeoutMs) {
+  const payload = {
+    question: context.evalCase.question,
+    expectedOutcome: context.evalCase.expected_outcome,
+    expectedMessages: context.evalCase.expected_messages,
+    referenceAnswer: context.evalCase.reference_answer,
+    candidateAnswer: context.candidate,
+    outputMessages: context.outputMessages ?? null,
+    guidelineFiles: context.evalCase.guideline_paths,
+    inputFiles: context.evalCase.file_paths.filter(
+      (p) => !context.evalCase.guideline_paths.includes(p)
+    ),
+    inputMessages: context.evalCase.input_messages,
+    traceSummary: context.traceSummary ?? null,
+    config: config ?? context.config ?? null
+  };
+  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  const scriptPath = script[script.length - 1];
+  const cwd = import_node_path17.default.dirname(scriptPath);
+  try {
+    const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
+    const prompt = stdout.trim();
+    if (!prompt) {
+      throw new Error("Prompt template produced empty output");
     }
+    return prompt;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Prompt template execution failed: ${message}`);
   }
-  return config.prompt;
 }
-function filterEvalCases(evalCases, evalId) {
-  if (!evalId) {
+function filterEvalCases(evalCases, filter) {
+  if (!filter) {
     return evalCases;
   }
-  return evalCases.filter((evalCase) => evalCase.id === evalId);
+  return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
 }
 function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
   const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -9509,7 +10297,7 @@ var import_ai4 = require("ai");
 var import_zod4 = require("zod");
 var rubricItemSchema = import_zod4.z.object({
   id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
-  description: import_zod4.z.string().describe("What this rubric checks for"),
+  expected_outcome: import_zod4.z.string().describe("Concrete expected outcome for this rubric item"),
   weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
   required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
 });
@@ -9529,7 +10317,7 @@ You must return a valid JSON object matching this schema:
   "rubrics": [
     {
       "id": "string (short identifier)",
-      "description": "string (what to check)",
+      "expected_outcome": "string (concrete expected outcome for this rubric item)",
       "weight": number (default 1.0),
       "required": boolean (default true)
     }
@@ -9565,7 +10353,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
     "Each rubric should:",
     "- Be specific and testable",
     "- Have a short, descriptive ID",
-    "- Include a clear description of what to check",
+    "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
     "- Indicate if it is required (mandatory) or optional",
     "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
     "",
@@ -9613,6 +10401,7 @@ function createAgentKernel() {
   createAgentKernel,
   createProvider,
   deepEqual,
+  detectFormat,
   ensureVSCodeSubagents,
   executeScript,
   explorationRatio,