npm - @agentv/core - Versions diffs - 2.1.1 → 2.5.1 - Mend

@agentv/core 2.1.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +38 -4
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +39 -5
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1070 -281
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +82 -7
package/dist/index.d.ts +82 -7
package/dist/index.js +1018 -230
package/dist/index.js.map +1 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-KDEP4I7G.js";
+} from "./chunk-RP3M7COZ.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -146,8 +146,9 @@ function mergeExecutionMetrics(summary, metrics) {
 }
 // src/evaluation/yaml-parser.ts
-import { readFile as readFile5 } from "node:fs/promises";
-import path6 from "node:path";
+import { readFile as readFile6 } from "node:fs/promises";
+import path7 from "node:path";
+import micromatch3 from "micromatch";
 import { parse as parse2 } from "yaml";
 // src/evaluation/loaders/config-loader.ts
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
         }
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
-      const config = {};
+      const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
+      const config2 = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
-        if (!knownProps.has(key) && value !== void 0) {
-          config[key] = value;
+        if (!knownProps2.has(key) && value !== void 0) {
+          config2[key] = value;
         }
       }
       evaluators.push({
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {},
+        ...Object.keys(config2).length > 0 ? { config: config2 } : {},
         ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const config = {
+      const config2 = {
         name,
         type: "tool_trajectory",
         mode,
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         ...expected ? { expected } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {}
       };
-      evaluators.push(config);
+      evaluators.push(config2);
       continue;
     }
     if (typeValue === "field_accuracy") {
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString(rawEvaluator.prompt);
+    const rawPrompt = rawEvaluator.prompt;
+    let prompt;
     let promptPath;
-    if (prompt) {
+    let resolvedPromptScript;
+    let promptScriptConfig;
+    if (isJsonObject2(rawPrompt)) {
+      const scriptArray = asStringArray(
+        rawPrompt.script,
+        `prompt.script for evaluator '${name}' in '${evalId}'`
+      );
+      if (!scriptArray) {
+        throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
+      }
+      const scriptPath = scriptArray[scriptArray.length - 1];
+      const resolved = await resolveFileReference2(scriptPath, searchRoots);
+      if (resolved.resolvedPath) {
+        resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
+      } else {
+        throw new Error(
+          `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
+        );
+      }
+      if (isJsonObject2(rawPrompt.config)) {
+        promptScriptConfig = rawPrompt.config;
+      }
+    } else if (typeof rawPrompt === "string") {
+      prompt = rawPrompt;
       const resolved = await resolveFileReference2(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = path3.resolve(resolved.resolvedPath);
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
     }
     const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
-    const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString(rubric.description) ?? "",
-      weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-      required: typeof rubric.required === "boolean" ? rubric.required : true
-    })).filter((r) => r.description.length > 0) : void 0;
+    const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
     if (typeValue === "rubric") {
       if (!parsedRubrics) {
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
+    const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
+    const config = {};
+    for (const [key, value] of Object.entries(rawEvaluator)) {
+      if (!knownProps.has(key) && value !== void 0) {
+        config[key] = value;
+      }
+    }
+    const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
+    const mergedConfig = { ...config, ...topLevelConfig };
+    const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
+      ...promptPath ? { resolvedPromptPath: promptPath } : {},
+      ...resolvedPromptScript ? { resolvedPromptScript } : {},
       ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
-      ...weight !== void 0 ? { weight } : {}
+      ...weight !== void 0 ? { weight } : {},
+      ...finalConfig ? { config: finalConfig } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -925,6 +958,191 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
 function isValidFieldAggregationType(value) {
   return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
 }
+function parseRubricItems(rawRubrics, evaluatorName, evalId) {
+  const items = [];
+  for (const [index, rawRubric] of rawRubrics.entries()) {
+    if (!isJsonObject2(rawRubric)) {
+      logWarning2(
+        `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
+      );
+      continue;
+    }
+    const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
+    const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
+    const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
+    let requiredMinScore;
+    let required;
+    if (typeof rawRubric.required_min_score === "number") {
+      const minScore = rawRubric.required_min_score;
+      if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
+        throw new Error(
+          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
+        );
+      }
+      requiredMinScore = minScore;
+    }
+    if (typeof rawRubric.required === "boolean") {
+      required = rawRubric.required;
+    }
+    let scoreRanges;
+    const rawScoreRanges = rawRubric.score_ranges;
+    if (rawScoreRanges !== void 0) {
+      if (!Array.isArray(rawScoreRanges)) {
+        throw new Error(
+          `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
+        );
+      }
+      scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
+      items.push({
+        id,
+        weight,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...required !== void 0 ? { required } : {},
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
+        score_ranges: scoreRanges
+      });
+    } else {
+      if (expectedOutcome.length === 0) {
+        logWarning2(
+          `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
+        );
+        continue;
+      }
+      items.push({
+        id,
+        expected_outcome: expectedOutcome,
+        weight,
+        // Default to required: true if not specified (backward compatibility)
+        required: required ?? true,
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
+      });
+    }
+  }
+  return items.length > 0 ? items : void 0;
+}
+function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
+  const ranges = [];
+  for (const [index, rawRange] of rawRanges.entries()) {
+    if (!isJsonObject2(rawRange)) {
+      throw new Error(
+        `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
+      );
+    }
+    const scoreRangeValue = rawRange.score_range;
+    if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
+      );
+    }
+    const [min, max] = scoreRangeValue;
+    if (!Number.isInteger(min) || !Number.isInteger(max)) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
+      );
+    }
+    if (min < 0 || min > 10 || max < 0 || max > 10) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
+      );
+    }
+    if (min > max) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
+      );
+    }
+    const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
+    if (expectedOutcome.length === 0) {
+      throw new Error(
+        `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
+      );
+    }
+    ranges.push({
+      score_range: [min, max],
+      expected_outcome: expectedOutcome
+    });
+  }
+  const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
+  for (let i = 1; i < sortedRanges.length; i++) {
+    const prev = sortedRanges[i - 1];
+    const curr = sortedRanges[i];
+    if (curr.score_range[0] <= prev.score_range[1]) {
+      throw new Error(
+        `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
+      );
+    }
+  }
+  const covered = /* @__PURE__ */ new Set();
+  for (const range of ranges) {
+    for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
+      covered.add(i);
+    }
+  }
+  const missing = [];
+  for (let i = 0; i <= 10; i++) {
+    if (!covered.has(i)) {
+      missing.push(i);
+    }
+  }
+  if (missing.length > 0) {
+    throw new Error(
+      `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
+    );
+  }
+  return ranges;
+}
+function parseInlineRubrics(rawRubrics) {
+  const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
+    if (typeof rubric === "string") {
+      return {
+        id: `rubric-${index + 1}`,
+        expected_outcome: rubric,
+        weight: 1,
+        required: true
+      };
+    }
+    const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
+    const rawScoreRanges = rubric.score_ranges;
+    const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
+      score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
+      expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
+    })).filter((r) => r.expected_outcome.length > 0) : void 0;
+    const baseRubric = {
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      weight: typeof rubric.weight === "number" ? rubric.weight : 1
+    };
+    if (scoreRanges && scoreRanges.length > 0) {
+      return {
+        ...baseRubric,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
+        ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
+        score_ranges: scoreRanges
+      };
+    }
+    return {
+      ...baseRubric,
+      expected_outcome: expectedOutcome,
+      required: typeof rubric.required === "boolean" ? rubric.required : true,
+      ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
+    };
+  }).filter(
+    (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
+  );
+  if (rubricItems.length === 0) {
+    return void 0;
+  }
+  return {
+    name: "rubric",
+    type: "llm_judge",
+    rubrics: rubricItems
+  };
+}
+// src/evaluation/loaders/jsonl-parser.ts
+import { readFile as readFile4 } from "node:fs/promises";
+import path5 from "node:path";
+import micromatch2 from "micromatch";
+import { parse as parseYaml } from "yaml";
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
@@ -1186,28 +1404,302 @@ async function processExpectedMessages(options) {
   return segments;
 }
-// src/evaluation/formatting/prompt-builder.ts
-import { readFile as readFile4 } from "node:fs/promises";
-import path5 from "node:path";
+// src/evaluation/loaders/shorthand-expansion.ts
+function expandInputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "user", content: value }];
+  }
+  if (Array.isArray(value)) {
+    const messages = value.filter((msg) => isTestMessage(msg));
+    return messages.length > 0 ? messages : void 0;
+  }
+  return void 0;
+}
+function expandExpectedOutputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "assistant", content: value }];
+  }
+  if (Array.isArray(value)) {
+    if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
+      const messages = value.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  if (isJsonObject(value)) {
+    if ("role" in value) {
+      return isTestMessage(value) ? [value] : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  return void 0;
+}
+function resolveInputMessages(raw) {
+  if (raw.input_messages !== void 0) {
+    if (Array.isArray(raw.input_messages)) {
+      const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandInputShorthand(raw.input);
+}
+function resolveExpectedMessages(raw) {
+  if (raw.expected_messages !== void 0) {
+    if (Array.isArray(raw.expected_messages)) {
+      const messages = raw.expected_messages.filter(
+        (msg) => isTestMessage(msg)
+      );
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandExpectedOutputShorthand(raw.expected_output);
+}
+// src/evaluation/loaders/jsonl-parser.ts
 var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
 var ANSI_RESET5 = "\x1B[0m";
+function detectFormat(filePath) {
+  const ext = path5.extname(filePath).toLowerCase();
+  if (ext === ".jsonl") return "jsonl";
+  if (ext === ".yaml" || ext === ".yml") return "yaml";
+  throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
+}
+async function loadSidecarMetadata(jsonlPath, verbose) {
+  const dir = path5.dirname(jsonlPath);
+  const base = path5.basename(jsonlPath, ".jsonl");
+  const sidecarPath = path5.join(dir, `${base}.yaml`);
+  if (!await fileExists2(sidecarPath)) {
+    if (verbose) {
+      logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
+    }
+    return {};
+  }
+  try {
+    const content = await readFile4(sidecarPath, "utf8");
+    const parsed = parseYaml(content);
+    if (!isJsonObject(parsed)) {
+      logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
+      return {};
+    }
+    return {
+      description: asString4(parsed.description),
+      dataset: asString4(parsed.dataset),
+      execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
+      evaluator: parsed.evaluator
+    };
+  } catch (error) {
+    logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
+    return {};
+  }
+}
+function parseJsonlContent(content, filePath) {
+  const lines = content.split("\n");
+  const cases = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (line === "") continue;
+    try {
+      const parsed = JSON.parse(line);
+      if (!isJsonObject(parsed)) {
+        throw new Error("Expected JSON object");
+      }
+      cases.push(parsed);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
+  File: ${filePath}`);
+    }
+  }
+  return cases;
+}
+async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
+  const verbose = options?.verbose ?? false;
+  const filterPattern = options?.filter;
+  const absoluteTestPath = path5.resolve(evalFilePath);
+  const repoRootPath = resolveToAbsolutePath(repoRoot);
+  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
+  const config = await loadConfig(absoluteTestPath, repoRootPath);
+  const guidelinePatterns = config?.guideline_patterns;
+  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
+  const rawFile = await readFile4(absoluteTestPath, "utf8");
+  const rawCases = parseJsonlContent(rawFile, evalFilePath);
+  const fallbackDataset = path5.basename(absoluteTestPath, ".jsonl") || "eval";
+  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
+  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
+  const globalExecution = sidecar.execution;
+  if (verbose) {
+    console.log(`
+[JSONL Dataset: ${evalFilePath}]`);
+    console.log(`  Cases: ${rawCases.length}`);
+    console.log(`  Dataset name: ${datasetName}`);
+    if (sidecar.description) {
+      console.log(`  Description: ${sidecar.description}`);
+    }
+  }
+  const results = [];
+  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
+    const evalcase = rawCases[lineIndex];
+    const lineNumber = lineIndex + 1;
+    const id = asString4(evalcase.id);
+    if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
+      continue;
+    }
+    const conversationId = asString4(evalcase.conversation_id);
+    const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
+      logError(
+        `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
+      );
+      continue;
+    }
+    const hasExpectedMessages = expectedMessages.length > 0;
+    const guidelinePaths = [];
+    const inputTextParts = [];
+    const inputSegments = await processMessages({
+      messages: inputMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      guidelinePaths,
+      textParts: inputTextParts,
+      messageType: "input",
+      verbose
+    });
+    const outputSegments = hasExpectedMessages ? await processExpectedMessages({
+      messages: expectedMessages,
+      searchRoots,
+      repoRootPath,
+      verbose
+    }) : [];
+    let referenceAnswer = "";
+    if (outputSegments.length > 0) {
+      const lastMessage = outputSegments[outputSegments.length - 1];
+      const content = lastMessage.content;
+      const toolCalls = lastMessage.tool_calls;
+      if (typeof content === "string") {
+        referenceAnswer = content;
+      } else if (content !== void 0 && content !== null) {
+        referenceAnswer = JSON.stringify(content, null, 2);
+      } else if (toolCalls !== void 0 && toolCalls !== null) {
+        referenceAnswer = JSON.stringify(toolCalls, null, 2);
+      }
+    }
+    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
+    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
+    const mergedExecution = caseExecution ?? globalExecution;
+    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    let evaluators;
+    try {
+      evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
+      continue;
+    }
+    const inlineRubrics = evalcase.rubrics;
+    if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
+        evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
+      }
+    }
+    const userFilePaths = [];
+    for (const segment of inputSegments) {
+      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
+        userFilePaths.push(segment.resolvedPath);
+      }
+    }
+    const allFilePaths = [
+      ...guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
+      ...userFilePaths
+    ];
+    const testCase = {
+      id,
+      dataset: datasetName,
+      conversation_id: conversationId,
+      question,
+      input_messages: inputMessages,
+      input_segments: inputSegments,
+      expected_messages: outputSegments,
+      reference_answer: referenceAnswer,
+      guideline_paths: guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
+      guideline_patterns: guidelinePatterns,
+      file_paths: allFilePaths,
+      expected_outcome: outcome,
+      evaluator: evalCaseEvaluatorKind,
+      evaluators
+    };
+    if (verbose) {
+      console.log(`
+[Eval Case: ${id}]`);
+      if (testCase.guideline_paths.length > 0) {
+        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
+        for (const guidelinePath of testCase.guideline_paths) {
+          console.log(`    - ${guidelinePath}`);
+        }
+      } else {
+        console.log("  No guidelines found");
+      }
+    }
+    results.push(testCase);
+  }
+  return results;
+}
+function asString4(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function logWarning4(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}
+${detailBlock}${ANSI_RESET5}`);
+  } else {
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+  }
+}
+function logError(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.error(`${ANSI_RED}Error: ${message}
+${detailBlock}${ANSI_RESET5}`);
+  } else {
+    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
+  }
+}
+// src/evaluation/formatting/prompt-builder.ts
+import { readFile as readFile5 } from "node:fs/promises";
+import path6 from "node:path";
+var ANSI_YELLOW6 = "\x1B[33m";
+var ANSI_RESET6 = "\x1B[0m";
 async function buildPromptInputs(testCase, mode = "lm") {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
-    const absolutePath = path5.resolve(rawPath);
+    const absolutePath = path6.resolve(rawPath);
     if (!await fileExists2(absolutePath)) {
-      logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
+      logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
       continue;
     }
     try {
-      const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await readFile5(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
-        displayPath: path5.basename(absolutePath)
+        displayPath: path6.basename(absolutePath)
       });
     } catch (error) {
-      logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
+      logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
   const guidelines = formatFileContents(guidelineParts);
@@ -1231,9 +1723,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
             messageSegments.push({ type: "text", value: segment });
           }
         } else if (isJsonObject(segment)) {
-          const type = asString4(segment.type);
+          const type = asString5(segment.type);
           if (type === "file") {
-            const value = asString4(segment.value);
+            const value = asString5(segment.value);
             if (!value) continue;
             if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
               messageSegments.push({ type: "guideline_ref", path: value });
@@ -1244,7 +1736,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
               messageSegments.push({ type: "file", text: fileText, path: value });
             }
           } else if (type === "text") {
-            const textValue = asString4(segment.value);
+            const textValue = asString5(segment.value);
             if (textValue && textValue.trim().length > 0) {
               messageSegments.push({ type: "text", value: textValue });
             }
@@ -1398,21 +1890,21 @@ ${guidelineContent.trim()}`);
   }
   return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
-function asString4(value) {
+function asString5(value) {
   return typeof value === "string" ? value : void 0;
 }
-function logWarning4(message) {
-  console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+function logWarning5(message) {
+  console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
 }
 // src/evaluation/yaml-parser.ts
-var ANSI_YELLOW6 = "\x1B[33m";
-var ANSI_RED = "\x1B[31m";
-var ANSI_RESET6 = "\x1B[0m";
+var ANSI_YELLOW7 = "\x1B[33m";
+var ANSI_RED2 = "\x1B[31m";
+var ANSI_RESET7 = "\x1B[0m";
 async function readTestSuiteMetadata(testFilePath) {
   try {
-    const absolutePath = path6.resolve(testFilePath);
-    const content = await readFile5(absolutePath, "utf8");
+    const absolutePath = path7.resolve(testFilePath);
+    const content = await readFile6(absolutePath, "utf8");
     const parsed = parse2(content);
     if (!isJsonObject(parsed)) {
       return {};
@@ -1423,21 +1915,25 @@ async function readTestSuiteMetadata(testFilePath) {
   }
 }
 async function loadEvalCases(evalFilePath, repoRoot, options) {
+  const format = detectFormat(evalFilePath);
+  if (format === "jsonl") {
+    return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
+  }
   const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
-  const absoluteTestPath = path6.resolve(evalFilePath);
+  const filterPattern = options?.filter;
+  const absoluteTestPath = path7.resolve(evalFilePath);
   const repoRootPath = resolveToAbsolutePath(repoRoot);
   const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
   const config = await loadConfig(absoluteTestPath, repoRootPath);
   const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await readFile5(absoluteTestPath, "utf8");
+  const rawFile = await readFile6(absoluteTestPath, "utf8");
   const parsed = parse2(rawFile);
   if (!isJsonObject(parsed)) {
     throw new Error(`Invalid test file format: ${evalFilePath}`);
   }
   const suite = parsed;
-  const datasetNameFromSuite = asString5(suite.dataset)?.trim();
-  const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
+  const datasetNameFromSuite = asString6(suite.dataset)?.trim();
+  const fallbackDataset = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
   const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
   const rawTestcases = suite.evalcases;
   if (!Array.isArray(rawTestcases)) {
@@ -1445,37 +1941,29 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   }
   const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
   const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
-  const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
+  const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
   const results = [];
   for (const rawEvalcase of rawTestcases) {
     if (!isJsonObject(rawEvalcase)) {
-      logWarning5("Skipping invalid eval case entry (expected object)");
+      logWarning6("Skipping invalid eval case entry (expected object)");
       continue;
     }
     const evalcase = rawEvalcase;
-    const id = asString5(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    const id = asString6(evalcase.id);
+    if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
       continue;
     }
-    const conversationId = asString5(evalcase.conversation_id);
-    const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logError(
-        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
+    const conversationId = asString6(evalcase.conversation_id);
+    const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
+      logError2(
+        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
       );
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter(
-      (msg) => isTestMessage(msg)
-    );
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logError(`No valid expected message found for eval case: ${id}`);
-      continue;
-    }
+    const hasExpectedMessages = expectedMessages.length > 0;
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1514,33 +2002,13 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      logError(`Skipping eval case '${id}': ${message}`);
+      logError2(`Skipping eval case '${id}': ${message}`);
       continue;
     }
     const inlineRubrics = evalcase.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
-      const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
-        if (typeof rubric === "string") {
-          return {
-            id: `rubric-${index + 1}`,
-            description: rubric,
-            weight: 1,
-            required: true
-          };
-        }
-        return {
-          id: asString5(rubric.id) ?? `rubric-${index + 1}`,
-          description: asString5(rubric.description) ?? "",
-          weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-          required: typeof rubric.required === "boolean" ? rubric.required : true
-        };
-      }).filter((r) => r.description.length > 0);
-      if (rubricItems.length > 0) {
-        const rubricEvaluator = {
-          name: "rubric",
-          type: "llm_judge",
-          rubrics: rubricItems
-        };
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
         evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
       }
     }
@@ -1551,7 +2019,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       }
     }
     const allFilePaths = [
-      ...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
+      ...guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
       ...userFilePaths
     ];
     const testCase = {
@@ -1563,7 +2031,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       input_segments: inputSegments,
       expected_messages: outputSegments,
       reference_answer: referenceAnswer,
-      guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
+      guideline_paths: guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
       expected_outcome: outcome,
@@ -1586,25 +2054,25 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   }
   return results;
 }
-function asString5(value) {
+function asString6(value) {
   return typeof value === "string" ? value : void 0;
 }
-function logWarning5(message, details) {
+function logWarning6(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW6}Warning: ${message}
-${detailBlock}${ANSI_RESET6}`);
+    console.warn(`${ANSI_YELLOW7}Warning: ${message}
+${detailBlock}${ANSI_RESET7}`);
   } else {
-    console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
+    console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
   }
 }
-function logError(message, details) {
+function logError2(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.error(`${ANSI_RED}Error: ${message}
-${detailBlock}${ANSI_RESET6}`);
+    console.error(`${ANSI_RED2}Error: ${message}
+${detailBlock}${ANSI_RESET7}`);
   } else {
-    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
+    console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
   }
 }
@@ -1947,7 +2415,7 @@ import { randomUUID } from "node:crypto";
 import { createWriteStream } from "node:fs";
 import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
-import path8 from "node:path";
+import path9 from "node:path";
 // src/evaluation/providers/claude-code-log-tracker.ts
 var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
@@ -2003,7 +2471,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
 }
 // src/evaluation/providers/preread.ts
-import path7 from "node:path";
+import path8 from "node:path";
 function buildPromptDocument(request, inputFiles, options) {
   const parts = [];
   const guidelineFiles = collectGuidelineFiles(
@@ -2026,7 +2494,7 @@ function normalizeInputFiles(inputFiles) {
   }
   const deduped = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path7.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (!deduped.has(absolutePath)) {
       deduped.set(absolutePath, absolutePath);
     }
@@ -2039,14 +2507,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path7.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (overrides?.has(absolutePath)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
       }
       continue;
     }
-    const normalized = absolutePath.split(path7.sep).join("/");
+    const normalized = absolutePath.split(path8.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -2061,7 +2529,7 @@ function collectInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path7.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -2073,7 +2541,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path7.basename(absolutePath);
+    const fileName = path8.basename(absolutePath);
     const fileUri = pathToFileUri(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -2093,7 +2561,7 @@ ${buildList(inputFiles).join("\n")}.`);
   return sections.join("\n");
 }
 function pathToFileUri(filePath) {
-  const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
+  const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -2130,7 +2598,7 @@ var ClaudeCodeProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
+      const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
       await writeFile(promptFile, request.question, "utf8");
       const args = this.buildClaudeCodeArgs(request.question, inputFiles);
       const cwd = this.resolveCwd();
@@ -2178,7 +2646,7 @@ var ClaudeCodeProvider = class {
     if (!this.config.cwd) {
       return process.cwd();
     }
-    return path8.resolve(this.config.cwd);
+    return path9.resolve(this.config.cwd);
   }
   buildClaudeCodeArgs(prompt, inputFiles) {
     const args = [];
@@ -2235,7 +2703,7 @@ ${filesContext}`;
     }
   }
   async createWorkspace() {
-    return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
+    return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
@@ -2249,9 +2717,9 @@ ${filesContext}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return path8.resolve(this.config.logDir);
+      return path9.resolve(this.config.logDir);
     }
-    return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
+    return path9.join(process.cwd(), ".agentv", "logs", "claude-code");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -2265,7 +2733,7 @@ ${filesContext}`;
       console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
     try {
       const logger = await ClaudeCodeStreamLogger.create({
         filePath,
@@ -2670,10 +3138,10 @@ function escapeShellArg(arg) {
 }
 async function defaultClaudeCodeRunner(options) {
   const tempId = randomUUID();
-  const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
-  const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
-  const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
-  const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
+  const stdoutFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
+  const stderrFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
+  const exitFile = path9.join(tmpdir(), `agentv-cc-${tempId}-exit`);
+  const pidFile = path9.join(tmpdir(), `agentv-cc-${tempId}-pid`);
   try {
     return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
   } finally {
@@ -2713,8 +3181,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
   let lastStdoutSize = 0;
   const readFileIfExists = async (filePath) => {
     try {
-      const { readFile: readFile7 } = await import("node:fs/promises");
-      return await readFile7(filePath, "utf8");
+      const { readFile: readFile8 } = await import("node:fs/promises");
+      return await readFile8(filePath, "utf8");
     } catch {
       return "";
     }
@@ -2789,7 +3257,7 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
 import { exec as execWithCallback } from "node:child_process";
 import fs from "node:fs/promises";
 import os from "node:os";
-import path9 from "node:path";
+import path10 from "node:path";
 import { promisify } from "node:util";
 import { z } from "zod";
 var ToolCallSchema = z.object({
@@ -2797,7 +3265,8 @@ var ToolCallSchema = z.object({
   input: z.unknown().optional(),
   output: z.unknown().optional(),
   id: z.string().optional(),
-  timestamp: z.string().optional()
+  timestamp: z.string().optional(),
+  duration_ms: z.number().optional()
 });
 var OutputMessageInputSchema = z.object({
   role: z.string(),
@@ -2805,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
   content: z.unknown().optional(),
   tool_calls: z.array(ToolCallSchema).optional(),
   timestamp: z.string().optional(),
+  duration_ms: z.number().optional(),
   metadata: z.record(z.unknown()).optional()
 });
 var TokenUsageSchema = z.object({
@@ -2843,8 +3313,16 @@ function convertOutputMessages(messages) {
     role: msg.role,
     name: msg.name,
     content: msg.content,
-    toolCalls: msg.tool_calls,
+    toolCalls: msg.tool_calls?.map((tc) => ({
+      tool: tc.tool,
+      input: tc.input,
+      output: tc.output,
+      id: tc.id,
+      timestamp: tc.timestamp,
+      durationMs: tc.duration_ms
+    })),
     timestamp: msg.timestamp,
+    durationMs: msg.duration_ms,
     metadata: msg.metadata
   }));
 }
@@ -3246,7 +3724,7 @@ function normalizeInputFiles2(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path9.resolve(inputFile);
+    const absolutePath = path10.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3260,7 +3738,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(path9.basename(filePath));
+    const escapedName = shellEscape(path10.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -3284,7 +3762,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
+  return path10.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
 function formatTimeoutSuffix2(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -3300,7 +3778,7 @@ import { randomUUID as randomUUID2 } from "node:crypto";
 import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
 import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
 import { tmpdir as tmpdir2 } from "node:os";
-import path10 from "node:path";
+import path11 from "node:path";
 import { promisify as promisify2 } from "node:util";
 // src/evaluation/providers/codex-log-tracker.ts
@@ -3395,7 +3873,7 @@ var CodexProvider = class {
       const promptContent = `${systemPrompt}
 ${basePrompt}`;
-      const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
+      const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME2);
       await writeFile2(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
@@ -3445,7 +3923,7 @@ ${basePrompt}`;
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return path10.resolve(this.config.cwd);
+    return path11.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = [
@@ -3487,7 +3965,7 @@ ${basePrompt}`;
     }
   }
   async createWorkspace() {
-    return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
+    return await mkdtemp2(path11.join(tmpdir2(), WORKSPACE_PREFIX2));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
@@ -3501,9 +3979,9 @@ ${basePrompt}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return path10.resolve(this.config.logDir);
+      return path11.resolve(this.config.logDir);
     }
-    return path10.join(process.cwd(), ".agentv", "logs", "codex");
+    return path11.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -3517,7 +3995,7 @@ ${basePrompt}`;
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
+    const filePath = path11.join(logDir, buildLogFilename2(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -3732,7 +4210,7 @@ function tryParseJsonValue2(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
+    const resolved = path11.isAbsolute(candidate) ? candidate : path11.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
     await access2(executablePath, constants2.F_OK);
     return executablePath;
@@ -4245,7 +4723,7 @@ import { randomUUID as randomUUID3 } from "node:crypto";
 import { createWriteStream as createWriteStream3 } from "node:fs";
 import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
 import { tmpdir as tmpdir3 } from "node:os";
-import path11 from "node:path";
+import path12 from "node:path";
 // src/evaluation/providers/pi-log-tracker.ts
 var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
@@ -4329,7 +4807,7 @@ var PiCodingAgentProvider = class {
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
+      const promptFile = path12.join(workspaceRoot, PROMPT_FILENAME3);
       await writeFile3(promptFile, request.question, "utf8");
       const args = this.buildPiArgs(request.question, inputFiles);
       const cwd = this.resolveCwd(workspaceRoot);
@@ -4371,7 +4849,7 @@ var PiCodingAgentProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return path11.resolve(this.config.cwd);
+    return path12.resolve(this.config.cwd);
   }
   buildPiArgs(prompt, inputFiles) {
     const args = [];
@@ -4460,7 +4938,7 @@ ${prompt}`;
     return env;
   }
   async createWorkspace() {
-    return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
+    return await mkdtemp3(path12.join(tmpdir3(), WORKSPACE_PREFIX3));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
@@ -4470,9 +4948,9 @@ ${prompt}`;
   }
   resolveLogDirectory() {
     if (this.config.logDir) {
-      return path11.resolve(this.config.logDir);
+      return path12.resolve(this.config.logDir);
     }
-    return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+    return path12.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -4486,7 +4964,7 @@ ${prompt}`;
       console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
+    const filePath = path12.join(logDir, buildLogFilename3(request, this.targetName));
     try {
       const logger = await PiStreamLogger.create({
         filePath,
@@ -4919,7 +5397,7 @@ async function defaultPiRunner(options) {
 }
 // src/evaluation/providers/vscode.ts
-import path12 from "node:path";
+import path13 from "node:path";
 import {
   dispatchAgentSession,
   dispatchBatchAgent,
@@ -5094,7 +5572,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path12.basename(absolutePath);
+    const fileName = path13.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -5119,8 +5597,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path12.resolve(attachment);
-    const normalized = absolutePath.split(path12.sep).join("/");
+    const absolutePath = path13.resolve(attachment);
+    const normalized = absolutePath.split(path13.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -5135,7 +5613,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path12.resolve(attachment);
+    const absolutePath = path13.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -5143,7 +5621,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
+  const absolutePath = path13.isAbsolute(filePath) ? filePath : path13.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -5156,7 +5634,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(path12.resolve(attachment));
+    deduped.add(path13.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -5165,7 +5643,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(path12.resolve(inputFile));
+      deduped.add(path13.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -5213,8 +5691,8 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
-import { access as access3, readFile as readFile6 } from "node:fs/promises";
-import path13 from "node:path";
+import { access as access3, readFile as readFile7 } from "node:fs/promises";
+import path14 from "node:path";
 import { parse as parse3 } from "yaml";
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -5251,11 +5729,11 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = path13.resolve(filePath);
+  const absolutePath = path14.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await readFile6(absolutePath, "utf8");
+  const raw = await readFile7(absolutePath, "utf8");
   const parsed = parse3(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -5462,15 +5940,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
   });
 }
 async function execShellWithStdin(command, stdinPayload, options = {}) {
-  const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
+  const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
   const { tmpdir: tmpdir4 } = await import("node:os");
-  const path15 = await import("node:path");
+  const path16 = await import("node:path");
   const { randomUUID: randomUUID4 } = await import("node:crypto");
-  const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
+  const dir = path16.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
   await mkdir4(dir, { recursive: true });
-  const stdinPath = path15.join(dir, "stdin.txt");
-  const stdoutPath = path15.join(dir, "stdout.txt");
-  const stderrPath = path15.join(dir, "stderr.txt");
+  const stdinPath = path16.join(dir, "stdin.txt");
+  const stdoutPath = path16.join(dir, "stdout.txt");
+  const stderrPath = path16.join(dir, "stderr.txt");
   await writeFile4(stdinPath, stdinPayload, "utf8");
   const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
   const { spawn: spawn4 } = await import("node:child_process");
@@ -5500,8 +5978,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
         resolve(code ?? 0);
       });
     });
-    const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
-    const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
+    const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
+    const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
     return { stdout, stderr, exitCode };
   } finally {
     await rm4(dir, { recursive: true, force: true });
@@ -5773,7 +6251,7 @@ var CodeEvaluator = class {
       outputMessages: context.outputMessages ?? null,
       guidelineFiles: context.evalCase.guideline_paths,
       inputFiles: context.evalCase.file_paths.filter(
-        (path15) => !context.evalCase.guideline_paths.includes(path15)
+        (path16) => !context.evalCase.guideline_paths.includes(path16)
       ),
       inputMessages: context.evalCase.input_messages,
       traceSummary: context.traceSummary ?? null,
@@ -5921,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
   checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
   overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
 });
+var scoreRangeCheckResultSchema = z2.object({
+  id: z2.string().describe("The ID of the rubric criterion being scored"),
+  score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
+  reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
+});
+var scoreRangeEvaluationSchema = z2.object({
+  checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
+  overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
+});
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
@@ -6006,6 +6493,10 @@ var LlmJudgeEvaluator = class {
         `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
       );
     }
+    const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+    if (hasScoreRanges) {
+      return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
+    }
     const prompt = this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
     const evaluatorRawRequest = {
@@ -6031,6 +6522,84 @@ var LlmJudgeEvaluator = class {
       evaluatorRawRequest
     };
   }
+  /**
+   * Evaluate using score-range rubrics (analytic rubric scoring).
+   * Each criterion is scored 0-10 and normalized to 0-1.
+   */
+  async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
+    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const systemPrompt = buildScoreRangeOutputSchema();
+    const evaluatorRawRequest = {
+      userPrompt: prompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    const { data } = await this.runWithRetry({
+      context,
+      judgeProvider,
+      systemPrompt,
+      userPrompt: prompt,
+      schema: scoreRangeEvaluationSchema
+    });
+    const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
+    return {
+      score,
+      verdict,
+      hits,
+      misses,
+      expectedAspectCount: rubrics.length,
+      reasoning: data.overall_reasoning,
+      evaluatorRawRequest,
+      details
+    };
+  }
+  /**
+   * Build prompt for score-range rubric evaluation.
+   */
+  buildScoreRangePrompt(context, rubrics) {
+    const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
+    const parts = [
+      "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
+      "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
+      "",
+      "[[ ## question ## ]]",
+      formattedQuestion,
+      "",
+      "[[ ## expected_outcome ## ]]",
+      context.evalCase.expected_outcome,
+      ""
+    ];
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
+    }
+    parts.push(
+      "[[ ## candidate_answer ## ]]",
+      context.candidate,
+      "",
+      "[[ ## scoring_criteria ## ]]"
+    );
+    for (const rubric of rubrics) {
+      const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
+      const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
+      parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
+      if (rubric.expected_outcome) {
+        parts.push(`Description: ${rubric.expected_outcome}`);
+      }
+      if (rubric.score_ranges && rubric.score_ranges.length > 0) {
+        parts.push("Score ranges:");
+        for (const range of rubric.score_ranges) {
+          const [min, max] = range.score_range;
+          const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
+          parts.push(`  - Score ${rangeLabel}: ${range.expected_outcome}`);
+        }
+      }
+    }
+    parts.push(
+      "",
+      "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
+    );
+    return parts.join("\n");
+  }
   buildRubricPrompt(context, rubrics) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const parts = [
@@ -6050,7 +6619,7 @@ var LlmJudgeEvaluator = class {
     for (const rubric of rubrics) {
       const requiredLabel = rubric.required ? " (REQUIRED)" : "";
       const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
-      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
+      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
     }
     parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
     return parts.join("\n");
@@ -6137,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
     totalWeight += rubric.weight;
     if (check.satisfied) {
       earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
     } else {
-      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
       if (rubric.required) {
         failedRequired = true;
       }
@@ -6149,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
+function buildScoreRangeOutputSchema() {
+  return `You are an expert evaluator. Score the candidate answer on each criterion.
+You must return a valid JSON object matching this schema:
+{
+  "checks": [
+    {
+      "id": "string (criterion id)",
+      "score": integer (0-10),
+      "reasoning": "string (brief explanation for score)"
+    }
+  ],
+  "overall_reasoning": "string (summary, optional)"
+}
+Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
+}
+function calculateScoreRangeResult(result, rubrics) {
+  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
+  const hits = [];
+  const misses = [];
+  const rawScores = {};
+  let totalWeight = 0;
+  let weightedScoreSum = 0;
+  let failedRequired = false;
+  for (const check of result.checks) {
+    const rubric = rubricMap.get(check.id);
+    if (!rubric) {
+      continue;
+    }
+    const rawScore = Math.max(0, Math.min(10, check.score));
+    const normalizedScore = rawScore / 10;
+    rawScores[rubric.id] = rawScore;
+    totalWeight += rubric.weight;
+    weightedScoreSum += normalizedScore * rubric.weight;
+    let requiredMinScore;
+    if (rubric.required_min_score !== void 0) {
+      requiredMinScore = rubric.required_min_score;
+    } else if (rubric.required === true) {
+      requiredMinScore = 10;
+    }
+    const matchingRange = rubric.score_ranges?.find(
+      (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
+    );
+    const rangeDescription = matchingRange?.expected_outcome ?? "";
+    const criterionLabel = rubric.expected_outcome ?? rubric.id;
+    const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
+    const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
+    if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
+      failedRequired = true;
+      misses.push(scoreInfo);
+    } else if (rawScore >= 7) {
+      hits.push(scoreInfo);
+    } else {
+      misses.push(scoreInfo);
+    }
+  }
+  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
+  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
+  return {
+    score,
+    verdict,
+    hits,
+    misses,
+    details: {
+      raw_scores: rawScores,
+      normalization: "score / 10",
+      aggregation: "weighted_average"
+    }
+  };
+}
 // src/evaluation/evaluators/composite.ts
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -6532,115 +7171,115 @@ var FieldAccuracyEvaluator = class {
    * Evaluate a single field against the expected value.
    */
   evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path15, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path15);
-    const expectedValue = resolvePath(expectedData, path15);
+    const { path: path16, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path16);
+    const expectedValue = resolvePath(expectedData, path16);
     if (expectedValue === void 0) {
       return {
-        path: path15,
+        path: path16,
         score: 1,
         // No expected value means no comparison needed
         weight,
         hit: true,
-        message: `${path15}: no expected value`
+        message: `${path16}: no expected value`
       };
     }
     if (candidateValue === void 0) {
       if (required) {
         return {
-          path: path15,
+          path: path16,
           score: 0,
           weight,
           hit: false,
-          message: `${path15} (required, missing)`
+          message: `${path16} (required, missing)`
         };
       }
       return {
-        path: path15,
+        path: path16,
         score: 1,
         // Don't penalize missing optional fields
         weight: 0,
         // Zero weight means it won't affect the score
         hit: true,
-        message: `${path15}: optional field missing`
+        message: `${path16}: optional field missing`
       };
     }
     switch (match) {
       case "exact":
-        return this.compareExact(path15, candidateValue, expectedValue, weight);
+        return this.compareExact(path16, candidateValue, expectedValue, weight);
       case "numeric_tolerance":
         return this.compareNumericTolerance(
-          path15,
+          path16,
           candidateValue,
           expectedValue,
           fieldConfig,
           weight
         );
       case "date":
-        return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
+        return this.compareDate(path16, candidateValue, expectedValue, fieldConfig, weight);
       default:
         return {
-          path: path15,
+          path: path16,
           score: 0,
           weight,
           hit: false,
-          message: `${path15}: unknown match type "${match}"`
+          message: `${path16}: unknown match type "${match}"`
         };
     }
   }
   /**
    * Exact equality comparison.
    */
-  compareExact(path15, candidateValue, expectedValue, weight) {
+  compareExact(path16, candidateValue, expectedValue, weight) {
     if (deepEqual(candidateValue, expectedValue)) {
       return {
-        path: path15,
+        path: path16,
         score: 1,
         weight,
         hit: true,
-        message: path15
+        message: path16
       };
     }
     if (typeof candidateValue !== typeof expectedValue) {
       return {
-        path: path15,
+        path: path16,
         score: 0,
         weight,
         hit: false,
-        message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+        message: `${path16} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
       };
     }
     return {
-      path: path15,
+      path: path16,
       score: 0,
       weight,
       hit: false,
-      message: `${path15} (value mismatch)`
+      message: `${path16} (value mismatch)`
     };
   }
   /**
    * Numeric comparison with absolute or relative tolerance.
    */
-  compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
+  compareNumericTolerance(path16, candidateValue, expectedValue, fieldConfig, weight) {
     const { tolerance = 0, relative = false } = fieldConfig;
     const candidateNum = toNumber(candidateValue);
     const expectedNum = toNumber(expectedValue);
     if (candidateNum === null || expectedNum === null) {
       return {
-        path: path15,
+        path: path16,
         score: 0,
         weight,
         hit: false,
-        message: `${path15} (non-numeric value)`
+        message: `${path16} (non-numeric value)`
       };
     }
     if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        path: path15,
+        path: path16,
         score: 0,
         weight,
         hit: false,
-        message: `${path15} (invalid numeric value)`
+        message: `${path16} (invalid numeric value)`
       };
     }
     const diff = Math.abs(candidateNum - expectedNum);
@@ -6653,61 +7292,61 @@ var FieldAccuracyEvaluator = class {
     }
     if (withinTolerance) {
       return {
-        path: path15,
+        path: path16,
         score: 1,
         weight,
         hit: true,
-        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
+        message: `${path16} (within tolerance: diff=${diff.toFixed(2)})`
       };
     }
     return {
-      path: path15,
+      path: path16,
       score: 0,
       weight,
       hit: false,
-      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+      message: `${path16} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
     };
   }
   /**
    * Date comparison with format normalization.
    */
-  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
+  compareDate(path16, candidateValue, expectedValue, fieldConfig, weight) {
     const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
     const candidateDate = parseDate(String(candidateValue), formats);
     const expectedDate = parseDate(String(expectedValue), formats);
     if (candidateDate === null) {
       return {
-        path: path15,
+        path: path16,
         score: 0,
         weight,
         hit: false,
-        message: `${path15} (unparseable candidate date)`
+        message: `${path16} (unparseable candidate date)`
       };
     }
     if (expectedDate === null) {
       return {
-        path: path15,
+        path: path16,
         score: 0,
         weight,
         hit: false,
-        message: `${path15} (unparseable expected date)`
+        message: `${path16} (unparseable expected date)`
       };
     }
     if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
       return {
-        path: path15,
+        path: path16,
         score: 1,
         weight,
         hit: true,
-        message: path15
+        message: path16
       };
     }
     return {
-      path: path15,
+      path: path16,
       score: 0,
       weight,
       hit: false,
-      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+      message: `${path16} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
     };
   }
   /**
@@ -6747,11 +7386,11 @@ var FieldAccuracyEvaluator = class {
     };
   }
 };
-function resolvePath(obj, path15) {
-  if (!path15 || !obj) {
+function resolvePath(obj, path16) {
+  if (!path16 || !obj) {
     return void 0;
   }
-  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  const parts = path16.split(/\.|\[|\]/).filter((p) => p.length > 0);
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0) {
@@ -6976,6 +7615,27 @@ function argsMatch(expected, actual) {
   }
   return true;
 }
+function checkLatency(toolName, maxDurationMs, actualDurationMs) {
+  if (maxDurationMs === void 0) {
+    return { status: "skip", message: "" };
+  }
+  if (actualDurationMs === void 0) {
+    return {
+      status: "skip",
+      message: `No duration data for ${toolName}; latency assertion skipped`
+    };
+  }
+  if (actualDurationMs <= maxDurationMs) {
+    return {
+      status: "pass",
+      message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+    };
+  }
+  return {
+    status: "fail",
+    message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+  };
+}
 var ToolTrajectoryEvaluator = class {
   kind = "tool_trajectory";
   config;
@@ -7034,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
         for (const call of message.toolCalls) {
           toolCalls.push({
             name: call.tool,
-            args: call.input
+            args: call.input,
+            durationMs: call.durationMs
           });
         }
       }
@@ -7102,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
     let actualIndex = 0;
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     for (let i = 0; i < expected.length; i++) {
       const expectedItem = expected[i];
       const expectedTool = expectedItem.tool;
       let found = false;
       let argsMismatch = false;
+      let matchedCall;
       while (actualIndex < toolCalls.length) {
         const actualCall = toolCalls[actualIndex];
         if (actualCall.name === expectedTool) {
           if (argsMatch(expectedItem.args, actualCall.args)) {
             hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            sequenceHits++;
+            matchedCall = actualCall;
             actualIndex++;
             found = true;
             break;
@@ -7129,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
       if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
+      if (found && matchedCall) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          matchedCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
   evaluateExact(toolCalls) {
@@ -7152,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     if (toolCalls.length !== expected.length) {
       misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
     }
@@ -7161,33 +7860,58 @@ var ToolTrajectoryEvaluator = class {
       const expectedTool = expectedItem.tool;
       const actualCall = toolCalls[i];
       const actualTool = actualCall.name;
+      let sequenceMatched = false;
       if (actualTool === expectedTool) {
         if (argsMatch(expectedItem.args, actualCall.args)) {
           hits.push(`Position ${i}: ${expectedTool}`);
+          sequenceHits++;
+          sequenceMatched = true;
         } else {
           misses.push(`Position ${i}: ${expectedTool} args mismatch`);
         }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
+      if (sequenceMatched) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          actualCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
     for (let i = checkLength; i < expected.length; i++) {
       misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
 };
 // src/evaluation/orchestrator.ts
 import { createHash } from "node:crypto";
-import path14 from "node:path";
+import path15 from "node:path";
+import micromatch4 from "micromatch";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -7346,17 +8070,17 @@ async function runEvaluation(options) {
     cache,
     useCache,
     now,
-    evalId,
+    filter,
     verbose,
     evalCases: preloadedEvalCases,
     onResult,
     onProgress
   } = options;
-  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
-  const filteredEvalCases = filterEvalCases(evalCases, evalId);
+  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
+  const filteredEvalCases = filterEvalCases(evalCases, filter);
   if (filteredEvalCases.length === 0) {
-    if (evalId) {
-      throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
+    if (filter) {
+      throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
     }
     return [];
   }
@@ -7932,7 +8656,10 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          judgeProvider
+          judgeProvider,
+          outputMessages,
+          traceSummary,
+          agentTimeoutMs
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7986,7 +8713,7 @@ async function runEvaluatorList(options) {
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? path15.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -8267,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
     attempt,
     promptInputs,
     now,
-    judgeProvider
+    judgeProvider,
+    outputMessages,
+    traceSummary,
+    agentTimeoutMs
   } = options;
-  const customPrompt = await resolveCustomPrompt(config);
+  const customPrompt = await resolveCustomPrompt(
+    config,
+    {
+      evalCase,
+      candidate,
+      outputMessages,
+      traceSummary,
+      config: config.config
+    },
+    agentTimeoutMs
+  );
   return evaluatorRegistry.llm_judge.evaluate({
     evalCase,
     candidate,
@@ -8283,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
     evaluator: config
   });
 }
-async function resolveCustomPrompt(config) {
-  if (config.promptPath) {
+async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
+  if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
+    if (!context) {
+      throw new Error("Context required for executable prompt templates");
+    }
+    return executePromptTemplate(
+      promptConfig.resolvedPromptScript,
+      context,
+      promptConfig.config,
+      timeoutMs
+    );
+  }
+  const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
+  if (promptPath) {
     try {
-      const content = await readTextFile(config.promptPath);
+      const content = await readTextFile(promptPath);
       return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
+      console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
     }
   }
-  return config.prompt;
+  const promptValue = promptConfig.prompt;
+  if (typeof promptValue === "string") {
+    return promptValue;
+  }
+  return void 0;
+}
+async function executePromptTemplate(script, context, config, timeoutMs) {
+  const payload = {
+    question: context.evalCase.question,
+    expectedOutcome: context.evalCase.expected_outcome,
+    expectedMessages: context.evalCase.expected_messages,
+    referenceAnswer: context.evalCase.reference_answer,
+    candidateAnswer: context.candidate,
+    outputMessages: context.outputMessages ?? null,
+    guidelineFiles: context.evalCase.guideline_paths,
+    inputFiles: context.evalCase.file_paths.filter(
+      (p) => !context.evalCase.guideline_paths.includes(p)
+    ),
+    inputMessages: context.evalCase.input_messages,
+    traceSummary: context.traceSummary ?? null,
+    config: config ?? context.config ?? null
+  };
+  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  const scriptPath = script[script.length - 1];
+  const cwd = path15.dirname(scriptPath);
+  try {
+    const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
+    const prompt = stdout.trim();
+    if (!prompt) {
+      throw new Error("Prompt template produced empty output");
+    }
+    return prompt;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Prompt template execution failed: ${message}`);
+  }
 }
-function filterEvalCases(evalCases, evalId) {
-  if (!evalId) {
+function filterEvalCases(evalCases, filter) {
+  if (!filter) {
     return evalCases;
   }
-  return evalCases.filter((evalCase) => evalCase.id === evalId);
+  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
 }
 function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
   const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -8457,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
 import { z as z3 } from "zod";
 var rubricItemSchema = z3.object({
   id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
-  description: z3.string().describe("What this rubric checks for"),
+  expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
   weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
   required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
 });
@@ -8477,7 +9264,7 @@ You must return a valid JSON object matching this schema:
   "rubrics": [
     {
       "id": "string (short identifier)",
-      "description": "string (what to check)",
+      "expected_outcome": "string (concrete expected outcome for this rubric item)",
       "weight": number (default 1.0),
       "required": boolean (default true)
     }
@@ -8513,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
     "Each rubric should:",
     "- Be specific and testable",
     "- Have a short, descriptive ID",
-    "- Include a clear description of what to check",
+    "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
     "- Indicate if it is required (mandatory) or optional",
     "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
     "",
@@ -8560,6 +9347,7 @@ export {
   createAgentKernel,
   createProvider,
   deepEqual,
+  detectFormat,
   ensureVSCodeSubagents,
   executeScript,
   explorationRatio,