npm - @agentv/core - Versions diffs - 2.2.0 → 2.5.1 - Mend

@agentv/core 2.2.0 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +38 -4
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +39 -5
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +654 -119
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +75 -6
package/dist/index.d.ts +75 -6
package/dist/index.js +655 -120
package/dist/index.js.map +1 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-KDEP4I7G.js";
+} from "./chunk-RP3M7COZ.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -148,6 +148,7 @@ function mergeExecutionMetrics(summary, metrics) {
 // src/evaluation/yaml-parser.ts
 import { readFile as readFile6 } from "node:fs/promises";
 import path7 from "node:path";
+import micromatch3 from "micromatch";
 import { parse as parse2 } from "yaml";
 // src/evaluation/loaders/config-loader.ts
@@ -462,11 +463,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
         }
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
-      const config = {};
+      const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
+      const config2 = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
-        if (!knownProps.has(key) && value !== void 0) {
-          config[key] = value;
+        if (!knownProps2.has(key) && value !== void 0) {
+          config2[key] = value;
         }
       }
       evaluators.push({
@@ -476,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {},
+        ...Object.keys(config2).length > 0 ? { config: config2 } : {},
         ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
@@ -641,7 +642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const config = {
+      const config2 = {
         name,
         type: "tool_trajectory",
         mode,
@@ -649,7 +650,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         ...expected ? { expected } : {},
         ...weight2 !== void 0 ? { weight: weight2 } : {}
       };
-      evaluators.push(config);
+      evaluators.push(config2);
       continue;
     }
     if (typeValue === "field_accuracy") {
@@ -786,9 +787,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString(rawEvaluator.prompt);
+    const rawPrompt = rawEvaluator.prompt;
+    let prompt;
     let promptPath;
-    if (prompt) {
+    let resolvedPromptScript;
+    let promptScriptConfig;
+    if (isJsonObject2(rawPrompt)) {
+      const scriptArray = asStringArray(
+        rawPrompt.script,
+        `prompt.script for evaluator '${name}' in '${evalId}'`
+      );
+      if (!scriptArray) {
+        throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
+      }
+      const scriptPath = scriptArray[scriptArray.length - 1];
+      const resolved = await resolveFileReference2(scriptPath, searchRoots);
+      if (resolved.resolvedPath) {
+        resolvedPromptScript = [...scriptArray.slice(0, -1), path3.resolve(resolved.resolvedPath)];
+      } else {
+        throw new Error(
+          `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
+        );
+      }
+      if (isJsonObject2(rawPrompt.config)) {
+        promptScriptConfig = rawPrompt.config;
+      }
+    } else if (typeof rawPrompt === "string") {
+      prompt = rawPrompt;
       const resolved = await resolveFileReference2(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = path3.resolve(resolved.resolvedPath);
@@ -807,12 +832,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
     }
     const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
-    const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString(rubric.description) ?? "",
-      weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-      required: typeof rubric.required === "boolean" ? rubric.required : true
-    })).filter((r) => r.description.length > 0) : void 0;
+    const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name, evalId) : void 0;
     if (typeValue === "rubric") {
       if (!parsedRubrics) {
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
@@ -832,13 +852,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     const weight = validateWeight(rawEvaluator.weight, name, evalId);
+    const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
+    const config = {};
+    for (const [key, value] of Object.entries(rawEvaluator)) {
+      if (!knownProps.has(key) && value !== void 0) {
+        config[key] = value;
+      }
+    }
+    const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
+    const mergedConfig = { ...config, ...topLevelConfig };
+    const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
+      ...promptPath ? { resolvedPromptPath: promptPath } : {},
+      ...resolvedPromptScript ? { resolvedPromptScript } : {},
       ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
-      ...weight !== void 0 ? { weight } : {}
+      ...weight !== void 0 ? { weight } : {},
+      ...finalConfig ? { config: finalConfig } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -925,10 +958,190 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
 function isValidFieldAggregationType(value) {
   return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
 }
+function parseRubricItems(rawRubrics, evaluatorName, evalId) {
+  const items = [];
+  for (const [index, rawRubric] of rawRubrics.entries()) {
+    if (!isJsonObject2(rawRubric)) {
+      logWarning2(
+        `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
+      );
+      continue;
+    }
+    const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
+    const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
+    const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
+    let requiredMinScore;
+    let required;
+    if (typeof rawRubric.required_min_score === "number") {
+      const minScore = rawRubric.required_min_score;
+      if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
+        throw new Error(
+          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
+        );
+      }
+      requiredMinScore = minScore;
+    }
+    if (typeof rawRubric.required === "boolean") {
+      required = rawRubric.required;
+    }
+    let scoreRanges;
+    const rawScoreRanges = rawRubric.score_ranges;
+    if (rawScoreRanges !== void 0) {
+      if (!Array.isArray(rawScoreRanges)) {
+        throw new Error(
+          `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
+        );
+      }
+      scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
+      items.push({
+        id,
+        weight,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...required !== void 0 ? { required } : {},
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
+        score_ranges: scoreRanges
+      });
+    } else {
+      if (expectedOutcome.length === 0) {
+        logWarning2(
+          `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
+        );
+        continue;
+      }
+      items.push({
+        id,
+        expected_outcome: expectedOutcome,
+        weight,
+        // Default to required: true if not specified (backward compatibility)
+        required: required ?? true,
+        ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
+      });
+    }
+  }
+  return items.length > 0 ? items : void 0;
+}
+function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
+  const ranges = [];
+  for (const [index, rawRange] of rawRanges.entries()) {
+    if (!isJsonObject2(rawRange)) {
+      throw new Error(
+        `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
+      );
+    }
+    const scoreRangeValue = rawRange.score_range;
+    if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
+      );
+    }
+    const [min, max] = scoreRangeValue;
+    if (!Number.isInteger(min) || !Number.isInteger(max)) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
+      );
+    }
+    if (min < 0 || min > 10 || max < 0 || max > 10) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
+      );
+    }
+    if (min > max) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
+      );
+    }
+    const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
+    if (expectedOutcome.length === 0) {
+      throw new Error(
+        `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
+      );
+    }
+    ranges.push({
+      score_range: [min, max],
+      expected_outcome: expectedOutcome
+    });
+  }
+  const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
+  for (let i = 1; i < sortedRanges.length; i++) {
+    const prev = sortedRanges[i - 1];
+    const curr = sortedRanges[i];
+    if (curr.score_range[0] <= prev.score_range[1]) {
+      throw new Error(
+        `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
+      );
+    }
+  }
+  const covered = /* @__PURE__ */ new Set();
+  for (const range of ranges) {
+    for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
+      covered.add(i);
+    }
+  }
+  const missing = [];
+  for (let i = 0; i <= 10; i++) {
+    if (!covered.has(i)) {
+      missing.push(i);
+    }
+  }
+  if (missing.length > 0) {
+    throw new Error(
+      `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
+    );
+  }
+  return ranges;
+}
+function parseInlineRubrics(rawRubrics) {
+  const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
+    if (typeof rubric === "string") {
+      return {
+        id: `rubric-${index + 1}`,
+        expected_outcome: rubric,
+        weight: 1,
+        required: true
+      };
+    }
+    const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
+    const rawScoreRanges = rubric.score_ranges;
+    const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
+      score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
+      expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
+    })).filter((r) => r.expected_outcome.length > 0) : void 0;
+    const baseRubric = {
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      weight: typeof rubric.weight === "number" ? rubric.weight : 1
+    };
+    if (scoreRanges && scoreRanges.length > 0) {
+      return {
+        ...baseRubric,
+        ...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
+        ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
+        ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
+        score_ranges: scoreRanges
+      };
+    }
+    return {
+      ...baseRubric,
+      expected_outcome: expectedOutcome,
+      required: typeof rubric.required === "boolean" ? rubric.required : true,
+      ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
+    };
+  }).filter(
+    (r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
+  );
+  if (rubricItems.length === 0) {
+    return void 0;
+  }
+  return {
+    name: "rubric",
+    type: "llm_judge",
+    rubrics: rubricItems
+  };
+}
 // src/evaluation/loaders/jsonl-parser.ts
 import { readFile as readFile4 } from "node:fs/promises";
 import path5 from "node:path";
+import micromatch2 from "micromatch";
 import { parse as parseYaml } from "yaml";
 // src/evaluation/loaders/message-processor.ts
@@ -1191,6 +1404,65 @@ async function processExpectedMessages(options) {
   return segments;
 }
+// src/evaluation/loaders/shorthand-expansion.ts
+function expandInputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "user", content: value }];
+  }
+  if (Array.isArray(value)) {
+    const messages = value.filter((msg) => isTestMessage(msg));
+    return messages.length > 0 ? messages : void 0;
+  }
+  return void 0;
+}
+function expandExpectedOutputShorthand(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value === "string") {
+    return [{ role: "assistant", content: value }];
+  }
+  if (Array.isArray(value)) {
+    if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
+      const messages = value.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  if (isJsonObject(value)) {
+    if ("role" in value) {
+      return isTestMessage(value) ? [value] : void 0;
+    }
+    return [{ role: "assistant", content: value }];
+  }
+  return void 0;
+}
+function resolveInputMessages(raw) {
+  if (raw.input_messages !== void 0) {
+    if (Array.isArray(raw.input_messages)) {
+      const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandInputShorthand(raw.input);
+}
+function resolveExpectedMessages(raw) {
+  if (raw.expected_messages !== void 0) {
+    if (Array.isArray(raw.expected_messages)) {
+      const messages = raw.expected_messages.filter(
+        (msg) => isTestMessage(msg)
+      );
+      return messages.length > 0 ? messages : void 0;
+    }
+    return void 0;
+  }
+  return expandExpectedOutputShorthand(raw.expected_output);
+}
 // src/evaluation/loaders/jsonl-parser.ts
 var ANSI_YELLOW5 = "\x1B[33m";
 var ANSI_RED = "\x1B[31m";
@@ -1251,7 +1523,7 @@ function parseJsonlContent(content, filePath) {
 }
 async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
   const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
+  const filterPattern = options?.filter;
   const absoluteTestPath = path5.resolve(evalFilePath);
   const repoRootPath = resolveToAbsolutePath(repoRoot);
   const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -1278,28 +1550,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
     const evalcase = rawCases[lineIndex];
     const lineNumber = lineIndex + 1;
     const id = asString4(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
       continue;
     }
     const conversationId = asString4(evalcase.conversation_id);
     const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
       logError(
-        `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
+        `Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
       );
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter(
-      (msg) => isTestMessage(msg)
-    );
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
-      continue;
-    }
+    const hasExpectedMessages = expectedMessages.length > 0;
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1345,28 +1609,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
     }
     const inlineRubrics = evalcase.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
-      const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
-        if (typeof rubric === "string") {
-          return {
-            id: `rubric-${index + 1}`,
-            description: rubric,
-            weight: 1,
-            required: true
-          };
-        }
-        return {
-          id: asString4(rubric.id) ?? `rubric-${index + 1}`,
-          description: asString4(rubric.description) ?? "",
-          weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-          required: typeof rubric.required === "boolean" ? rubric.required : true
-        };
-      }).filter((r) => r.description.length > 0);
-      if (rubricItems.length > 0) {
-        const rubricEvaluator = {
-          name: "rubric",
-          type: "llm_judge",
-          rubrics: rubricItems
-        };
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
         evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
       }
     }
@@ -1676,7 +1920,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
     return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
   }
   const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
+  const filterPattern = options?.filter;
   const absoluteTestPath = path7.resolve(evalFilePath);
   const repoRootPath = resolveToAbsolutePath(repoRoot);
   const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
@@ -1706,28 +1950,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
     }
     const evalcase = rawEvalcase;
     const id = asString6(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
       continue;
     }
     const conversationId = asString6(evalcase.conversation_id);
     const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
+    const inputMessages = resolveInputMessages(evalcase);
+    const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
+    if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
       logError2(
-        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
+        `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
       );
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter(
-      (msg) => isTestMessage(msg)
-    );
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logError2(`No valid expected message found for eval case: ${id}`);
-      continue;
-    }
+    const hasExpectedMessages = expectedMessages.length > 0;
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1771,28 +2007,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
     }
     const inlineRubrics = evalcase.rubrics;
     if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
-      const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
-        if (typeof rubric === "string") {
-          return {
-            id: `rubric-${index + 1}`,
-            description: rubric,
-            weight: 1,
-            required: true
-          };
-        }
-        return {
-          id: asString6(rubric.id) ?? `rubric-${index + 1}`,
-          description: asString6(rubric.description) ?? "",
-          weight: typeof rubric.weight === "number" ? rubric.weight : 1,
-          required: typeof rubric.required === "boolean" ? rubric.required : true
-        };
-      }).filter((r) => r.description.length > 0);
-      if (rubricItems.length > 0) {
-        const rubricEvaluator = {
-          name: "rubric",
-          type: "llm_judge",
-          rubrics: rubricItems
-        };
+      const rubricEvaluator = parseInlineRubrics(inlineRubrics);
+      if (rubricEvaluator) {
         evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
       }
     }
@@ -3049,7 +3265,8 @@ var ToolCallSchema = z.object({
   input: z.unknown().optional(),
   output: z.unknown().optional(),
   id: z.string().optional(),
-  timestamp: z.string().optional()
+  timestamp: z.string().optional(),
+  duration_ms: z.number().optional()
 });
 var OutputMessageInputSchema = z.object({
   role: z.string(),
@@ -3057,6 +3274,7 @@ var OutputMessageInputSchema = z.object({
   content: z.unknown().optional(),
   tool_calls: z.array(ToolCallSchema).optional(),
   timestamp: z.string().optional(),
+  duration_ms: z.number().optional(),
   metadata: z.record(z.unknown()).optional()
 });
 var TokenUsageSchema = z.object({
@@ -3095,8 +3313,16 @@ function convertOutputMessages(messages) {
     role: msg.role,
     name: msg.name,
     content: msg.content,
-    toolCalls: msg.tool_calls,
+    toolCalls: msg.tool_calls?.map((tc) => ({
+      tool: tc.tool,
+      input: tc.input,
+      output: tc.output,
+      id: tc.id,
+      timestamp: tc.timestamp,
+      durationMs: tc.duration_ms
+    })),
     timestamp: msg.timestamp,
+    durationMs: msg.duration_ms,
     metadata: msg.metadata
   }));
 }
@@ -6173,6 +6399,15 @@ var rubricEvaluationSchema = z2.object({
   checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
   overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
 });
+var scoreRangeCheckResultSchema = z2.object({
+  id: z2.string().describe("The ID of the rubric criterion being scored"),
+  score: z2.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
+  reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this score").optional()
+});
+var scoreRangeEvaluationSchema = z2.object({
+  checks: z2.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
+  overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)").optional()
+});
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
@@ -6258,6 +6493,10 @@ var LlmJudgeEvaluator = class {
         `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
       );
     }
+    const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+    if (hasScoreRanges) {
+      return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
+    }
     const prompt = this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
     const evaluatorRawRequest = {
@@ -6283,6 +6522,84 @@ var LlmJudgeEvaluator = class {
       evaluatorRawRequest
     };
   }
+  /**
+   * Evaluate using score-range rubrics (analytic rubric scoring).
+   * Each criterion is scored 0-10 and normalized to 0-1.
+   */
+  async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
+    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const systemPrompt = buildScoreRangeOutputSchema();
+    const evaluatorRawRequest = {
+      userPrompt: prompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    const { data } = await this.runWithRetry({
+      context,
+      judgeProvider,
+      systemPrompt,
+      userPrompt: prompt,
+      schema: scoreRangeEvaluationSchema
+    });
+    const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
+    return {
+      score,
+      verdict,
+      hits,
+      misses,
+      expectedAspectCount: rubrics.length,
+      reasoning: data.overall_reasoning,
+      evaluatorRawRequest,
+      details
+    };
+  }
+  /**
+   * Build prompt for score-range rubric evaluation.
+   */
+  buildScoreRangePrompt(context, rubrics) {
+    const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
+    const parts = [
+      "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
+      "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
+      "",
+      "[[ ## question ## ]]",
+      formattedQuestion,
+      "",
+      "[[ ## expected_outcome ## ]]",
+      context.evalCase.expected_outcome,
+      ""
+    ];
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
+    }
+    parts.push(
+      "[[ ## candidate_answer ## ]]",
+      context.candidate,
+      "",
+      "[[ ## scoring_criteria ## ]]"
+    );
+    for (const rubric of rubrics) {
+      const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
+      const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
+      parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
+      if (rubric.expected_outcome) {
+        parts.push(`Description: ${rubric.expected_outcome}`);
+      }
+      if (rubric.score_ranges && rubric.score_ranges.length > 0) {
+        parts.push("Score ranges:");
+        for (const range of rubric.score_ranges) {
+          const [min, max] = range.score_range;
+          const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
+          parts.push(`  - Score ${rangeLabel}: ${range.expected_outcome}`);
+        }
+      }
+    }
+    parts.push(
+      "",
+      "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
+    );
+    return parts.join("\n");
+  }
   buildRubricPrompt(context, rubrics) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const parts = [
@@ -6302,7 +6619,7 @@ var LlmJudgeEvaluator = class {
     for (const rubric of rubrics) {
       const requiredLabel = rubric.required ? " (REQUIRED)" : "";
       const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
-      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
+      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
     }
     parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
     return parts.join("\n");
@@ -6389,9 +6706,9 @@ function calculateRubricScore(result, rubrics) {
     totalWeight += rubric.weight;
     if (check.satisfied) {
       earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
     } else {
-      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
       if (rubric.required) {
         failedRequired = true;
       }
@@ -6401,6 +6718,76 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
+function buildScoreRangeOutputSchema() {
+  return `You are an expert evaluator. Score the candidate answer on each criterion.
+You must return a valid JSON object matching this schema:
+{
+  "checks": [
+    {
+      "id": "string (criterion id)",
+      "score": integer (0-10),
+      "reasoning": "string (brief explanation for score)"
+    }
+  ],
+  "overall_reasoning": "string (summary, optional)"
+}
+Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
+}
+function calculateScoreRangeResult(result, rubrics) {
+  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
+  const hits = [];
+  const misses = [];
+  const rawScores = {};
+  let totalWeight = 0;
+  let weightedScoreSum = 0;
+  let failedRequired = false;
+  for (const check of result.checks) {
+    const rubric = rubricMap.get(check.id);
+    if (!rubric) {
+      continue;
+    }
+    const rawScore = Math.max(0, Math.min(10, check.score));
+    const normalizedScore = rawScore / 10;
+    rawScores[rubric.id] = rawScore;
+    totalWeight += rubric.weight;
+    weightedScoreSum += normalizedScore * rubric.weight;
+    let requiredMinScore;
+    if (rubric.required_min_score !== void 0) {
+      requiredMinScore = rubric.required_min_score;
+    } else if (rubric.required === true) {
+      requiredMinScore = 10;
+    }
+    const matchingRange = rubric.score_ranges?.find(
+      (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
+    );
+    const rangeDescription = matchingRange?.expected_outcome ?? "";
+    const criterionLabel = rubric.expected_outcome ?? rubric.id;
+    const reasoningText = check.reasoning ? `: ${check.reasoning}` : "";
+    const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
+    if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
+      failedRequired = true;
+      misses.push(scoreInfo);
+    } else if (rawScore >= 7) {
+      hits.push(scoreInfo);
+    } else {
+      misses.push(scoreInfo);
+    }
+  }
+  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
+  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
+  return {
+    score,
+    verdict,
+    hits,
+    misses,
+    details: {
+      raw_scores: rawScores,
+      normalization: "score / 10",
+      aggregation: "weighted_average"
+    }
+  };
+}
 // src/evaluation/evaluators/composite.ts
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
@@ -7228,6 +7615,27 @@ function argsMatch(expected, actual) {
   }
   return true;
 }
+function checkLatency(toolName, maxDurationMs, actualDurationMs) {
+  if (maxDurationMs === void 0) {
+    return { status: "skip", message: "" };
+  }
+  if (actualDurationMs === void 0) {
+    return {
+      status: "skip",
+      message: `No duration data for ${toolName}; latency assertion skipped`
+    };
+  }
+  if (actualDurationMs <= maxDurationMs) {
+    return {
+      status: "pass",
+      message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+    };
+  }
+  return {
+    status: "fail",
+    message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
+  };
+}
 var ToolTrajectoryEvaluator = class {
   kind = "tool_trajectory";
   config;
@@ -7286,7 +7694,8 @@ var ToolTrajectoryEvaluator = class {
         for (const call of message.toolCalls) {
           toolCalls.push({
             name: call.tool,
-            args: call.input
+            args: call.input,
+            durationMs: call.durationMs
           });
         }
       }
@@ -7354,17 +7763,27 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
     let actualIndex = 0;
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     for (let i = 0; i < expected.length; i++) {
       const expectedItem = expected[i];
       const expectedTool = expectedItem.tool;
       let found = false;
       let argsMismatch = false;
+      let matchedCall;
       while (actualIndex < toolCalls.length) {
         const actualCall = toolCalls[actualIndex];
         if (actualCall.name === expectedTool) {
           if (argsMatch(expectedItem.args, actualCall.args)) {
             hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            sequenceHits++;
+            matchedCall = actualCall;
             actualIndex++;
             found = true;
             break;
@@ -7381,14 +7800,35 @@ var ToolTrajectoryEvaluator = class {
       if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
+      if (found && matchedCall) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          matchedCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
   evaluateExact(toolCalls) {
@@ -7404,6 +7844,13 @@ var ToolTrajectoryEvaluator = class {
     }
     const hits = [];
     const misses = [];
+    const warnings = [];
+    let sequenceHits = 0;
+    let latencyHits = 0;
+    let latencySkips = 0;
+    const latencyAssertionCount = expected.filter(
+      (item) => item.maxDurationMs !== void 0
+    ).length;
     if (toolCalls.length !== expected.length) {
       misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
     }
@@ -7413,26 +7860,50 @@ var ToolTrajectoryEvaluator = class {
       const expectedTool = expectedItem.tool;
       const actualCall = toolCalls[i];
       const actualTool = actualCall.name;
+      let sequenceMatched = false;
       if (actualTool === expectedTool) {
         if (argsMatch(expectedItem.args, actualCall.args)) {
           hits.push(`Position ${i}: ${expectedTool}`);
+          sequenceHits++;
+          sequenceMatched = true;
         } else {
           misses.push(`Position ${i}: ${expectedTool} args mismatch`);
         }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
+      if (sequenceMatched) {
+        const latencyResult = checkLatency(
+          expectedTool,
+          expectedItem.maxDurationMs,
+          actualCall.durationMs
+        );
+        if (latencyResult.status === "pass") {
+          hits.push(latencyResult.message);
+          latencyHits++;
+        } else if (latencyResult.status === "fail") {
+          misses.push(latencyResult.message);
+        } else if (latencyResult.message) {
+          warnings.push(latencyResult.message);
+          latencySkips++;
+        }
+      }
     }
     for (let i = checkLength; i < expected.length; i++) {
       misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
     }
-    const score = hits.length / expected.length;
+    for (const warning of warnings) {
+      console.warn(`[tool_trajectory] ${warning}`);
+    }
+    const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
+    const totalAssertions = expected.length + effectiveLatencyAssertions;
+    const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
     return {
       score,
       verdict: scoreToVerdict(score),
       hits,
       misses,
-      expectedAspectCount: expected.length
+      expectedAspectCount: totalAssertions
     };
   }
 };
@@ -7440,6 +7911,7 @@ var ToolTrajectoryEvaluator = class {
 // src/evaluation/orchestrator.ts
 import { createHash } from "node:crypto";
 import path15 from "node:path";
+import micromatch4 from "micromatch";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -7598,17 +8070,17 @@ async function runEvaluation(options) {
     cache,
     useCache,
     now,
-    evalId,
+    filter,
     verbose,
     evalCases: preloadedEvalCases,
     onResult,
     onProgress
   } = options;
-  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
-  const filteredEvalCases = filterEvalCases(evalCases, evalId);
+  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter });
+  const filteredEvalCases = filterEvalCases(evalCases, filter);
   if (filteredEvalCases.length === 0) {
-    if (evalId) {
-      throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
+    if (filter) {
+      throw new Error(`No eval cases matched filter '${filter}' in ${evalFilePath}`);
     }
     return [];
   }
@@ -8184,7 +8656,10 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          judgeProvider
+          judgeProvider,
+          outputMessages,
+          traceSummary,
+          agentTimeoutMs
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8519,9 +8994,22 @@ async function runLlmJudgeEvaluator(options) {
     attempt,
     promptInputs,
     now,
-    judgeProvider
+    judgeProvider,
+    outputMessages,
+    traceSummary,
+    agentTimeoutMs
   } = options;
-  const customPrompt = await resolveCustomPrompt(config);
+  const customPrompt = await resolveCustomPrompt(
+    config,
+    {
+      evalCase,
+      candidate,
+      outputMessages,
+      traceSummary,
+      config: config.config
+    },
+    agentTimeoutMs
+  );
   return evaluatorRegistry.llm_judge.evaluate({
     evalCase,
     candidate,
@@ -8535,23 +9023,70 @@ async function runLlmJudgeEvaluator(options) {
     evaluator: config
   });
 }
-async function resolveCustomPrompt(config) {
-  if (config.promptPath) {
+async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
+  if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
+    if (!context) {
+      throw new Error("Context required for executable prompt templates");
+    }
+    return executePromptTemplate(
+      promptConfig.resolvedPromptScript,
+      context,
+      promptConfig.config,
+      timeoutMs
+    );
+  }
+  const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
+  if (promptPath) {
     try {
-      const content = await readTextFile(config.promptPath);
+      const content = await readTextFile(promptPath);
       return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
-      console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
+      console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
     }
   }
-  return config.prompt;
+  const promptValue = promptConfig.prompt;
+  if (typeof promptValue === "string") {
+    return promptValue;
+  }
+  return void 0;
+}
+async function executePromptTemplate(script, context, config, timeoutMs) {
+  const payload = {
+    question: context.evalCase.question,
+    expectedOutcome: context.evalCase.expected_outcome,
+    expectedMessages: context.evalCase.expected_messages,
+    referenceAnswer: context.evalCase.reference_answer,
+    candidateAnswer: context.candidate,
+    outputMessages: context.outputMessages ?? null,
+    guidelineFiles: context.evalCase.guideline_paths,
+    inputFiles: context.evalCase.file_paths.filter(
+      (p) => !context.evalCase.guideline_paths.includes(p)
+    ),
+    inputMessages: context.evalCase.input_messages,
+    traceSummary: context.traceSummary ?? null,
+    config: config ?? context.config ?? null
+  };
+  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  const scriptPath = script[script.length - 1];
+  const cwd = path15.dirname(scriptPath);
+  try {
+    const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
+    const prompt = stdout.trim();
+    if (!prompt) {
+      throw new Error("Prompt template produced empty output");
+    }
+    return prompt;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    throw new Error(`Prompt template execution failed: ${message}`);
+  }
 }
-function filterEvalCases(evalCases, evalId) {
-  if (!evalId) {
+function filterEvalCases(evalCases, filter) {
+  if (!filter) {
     return evalCases;
   }
-  return evalCases.filter((evalCase) => evalCase.id === evalId);
+  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
 }
 function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
   const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
@@ -8709,7 +9244,7 @@ import { generateText as generateText4 } from "ai";
 import { z as z3 } from "zod";
 var rubricItemSchema = z3.object({
   id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
-  description: z3.string().describe("What this rubric checks for"),
+  expected_outcome: z3.string().describe("Concrete expected outcome for this rubric item"),
   weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
   required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
 });
@@ -8729,7 +9264,7 @@ You must return a valid JSON object matching this schema:
   "rubrics": [
     {
       "id": "string (short identifier)",
-      "description": "string (what to check)",
+      "expected_outcome": "string (concrete expected outcome for this rubric item)",
       "weight": number (default 1.0),
       "required": boolean (default true)
     }
@@ -8765,7 +9300,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
     "Each rubric should:",
     "- Be specific and testable",
     "- Have a short, descriptive ID",
-    "- Include a clear description of what to check",
+    "- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
     "- Indicate if it is required (mandatory) or optional",
     "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
     "",