npm - @agentv/core - Versions diffs - 0.23.0 → 0.26.0 - Mend

@agentv/core 0.23.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-B2J23S7D.js → chunk-NDEN3H2B.js} +28 -17
package/dist/chunk-NDEN3H2B.js.map +1 -0
package/dist/evaluation/validation/index.cjs +64 -17
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +48 -2
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +674 -62
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +157 -4
package/dist/index.d.ts +157 -4
package/dist/index.js +629 -33
package/dist/index.js.map +1 -1
package/package.json +5 -2
package/dist/chunk-B2J23S7D.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -5,10 +5,11 @@ import {
   findGitRoot,
   isAgentProvider,
   normalizeLineEndings,
+  readJsonFile,
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-B2J23S7D.js";
+} from "./chunk-NDEN3H2B.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -51,7 +52,14 @@ function isTestMessage(value) {
   }
   return candidate.content.every(isJsonObject);
 }
-var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
+var EVALUATOR_KIND_VALUES = [
+  "code_judge",
+  "llm_judge",
+  "rubric",
+  "composite",
+  "tool_trajectory",
+  "expected_messages"
+];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
   return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -60,6 +68,44 @@ function getHitCount(result) {
   return result.hits.length;
 }
+// src/evaluation/trace.ts
+function isTraceEventType(value) {
+  return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
+}
+function isTraceEvent(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const candidate = value;
+  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
+}
+function isExpectedToolCall(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const candidate = value;
+  return typeof candidate.tool === "string";
+}
+function computeTraceSummary(trace) {
+  const toolCallCounts = {};
+  let errorCount = 0;
+  for (const event of trace) {
+    if (event.type === "tool_call" && event.name) {
+      toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
+    }
+    if (event.type === "error") {
+      errorCount++;
+    }
+  }
+  const toolNames = Object.keys(toolCallCounts).sort();
+  return {
+    eventCount: trace.length,
+    toolNames,
+    toolCallsByName: toolCallCounts,
+    errorCount
+  };
+}
 // src/evaluation/yaml-parser.ts
 import { readFile as readFile5 } from "node:fs/promises";
 import path6 from "node:path";
@@ -409,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       const cwd = asString2(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
@@ -429,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         type: "code",
         script,
         cwd,
-        resolvedCwd
+        resolvedCwd,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
@@ -524,14 +572,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...promptPath2 ? { promptPath: promptPath2 } : {}
         };
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "composite",
         evaluators: memberEvaluators,
-        aggregator
+        aggregator,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "expected_messages") {
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "expected_messages",
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
+    if (typeValue === "tool_trajectory") {
+      const mode = asString2(rawEvaluator.mode);
+      if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
+        );
+        continue;
+      }
+      const rawMinimums = rawEvaluator.minimums;
+      let minimums;
+      if (rawMinimums !== void 0) {
+        if (!isJsonObject2(rawMinimums)) {
+          logWarning2(
+            `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
+          );
+          continue;
+        }
+        minimums = {};
+        for (const [toolName, count] of Object.entries(rawMinimums)) {
+          if (typeof count === "number" && count >= 0) {
+            minimums[toolName] = count;
+          }
+        }
+      }
+      const rawExpected = rawEvaluator.expected;
+      let expected;
+      if (rawExpected !== void 0) {
+        if (!Array.isArray(rawExpected)) {
+          logWarning2(
+            `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
+          );
+          continue;
+        }
+        expected = [];
+        for (const item of rawExpected) {
+          if (isJsonObject2(item) && typeof item.tool === "string") {
+            expected.push({ tool: item.tool });
+          }
+        }
+      }
+      if (mode === "any_order" && !minimums) {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
+        );
+        continue;
+      }
+      if ((mode === "in_order" || mode === "exact") && !expected) {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      const config = {
+        name,
+        type: "tool_trajectory",
+        mode,
+        ...minimums ? { minimums } : {},
+        ...expected ? { expected } : {},
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      };
+      evaluators.push(config);
+      continue;
+    }
     const prompt = asString2(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
@@ -568,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "llm_judge",
-        rubrics: parsedRubrics
+        rubrics: parsedRubrics,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
+    const weight = validateWeight(rawEvaluator.weight, name, evalId);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
-      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
+      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
+      ...weight !== void 0 ? { weight } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -610,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
     console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
 }
+function validateWeight(rawWeight, evaluatorName, evalId) {
+  if (rawWeight === void 0) {
+    return void 0;
+  }
+  if (typeof rawWeight !== "number") {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
+    );
+  }
+  if (!Number.isFinite(rawWeight)) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
+    );
+  }
+  if (rawWeight < 0) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
+    );
+  }
+  return rawWeight;
+}
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
@@ -785,6 +933,67 @@ ${detailBlock}${ANSI_RESET4}`);
     console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
   }
 }
+async function processExpectedMessages(options) {
+  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const segments = [];
+  for (const message of messages) {
+    const segment = {
+      role: message.role
+    };
+    if (message.role === "assistant" && message.tool_calls !== void 0) {
+      segment.tool_calls = message.tool_calls;
+    }
+    const content = message.content;
+    if (typeof content === "string") {
+      segment.content = content;
+    } else if (Array.isArray(content)) {
+      const processedContent = [];
+      for (const rawSegment of content) {
+        if (!isJsonObject(rawSegment)) {
+          continue;
+        }
+        const segmentType = asString3(rawSegment.type);
+        if (segmentType === "file") {
+          const rawValue = asString3(rawSegment.value);
+          if (!rawValue) {
+            continue;
+          }
+          const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
+            rawValue,
+            searchRoots
+          );
+          if (!resolvedPath) {
+            const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+            logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
+            continue;
+          }
+          try {
+            const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+            processedContent.push({
+              type: "file",
+              path: displayPath,
+              text: fileContent,
+              resolvedPath: path4.resolve(resolvedPath)
+            });
+            if (verbose) {
+              console.log(`  [Expected Output File] Found: ${displayPath}`);
+              console.log(`    Resolved to: ${resolvedPath}`);
+            }
+          } catch (error) {
+            logWarning3(
+              `Could not read expected output file ${resolvedPath}: ${error.message}`
+            );
+          }
+          continue;
+        }
+        processedContent.push(cloneJsonObject(rawSegment));
+      }
+      segment.content = processedContent;
+    }
+    segments.push(segment);
+  }
+  return segments;
+}
 // src/evaluation/formatting/prompt-builder.ts
 import { readFile as readFile4 } from "node:fs/promises";
@@ -1089,12 +1298,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       messageType: "input",
       verbose
     });
-    const outputSegments = hasExpectedMessages ? await processMessages({
+    const outputSegments = hasExpectedMessages ? await processExpectedMessages({
       messages: expectedMessages,
       searchRoots,
       repoRootPath,
-      guidelinePatterns,
-      messageType: "output",
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1618,9 +1825,11 @@ var CliProvider = class {
       const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
       throw new Error(message);
     }
-    const responseText = await this.readAndCleanupOutputFile(outputFilePath);
+    const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
+    const parsed = this.parseOutputContent(responseContent);
     return {
-      text: responseText,
+      text: parsed.text,
+      trace: parsed.trace,
       raw: {
         command: renderedCommand,
         stderr: result.stderr,
@@ -1630,6 +1839,31 @@ var CliProvider = class {
       }
     };
   }
+  /**
+   * Parse output content from CLI.
+   * If the content is valid JSON with a 'text' field, extract text and optional trace.
+   * Otherwise, treat the entire content as plain text.
+   */
+  parseOutputContent(content) {
+    try {
+      const parsed = JSON.parse(content);
+      if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
+        const obj = parsed;
+        const text = typeof obj.text === "string" ? obj.text : String(obj.text);
+        const trace = this.parseTrace(obj.trace);
+        return { text, trace };
+      }
+    } catch {
+    }
+    return { text: content };
+  }
+  parseTrace(trace) {
+    if (!Array.isArray(trace)) {
+      return void 0;
+    }
+    const validEvents = trace.filter(isTraceEvent);
+    return validEvents.length > 0 ? validEvents : void 0;
+  }
   async readAndCleanupOutputFile(filePath) {
     try {
       const content = await readTextFile(filePath);
@@ -2616,6 +2850,7 @@ var MockProvider = class {
   delayMs;
   delayMinMs;
   delayMaxMs;
+  trace;
   constructor(targetName, config) {
     this.id = `mock:${targetName}`;
     this.targetName = targetName;
@@ -2623,6 +2858,7 @@ var MockProvider = class {
     this.delayMs = config.delayMs ?? 0;
     this.delayMinMs = config.delayMinMs ?? 0;
     this.delayMaxMs = config.delayMaxMs ?? 0;
+    this.trace = config.trace;
   }
   async invoke(request) {
     const delay = this.calculateDelay();
@@ -2634,7 +2870,8 @@ var MockProvider = class {
       raw: {
         question: request.question,
         guidelines: request.guidelines
-      }
+      },
+      trace: this.trace
     };
   }
   calculateDelay() {
@@ -3306,9 +3543,11 @@ var CodeEvaluator = class {
         expected_outcome: context.evalCase.expected_outcome,
         reference_answer: context.evalCase.reference_answer,
         candidate_answer: context.candidate,
-        guideline_paths: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths,
-        input_segments: context.evalCase.input_segments
+        guideline_files: context.evalCase.guideline_paths,
+        input_files: context.evalCase.file_paths.filter(
+          (path13) => !context.evalCase.guideline_paths.includes(path13)
+        ),
+        input_messages: context.evalCase.input_messages
       },
       null,
       2
@@ -3428,6 +3667,251 @@ function substituteVariables(template, variables) {
     return variables[varName] ?? match;
   });
 }
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { candidateTrace, candidateTraceSummary } = context;
+    if (!candidateTrace || !candidateTraceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(candidateTraceSummary);
+      case "in_order":
+        return this.evaluateInOrder(candidateTrace);
+      case "exact":
+        return this.evaluateExact(candidateTrace);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(trace) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedTool = expected[i].tool;
+      let found = false;
+      while (actualIndex < actualToolCalls.length) {
+        if (actualToolCalls[actualIndex].name === expectedTool) {
+          hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+          actualIndex++;
+          found = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(trace) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
+    const hits = [];
+    const misses = [];
+    if (actualToolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, actualToolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedTool = expected[i].tool;
+      const actualTool = actualToolCalls[i].name;
+      if (actualTool === expectedTool) {
+        hits.push(`Position ${i}: ${expectedTool} \u2713`);
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
+var ExpectedMessagesEvaluator = class {
+  kind = "expected_messages";
+  evaluate(context) {
+    const { candidateTrace, evalCase } = context;
+    const expectedSegments = evalCase.expected_segments;
+    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
+    if (expectedToolCalls.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool_calls specified in expected_messages"],
+        misses: [],
+        expectedAspectCount: 1
+      };
+    }
+    if (!candidateTrace || candidateTrace.length === 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available to validate tool_calls"],
+        expectedAspectCount: expectedToolCalls.length
+      };
+    }
+    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
+    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
+  }
+  extractExpectedToolCalls(segments) {
+    if (!segments) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const segment of segments) {
+      const role = segment.role;
+      const segmentToolCalls = segment.tool_calls;
+      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
+        for (const tc of segmentToolCalls) {
+          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
+            const toolCall = tc;
+            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
+          }
+        }
+      }
+    }
+    return toolCalls;
+  }
+  validateToolCalls(expected, actual) {
+    const hits = [];
+    const misses = [];
+    for (let i = 0; i < expected.length; i++) {
+      const expectedCall = expected[i];
+      const actualCall = actual[i];
+      if (!actualCall) {
+        misses.push(
+          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
+        );
+        continue;
+      }
+      if (actualCall.name !== expectedCall.tool) {
+        misses.push(
+          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
+        );
+        continue;
+      }
+      if (expectedCall.input !== void 0) {
+        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
+          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
+          continue;
+        }
+      }
+      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
+    }
+    const totalChecks = expected.length || 1;
+    const score = hits.length / totalChecks;
+    return {
+      score,
+      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
+      hits,
+      misses,
+      expectedAspectCount: totalChecks
+    };
+  }
+  deepEquals(a, b) {
+    if (a === b) return true;
+    if (typeof a !== typeof b) return false;
+    if (typeof a !== "object" || a === null || b === null) return false;
+    if (Array.isArray(a) && Array.isArray(b)) {
+      if (a.length !== b.length) return false;
+      return a.every((val, i) => this.deepEquals(val, b[i]));
+    }
+    if (Array.isArray(a) || Array.isArray(b)) return false;
+    const aObj = a;
+    const bObj = b;
+    const aKeys = Object.keys(aObj);
+    const bKeys = Object.keys(bObj);
+    if (aKeys.length !== bKeys.length) return false;
+    return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
+  }
+};
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -3851,7 +4335,7 @@ async function runEvaluation(options) {
     if (!definition) {
       return void 0;
     }
-    const resolved = resolveTargetDefinition(definition, envLookup);
+    const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
     resolvedTargetsByName.set(name, resolved);
     return resolved;
   };
@@ -4165,6 +4649,17 @@ async function runEvalCase(options) {
   if (cacheKey && cache && !cachedResponse) {
     await cache.set(cacheKey, providerResponse);
   }
+  let candidateTrace = providerResponse.trace;
+  if (!candidateTrace && providerResponse.traceRef) {
+    try {
+      const rawTrace = await readJsonFile(providerResponse.traceRef);
+      if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
+        candidateTrace = rawTrace;
+      }
+    } catch {
+    }
+  }
+  const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
   try {
     return await evaluateCandidate({
       evalCase,
@@ -4176,7 +4671,9 @@ async function runEvalCase(options) {
       nowFn,
       attempt,
       judgeProvider,
-      agentTimeoutMs
+      agentTimeoutMs,
+      candidateTrace,
+      candidateTraceSummary
     });
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4193,7 +4690,9 @@ async function evaluateCandidate(options) {
     nowFn,
     attempt,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4206,7 +4705,9 @@ async function evaluateCandidate(options) {
     promptInputs,
     now: gradeTimestamp,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -4219,14 +4720,12 @@ async function evaluateCandidate(options) {
   } else {
     if (promptInputs.chatPrompt) {
       lmProviderRequest = {
-        chat_prompt: promptInputs.chatPrompt,
-        guideline_paths: evalCase.guideline_paths
+        chat_prompt: promptInputs.chatPrompt
       };
     } else {
       lmProviderRequest = {
         question: promptInputs.question,
-        guidelines: promptInputs.guidelines,
-        guideline_paths: evalCase.guideline_paths
+        guidelines: promptInputs.guidelines
       };
     }
   }
@@ -4245,7 +4744,8 @@ async function evaluateCandidate(options) {
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,
     evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
-    evaluator_results: evaluatorResults
+    evaluator_results: evaluatorResults,
+    trace_summary: candidateTraceSummary
   };
 }
 async function runEvaluatorsForCase(options) {
@@ -4259,7 +4759,9 @@ async function runEvaluatorsForCase(options) {
     promptInputs,
     now,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -4273,7 +4775,9 @@ async function runEvaluatorsForCase(options) {
       promptInputs,
       now,
       judgeProvider,
-      agentTimeoutMs
+      agentTimeoutMs,
+      candidateTrace,
+      candidateTraceSummary
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4289,7 +4793,9 @@ async function runEvaluatorsForCase(options) {
     attempt,
     promptInputs,
     now,
-    judgeProvider
+    judgeProvider,
+    candidateTrace,
+    candidateTraceSummary
   });
   return { score };
 }
@@ -4305,7 +4811,9 @@ async function runEvaluatorList(options) {
     promptInputs,
     now,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -4324,11 +4832,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4351,11 +4861,13 @@ async function runEvaluatorList(options) {
           promptInputs,
           now
         });
-        scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: "code_judge",
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4381,6 +4893,12 @@ async function runEvaluatorList(options) {
                 cwd: evalFileDir,
                 evaluatorFactory: { create: createEvaluator }
               });
+            case "tool_trajectory":
+              return new ToolTrajectoryEvaluator({
+                config: memberConfig
+              });
+            case "expected_messages":
+              return new ExpectedMessagesEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4402,11 +4920,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4415,6 +4935,60 @@ async function runEvaluatorList(options) {
           evaluator_results: mapChildResults(score2.evaluatorResults)
         });
       }
+      if (evaluator.type === "tool_trajectory") {
+        const trajectoryEvaluator = new ToolTrajectoryEvaluator({
+          config: evaluator
+        });
+        const score2 = trajectoryEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          candidateTrace,
+          candidateTraceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "expected_messages") {
+        const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
+        const score2 = expectedMessagesEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          candidateTrace,
+          candidateTraceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       const fallbackScore = {
@@ -4426,15 +5000,18 @@ async function runEvaluatorList(options) {
         reasoning: message
       };
       const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
+      const weight = evaluator.weight ?? 1;
       scored.push({
         score: fallbackScore,
         name: evaluator.name ?? "unknown",
-        type: resultType ?? "llm_judge"
+        type: resultType ?? "llm_judge",
+        weight
       });
       evaluatorResults.push({
         name: evaluator.name ?? "unknown",
         type: resultType ?? "llm_judge",
         score: 0,
+        weight,
         verdict: "fail",
         hits: [],
         misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4442,7 +5019,9 @@ async function runEvaluatorList(options) {
       });
     }
   }
-  const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
+  const aggregateScore = scored.length > 0 ? computeWeightedMean(
+    scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
+  ) : 0;
   const hits = scored.flatMap((entry) => entry.score.hits);
   const misses = scored.flatMap((entry) => entry.score.misses);
   const expectedAspectCount = scored.reduce(
@@ -4668,6 +5247,16 @@ function mapChildResults(children) {
     evaluator_results: mapChildResults(child.evaluatorResults)
   }));
 }
+function computeWeightedMean(entries) {
+  let totalWeight = 0;
+  let weightedSum = 0;
+  for (const entry of entries) {
+    const weight = entry.weight ?? 1;
+    totalWeight += weight;
+    weightedSum += entry.score * weight;
+  }
+  return totalWeight > 0 ? weightedSum / totalWeight : 0;
+}
 // src/evaluation/generators/rubric-generator.ts
 import { generateText as generateText3 } from "ai";
@@ -4756,11 +5345,14 @@ function createAgentKernel() {
 export {
   CodeEvaluator,
   CompositeEvaluator,
+  ExpectedMessagesEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
+  ToolTrajectoryEvaluator,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
+  computeTraceSummary,
   consumeCodexLogEntries,
   createAgentKernel,
   createProvider,
@@ -4771,14 +5363,18 @@ export {
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
+  isExpectedToolCall,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
   isTestMessage,
   isTestMessageRole,
+  isTraceEvent,
+  isTraceEventType,
   listTargetNames,
   loadEvalCases,
   normalizeLineEndings,
+  readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
   readTextFile,